diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18688 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 565, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 271.16796875, + "completions/mean_terminated_length": 271.16796875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.15934562776237726, + "epoch": 0.0017699115044247787, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3718360853280971, + "learning_rate": 0.0, + "loss": -0.0009, + "num_tokens": 471851.0, + "reward": 0.48046875, + "reward_std": 0.11373046040534973, + "rewards/execution_accuracy_EX/mean": 0.453125, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897173643112183, + "sampling/importance_sampling_ratio/min": 0.011167307384312153, + "sampling/sampling_logp_difference/max": 4.494764804840088, + "sampling/sampling_logp_difference/mean": 0.12935766577720642, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 249.3046875, + "completions/mean_terminated_length": 249.3046875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.1653076997026801, + "epoch": 0.0035398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.35353629206322235, + "learning_rate": 1.7543859649122805e-08, + "loss": 0.0035, + "num_tokens": 999049.0, + "reward": 0.5732421875, + "reward_std": 0.09824428707361221, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9881603121757507, + "sampling/importance_sampling_ratio/min": 0.006782185286283493, + "sampling/sampling_logp_difference/max": 4.99345588684082, + "sampling/sampling_logp_difference/mean": 0.13611137866973877, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 256.80859375, + "completions/mean_terminated_length": 256.80859375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.14843954052776098, + "epoch": 0.005309734513274336, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2095262325106797, + "learning_rate": 3.508771929824561e-08, + "loss": 0.0022, + "num_tokens": 1433800.0, + "reward": 0.5658202767372131, + "reward_std": 0.05070105940103531, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9379620552062988, + "sampling/importance_sampling_ratio/mean": 0.9919509887695312, + "sampling/importance_sampling_ratio/min": 0.008978665806353092, + "sampling/sampling_logp_difference/max": 4.71290397644043, + "sampling/sampling_logp_difference/mean": 0.11467862129211426, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 258.3046875, + "completions/mean_terminated_length": 258.3046875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1667354740202427, + "epoch": 0.007079646017699115, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.43382092455284926, + "learning_rate": 5.2631578947368416e-08, + "loss": 0.0074, + "num_tokens": 2051574.0, + "reward": 0.4507812261581421, + "reward_std": 0.07652123272418976, + "rewards/execution_accuracy_EX/mean": 0.421875, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9905478954315186, + "sampling/importance_sampling_ratio/min": 0.005259412806481123, + "sampling/sampling_logp_difference/max": 5.247735977172852, + "sampling/sampling_logp_difference/mean": 0.13199694454669952, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 254.0625, + "completions/mean_terminated_length": 254.0625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.1663297237828374, + "epoch": 0.008849557522123894, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.22201991026539605, + "learning_rate": 7.017543859649122e-08, + "loss": -0.0038, + "num_tokens": 2615734.0, + "reward": 0.7476562261581421, + "reward_std": 0.040560849010944366, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907875061035156, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.1312158703804016, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 243.65625, + "completions/mean_terminated_length": 243.65625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.15412813518196344, + "epoch": 0.010619469026548672, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5417660434713405, + "learning_rate": 8.771929824561403e-08, + "loss": 0.0025, + "num_tokens": 3078862.0, + "reward": 0.569531261920929, + "reward_std": 0.16495174169540405, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915728569030762, + "sampling/importance_sampling_ratio/min": 0.006744090002030134, + "sampling/sampling_logp_difference/max": 4.999088764190674, + "sampling/sampling_logp_difference/mean": 0.12345244735479355, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 259.73046875, + "completions/mean_terminated_length": 259.73046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.16022101696580648, + "epoch": 0.012389380530973451, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5129746808708694, + "learning_rate": 1.0526315789473683e-07, + "loss": 0.0017, + "num_tokens": 3646905.0, + "reward": 0.5546875, + "reward_std": 0.19150322675704956, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.938775658607483, + "sampling/importance_sampling_ratio/mean": 0.9897801876068115, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1270093023777008, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 251.12890625, + "completions/mean_terminated_length": 251.12890625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.15483053773641586, + "epoch": 0.01415929203539823, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5008000600471595, + "learning_rate": 1.2280701754385964e-07, + "loss": 0.0071, + "num_tokens": 3987930.0, + "reward": 0.42851561307907104, + "reward_std": 0.11978859454393387, + "rewards/execution_accuracy_EX/mean": 0.3984375, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904801845550537, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.12343038618564606, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 258.8671875, + "completions/mean_terminated_length": 258.8671875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.1579498378559947, + "epoch": 0.01592920353982301, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.697564281530234, + "learning_rate": 1.4035087719298244e-07, + "loss": 0.0182, + "num_tokens": 4380360.0, + "reward": 0.699414074420929, + "reward_std": 0.18346309661865234, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916104078292847, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.12412801384925842, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 259.6015625, + "completions/mean_terminated_length": 259.6015625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.15939485002309084, + "epoch": 0.017699115044247787, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4724097154417985, + "learning_rate": 1.5789473684210525e-07, + "loss": -0.0058, + "num_tokens": 4767138.0, + "reward": 0.539843738079071, + "reward_std": 0.08787554502487183, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911128878593445, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.12623253464698792, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 242.609375, + "completions/mean_terminated_length": 242.609375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.1538402298465371, + "epoch": 0.019469026548672566, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.536759311923797, + "learning_rate": 1.7543859649122805e-07, + "loss": -0.01, + "num_tokens": 5212222.0, + "reward": 0.43964841961860657, + "reward_std": 0.15701286494731903, + "rewards/execution_accuracy_EX/mean": 0.41015625, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912629127502441, + "sampling/importance_sampling_ratio/min": 0.002483538817614317, + "sampling/sampling_logp_difference/max": 5.99807071685791, + "sampling/sampling_logp_difference/mean": 0.12161983549594879, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 242.28125, + "completions/mean_terminated_length": 242.28125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.16555915400385857, + "epoch": 0.021238938053097345, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4424067167359864, + "learning_rate": 1.9298245614035086e-07, + "loss": -0.0053, + "num_tokens": 5695862.0, + "reward": 0.48417967557907104, + "reward_std": 0.08947963267564774, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9921506643295288, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.12535056471824646, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 279.2578125, + "completions/mean_terminated_length": 279.2578125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.19342941138893366, + "epoch": 0.023008849557522124, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4514916455646264, + "learning_rate": 2.1052631578947366e-07, + "loss": 0.0012, + "num_tokens": 6151192.0, + "reward": 0.3505859375, + "reward_std": 0.13035647571086884, + "rewards/execution_accuracy_EX/mean": 0.31640625, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9960017800331116, + "sampling/importance_sampling_ratio/min": 0.004687149077653885, + "sampling/sampling_logp_difference/max": 5.362930774688721, + "sampling/sampling_logp_difference/mean": 0.1375425159931183, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 271.4296875, + "completions/mean_terminated_length": 271.4296875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.15935497358441353, + "epoch": 0.024778761061946902, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.327113043764012, + "learning_rate": 2.2807017543859647e-07, + "loss": 0.0013, + "num_tokens": 6509142.0, + "reward": 0.3802734613418579, + "reward_std": 0.12165890634059906, + "rewards/execution_accuracy_EX/mean": 0.34765625, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9938601851463318, + "sampling/importance_sampling_ratio/min": 0.0049340371042490005, + "sampling/sampling_logp_difference/max": 5.31159782409668, + "sampling/sampling_logp_difference/mean": 0.12151970714330673, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 535.0, + "completions/max_terminated_length": 535.0, + "completions/mean_length": 242.984375, + "completions/mean_terminated_length": 242.984375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.17131377570331097, + "epoch": 0.02654867256637168, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.63292812729377, + "learning_rate": 2.456140350877193e-07, + "loss": 0.0115, + "num_tokens": 7007938.0, + "reward": 0.7513672113418579, + "reward_std": 0.09449917078018188, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872004985809326, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13804106414318085, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 483.0, + "completions/max_terminated_length": 483.0, + "completions/mean_length": 241.4296875, + "completions/mean_terminated_length": 241.4296875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.15650569833815098, + "epoch": 0.02831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3403532988368406, + "learning_rate": 2.631578947368421e-07, + "loss": 0.0021, + "num_tokens": 7345216.0, + "reward": 0.532421886920929, + "reward_std": 0.07652123272418976, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892227053642273, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12831050157546997, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 256.02734375, + "completions/mean_terminated_length": 256.02734375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.15152594726532698, + "epoch": 0.03008849557522124, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4201938168281909, + "learning_rate": 2.807017543859649e-07, + "loss": -0.0013, + "num_tokens": 7786391.0, + "reward": 0.632617175579071, + "reward_std": 0.09657369554042816, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9931073188781738, + "sampling/importance_sampling_ratio/min": 0.005292921327054501, + "sampling/sampling_logp_difference/max": 5.241384983062744, + "sampling/sampling_logp_difference/mean": 0.11415106058120728, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 299.6640625, + "completions/mean_terminated_length": 299.6640625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.17045521549880505, + "epoch": 0.03185840707964602, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.42830016979394364, + "learning_rate": 2.982456140350877e-07, + "loss": 0.0001, + "num_tokens": 8425761.0, + "reward": 0.3431640565395355, + "reward_std": 0.10905265808105469, + "rewards/execution_accuracy_EX/mean": 0.30859375, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894428253173828, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1361292600631714, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 251.5390625, + "completions/mean_terminated_length": 251.5390625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.15116885397583246, + "epoch": 0.033628318584070796, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.38692038432339304, + "learning_rate": 3.157894736842105e-07, + "loss": -0.0106, + "num_tokens": 9166699.0, + "reward": 0.6585937738418579, + "reward_std": 0.07442296296358109, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9884634017944336, + "sampling/importance_sampling_ratio/min": 0.0015071695670485497, + "sampling/sampling_logp_difference/max": 6.497521877288818, + "sampling/sampling_logp_difference/mean": 0.12785200774669647, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 482.0, + "completions/max_terminated_length": 482.0, + "completions/mean_length": 264.05859375, + "completions/mean_terminated_length": 264.05859375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.16685076151043177, + "epoch": 0.035398230088495575, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5346293409063606, + "learning_rate": 3.333333333333333e-07, + "loss": -0.0003, + "num_tokens": 9636778.0, + "reward": 0.36542969942092896, + "reward_std": 0.11446360498666763, + "rewards/execution_accuracy_EX/mean": 0.33203125, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901782274246216, + "sampling/importance_sampling_ratio/min": 0.008697398006916046, + "sampling/sampling_logp_difference/max": 4.744731426239014, + "sampling/sampling_logp_difference/mean": 0.13019706308841705, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 263.75390625, + "completions/mean_terminated_length": 263.75390625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.17072481475770473, + "epoch": 0.03716814159292035, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.3879771516933411, + "learning_rate": 3.508771929824561e-07, + "loss": -0.0036, + "num_tokens": 10139979.0, + "reward": 0.38398438692092896, + "reward_std": 0.12076221406459808, + "rewards/execution_accuracy_EX/mean": 0.3515625, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916797876358032, + "sampling/importance_sampling_ratio/min": 0.0052751051262021065, + "sampling/sampling_logp_difference/max": 5.244756698608398, + "sampling/sampling_logp_difference/mean": 0.1302974969148636, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 265.765625, + "completions/mean_terminated_length": 265.765625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.1654449887573719, + "epoch": 0.03893805309734513, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.6363805342093116, + "learning_rate": 3.684210526315789e-07, + "loss": -0.0108, + "num_tokens": 10621263.0, + "reward": 0.4322265684604645, + "reward_std": 0.2488056868314743, + "rewards/execution_accuracy_EX/mean": 0.40234375, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9914228916168213, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1298561841249466, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 242.8828125, + "completions/mean_terminated_length": 242.8828125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.1548225237056613, + "epoch": 0.04070796460176991, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.30343913797362465, + "learning_rate": 3.859649122807017e-07, + "loss": 0.001, + "num_tokens": 11192305.0, + "reward": 0.3876953125, + "reward_std": 0.0709814801812172, + "rewards/execution_accuracy_EX/mean": 0.35546875, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9600321054458618, + "sampling/importance_sampling_ratio/mean": 0.9905182123184204, + "sampling/importance_sampling_ratio/min": 0.008726601488888264, + "sampling/sampling_logp_difference/max": 4.741379261016846, + "sampling/sampling_logp_difference/mean": 0.1252444088459015, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 256.05078125, + "completions/mean_terminated_length": 256.05078125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.16837795451283455, + "epoch": 0.04247787610619469, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.38643293290020925, + "learning_rate": 4.035087719298245e-07, + "loss": 0.0048, + "num_tokens": 11606254.0, + "reward": 0.39140623807907104, + "reward_std": 0.07181769609451294, + "rewards/execution_accuracy_EX/mean": 0.359375, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912103414535522, + "sampling/importance_sampling_ratio/min": 0.010824721306562424, + "sampling/sampling_logp_difference/max": 4.525922775268555, + "sampling/sampling_logp_difference/mean": 0.1307310163974762, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 269.74609375, + "completions/mean_terminated_length": 269.74609375, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.1664047073572874, + "epoch": 0.04424778761061947, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4560689330144635, + "learning_rate": 4.2105263157894733e-07, + "loss": -0.0029, + "num_tokens": 12036397.0, + "reward": 0.4619140625, + "reward_std": 0.14664286375045776, + "rewards/execution_accuracy_EX/mean": 0.43359375, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9932452440261841, + "sampling/importance_sampling_ratio/min": 0.00023083774431142956, + "sampling/sampling_logp_difference/max": 8.373795509338379, + "sampling/sampling_logp_difference/mean": 0.12580636143684387, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 658.0, + "completions/max_terminated_length": 658.0, + "completions/mean_length": 279.5703125, + "completions/mean_terminated_length": 279.5703125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.15781530179083347, + "epoch": 0.04601769911504425, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.261203829935384, + "learning_rate": 4.3859649122807013e-07, + "loss": 0.0033, + "num_tokens": 12561103.0, + "reward": 0.6214843988418579, + "reward_std": 0.06533188372850418, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912391901016235, + "sampling/importance_sampling_ratio/min": 0.008697099052369595, + "sampling/sampling_logp_difference/max": 4.744765758514404, + "sampling/sampling_logp_difference/mean": 0.12695281207561493, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 251.07421875, + "completions/mean_terminated_length": 251.07421875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.15882813464850187, + "epoch": 0.047787610619469026, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.6429945429836382, + "learning_rate": 4.5614035087719294e-07, + "loss": -0.0036, + "num_tokens": 13014050.0, + "reward": 0.576953113079071, + "reward_std": 0.16937553882598877, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991642951965332, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.12487000226974487, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 248.23828125, + "completions/mean_terminated_length": 248.23828125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.16719039157032967, + "epoch": 0.049557522123893805, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.35124885664301203, + "learning_rate": 4.7368421052631574e-07, + "loss": 0.0023, + "num_tokens": 13373951.0, + "reward": 0.5064452886581421, + "reward_std": 0.07390275597572327, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876657128334045, + "sampling/importance_sampling_ratio/min": 0.008669464848935604, + "sampling/sampling_logp_difference/max": 4.747948169708252, + "sampling/sampling_logp_difference/mean": 0.13793246448040009, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 258.015625, + "completions/mean_terminated_length": 258.015625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.16276424657553434, + "epoch": 0.05132743362831858, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.31018533931314113, + "learning_rate": 4.912280701754385e-07, + "loss": 0.009, + "num_tokens": 13801907.0, + "reward": 0.5435546636581421, + "reward_std": 0.07704143971204758, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915119409561157, + "sampling/importance_sampling_ratio/min": 0.014377371408045292, + "sampling/sampling_logp_difference/max": 4.242099761962891, + "sampling/sampling_logp_difference/mean": 0.12507876753807068, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 254.31640625, + "completions/mean_terminated_length": 254.31640625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.16230978816747665, + "epoch": 0.05309734513274336, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6470236391638025, + "learning_rate": 5.087719298245614e-07, + "loss": -0.0002, + "num_tokens": 14166388.0, + "reward": 0.39140623807907104, + "reward_std": 0.12628692388534546, + "rewards/execution_accuracy_EX/mean": 0.359375, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9924486875534058, + "sampling/importance_sampling_ratio/min": 0.005509523209184408, + "sampling/sampling_logp_difference/max": 5.201277256011963, + "sampling/sampling_logp_difference/mean": 0.1254267394542694, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 308.08984375, + "completions/mean_terminated_length": 293.2353210449219, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.17039766628295183, + "epoch": 0.05486725663716814, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.3609975277854385, + "learning_rate": 5.263157894736842e-07, + "loss": -0.0199, + "num_tokens": 14595739.0, + "reward": 0.36152341961860657, + "reward_std": 0.11451171338558197, + "rewards/execution_accuracy_EX/mean": 0.328125, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.992607831954956, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.12666171789169312, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 266.26953125, + "completions/mean_terminated_length": 266.26953125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.1560642858967185, + "epoch": 0.05663716814159292, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3862649664941044, + "learning_rate": 5.43859649122807e-07, + "loss": -0.0028, + "num_tokens": 15134960.0, + "reward": 0.4916015565395355, + "reward_std": 0.10359475016593933, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896784424781799, + "sampling/importance_sampling_ratio/min": 0.011137904599308968, + "sampling/sampling_logp_difference/max": 4.497401237487793, + "sampling/sampling_logp_difference/mean": 0.12697283923625946, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 273.18359375, + "completions/mean_terminated_length": 273.18359375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.16499491687864065, + "epoch": 0.0584070796460177, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3443077830634362, + "learning_rate": 5.614035087719298e-07, + "loss": 0.0032, + "num_tokens": 15617487.0, + "reward": 0.6400390863418579, + "reward_std": 0.08352725207805634, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896954298019409, + "sampling/importance_sampling_ratio/min": 0.003411046927794814, + "sampling/sampling_logp_difference/max": 5.680736064910889, + "sampling/sampling_logp_difference/mean": 0.12999160587787628, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 256.90234375, + "completions/mean_terminated_length": 256.90234375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1618333924561739, + "epoch": 0.06017699115044248, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.4956081617232054, + "learning_rate": 5.789473684210526e-07, + "loss": -0.0098, + "num_tokens": 16096726.0, + "reward": 0.5695312023162842, + "reward_std": 0.14044690132141113, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875262975692749, + "sampling/importance_sampling_ratio/min": 0.007073412649333477, + "sampling/sampling_logp_difference/max": 4.951412200927734, + "sampling/sampling_logp_difference/mean": 0.13545483350753784, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 243.875, + "completions/mean_terminated_length": 243.875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.16362634301185608, + "epoch": 0.061946902654867256, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4716726535681683, + "learning_rate": 5.964912280701754e-07, + "loss": 0.0111, + "num_tokens": 16463574.0, + "reward": 0.5695312023162842, + "reward_std": 0.13379795849323273, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906102418899536, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1274825632572174, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 253.26171875, + "completions/mean_terminated_length": 253.26171875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.15600297320634127, + "epoch": 0.06371681415929203, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.43647402892283993, + "learning_rate": 6.140350877192982e-07, + "loss": -0.0001, + "num_tokens": 16819209.0, + "reward": 0.3580078184604645, + "reward_std": 0.1251240223646164, + "rewards/execution_accuracy_EX/mean": 0.32421875, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874469041824341, + "sampling/importance_sampling_ratio/min": 0.01435503177344799, + "sampling/sampling_logp_difference/max": 4.243654727935791, + "sampling/sampling_logp_difference/mean": 0.12934981286525726, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 272.01171875, + "completions/mean_terminated_length": 272.01171875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.17558589950203896, + "epoch": 0.06548672566371681, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3917115726809752, + "learning_rate": 6.31578947368421e-07, + "loss": -0.0012, + "num_tokens": 17352396.0, + "reward": 0.46562498807907104, + "reward_std": 0.07592549920082092, + "rewards/execution_accuracy_EX/mean": 0.4375, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920346736907959, + "sampling/importance_sampling_ratio/min": 0.011149573139846325, + "sampling/sampling_logp_difference/max": 4.496354103088379, + "sampling/sampling_logp_difference/mean": 0.13653096556663513, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 272.6484375, + "completions/mean_terminated_length": 272.6484375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.16311697475612164, + "epoch": 0.06725663716814159, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.46006875693215693, + "learning_rate": 6.491228070175438e-07, + "loss": 0.0085, + "num_tokens": 17832802.0, + "reward": 0.33574220538139343, + "reward_std": 0.1252252757549286, + "rewards/execution_accuracy_EX/mean": 0.30078125, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890013933181763, + "sampling/importance_sampling_ratio/min": 0.008697140030562878, + "sampling/sampling_logp_difference/max": 4.744760990142822, + "sampling/sampling_logp_difference/mean": 0.133224219083786, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 484.0, + "completions/max_terminated_length": 484.0, + "completions/mean_length": 255.1484375, + "completions/mean_terminated_length": 255.1484375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.1703223865479231, + "epoch": 0.06902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.46883165863828125, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0155, + "num_tokens": 18380632.0, + "reward": 0.5732421875, + "reward_std": 0.11382117122411728, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883578419685364, + "sampling/importance_sampling_ratio/min": 0.01455059926956892, + "sampling/sampling_logp_difference/max": 4.230123043060303, + "sampling/sampling_logp_difference/mean": 0.13755206763744354, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 258.55078125, + "completions/mean_terminated_length": 258.55078125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.16107967123389244, + "epoch": 0.07079646017699115, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.6750277724083866, + "learning_rate": 6.842105263157895e-07, + "loss": -0.0177, + "num_tokens": 18732805.0, + "reward": 0.5732422471046448, + "reward_std": 0.2435562014579773, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888910055160522, + "sampling/importance_sampling_ratio/min": 0.0067546493373811245, + "sampling/sampling_logp_difference/max": 4.997524261474609, + "sampling/sampling_logp_difference/mean": 0.1301165670156479, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 514.0, + "completions/max_terminated_length": 514.0, + "completions/mean_length": 250.875, + "completions/mean_terminated_length": 250.875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.1552940523251891, + "epoch": 0.07256637168141593, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5005401348111757, + "learning_rate": 7.017543859649122e-07, + "loss": 0.0031, + "num_tokens": 19423813.0, + "reward": 0.632617175579071, + "reward_std": 0.13760298490524292, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985753059387207, + "sampling/importance_sampling_ratio/min": 0.0052642528899014, + "sampling/sampling_logp_difference/max": 5.246816158294678, + "sampling/sampling_logp_difference/mean": 0.1318393349647522, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 520.0, + "completions/max_terminated_length": 520.0, + "completions/mean_length": 244.7265625, + "completions/mean_terminated_length": 244.7265625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.15635372046381235, + "epoch": 0.0743362831858407, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4023042380296677, + "learning_rate": 7.192982456140351e-07, + "loss": 0.0035, + "num_tokens": 19805743.0, + "reward": 0.48046875, + "reward_std": 0.087394580245018, + "rewards/execution_accuracy_EX/mean": 0.453125, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9491380453109741, + "sampling/importance_sampling_ratio/mean": 0.9899490475654602, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.12644615769386292, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 273.82421875, + "completions/mean_terminated_length": 273.82421875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.15313264727592468, + "epoch": 0.07610619469026549, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.471149655919957, + "learning_rate": 7.368421052631578e-07, + "loss": -0.0015, + "num_tokens": 20292178.0, + "reward": 0.39140626788139343, + "reward_std": 0.11658942699432373, + "rewards/execution_accuracy_EX/mean": 0.359375, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918742775917053, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.1207265704870224, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 245.71484375, + "completions/mean_terminated_length": 245.71484375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.15860996302217245, + "epoch": 0.07787610619469026, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.5926444264300149, + "learning_rate": 7.543859649122807e-07, + "loss": 0.0065, + "num_tokens": 20814217.0, + "reward": 0.6585937738418579, + "reward_std": 0.1951339840888977, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896022081375122, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12872432172298431, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 683.0, + "completions/max_terminated_length": 683.0, + "completions/mean_length": 286.90234375, + "completions/mean_terminated_length": 286.90234375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.1724842879921198, + "epoch": 0.07964601769911504, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.39127847342188277, + "learning_rate": 7.719298245614034e-07, + "loss": -0.0048, + "num_tokens": 21328128.0, + "reward": 0.37285155057907104, + "reward_std": 0.09240090101957321, + "rewards/execution_accuracy_EX/mean": 0.33984375, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916377067565918, + "sampling/importance_sampling_ratio/min": 0.01261539850383997, + "sampling/sampling_logp_difference/max": 4.372837066650391, + "sampling/sampling_logp_difference/mean": 0.13387610018253326, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 249.42578125, + "completions/mean_terminated_length": 249.42578125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.15575900953263044, + "epoch": 0.08141592920353982, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.595381992519994, + "learning_rate": 7.894736842105263e-07, + "loss": -0.0132, + "num_tokens": 21906413.0, + "reward": 0.576953113079071, + "reward_std": 0.16818717122077942, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918358325958252, + "sampling/importance_sampling_ratio/min": 0.008697489276528358, + "sampling/sampling_logp_difference/max": 4.744720935821533, + "sampling/sampling_logp_difference/mean": 0.1254531294107437, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 237.3125, + "completions/mean_terminated_length": 237.3125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.15542869828641415, + "epoch": 0.0831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3580323522227171, + "learning_rate": 8.07017543859649e-07, + "loss": 0.0006, + "num_tokens": 22426013.0, + "reward": 0.5287109613418579, + "reward_std": 0.1050565242767334, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894780516624451, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.12700873613357544, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 270.91015625, + "completions/mean_terminated_length": 270.91015625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.1632612720131874, + "epoch": 0.08495575221238938, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.4737364078767477, + "learning_rate": 8.245614035087719e-07, + "loss": 0.0131, + "num_tokens": 22927798.0, + "reward": 0.46562498807907104, + "reward_std": 0.1609576940536499, + "rewards/execution_accuracy_EX/mean": 0.4375, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9070905447006226, + "sampling/importance_sampling_ratio/mean": 0.9903364181518555, + "sampling/importance_sampling_ratio/min": 0.01432269811630249, + "sampling/sampling_logp_difference/max": 4.245909690856934, + "sampling/sampling_logp_difference/mean": 0.13301558792591095, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 255.125, + "completions/mean_terminated_length": 255.125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.1544871861115098, + "epoch": 0.08672566371681416, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.9658067733358846, + "learning_rate": 8.421052631578947e-07, + "loss": -0.0034, + "num_tokens": 23282534.0, + "reward": 0.40996092557907104, + "reward_std": 0.1549127697944641, + "rewards/execution_accuracy_EX/mean": 0.37890625, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911001920700073, + "sampling/importance_sampling_ratio/min": 0.0031994825694710016, + "sampling/sampling_logp_difference/max": 5.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.11949102580547333, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 558.0, + "completions/max_terminated_length": 558.0, + "completions/mean_length": 273.28125, + "completions/mean_terminated_length": 273.28125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.16533717513084412, + "epoch": 0.08849557522123894, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4110370892395211, + "learning_rate": 8.596491228070175e-07, + "loss": -0.0018, + "num_tokens": 23766222.0, + "reward": 0.5361328125, + "reward_std": 0.08428344130516052, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990532398223877, + "sampling/importance_sampling_ratio/min": 0.011184120550751686, + "sampling/sampling_logp_difference/max": 4.493260383605957, + "sampling/sampling_logp_difference/mean": 0.1303660273551941, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 239.26171875, + "completions/mean_terminated_length": 239.26171875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.1509690945968032, + "epoch": 0.09026548672566372, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5915328403402159, + "learning_rate": 8.771929824561403e-07, + "loss": 0.0144, + "num_tokens": 24126641.0, + "reward": 0.773632824420929, + "reward_std": 0.14853864908218384, + "rewards/execution_accuracy_EX/mean": 0.76171875, + "rewards/execution_accuracy_EX/std": 0.4268665909767151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891676306724548, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12352632731199265, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 260.34765625, + "completions/mean_terminated_length": 260.34765625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.16321794502437115, + "epoch": 0.0920353982300885, + "frac_reward_zero_std": 0.5, + "grad_norm": 0.5495836081920207, + "learning_rate": 8.947368421052631e-07, + "loss": 0.0095, + "num_tokens": 24667818.0, + "reward": 0.558398425579071, + "reward_std": 0.15073998272418976, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9886378049850464, + "sampling/importance_sampling_ratio/min": 0.01430966705083847, + "sampling/sampling_logp_difference/max": 4.246819972991943, + "sampling/sampling_logp_difference/mean": 0.13047116994857788, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 256.78125, + "completions/mean_terminated_length": 256.78125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.1542206835001707, + "epoch": 0.09380530973451327, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.46657412155303585, + "learning_rate": 9.122807017543859e-07, + "loss": 0.003, + "num_tokens": 25086514.0, + "reward": 0.6363281011581421, + "reward_std": 0.07024834305047989, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989728569984436, + "sampling/importance_sampling_ratio/min": 0.008775152266025543, + "sampling/sampling_logp_difference/max": 4.735831260681152, + "sampling/sampling_logp_difference/mean": 0.1264614462852478, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 251.87890625, + "completions/mean_terminated_length": 251.87890625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.15210619661957026, + "epoch": 0.09557522123893805, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2814739303666432, + "learning_rate": 9.298245614035087e-07, + "loss": -0.0045, + "num_tokens": 25523475.0, + "reward": 0.6103515625, + "reward_std": 0.045504868030548096, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9884800910949707, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.12667906284332275, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 271.0234375, + "completions/mean_terminated_length": 271.0234375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.16253683529794216, + "epoch": 0.09734513274336283, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.655573269539638, + "learning_rate": 9.473684210526315e-07, + "loss": -0.0011, + "num_tokens": 26151833.0, + "reward": 0.5361328125, + "reward_std": 0.16318267583847046, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890881776809692, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13104161620140076, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 270.89453125, + "completions/mean_terminated_length": 270.89453125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.171295078471303, + "epoch": 0.09911504424778761, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.21856908572245218, + "learning_rate": 9.649122807017545e-07, + "loss": 0.0018, + "num_tokens": 26566366.0, + "reward": 0.4990234375, + "reward_std": 0.030420634895563126, + "rewards/execution_accuracy_EX/mean": 0.47265625, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893931746482849, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.13567079603672028, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 278.71484375, + "completions/mean_terminated_length": 278.71484375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.15868478454649448, + "epoch": 0.10088495575221239, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.42894406154440656, + "learning_rate": 9.82456140350877e-07, + "loss": 0.0079, + "num_tokens": 27160645.0, + "reward": 0.43593746423721313, + "reward_std": 0.10411045700311661, + "rewards/execution_accuracy_EX/mean": 0.40625, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990469217300415, + "sampling/importance_sampling_ratio/min": 0.007318601477891207, + "sampling/sampling_logp_difference/max": 4.9173359870910645, + "sampling/sampling_logp_difference/mean": 0.1256943643093109, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 510.0, + "completions/max_terminated_length": 510.0, + "completions/mean_length": 266.67578125, + "completions/mean_terminated_length": 266.67578125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.1557472189888358, + "epoch": 0.10265486725663717, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.19193666625505582, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 27666418.0, + "reward": 0.736523449420929, + "reward_std": 0.030420634895563126, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860793352127075, + "sampling/importance_sampling_ratio/min": 0.011136556044220924, + "sampling/sampling_logp_difference/max": 4.497522354125977, + "sampling/sampling_logp_difference/mean": 0.1295561045408249, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 584.0, + "completions/max_terminated_length": 584.0, + "completions/mean_length": 276.62109375, + "completions/mean_terminated_length": 276.62109375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.16748062148690224, + "epoch": 0.10442477876106195, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.37844013508385604, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 28327585.0, + "reward": 0.5249999761581421, + "reward_std": 0.1216825395822525, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872063994407654, + "sampling/importance_sampling_ratio/min": 0.0031934145372360945, + "sampling/sampling_logp_difference/max": 5.746664524078369, + "sampling/sampling_logp_difference/mean": 0.13904190063476562, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 248.765625, + "completions/mean_terminated_length": 248.765625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.15811389405280352, + "epoch": 0.10619469026548672, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.471437674054244, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 28752037.0, + "reward": 0.7699218988418579, + "reward_std": 0.056240808218717575, + "rewards/execution_accuracy_EX/mean": 0.7578125, + "rewards/execution_accuracy_EX/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9995441436767578, + "sampling/importance_sampling_ratio/mean": 0.9891387224197388, + "sampling/importance_sampling_ratio/min": 0.011232558637857437, + "sampling/sampling_logp_difference/max": 4.488938808441162, + "sampling/sampling_logp_difference/mean": 0.13061580061912537, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 254.82421875, + "completions/mean_terminated_length": 254.82421875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.15606304723769426, + "epoch": 0.1079646017699115, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3694867959581085, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 29137368.0, + "reward": 0.3951171934604645, + "reward_std": 0.08195790648460388, + "rewards/execution_accuracy_EX/mean": 0.36328125, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906754493713379, + "sampling/importance_sampling_ratio/min": 0.011422280222177505, + "sampling/sampling_logp_difference/max": 4.472189426422119, + "sampling/sampling_logp_difference/mean": 0.12278417497873306, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 247.20703125, + "completions/mean_terminated_length": 247.20703125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.15486662182956934, + "epoch": 0.10973451327433628, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5631731421841643, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 29616333.0, + "reward": 0.3134765625, + "reward_std": 0.13842415809631348, + "rewards/execution_accuracy_EX/mean": 0.27734375, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9910948872566223, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12670296430587769, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 245.11328125, + "completions/mean_terminated_length": 245.11328125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.14672744553536177, + "epoch": 0.11150442477876106, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.4926263656012665, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 29999274.0, + "reward": 0.6548827886581421, + "reward_std": 0.14507801830768585, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985906720161438, + "sampling/importance_sampling_ratio/min": 0.0010352961253374815, + "sampling/sampling_logp_difference/max": 6.873067855834961, + "sampling/sampling_logp_difference/mean": 0.13023461401462555, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 244.82421875, + "completions/mean_terminated_length": 244.82421875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.140235036611557, + "epoch": 0.11327433628318584, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.31133540422069816, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 30326221.0, + "reward": 0.614062488079071, + "reward_std": 0.07652123272418976, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862856864929199, + "sampling/importance_sampling_ratio/min": 0.011226662434637547, + "sampling/sampling_logp_difference/max": 4.489463806152344, + "sampling/sampling_logp_difference/mean": 0.12468035519123077, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 249.796875, + "completions/mean_terminated_length": 249.796875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.14288225024938583, + "epoch": 0.11504424778761062, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.49435018549400433, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 30967945.0, + "reward": 0.5992187261581421, + "reward_std": 0.11038152128458023, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9403454065322876, + "sampling/importance_sampling_ratio/mean": 0.9919389486312866, + "sampling/importance_sampling_ratio/min": 0.018361039459705353, + "sampling/sampling_logp_difference/max": 3.9975242614746094, + "sampling/sampling_logp_difference/mean": 0.1192697137594223, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 244.9453125, + "completions/mean_terminated_length": 244.9453125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.16346758417785168, + "epoch": 0.1168141592920354, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.705943009209385, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 31329547.0, + "reward": 0.7958984375, + "reward_std": 0.16378909349441528, + "rewards/execution_accuracy_EX/mean": 0.78515625, + "rewards/execution_accuracy_EX/std": 0.4115184545516968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889636635780334, + "sampling/importance_sampling_ratio/min": 0.0067546493373811245, + "sampling/sampling_logp_difference/max": 4.997524261474609, + "sampling/sampling_logp_difference/mean": 0.13075979053974152, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 247.65625, + "completions/mean_terminated_length": 247.65625, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.13692456483840942, + "epoch": 0.11858407079646018, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.41960662240753216, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 31673187.0, + "reward": 0.6177734136581421, + "reward_std": 0.12700936198234558, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900465607643127, + "sampling/importance_sampling_ratio/min": 0.014339148066937923, + "sampling/sampling_logp_difference/max": 4.244761943817139, + "sampling/sampling_logp_difference/mean": 0.11534422636032104, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 231.02734375, + "completions/mean_terminated_length": 231.02734375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.14337879046797752, + "epoch": 0.12035398230088495, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5793720639466535, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 32083402.0, + "reward": 0.6623046398162842, + "reward_std": 0.14669404923915863, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8921527862548828, + "sampling/importance_sampling_ratio/mean": 0.9853760004043579, + "sampling/importance_sampling_ratio/min": 0.014309640042483807, + "sampling/sampling_logp_difference/max": 4.246821880340576, + "sampling/sampling_logp_difference/mean": 0.12616120278835297, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 265.67578125, + "completions/mean_terminated_length": 265.67578125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.15827831160277128, + "epoch": 0.12212389380530973, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.36723289589059144, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 32733591.0, + "reward": 0.4136718511581421, + "reward_std": 0.07652123272418976, + "rewards/execution_accuracy_EX/mean": 0.3828125, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901444911956787, + "sampling/importance_sampling_ratio/min": 0.011146697215735912, + "sampling/sampling_logp_difference/max": 4.496612071990967, + "sampling/sampling_logp_difference/mean": 0.12319838255643845, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 240.4765625, + "completions/mean_terminated_length": 240.4765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.14707565028220415, + "epoch": 0.12389380530973451, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5123525866198354, + "learning_rate": 1e-06, + "loss": 0.0103, + "num_tokens": 33098577.0, + "reward": 0.6771484613418579, + "reward_std": 0.10976006090641022, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871964454650879, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.12734374403953552, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 253.390625, + "completions/mean_terminated_length": 253.390625, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.14430231135338545, + "epoch": 0.1256637168141593, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.4393704191662513, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 33566453.0, + "reward": 0.4322265684604645, + "reward_std": 0.07519236207008362, + "rewards/execution_accuracy_EX/mean": 0.40234375, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.979678750038147, + "sampling/importance_sampling_ratio/mean": 0.9862529039382935, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.12432628870010376, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 251.34375, + "completions/mean_terminated_length": 251.34375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.1437322748824954, + "epoch": 0.12743362831858407, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5928725986978491, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 34383021.0, + "reward": 0.6771484613418579, + "reward_std": 0.17059272527694702, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9834222197532654, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.13263297080993652, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 245.2890625, + "completions/mean_terminated_length": 245.2890625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.14456360507756472, + "epoch": 0.12920353982300886, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4596876714189424, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 34942663.0, + "reward": 0.5435546636581421, + "reward_std": 0.07704144716262817, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871764183044434, + "sampling/importance_sampling_ratio/min": 0.008698852732777596, + "sampling/sampling_logp_difference/max": 4.744564056396484, + "sampling/sampling_logp_difference/mean": 0.12493147701025009, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 605.0, + "completions/max_terminated_length": 605.0, + "completions/mean_length": 269.0234375, + "completions/mean_terminated_length": 269.0234375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.15099895559251308, + "epoch": 0.13097345132743363, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.35108097426422163, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 35485437.0, + "reward": 0.6919921636581421, + "reward_std": 0.055404599756002426, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9850528836250305, + "sampling/importance_sampling_ratio/min": 0.00975518673658371, + "sampling/sampling_logp_difference/max": 4.629956245422363, + "sampling/sampling_logp_difference/mean": 0.13283172249794006, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 267.234375, + "completions/mean_terminated_length": 267.234375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.16901178658008575, + "epoch": 0.13274336283185842, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5273541990780711, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 36026793.0, + "reward": 0.5843750238418579, + "reward_std": 0.14288721978664398, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99019855260849, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13086813688278198, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 271.046875, + "completions/mean_terminated_length": 271.046875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.15238861553370953, + "epoch": 0.13451327433628318, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.40087372480650646, + "learning_rate": 1e-06, + "loss": 0.0118, + "num_tokens": 36493301.0, + "reward": 0.5027344226837158, + "reward_std": 0.08382821083068848, + "rewards/execution_accuracy_EX/mean": 0.4765625, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915053844451904, + "sampling/importance_sampling_ratio/min": 0.014470180496573448, + "sampling/sampling_logp_difference/max": 4.235665321350098, + "sampling/sampling_logp_difference/mean": 0.1226399838924408, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 247.19921875, + "completions/mean_terminated_length": 247.19921875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.1407962953671813, + "epoch": 0.13628318584070798, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3009020661114259, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 36941400.0, + "reward": 0.5732421875, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9816585779190063, + "sampling/importance_sampling_ratio/mean": 0.9868438839912415, + "sampling/importance_sampling_ratio/min": 0.004535609390586615, + "sampling/sampling_logp_difference/max": 5.395795822143555, + "sampling/sampling_logp_difference/mean": 0.1235656887292862, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 251.828125, + "completions/mean_terminated_length": 251.828125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.13365309080109, + "epoch": 0.13805309734513274, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.45197690483932235, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 37504860.0, + "reward": 0.40996092557907104, + "reward_std": 0.10851120948791504, + "rewards/execution_accuracy_EX/mean": 0.37890625, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9868944883346558, + "sampling/importance_sampling_ratio/min": 0.006765489932149649, + "sampling/sampling_logp_difference/max": 4.995920658111572, + "sampling/sampling_logp_difference/mean": 0.11721721291542053, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 441.0, + "completions/max_terminated_length": 441.0, + "completions/mean_length": 258.06640625, + "completions/mean_terminated_length": 258.06640625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.1493245316669345, + "epoch": 0.13982300884955753, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.3971716327047687, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 37974557.0, + "reward": 0.5324218273162842, + "reward_std": 0.09523230791091919, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987461507320404, + "sampling/importance_sampling_ratio/min": 0.01118389144539833, + "sampling/sampling_logp_difference/max": 4.49328088760376, + "sampling/sampling_logp_difference/mean": 0.12770724296569824, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 258.30859375, + "completions/mean_terminated_length": 258.30859375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.13829983118921518, + "epoch": 0.1415929203539823, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.36638522598006096, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 38638780.0, + "reward": 0.5658203363418579, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985849916934967, + "sampling/importance_sampling_ratio/min": 0.008736731484532356, + "sampling/sampling_logp_difference/max": 4.7402191162109375, + "sampling/sampling_logp_difference/mean": 0.12555652856826782, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 465.0, + "completions/max_terminated_length": 465.0, + "completions/mean_length": 249.34765625, + "completions/mean_terminated_length": 249.34765625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.1591026447713375, + "epoch": 0.1433628318584071, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5951723248871006, + "learning_rate": 1e-06, + "loss": 0.0135, + "num_tokens": 39192133.0, + "reward": 0.5287109613418579, + "reward_std": 0.12155766040086746, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9834750890731812, + "sampling/importance_sampling_ratio/min": 0.011136576533317566, + "sampling/sampling_logp_difference/max": 4.497520446777344, + "sampling/sampling_logp_difference/mean": 0.14167138934135437, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 229.1015625, + "completions/mean_terminated_length": 229.1015625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.14118606969714165, + "epoch": 0.14513274336283186, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.49369786226219176, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 39709855.0, + "reward": 0.443359375, + "reward_std": 0.07463589310646057, + "rewards/execution_accuracy_EX/mean": 0.4140625, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9870415925979614, + "sampling/importance_sampling_ratio/min": 0.0053710173815488815, + "sampling/sampling_logp_difference/max": 5.226737976074219, + "sampling/sampling_logp_difference/mean": 0.12739017605781555, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 227.53515625, + "completions/mean_terminated_length": 227.53515625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.14053333550691605, + "epoch": 0.14690265486725665, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.42611443788318915, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 40200184.0, + "reward": 0.5546875, + "reward_std": 0.06034861505031586, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842078685760498, + "sampling/importance_sampling_ratio/min": 0.0031994825694710016, + "sampling/sampling_logp_difference/max": 5.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.12890583276748657, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 254.375, + "completions/mean_terminated_length": 254.375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.13399124145507812, + "epoch": 0.1486725663716814, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5382356286285921, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 40928104.0, + "reward": 0.588085949420929, + "reward_std": 0.13463234901428223, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846076965332031, + "sampling/importance_sampling_ratio/min": 0.005270363297313452, + "sampling/sampling_logp_difference/max": 5.2456560134887695, + "sampling/sampling_logp_difference/mean": 0.1227772980928421, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 253.609375, + "completions/mean_terminated_length": 253.609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.14286551717668772, + "epoch": 0.1504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5308770277208859, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 41251076.0, + "reward": 0.6177734136581421, + "reward_std": 0.10954530537128448, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866983890533447, + "sampling/importance_sampling_ratio/min": 0.006744090002030134, + "sampling/sampling_logp_difference/max": 4.999088764190674, + "sampling/sampling_logp_difference/mean": 0.1250735968351364, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 474.0, + "completions/max_terminated_length": 474.0, + "completions/mean_length": 241.0703125, + "completions/mean_terminated_length": 241.0703125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.14058785513043404, + "epoch": 0.15221238938053097, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.516128474525671, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 41674758.0, + "reward": 0.6140625476837158, + "reward_std": 0.10150519013404846, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9833188056945801, + "sampling/importance_sampling_ratio/min": 0.009839275851845741, + "sampling/sampling_logp_difference/max": 4.621373176574707, + "sampling/sampling_logp_difference/mean": 0.13053299486637115, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 224.6015625, + "completions/mean_terminated_length": 224.6015625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.12872455874457955, + "epoch": 0.15398230088495576, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3222598225126463, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 42160736.0, + "reward": 0.5992187261581421, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812698364257812, + "sampling/importance_sampling_ratio/min": 0.011136570945382118, + "sampling/sampling_logp_difference/max": 4.497520923614502, + "sampling/sampling_logp_difference/mean": 0.12602566182613373, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 255.4765625, + "completions/mean_terminated_length": 255.4765625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.13219175767153502, + "epoch": 0.15575221238938053, + "frac_reward_zero_std": 0.375, + "grad_norm": 0.7502465863244118, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 42696074.0, + "reward": 0.517578125, + "reward_std": 0.22942377626895905, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.982456386089325, + "sampling/importance_sampling_ratio/min": 0.008679235354065895, + "sampling/sampling_logp_difference/max": 4.746821880340576, + "sampling/sampling_logp_difference/mean": 0.1295037716627121, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 260.21875, + "completions/mean_terminated_length": 260.21875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.13940082676708698, + "epoch": 0.15752212389380532, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.5394519972557607, + "learning_rate": 1e-06, + "loss": 0.0098, + "num_tokens": 43249010.0, + "reward": 0.48417967557907104, + "reward_std": 0.14929932355880737, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862859845161438, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12424317002296448, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 256.921875, + "completions/mean_terminated_length": 256.921875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1300432663410902, + "epoch": 0.1592920353982301, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3120853116923606, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 43744734.0, + "reward": 0.569531261920929, + "reward_std": 0.0726388692855835, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860806465148926, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.11924497783184052, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 633.0, + "completions/max_terminated_length": 633.0, + "completions/mean_length": 238.203125, + "completions/mean_terminated_length": 238.203125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.1373284813016653, + "epoch": 0.16106194690265488, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.3496778784671871, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 44119490.0, + "reward": 0.62890625, + "reward_std": 0.0726388692855835, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9847703576087952, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.12893569469451904, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 270.34375, + "completions/mean_terminated_length": 270.34375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.1233369167894125, + "epoch": 0.16283185840707964, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.24394913692209896, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 44674442.0, + "reward": 0.5101562738418579, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98430997133255, + "sampling/importance_sampling_ratio/min": 0.009683731012046337, + "sampling/sampling_logp_difference/max": 4.637308120727539, + "sampling/sampling_logp_difference/mean": 0.11890994012355804, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 209.3125, + "completions/mean_terminated_length": 209.3125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.11507104709744453, + "epoch": 0.16460176991150444, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.49668340779219333, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 45071546.0, + "reward": 0.6957030892372131, + "reward_std": 0.10463938117027283, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818706512451172, + "sampling/importance_sampling_ratio/min": 0.006784724537283182, + "sampling/sampling_logp_difference/max": 4.993081569671631, + "sampling/sampling_logp_difference/mean": 0.119605153799057, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 255.90234375, + "completions/mean_terminated_length": 255.90234375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.12336459616199136, + "epoch": 0.1663716814159292, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.47873542522941465, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 45598369.0, + "reward": 0.49531248211860657, + "reward_std": 0.09993584454059601, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826380014419556, + "sampling/importance_sampling_ratio/min": 9.701930684968829e-05, + "sampling/sampling_logp_difference/max": 9.2406005859375, + "sampling/sampling_logp_difference/mean": 0.12706580758094788, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 224.03125, + "completions/mean_terminated_length": 224.03125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.10378375835716724, + "epoch": 0.168141592920354, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7302596844663868, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 46179961.0, + "reward": 0.6363281011581421, + "reward_std": 0.10118919610977173, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818361401557922, + "sampling/importance_sampling_ratio/min": 0.0016268499894067645, + "sampling/sampling_logp_difference/max": 6.421109676361084, + "sampling/sampling_logp_difference/mean": 0.1124076396226883, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 255.484375, + "completions/mean_terminated_length": 255.484375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.1202244283631444, + "epoch": 0.16991150442477876, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6519829873603941, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 46693797.0, + "reward": 0.5880858898162842, + "reward_std": 0.15692231059074402, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838235378265381, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.12167157232761383, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 207.453125, + "completions/mean_terminated_length": 207.453125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.10423938743770123, + "epoch": 0.17168141592920355, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6998644369280165, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 47090937.0, + "reward": 0.7105468511581421, + "reward_std": 0.1457461714744568, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811939001083374, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.1167488768696785, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 243.77734375, + "completions/mean_terminated_length": 243.77734375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.11254391726106405, + "epoch": 0.17345132743362832, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.592336469339125, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 47617008.0, + "reward": 0.3988281190395355, + "reward_std": 0.11865141987800598, + "rewards/execution_accuracy_EX/mean": 0.3671875, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815082550048828, + "sampling/importance_sampling_ratio/min": 0.006885554175823927, + "sampling/sampling_logp_difference/max": 4.978329658508301, + "sampling/sampling_logp_difference/mean": 0.1191565990447998, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 230.40625, + "completions/mean_terminated_length": 230.40625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.11601526476442814, + "epoch": 0.1752212389380531, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.7784756849168607, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 48090568.0, + "reward": 0.4916015863418579, + "reward_std": 0.2303093671798706, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9816043376922607, + "sampling/importance_sampling_ratio/min": 0.0024954548571258783, + "sampling/sampling_logp_difference/max": 5.993284225463867, + "sampling/sampling_logp_difference/mean": 0.12253564596176147, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 231.5390625, + "completions/mean_terminated_length": 231.5390625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.10855316184461117, + "epoch": 0.17699115044247787, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6155129861921234, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 48574786.0, + "reward": 0.614062488079071, + "reward_std": 0.0859283059835434, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813829660415649, + "sampling/importance_sampling_ratio/min": 0.014280949719250202, + "sampling/sampling_logp_difference/max": 4.248828887939453, + "sampling/sampling_logp_difference/mean": 0.11490459740161896, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 225.52734375, + "completions/mean_terminated_length": 225.52734375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.1176459020934999, + "epoch": 0.17876106194690267, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5859686788725634, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 49274361.0, + "reward": 0.48417970538139343, + "reward_std": 0.11261902004480362, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983181357383728, + "sampling/importance_sampling_ratio/min": 0.008657840080559254, + "sampling/sampling_logp_difference/max": 4.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.12156712263822556, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 224.70703125, + "completions/mean_terminated_length": 224.70703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.1159709938801825, + "epoch": 0.18053097345132743, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.5971187276792003, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 49722542.0, + "reward": 0.5509765148162842, + "reward_std": 0.14512471854686737, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808385372161865, + "sampling/importance_sampling_ratio/min": 0.006748217158019543, + "sampling/sampling_logp_difference/max": 4.998476982116699, + "sampling/sampling_logp_difference/mean": 0.12293193489313126, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 240.8515625, + "completions/mean_terminated_length": 240.8515625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.10734113771468401, + "epoch": 0.18230088495575222, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.34698883138858766, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 50303928.0, + "reward": 0.532421886920929, + "reward_std": 0.07749484479427338, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824478626251221, + "sampling/importance_sampling_ratio/min": 0.007269550114870071, + "sampling/sampling_logp_difference/max": 4.924060821533203, + "sampling/sampling_logp_difference/mean": 0.1154562383890152, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 367.0, + "completions/max_terminated_length": 367.0, + "completions/mean_length": 209.046875, + "completions/mean_terminated_length": 209.046875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.10368814738467336, + "epoch": 0.184070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6533484189858692, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 50859780.0, + "reward": 0.6474609375, + "reward_std": 0.1079154908657074, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9794267416000366, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.11755738407373428, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 249.80859375, + "completions/mean_terminated_length": 249.80859375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.11205186462029815, + "epoch": 0.18584070796460178, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5596451756794907, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 51429315.0, + "reward": 0.7216796875, + "reward_std": 0.08038856089115143, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812313914299011, + "sampling/importance_sampling_ratio/min": 0.0024153217673301697, + "sampling/sampling_logp_difference/max": 6.025922775268555, + "sampling/sampling_logp_difference/mean": 0.12050285935401917, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 401.0, + "completions/max_terminated_length": 401.0, + "completions/mean_length": 209.70703125, + "completions/mean_terminated_length": 209.70703125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.10347587242722511, + "epoch": 0.18761061946902655, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.327592507336327, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 51851544.0, + "reward": 0.669726550579071, + "reward_std": 0.045504868030548096, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9436527490615845, + "sampling/importance_sampling_ratio/mean": 0.9785642623901367, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.1196037083864212, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 204.44140625, + "completions/mean_terminated_length": 204.44140625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.10565184382721782, + "epoch": 0.18938053097345134, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.58343713718625, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 52334249.0, + "reward": 0.7439453601837158, + "reward_std": 0.09104898571968079, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7635939121246338, + "sampling/importance_sampling_ratio/mean": 0.9796919226646423, + "sampling/importance_sampling_ratio/min": 0.0067548807710409164, + "sampling/sampling_logp_difference/max": 4.997489929199219, + "sampling/sampling_logp_difference/mean": 0.12021346390247345, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 203.52734375, + "completions/mean_terminated_length": 203.52734375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.10722668562084436, + "epoch": 0.1911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6644319399567485, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 52762816.0, + "reward": 0.6474609375, + "reward_std": 0.10284657776355743, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790235757827759, + "sampling/importance_sampling_ratio/min": 0.005264465697109699, + "sampling/sampling_logp_difference/max": 5.2467756271362305, + "sampling/sampling_logp_difference/mean": 0.1226806491613388, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 232.69140625, + "completions/mean_terminated_length": 232.69140625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.10158722009509802, + "epoch": 0.1929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6060721324679824, + "learning_rate": 1e-06, + "loss": 0.021, + "num_tokens": 53326625.0, + "reward": 0.5435546636581421, + "reward_std": 0.10694187134504318, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9834798574447632, + "sampling/importance_sampling_ratio/mean": 0.9809249043464661, + "sampling/importance_sampling_ratio/min": 0.0040969038382172585, + "sampling/sampling_logp_difference/max": 5.497523784637451, + "sampling/sampling_logp_difference/mean": 0.11318153142929077, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 199.62109375, + "completions/mean_terminated_length": 199.62109375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.09194927336648107, + "epoch": 0.19469026548672566, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7803880697355865, + "learning_rate": 1e-06, + "loss": -0.026, + "num_tokens": 53836016.0, + "reward": 0.6585937738418579, + "reward_std": 0.13558024168014526, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978531002998352, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.11296068131923676, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 242.75390625, + "completions/mean_terminated_length": 242.75390625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.10220854729413986, + "epoch": 0.19646017699115045, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6219019023040464, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 54415873.0, + "reward": 0.632617175579071, + "reward_std": 0.12484428286552429, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984710693359375, + "sampling/importance_sampling_ratio/min": 0.007291032467037439, + "sampling/sampling_logp_difference/max": 4.921110153198242, + "sampling/sampling_logp_difference/mean": 0.10962589085102081, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 411.0, + "completions/max_terminated_length": 411.0, + "completions/mean_length": 216.36328125, + "completions/mean_terminated_length": 216.36328125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.09269869700074196, + "epoch": 0.19823008849557522, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.39820004079656984, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 54904590.0, + "reward": 0.6251952648162842, + "reward_std": 0.05070105940103531, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978786826133728, + "sampling/importance_sampling_ratio/min": 0.008668440394103527, + "sampling/sampling_logp_difference/max": 4.748066425323486, + "sampling/sampling_logp_difference/mean": 0.11571105569601059, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 195.97265625, + "completions/mean_terminated_length": 195.97265625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.08949707262217999, + "epoch": 0.2, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.9151277910188915, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 55382103.0, + "reward": 0.717968761920929, + "reward_std": 0.15376240015029907, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8252545595169067, + "sampling/importance_sampling_ratio/mean": 0.9785122871398926, + "sampling/importance_sampling_ratio/min": 0.005275193601846695, + "sampling/sampling_logp_difference/max": 5.244740009307861, + "sampling/sampling_logp_difference/mean": 0.11404772102832794, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 206.4296875, + "completions/mean_terminated_length": 206.4296875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.0843565477989614, + "epoch": 0.20176991150442478, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.7758113158186482, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 55825253.0, + "reward": 0.46562498807907104, + "reward_std": 0.17511767148971558, + "rewards/execution_accuracy_EX/mean": 0.4375, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798601865768433, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.1056603267788887, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 208.6171875, + "completions/mean_terminated_length": 208.6171875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.08362881932407618, + "epoch": 0.20353982300884957, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.6768975964136774, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 56328579.0, + "reward": 0.595507800579071, + "reward_std": 0.1660008728504181, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9331179857254028, + "sampling/importance_sampling_ratio/mean": 0.981865406036377, + "sampling/importance_sampling_ratio/min": 0.005257657263427973, + "sampling/sampling_logp_difference/max": 5.248069763183594, + "sampling/sampling_logp_difference/mean": 0.10724613070487976, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 201.3359375, + "completions/mean_terminated_length": 201.3359375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.08422743389382958, + "epoch": 0.20530973451327433, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7836037832764083, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 56725449.0, + "reward": 0.6029297113418579, + "reward_std": 0.15058711171150208, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796129465103149, + "sampling/importance_sampling_ratio/min": 0.006748081650584936, + "sampling/sampling_logp_difference/max": 4.998497009277344, + "sampling/sampling_logp_difference/mean": 0.1074795126914978, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 198.203125, + "completions/mean_terminated_length": 198.203125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.08709999080747366, + "epoch": 0.20707964601769913, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7483009801443501, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 57198493.0, + "reward": 0.662304699420929, + "reward_std": 0.11717012524604797, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799016118049622, + "sampling/importance_sampling_ratio/min": 0.002495958236977458, + "sampling/sampling_logp_difference/max": 5.993082523345947, + "sampling/sampling_logp_difference/mean": 0.11591166257858276, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 234.34375, + "completions/mean_terminated_length": 234.34375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.11767698405310512, + "epoch": 0.2088495575221239, + "frac_reward_zero_std": 0.4375, + "grad_norm": 0.9452872240731504, + "learning_rate": 1e-06, + "loss": 0.0284, + "num_tokens": 57685477.0, + "reward": 0.4916015863418579, + "reward_std": 0.22398976981639862, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829739332199097, + "sampling/importance_sampling_ratio/min": 0.006783349439501762, + "sampling/sampling_logp_difference/max": 4.993284225463867, + "sampling/sampling_logp_difference/mean": 0.12430395185947418, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 205.609375, + "completions/mean_terminated_length": 205.609375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.08831424685195088, + "epoch": 0.21061946902654868, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5796923594345035, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 58077233.0, + "reward": 0.606640636920929, + "reward_std": 0.06846607476472855, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810867309570312, + "sampling/importance_sampling_ratio/min": 0.008661833591759205, + "sampling/sampling_logp_difference/max": 4.748828887939453, + "sampling/sampling_logp_difference/mean": 0.10440923273563385, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 208.79296875, + "completions/mean_terminated_length": 208.79296875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.09712347202003002, + "epoch": 0.21238938053097345, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7197104488845438, + "learning_rate": 1e-06, + "loss": -0.0128, + "num_tokens": 58598716.0, + "reward": 0.606640636920929, + "reward_std": 0.11373046040534973, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9806808233261108, + "sampling/importance_sampling_ratio/min": 0.004097335506230593, + "sampling/sampling_logp_difference/max": 5.497418403625488, + "sampling/sampling_logp_difference/mean": 0.11543069034814835, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 205.27734375, + "completions/mean_terminated_length": 205.27734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.08803935116156936, + "epoch": 0.21415929203539824, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7640824796201762, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 59011891.0, + "reward": 0.5361328125, + "reward_std": 0.10986313223838806, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9773586392402649, + "sampling/importance_sampling_ratio/min": 0.0049935500137507915, + "sampling/sampling_logp_difference/max": 5.29960823059082, + "sampling/sampling_logp_difference/mean": 0.11789683997631073, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 207.0859375, + "completions/mean_terminated_length": 207.0859375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.09456527512520552, + "epoch": 0.215929203539823, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.2887519228250559, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 59484841.0, + "reward": 0.4878906011581421, + "reward_std": 0.04326736554503441, + "rewards/execution_accuracy_EX/mean": 0.4609375, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9804641008377075, + "sampling/importance_sampling_ratio/min": 0.008669535629451275, + "sampling/sampling_logp_difference/max": 4.7479400634765625, + "sampling/sampling_logp_difference/mean": 0.11307957023382187, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 590.0, + "completions/max_terminated_length": 590.0, + "completions/mean_length": 222.859375, + "completions/mean_terminated_length": 222.859375, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.09229281730949879, + "epoch": 0.2176991150442478, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.669269608737298, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 59865893.0, + "reward": 0.5249999761581421, + "reward_std": 0.14341795444488525, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.947978138923645, + "sampling/importance_sampling_ratio/mean": 0.977138340473175, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.11797585338354111, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 195.48828125, + "completions/mean_terminated_length": 195.48828125, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "entropy": 0.08070922084152699, + "epoch": 0.21946902654867256, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0568791745676698, + "learning_rate": 1e-06, + "loss": 0.0081, + "num_tokens": 60474018.0, + "reward": 0.4916015565395355, + "reward_std": 0.10807903110980988, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.976543664932251, + "sampling/importance_sampling_ratio/min": 0.005264227278530598, + "sampling/sampling_logp_difference/max": 5.24682092666626, + "sampling/sampling_logp_difference/mean": 0.11132196336984634, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 417.0, + "completions/max_terminated_length": 417.0, + "completions/mean_length": 195.8984375, + "completions/mean_terminated_length": 195.8984375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.08272697357460856, + "epoch": 0.22123893805309736, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5912762532175119, + "learning_rate": 1e-06, + "loss": 0.0133, + "num_tokens": 60749656.0, + "reward": 0.688281238079071, + "reward_std": 0.10188308358192444, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9845452308654785, + "sampling/importance_sampling_ratio/mean": 0.9825228452682495, + "sampling/importance_sampling_ratio/min": 0.004114360548555851, + "sampling/sampling_logp_difference/max": 5.493271827697754, + "sampling/sampling_logp_difference/mean": 0.10054115951061249, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 207.53125, + "completions/mean_terminated_length": 207.53125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.08929994003847241, + "epoch": 0.22300884955752212, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0750690683806041, + "learning_rate": 1e-06, + "loss": 0.0181, + "num_tokens": 61236336.0, + "reward": 0.513867199420929, + "reward_std": 0.16860431432724, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823257923126221, + "sampling/importance_sampling_ratio/min": 0.00409407215192914, + "sampling/sampling_logp_difference/max": 5.498215198516846, + "sampling/sampling_logp_difference/mean": 0.10947106778621674, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 604.0, + "completions/max_terminated_length": 604.0, + "completions/mean_length": 185.3515625, + "completions/mean_terminated_length": 185.3515625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.08107093488797545, + "epoch": 0.2247787610619469, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.21616024681822563, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 61600874.0, + "reward": 0.7662109136581421, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.75390625, + "rewards/execution_accuracy_EX/std": 0.43157756328582764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9793136119842529, + "sampling/importance_sampling_ratio/min": 0.004132171627134085, + "sampling/sampling_logp_difference/max": 5.488952159881592, + "sampling/sampling_logp_difference/mean": 0.11020313948392868, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 197.26171875, + "completions/mean_terminated_length": 197.26171875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.09589638793841004, + "epoch": 0.22654867256637168, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5657508168952344, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 62107341.0, + "reward": 0.443359375, + "reward_std": 0.05094154179096222, + "rewards/execution_accuracy_EX/mean": 0.4140625, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819854497909546, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.11627371609210968, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 207.4140625, + "completions/mean_terminated_length": 207.4140625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.08701987843960524, + "epoch": 0.22831858407079647, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4626452630226199, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 62463863.0, + "reward": 0.539843738079071, + "reward_std": 0.05884425342082977, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782624244689941, + "sampling/importance_sampling_ratio/min": 0.006754652131348848, + "sampling/sampling_logp_difference/max": 4.997523784637451, + "sampling/sampling_logp_difference/mean": 0.11296375095844269, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 210.23828125, + "completions/mean_terminated_length": 210.23828125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.08077159384265542, + "epoch": 0.23008849557522124, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6245209958588894, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 62862196.0, + "reward": 0.5880858898162842, + "reward_std": 0.13064873218536377, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829123020172119, + "sampling/importance_sampling_ratio/min": 0.0015071897068992257, + "sampling/sampling_logp_difference/max": 6.497508525848389, + "sampling/sampling_logp_difference/mean": 0.10167676210403442, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 192.05859375, + "completions/mean_terminated_length": 192.05859375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.08136329706758261, + "epoch": 0.23185840707964603, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5170399270000965, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 63625459.0, + "reward": 0.7291015386581421, + "reward_std": 0.10337549448013306, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9791998863220215, + "sampling/importance_sampling_ratio/min": 0.005695051979273558, + "sampling/sampling_logp_difference/max": 5.168157577514648, + "sampling/sampling_logp_difference/mean": 0.10522396117448807, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 184.19140625, + "completions/mean_terminated_length": 184.19140625, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.07991229742765427, + "epoch": 0.2336283185840708, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7589126452270761, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 64062036.0, + "reward": 0.7105468511581421, + "reward_std": 0.08160266280174255, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9750150442123413, + "sampling/importance_sampling_ratio/min": 0.0067546493373811245, + "sampling/sampling_logp_difference/max": 4.997524261474609, + "sampling/sampling_logp_difference/mean": 0.11277009546756744, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 181.96875, + "completions/mean_terminated_length": 181.96875, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.08153297612443566, + "epoch": 0.23539823008849559, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.41604872411299954, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 64443420.0, + "reward": 0.4916015565395355, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9806865453720093, + "sampling/importance_sampling_ratio/min": 0.004161639139056206, + "sampling/sampling_logp_difference/max": 5.481846332550049, + "sampling/sampling_logp_difference/mean": 0.1023765578866005, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 361.0, + "completions/max_terminated_length": 361.0, + "completions/mean_length": 202.04296875, + "completions/mean_terminated_length": 202.04296875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.0842071371152997, + "epoch": 0.23716814159292035, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4664542353023535, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 64878871.0, + "reward": 0.6585937142372131, + "reward_std": 0.04326736554503441, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819703698158264, + "sampling/importance_sampling_ratio/min": 0.006755741313099861, + "sampling/sampling_logp_difference/max": 4.9973626136779785, + "sampling/sampling_logp_difference/mean": 0.10626322031021118, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 197.62109375, + "completions/mean_terminated_length": 197.62109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.08557542972266674, + "epoch": 0.23893805309734514, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6911078436228403, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 65220438.0, + "reward": 0.502734363079071, + "reward_std": 0.11038151383399963, + "rewards/execution_accuracy_EX/mean": 0.4765625, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810430407524109, + "sampling/importance_sampling_ratio/min": 0.00038086267886683345, + "sampling/sampling_logp_difference/max": 7.873071670532227, + "sampling/sampling_logp_difference/mean": 0.10258093476295471, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 184.6328125, + "completions/mean_terminated_length": 184.6328125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.08875895058736205, + "epoch": 0.2407079646017699, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.39005428484545107, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 65689720.0, + "reward": 0.4507812559604645, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.421875, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832066297531128, + "sampling/importance_sampling_ratio/min": 0.0031921479385346174, + "sampling/sampling_logp_difference/max": 5.747061252593994, + "sampling/sampling_logp_difference/mean": 0.10745403915643692, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 197.94921875, + "completions/mean_terminated_length": 197.94921875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.09639216307550669, + "epoch": 0.2424778761061947, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5622338166876616, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 66150155.0, + "reward": 0.38398438692092896, + "reward_std": 0.10411045700311661, + "rewards/execution_accuracy_EX/mean": 0.3515625, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815918803215027, + "sampling/importance_sampling_ratio/min": 0.00154885312076658, + "sampling/sampling_logp_difference/max": 6.470240592956543, + "sampling/sampling_logp_difference/mean": 0.10910271853208542, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 194.29296875, + "completions/mean_terminated_length": 194.29296875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.09355801250785589, + "epoch": 0.24424778761061947, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7498448203970273, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 66637238.0, + "reward": 0.5509765148162842, + "reward_std": 0.0833098292350769, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800968170166016, + "sampling/importance_sampling_ratio/min": 0.006755042355507612, + "sampling/sampling_logp_difference/max": 4.997466087341309, + "sampling/sampling_logp_difference/mean": 0.11173395812511444, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 180.32421875, + "completions/mean_terminated_length": 180.32421875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.09145210171118379, + "epoch": 0.24601769911504426, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7760187653219999, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 67111545.0, + "reward": 0.4916015863418579, + "reward_std": 0.15626469254493713, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813463091850281, + "sampling/importance_sampling_ratio/min": 0.0006360870320349932, + "sampling/sampling_logp_difference/max": 7.360175132751465, + "sampling/sampling_logp_difference/mean": 0.11378375440835953, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 186.57421875, + "completions/mean_terminated_length": 186.57421875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.08141208300366998, + "epoch": 0.24778761061946902, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7939820389844423, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 67474860.0, + "reward": 0.6623046398162842, + "reward_std": 0.14393633604049683, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9391729831695557, + "sampling/importance_sampling_ratio/mean": 0.9799111485481262, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.10885395109653473, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 174.40625, + "completions/mean_terminated_length": 174.40625, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.08326558629050851, + "epoch": 0.24955752212389382, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8624962406439426, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 68005348.0, + "reward": 0.755078136920929, + "reward_std": 0.10118919610977173, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830754995346069, + "sampling/importance_sampling_ratio/min": 0.0068655190989375114, + "sampling/sampling_logp_difference/max": 4.98124361038208, + "sampling/sampling_logp_difference/mean": 0.0987495705485344, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 192.13671875, + "completions/mean_terminated_length": 192.13671875, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.08353960514068604, + "epoch": 0.2513274336283186, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.929708602033235, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 68545703.0, + "reward": 0.6548827886581421, + "reward_std": 0.15161052346229553, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829370379447937, + "sampling/importance_sampling_ratio/min": 0.011213175021111965, + "sampling/sampling_logp_difference/max": 4.490665912628174, + "sampling/sampling_logp_difference/mean": 0.100138820707798, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 160.61328125, + "completions/mean_terminated_length": 160.61328125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.09222828038036823, + "epoch": 0.25309734513274335, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6757409108026292, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 68904564.0, + "reward": 0.576953113079071, + "reward_std": 0.07442296296358109, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790805578231812, + "sampling/importance_sampling_ratio/min": 0.011156657710671425, + "sampling/sampling_logp_difference/max": 4.495718955993652, + "sampling/sampling_logp_difference/mean": 0.11575712263584137, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 157.25390625, + "completions/mean_terminated_length": 157.25390625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.09697880689054728, + "epoch": 0.25486725663716814, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4644083103615812, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 69252629.0, + "reward": 0.7291015386581421, + "reward_std": 0.045504868030548096, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798318147659302, + "sampling/importance_sampling_ratio/min": 0.0086623290553689, + "sampling/sampling_logp_difference/max": 4.748771667480469, + "sampling/sampling_logp_difference/mean": 0.11303908377885818, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 173.0703125, + "completions/mean_terminated_length": 173.0703125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.08985295332968235, + "epoch": 0.25663716814159293, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.0268459739365057, + "learning_rate": 1e-06, + "loss": -0.0095, + "num_tokens": 69559479.0, + "reward": 0.606640636920929, + "reward_std": 0.14206604659557343, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9195373058319092, + "sampling/importance_sampling_ratio/mean": 0.9820912480354309, + "sampling/importance_sampling_ratio/min": 0.006510779727250338, + "sampling/sampling_logp_difference/max": 5.034296035766602, + "sampling/sampling_logp_difference/mean": 0.10475027561187744, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 163.83984375, + "completions/mean_terminated_length": 163.83984375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.07417428260669112, + "epoch": 0.2584070796460177, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.3256794304953388, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 70051470.0, + "reward": 0.699414074420929, + "reward_std": 0.04421525076031685, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.6169342994689941, + "sampling/importance_sampling_ratio/mean": 0.9794005155563354, + "sampling/importance_sampling_ratio/min": 0.0031864114571362734, + "sampling/sampling_logp_difference/max": 5.748859882354736, + "sampling/sampling_logp_difference/mean": 0.10460924357175827, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 177.9921875, + "completions/mean_terminated_length": 177.9921875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.09738247888162732, + "epoch": 0.26017699115044246, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.3007780571412999, + "learning_rate": 1e-06, + "loss": -0.0108, + "num_tokens": 70392444.0, + "reward": 0.625195324420929, + "reward_std": 0.05070105940103531, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7909278869628906, + "sampling/importance_sampling_ratio/mean": 0.9823647141456604, + "sampling/importance_sampling_ratio/min": 0.014345028437674046, + "sampling/sampling_logp_difference/max": 4.244351863861084, + "sampling/sampling_logp_difference/mean": 0.11186592280864716, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 194.3203125, + "completions/mean_terminated_length": 194.3203125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.09611005429178476, + "epoch": 0.26194690265486725, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5654913747314501, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 70842222.0, + "reward": 0.7105468511581421, + "reward_std": 0.08972012251615524, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849147796630859, + "sampling/importance_sampling_ratio/min": 0.005293986294418573, + "sampling/sampling_logp_difference/max": 5.241183757781982, + "sampling/sampling_logp_difference/mean": 0.10340418666601181, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 394.0, + "completions/max_terminated_length": 394.0, + "completions/mean_length": 194.33984375, + "completions/mean_terminated_length": 194.33984375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.1007878971286118, + "epoch": 0.26371681415929205, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.16816511037158902, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 71237637.0, + "reward": 0.5806640386581421, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98342365026474, + "sampling/importance_sampling_ratio/min": 0.006796163972467184, + "sampling/sampling_logp_difference/max": 4.991396903991699, + "sampling/sampling_logp_difference/mean": 0.1074715182185173, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 290.0, + "completions/max_terminated_length": 290.0, + "completions/mean_length": 175.91015625, + "completions/mean_terminated_length": 175.91015625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.0920578446239233, + "epoch": 0.26548672566371684, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.44195362627941037, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 71556158.0, + "reward": 0.632617175579071, + "reward_std": 0.07525734603404999, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7116446495056152, + "sampling/importance_sampling_ratio/mean": 0.9817265272140503, + "sampling/importance_sampling_ratio/min": 0.0143097760155797, + "sampling/sampling_logp_difference/max": 4.246812343597412, + "sampling/sampling_logp_difference/mean": 0.10374663025140762, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 159.8671875, + "completions/mean_terminated_length": 159.8671875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.09047194477170706, + "epoch": 0.2672566371681416, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.69712253196688, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 72081836.0, + "reward": 0.7884765863418579, + "reward_std": 0.06720219552516937, + "rewards/execution_accuracy_EX/mean": 0.77734375, + "rewards/execution_accuracy_EX/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822363257408142, + "sampling/importance_sampling_ratio/min": 0.0032103233970701694, + "sampling/sampling_logp_difference/max": 5.7413835525512695, + "sampling/sampling_logp_difference/mean": 0.10923825949430466, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 179.2734375, + "completions/mean_terminated_length": 179.2734375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.09650673251599073, + "epoch": 0.26902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5597491940819138, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 72610258.0, + "reward": 0.5621093511581421, + "reward_std": 0.08874650299549103, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9807620048522949, + "sampling/importance_sampling_ratio/min": 0.011610008776187897, + "sampling/sampling_logp_difference/max": 4.455887794494629, + "sampling/sampling_logp_difference/mean": 0.11025235056877136, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 208.1484375, + "completions/mean_terminated_length": 208.1484375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.10619036573916674, + "epoch": 0.27079646017699116, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5075640811815857, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 73292632.0, + "reward": 0.5138671398162842, + "reward_std": 0.08477610349655151, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9809649586677551, + "sampling/importance_sampling_ratio/min": 0.000261443987255916, + "sampling/sampling_logp_difference/max": 8.249290466308594, + "sampling/sampling_logp_difference/mean": 0.11439356207847595, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 215.23046875, + "completions/mean_terminated_length": 215.23046875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.09488361096009612, + "epoch": 0.27256637168141595, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5563388246185602, + "learning_rate": 1e-06, + "loss": 0.026, + "num_tokens": 73891331.0, + "reward": 0.6734374761581421, + "reward_std": 0.1068151593208313, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824247360229492, + "sampling/importance_sampling_ratio/min": 0.004115039948374033, + "sampling/sampling_logp_difference/max": 5.493106842041016, + "sampling/sampling_logp_difference/mean": 0.11030463874340057, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 292.0, + "completions/max_terminated_length": 292.0, + "completions/mean_length": 174.9296875, + "completions/mean_terminated_length": 174.9296875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.10223505413159728, + "epoch": 0.2743362831858407, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.2348828915946183, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 74386833.0, + "reward": 0.5138671398162842, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8823474645614624, + "sampling/importance_sampling_ratio/mean": 0.9790357947349548, + "sampling/importance_sampling_ratio/min": 0.004132173955440521, + "sampling/sampling_logp_difference/max": 5.488951683044434, + "sampling/sampling_logp_difference/mean": 0.12114541977643967, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 193.47265625, + "completions/mean_terminated_length": 193.47265625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.10932498797774315, + "epoch": 0.2761061946902655, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.234084008667381, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 75012122.0, + "reward": 0.5101562738418579, + "reward_std": 0.13818368315696716, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837905168533325, + "sampling/importance_sampling_ratio/min": 0.000336296419845894, + "sampling/sampling_logp_difference/max": 7.9975175857543945, + "sampling/sampling_logp_difference/mean": 0.11561685800552368, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 196.39453125, + "completions/mean_terminated_length": 196.39453125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.11083815386518836, + "epoch": 0.2778761061946903, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.217694381201557, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 75501535.0, + "reward": 0.5806640386581421, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9802595376968384, + "sampling/importance_sampling_ratio/min": 0.005292918533086777, + "sampling/sampling_logp_difference/max": 5.241385459899902, + "sampling/sampling_logp_difference/mean": 0.11775648593902588, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 195.5, + "completions/mean_terminated_length": 195.5, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.10389725724235177, + "epoch": 0.27964601769911507, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8569816688250197, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 76000319.0, + "reward": 0.6957030892372131, + "reward_std": 0.1004817932844162, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9827717542648315, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.11441849917173386, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 203.1484375, + "completions/mean_terminated_length": 203.1484375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.1024523968808353, + "epoch": 0.2814159292035398, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5038507122925591, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 76535285.0, + "reward": 0.7513672113418579, + "reward_std": 0.0707685649394989, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9771853685379028, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.12337704002857208, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 405.0, + "completions/max_terminated_length": 405.0, + "completions/mean_length": 192.5234375, + "completions/mean_terminated_length": 192.5234375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.10051512671634555, + "epoch": 0.2831858407079646, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5434956913637382, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 77001947.0, + "reward": 0.47285154461860657, + "reward_std": 0.061382777988910675, + "rewards/execution_accuracy_EX/mean": 0.4453125, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811661243438721, + "sampling/importance_sampling_ratio/min": 0.011125843971967697, + "sampling/sampling_logp_difference/max": 4.4984846115112305, + "sampling/sampling_logp_difference/mean": 0.11524706333875656, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 669.0, + "completions/max_terminated_length": 669.0, + "completions/mean_length": 206.97265625, + "completions/mean_terminated_length": 206.97265625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.09411763353273273, + "epoch": 0.2849557522123894, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.4174101942016626, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 77589332.0, + "reward": 0.6919921636581421, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796286821365356, + "sampling/importance_sampling_ratio/min": 0.006229365710169077, + "sampling/sampling_logp_difference/max": 5.0784807205200195, + "sampling/sampling_logp_difference/mean": 0.11265522241592407, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 554.0, + "completions/max_terminated_length": 554.0, + "completions/mean_length": 193.08984375, + "completions/mean_terminated_length": 193.08984375, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "entropy": 0.0976509964093566, + "epoch": 0.2867256637168142, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5846767822941459, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 77942443.0, + "reward": 0.6140624284744263, + "reward_std": 0.07652123272418976, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.993268609046936, + "sampling/importance_sampling_ratio/mean": 0.975715696811676, + "sampling/importance_sampling_ratio/min": 0.0007850261172279716, + "sampling/sampling_logp_difference/max": 7.14979362487793, + "sampling/sampling_logp_difference/mean": 0.13011512160301208, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 210.14453125, + "completions/mean_terminated_length": 210.14453125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.09765811590477824, + "epoch": 0.2884955752212389, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.9119764887048077, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 78450544.0, + "reward": 0.6474609375, + "reward_std": 0.13664188981056213, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9789226055145264, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.11777396500110626, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 796.0, + "completions/max_terminated_length": 796.0, + "completions/mean_length": 205.0625, + "completions/mean_terminated_length": 205.0625, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.09663508925586939, + "epoch": 0.2902654867256637, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5382454468895027, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 78971856.0, + "reward": 0.6511719226837158, + "reward_std": 0.05624080449342728, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9773013591766357, + "sampling/importance_sampling_ratio/min": 0.003231908194720745, + "sampling/sampling_logp_difference/max": 5.734682559967041, + "sampling/sampling_logp_difference/mean": 0.12293802946805954, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 171.2421875, + "completions/mean_terminated_length": 171.2421875, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "entropy": 0.08938074065372348, + "epoch": 0.2920353982300885, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5094300920717993, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 79465214.0, + "reward": 0.7513672113418579, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9770304560661316, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.11507593840360641, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 191.33203125, + "completions/mean_terminated_length": 191.33203125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.08660335559397936, + "epoch": 0.2938053097345133, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.5201220315453141, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 79974019.0, + "reward": 0.595507800579071, + "reward_std": 0.09449917078018188, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9787920713424683, + "sampling/importance_sampling_ratio/min": 0.0007223741267807782, + "sampling/sampling_logp_difference/max": 7.232967376708984, + "sampling/sampling_logp_difference/mean": 0.1137903556227684, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 194.05859375, + "completions/mean_terminated_length": 194.05859375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.09493234846740961, + "epoch": 0.29557522123893804, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7179768830419001, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 80477682.0, + "reward": 0.3951171636581421, + "reward_std": 0.0833098292350769, + "rewards/execution_accuracy_EX/mean": 0.36328125, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9767428636550903, + "sampling/importance_sampling_ratio/min": 0.006754762027412653, + "sampling/sampling_logp_difference/max": 4.997507572174072, + "sampling/sampling_logp_difference/mean": 0.12088891118764877, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 177.33203125, + "completions/mean_terminated_length": 177.33203125, + "completions/min_length": 78.0, + "completions/min_terminated_length": 78.0, + "entropy": 0.08975711092352867, + "epoch": 0.2973451327433628, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.26217961623506286, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 80897959.0, + "reward": 0.5472656488418579, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.979150652885437, + "sampling/importance_sampling_ratio/min": 0.00527530163526535, + "sampling/sampling_logp_difference/max": 5.244719505310059, + "sampling/sampling_logp_difference/mean": 0.1166551485657692, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 182.2734375, + "completions/mean_terminated_length": 182.2734375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.07896539149805903, + "epoch": 0.2991150442477876, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9767337123578634, + "learning_rate": 1e-06, + "loss": -0.0071, + "num_tokens": 81359517.0, + "reward": 0.5101562738418579, + "reward_std": 0.11978860199451447, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796271324157715, + "sampling/importance_sampling_ratio/min": 0.0024810130707919598, + "sampling/sampling_logp_difference/max": 5.999088287353516, + "sampling/sampling_logp_difference/mean": 0.10805898159742355, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 191.1171875, + "completions/mean_terminated_length": 191.1171875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.07802614104002714, + "epoch": 0.3008849557522124, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.6214905619531037, + "learning_rate": 1e-06, + "loss": 0.0096, + "num_tokens": 81723291.0, + "reward": 0.6771484613418579, + "reward_std": 0.10954713821411133, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9772258996963501, + "sampling/importance_sampling_ratio/min": 0.0019327143672853708, + "sampling/sampling_logp_difference/max": 6.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.11072556674480438, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 180.203125, + "completions/mean_terminated_length": 180.203125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.08707320876419544, + "epoch": 0.30265486725663715, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6307711878672693, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 82066751.0, + "reward": 0.7068359851837158, + "reward_std": 0.09396842122077942, + "rewards/execution_accuracy_EX/mean": 0.69140625, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9772290587425232, + "sampling/importance_sampling_ratio/min": 0.006749854888767004, + "sampling/sampling_logp_difference/max": 4.998234272003174, + "sampling/sampling_logp_difference/mean": 0.1186441034078598, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 187.46484375, + "completions/mean_terminated_length": 187.46484375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.07379985274747014, + "epoch": 0.30442477876106194, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6957386224605739, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 82550262.0, + "reward": 0.799609363079071, + "reward_std": 0.05937499925494194, + "rewards/execution_accuracy_EX/mean": 0.7890625, + "rewards/execution_accuracy_EX/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9391729831695557, + "sampling/importance_sampling_ratio/mean": 0.9753741025924683, + "sampling/importance_sampling_ratio/min": 0.0031850412487983704, + "sampling/sampling_logp_difference/max": 5.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.11013990640640259, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 194.41015625, + "completions/mean_terminated_length": 194.41015625, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.07758137444034219, + "epoch": 0.30619469026548674, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6636042722498399, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 83177151.0, + "reward": 0.6548827886581421, + "reward_std": 0.10142967849969864, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9806905388832092, + "sampling/importance_sampling_ratio/min": 0.005303387530148029, + "sampling/sampling_logp_difference/max": 5.239409446716309, + "sampling/sampling_logp_difference/mean": 0.1041186973452568, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 185.4296875, + "completions/mean_terminated_length": 185.4296875, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.07541646296158433, + "epoch": 0.30796460176991153, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6366634556334453, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 83635709.0, + "reward": 0.8033202886581421, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.79296875, + "rewards/execution_accuracy_EX/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774077534675598, + "sampling/importance_sampling_ratio/min": 0.005261880811303854, + "sampling/sampling_logp_difference/max": 5.24726676940918, + "sampling/sampling_logp_difference/mean": 0.10960914194583893, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 206.15625, + "completions/mean_terminated_length": 206.15625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.07524503348395228, + "epoch": 0.30973451327433627, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7066416869876871, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 84120837.0, + "reward": 0.680859386920929, + "reward_std": 0.11508505791425705, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9755487442016602, + "sampling/importance_sampling_ratio/min": 0.0040918686427176, + "sampling/sampling_logp_difference/max": 5.498753547668457, + "sampling/sampling_logp_difference/mean": 0.11219770461320877, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 190.375, + "completions/mean_terminated_length": 190.375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.08618370164185762, + "epoch": 0.31150442477876106, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.0553595652831549, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 84633253.0, + "reward": 0.5732421875, + "reward_std": 0.17616680264472961, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9752297401428223, + "sampling/importance_sampling_ratio/min": 0.005264629144221544, + "sampling/sampling_logp_difference/max": 5.246744632720947, + "sampling/sampling_logp_difference/mean": 0.12716248631477356, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 210.0546875, + "completions/mean_terminated_length": 210.0546875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.08424549270421267, + "epoch": 0.31327433628318585, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.48057719258627274, + "learning_rate": 1e-06, + "loss": -0.0119, + "num_tokens": 85289059.0, + "reward": 0.4173828363418579, + "reward_std": 0.08038855344057083, + "rewards/execution_accuracy_EX/mean": 0.38671875, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9767379760742188, + "sampling/importance_sampling_ratio/min": 0.005398212932050228, + "sampling/sampling_logp_difference/max": 5.221687316894531, + "sampling/sampling_logp_difference/mean": 0.12158403545618057, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 191.33984375, + "completions/mean_terminated_length": 191.33984375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.08516082679852843, + "epoch": 0.31504424778761064, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.4705687030476539, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 85792106.0, + "reward": 0.6437499523162842, + "reward_std": 0.06533188372850418, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.973250687122345, + "sampling/importance_sampling_ratio/min": 0.00674829725176096, + "sampling/sampling_logp_difference/max": 4.998465061187744, + "sampling/sampling_logp_difference/mean": 0.1235455796122551, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 229.20703125, + "completions/mean_terminated_length": 229.20703125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.08670031791552901, + "epoch": 0.3168141592920354, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.702815074671348, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 86352591.0, + "reward": 0.43593746423721313, + "reward_std": 0.13373565673828125, + "rewards/execution_accuracy_EX/mean": 0.40625, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764511585235596, + "sampling/importance_sampling_ratio/min": 0.005267122760415077, + "sampling/sampling_logp_difference/max": 5.246271133422852, + "sampling/sampling_logp_difference/mean": 0.11672802269458771, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 179.734375, + "completions/mean_terminated_length": 179.734375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.07481331843882799, + "epoch": 0.3185840707964602, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6621452677049885, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 86999787.0, + "reward": 0.669726550579071, + "reward_std": 0.0833098292350769, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764997959136963, + "sampling/importance_sampling_ratio/min": 0.005276773124933243, + "sampling/sampling_logp_difference/max": 5.24444055557251, + "sampling/sampling_logp_difference/mean": 0.11478103697299957, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 166.0703125, + "completions/mean_terminated_length": 166.0703125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.0695406598970294, + "epoch": 0.32035398230088497, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4821111998957712, + "learning_rate": 1e-06, + "loss": 0.0105, + "num_tokens": 87318317.0, + "reward": 0.5992187261581421, + "reward_std": 0.05884425342082977, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.972252368927002, + "sampling/importance_sampling_ratio/min": 0.0032017212361097336, + "sampling/sampling_logp_difference/max": 5.7440667152404785, + "sampling/sampling_logp_difference/mean": 0.11566232144832611, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 331.0, + "completions/max_terminated_length": 331.0, + "completions/mean_length": 197.2265625, + "completions/mean_terminated_length": 197.2265625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.08103373041376472, + "epoch": 0.32212389380530976, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8033444308064983, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 87922631.0, + "reward": 0.7291015386581421, + "reward_std": 0.10359025001525879, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9738355875015259, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.11636005342006683, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 187.8671875, + "completions/mean_terminated_length": 187.8671875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.06755454652011395, + "epoch": 0.3238938053097345, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7103004493988745, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 88254437.0, + "reward": 0.7476562261581421, + "reward_std": 0.09188519418239594, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9751908183097839, + "sampling/importance_sampling_ratio/min": 0.00675690732896328, + "sampling/sampling_logp_difference/max": 4.997189998626709, + "sampling/sampling_logp_difference/mean": 0.10840918123722076, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 534.0, + "completions/max_terminated_length": 534.0, + "completions/mean_length": 243.51953125, + "completions/mean_terminated_length": 243.51953125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.09385430719703436, + "epoch": 0.3256637168141593, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.6326159391651645, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 88825178.0, + "reward": 0.5992187261581421, + "reward_std": 0.1414703130722046, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9750841856002808, + "sampling/importance_sampling_ratio/min": 0.002553608501330018, + "sampling/sampling_logp_difference/max": 5.970247745513916, + "sampling/sampling_logp_difference/mean": 0.12004029750823975, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 202.0078125, + "completions/mean_terminated_length": 202.0078125, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.07875498337671161, + "epoch": 0.3274336283185841, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7295937325703263, + "learning_rate": 1e-06, + "loss": 0.0128, + "num_tokens": 89263772.0, + "reward": 0.62890625, + "reward_std": 0.11805569380521774, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9781786799430847, + "sampling/importance_sampling_ratio/min": 0.003189993090927601, + "sampling/sampling_logp_difference/max": 5.74773645401001, + "sampling/sampling_logp_difference/mean": 0.11500592529773712, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 201.98828125, + "completions/mean_terminated_length": 201.98828125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.07844761619344354, + "epoch": 0.3292035398230089, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.7682786757708523, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 89788617.0, + "reward": 0.36542969942092896, + "reward_std": 0.13452927768230438, + "rewards/execution_accuracy_EX/mean": 0.33203125, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788069725036621, + "sampling/importance_sampling_ratio/min": 0.005285183899104595, + "sampling/sampling_logp_difference/max": 5.242847919464111, + "sampling/sampling_logp_difference/mean": 0.10617148131132126, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 374.0, + "completions/max_terminated_length": 374.0, + "completions/mean_length": 190.0078125, + "completions/mean_terminated_length": 190.0078125, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "entropy": 0.074917315505445, + "epoch": 0.3309734513274336, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5566712159455093, + "learning_rate": 1e-06, + "loss": -0.0106, + "num_tokens": 90226587.0, + "reward": 0.5621093511581421, + "reward_std": 0.08301956206560135, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.976703941822052, + "sampling/importance_sampling_ratio/min": 0.008657840080559254, + "sampling/sampling_logp_difference/max": 4.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.10631166398525238, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 314.0, + "completions/max_terminated_length": 314.0, + "completions/mean_length": 181.51171875, + "completions/mean_terminated_length": 181.51171875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.06668154988437891, + "epoch": 0.3327433628318584, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8199597002180569, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 90931934.0, + "reward": 0.755078136920929, + "reward_std": 0.11978859454393387, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9755611419677734, + "sampling/importance_sampling_ratio/min": 0.006765482947230339, + "sampling/sampling_logp_difference/max": 4.995921611785889, + "sampling/sampling_logp_difference/mean": 0.10638159513473511, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 198.29296875, + "completions/mean_terminated_length": 198.29296875, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.06749717053025961, + "epoch": 0.3345132743362832, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6955114778927112, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 91469081.0, + "reward": 0.5435546636581421, + "reward_std": 0.0866614431142807, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9785115718841553, + "sampling/importance_sampling_ratio/min": 0.004092947579920292, + "sampling/sampling_logp_difference/max": 5.498489856719971, + "sampling/sampling_logp_difference/mean": 0.10224524140357971, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 192.2421875, + "completions/mean_terminated_length": 192.2421875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.07650431990623474, + "epoch": 0.336283185840708, + "frac_reward_zero_std": 0.5625, + "grad_norm": 0.8587836678940827, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 91815175.0, + "reward": 0.5472656488418579, + "reward_std": 0.1373625099658966, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.975071907043457, + "sampling/importance_sampling_ratio/min": 0.002527134958654642, + "sampling/sampling_logp_difference/max": 5.980669021606445, + "sampling/sampling_logp_difference/mean": 0.11604353785514832, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 212.04296875, + "completions/mean_terminated_length": 212.04296875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.07852728758007288, + "epoch": 0.3380530973451327, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.534256400305499, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 92274530.0, + "reward": 0.36542966961860657, + "reward_std": 0.08823078870773315, + "rewards/execution_accuracy_EX/mean": 0.33203125, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9817997813224792, + "sampling/importance_sampling_ratio/min": 0.0027461776044219732, + "sampling/sampling_logp_difference/max": 5.897545337677002, + "sampling/sampling_logp_difference/mean": 0.10510879755020142, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 177.33203125, + "completions/mean_terminated_length": 177.33203125, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.06680000270716846, + "epoch": 0.3398230088495575, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.2426648470928369, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 92682391.0, + "reward": 0.5880858898162842, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9757319688796997, + "sampling/importance_sampling_ratio/min": 0.00675286166369915, + "sampling/sampling_logp_difference/max": 4.997788906097412, + "sampling/sampling_logp_difference/mean": 0.10924798250198364, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 194.828125, + "completions/mean_terminated_length": 194.828125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.06723309447988868, + "epoch": 0.3415929203539823, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9267324866086494, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 93189915.0, + "reward": 0.47675782442092896, + "reward_std": 0.1142488494515419, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761380553245544, + "sampling/importance_sampling_ratio/min": 0.004090502858161926, + "sampling/sampling_logp_difference/max": 5.499087333679199, + "sampling/sampling_logp_difference/mean": 0.10671285539865494, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 168.98046875, + "completions/mean_terminated_length": 168.98046875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.06234596879221499, + "epoch": 0.3433628318584071, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5545643833321279, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 93705510.0, + "reward": 0.873828113079071, + "reward_std": 0.020280424505472183, + "rewards/execution_accuracy_EX/mean": 0.8671875, + "rewards/execution_accuracy_EX/std": 0.3400367796421051, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8843365907669067, + "sampling/importance_sampling_ratio/mean": 0.9771836996078491, + "sampling/importance_sampling_ratio/min": 0.002480123657733202, + "sampling/sampling_logp_difference/max": 5.999446868896484, + "sampling/sampling_logp_difference/mean": 0.10336756706237793, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 177.1171875, + "completions/mean_terminated_length": 177.1171875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.06709920638240874, + "epoch": 0.34513274336283184, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7412955791785233, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 94104804.0, + "reward": 0.62890625, + "reward_std": 0.087394580245018, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9750711917877197, + "sampling/importance_sampling_ratio/min": 0.004090499132871628, + "sampling/sampling_logp_difference/max": 5.499088287353516, + "sampling/sampling_logp_difference/mean": 0.10945422947406769, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 173.7109375, + "completions/mean_terminated_length": 173.7109375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.06990452646277845, + "epoch": 0.34690265486725663, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5765386747084846, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 94538394.0, + "reward": 0.6029296517372131, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8910551071166992, + "sampling/importance_sampling_ratio/mean": 0.9760135412216187, + "sampling/importance_sampling_ratio/min": 0.0026021667290478945, + "sampling/sampling_logp_difference/max": 5.95141077041626, + "sampling/sampling_logp_difference/mean": 0.11324626207351685, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 196.79296875, + "completions/mean_terminated_length": 196.79296875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.06319446745328605, + "epoch": 0.3486725663716814, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 95167429.0, + "reward": 0.40625, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774543046951294, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.10673161596059799, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 176.35546875, + "completions/mean_terminated_length": 176.35546875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.05642144940793514, + "epoch": 0.3504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.094257589913928, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 95800784.0, + "reward": 0.532421886920929, + "reward_std": 0.10090947151184082, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9782726764678955, + "sampling/importance_sampling_ratio/mean": 0.9796082973480225, + "sampling/importance_sampling_ratio/min": 0.0007555896881967783, + "sampling/sampling_logp_difference/max": 7.18801212310791, + "sampling/sampling_logp_difference/mean": 0.0956520214676857, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 178.1640625, + "completions/mean_terminated_length": 178.1640625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.06387745635583997, + "epoch": 0.35221238938053095, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8902276678750406, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 96331642.0, + "reward": 0.6140625476837158, + "reward_std": 0.11529980599880219, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.974372386932373, + "sampling/importance_sampling_ratio/min": 0.003185226581990719, + "sampling/sampling_logp_difference/max": 5.749231815338135, + "sampling/sampling_logp_difference/mean": 0.11444615572690964, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 181.69140625, + "completions/mean_terminated_length": 181.69140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.06275583617389202, + "epoch": 0.35398230088495575, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6802751268144084, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 96810571.0, + "reward": 0.62890625, + "reward_std": 0.10140211880207062, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779365658760071, + "sampling/importance_sampling_ratio/min": 0.0067699141800403595, + "sampling/sampling_logp_difference/max": 4.995266914367676, + "sampling/sampling_logp_difference/mean": 0.10834820568561554, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 355.0, + "completions/max_terminated_length": 355.0, + "completions/mean_length": 168.32421875, + "completions/mean_terminated_length": 168.32421875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.062084480887278914, + "epoch": 0.35575221238938054, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2154715550738686, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 97253038.0, + "reward": 0.5435546636581421, + "reward_std": 0.09240090101957321, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9760016202926636, + "sampling/importance_sampling_ratio/min": 0.0068613626062870026, + "sampling/sampling_logp_difference/max": 4.981849193572998, + "sampling/sampling_logp_difference/mean": 0.11295540630817413, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 158.02734375, + "completions/mean_terminated_length": 158.02734375, + "completions/min_length": 35.0, + "completions/min_terminated_length": 35.0, + "entropy": 0.05896363197825849, + "epoch": 0.35752212389380533, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.36200399022984825, + "learning_rate": 1e-06, + "loss": -0.0146, + "num_tokens": 97693605.0, + "reward": 0.587890625, + "reward_std": 0.015625, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 1.9604804515838623, + "sampling/importance_sampling_ratio/mean": 0.9757970571517944, + "sampling/importance_sampling_ratio/min": 0.004094286821782589, + "sampling/sampling_logp_difference/max": 5.498162746429443, + "sampling/sampling_logp_difference/mean": 0.10766394436359406, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 175.375, + "completions/mean_terminated_length": 175.375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.06358649139292538, + "epoch": 0.35929203539823007, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6236938824318042, + "learning_rate": 1e-06, + "loss": -0.0237, + "num_tokens": 98133061.0, + "reward": 0.632617175579071, + "reward_std": 0.06449567526578903, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9777095913887024, + "sampling/importance_sampling_ratio/min": 0.0024810119066387415, + "sampling/sampling_logp_difference/max": 5.999088764190674, + "sampling/sampling_logp_difference/mean": 0.10796424001455307, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 170.3203125, + "completions/mean_terminated_length": 170.3203125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.0637536346912384, + "epoch": 0.36106194690265486, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5662285364513112, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 98600903.0, + "reward": 0.699414074420929, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832741022109985, + "sampling/importance_sampling_ratio/min": 0.0033976626582443714, + "sampling/sampling_logp_difference/max": 5.684667587280273, + "sampling/sampling_logp_difference/mean": 0.09638360142707825, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 178.59765625, + "completions/mean_terminated_length": 178.59765625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.07512175478041172, + "epoch": 0.36283185840707965, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0520637907170889, + "learning_rate": 1e-06, + "loss": -0.027, + "num_tokens": 98886880.0, + "reward": 0.6177734136581421, + "reward_std": 0.13133010268211365, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764768481254578, + "sampling/importance_sampling_ratio/min": 0.002480123657733202, + "sampling/sampling_logp_difference/max": 5.999446868896484, + "sampling/sampling_logp_difference/mean": 0.11177542805671692, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 185.0078125, + "completions/mean_terminated_length": 185.0078125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.05705621326342225, + "epoch": 0.36460176991150445, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5785879385509655, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 99266770.0, + "reward": 0.632617175579071, + "reward_std": 0.03512417525053024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.977774977684021, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.10279636085033417, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 186.984375, + "completions/mean_terminated_length": 186.984375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.06327829719521105, + "epoch": 0.3663716814159292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 99836654.0, + "reward": 0.5843750238418579, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9805782437324524, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.10061825811862946, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 598.0, + "completions/max_terminated_length": 598.0, + "completions/mean_length": 197.97265625, + "completions/mean_terminated_length": 197.97265625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.07285571284592152, + "epoch": 0.368141592920354, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.9032628335738638, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 100185207.0, + "reward": 0.4619140625, + "reward_std": 0.15723830461502075, + "rewards/execution_accuracy_EX/mean": 0.43359375, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.965977430343628, + "sampling/importance_sampling_ratio/mean": 0.9789958000183105, + "sampling/importance_sampling_ratio/min": 0.004096913617104292, + "sampling/sampling_logp_difference/max": 5.49752140045166, + "sampling/sampling_logp_difference/mean": 0.10703115165233612, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 170.4296875, + "completions/mean_terminated_length": 170.4296875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.06122932001017034, + "epoch": 0.36991150442477877, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9039600343939009, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 100646517.0, + "reward": 0.4136718809604645, + "reward_std": 0.07442296296358109, + "rewards/execution_accuracy_EX/mean": 0.3828125, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9729261994361877, + "sampling/importance_sampling_ratio/min": 0.005264526233077049, + "sampling/sampling_logp_difference/max": 5.246764183044434, + "sampling/sampling_logp_difference/mean": 0.11402840912342072, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 202.8203125, + "completions/mean_terminated_length": 202.8203125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.08263392420485616, + "epoch": 0.37168141592920356, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5209844367673134, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 100923639.0, + "reward": 0.5658203363418579, + "reward_std": 0.07205817103385925, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790911674499512, + "sampling/importance_sampling_ratio/min": 0.006748989224433899, + "sampling/sampling_logp_difference/max": 4.9983625411987305, + "sampling/sampling_logp_difference/mean": 0.1137848049402237, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 189.3828125, + "completions/mean_terminated_length": 189.3828125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.07528905617073178, + "epoch": 0.3734513274336283, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.0398484156034733, + "learning_rate": 1e-06, + "loss": -0.0172, + "num_tokens": 101362505.0, + "reward": 0.62890625, + "reward_std": 0.1595408022403717, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9802089929580688, + "sampling/importance_sampling_ratio/min": 0.00525125814601779, + "sampling/sampling_logp_difference/max": 5.2492876052856445, + "sampling/sampling_logp_difference/mean": 0.10524387657642365, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 179.046875, + "completions/mean_terminated_length": 179.046875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.06973482808098197, + "epoch": 0.3752212389380531, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.2197917153113347, + "learning_rate": 1e-06, + "loss": -0.0129, + "num_tokens": 101929797.0, + "reward": 0.5435546636581421, + "reward_std": 0.10045605897903442, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796969890594482, + "sampling/importance_sampling_ratio/min": 0.0024955703411251307, + "sampling/sampling_logp_difference/max": 5.9932379722595215, + "sampling/sampling_logp_difference/mean": 0.10591519623994827, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 185.32421875, + "completions/mean_terminated_length": 185.32421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.06410850444808602, + "epoch": 0.3769911504424779, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.35396196776004946, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 102592568.0, + "reward": 0.3580077886581421, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.32421875, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786050915718079, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.10772916674613953, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 200.92578125, + "completions/mean_terminated_length": 200.92578125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.0813448466360569, + "epoch": 0.3787610619469027, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.32161294663908047, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 102888853.0, + "reward": 0.6214843988418579, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9744765758514404, + "sampling/importance_sampling_ratio/min": 0.005257983226329088, + "sampling/sampling_logp_difference/max": 5.248007774353027, + "sampling/sampling_logp_difference/mean": 0.11827675253152847, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 181.1328125, + "completions/mean_terminated_length": 181.1328125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.07070233160629869, + "epoch": 0.3805309734513274, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.36521141524658596, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 103277671.0, + "reward": 0.740234375, + "reward_std": 0.05905900150537491, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761698842048645, + "sampling/importance_sampling_ratio/min": 0.005595050286501646, + "sampling/sampling_logp_difference/max": 5.185873031616211, + "sampling/sampling_logp_difference/mean": 0.10985036939382553, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 180.3359375, + "completions/mean_terminated_length": 180.3359375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.06991010042838752, + "epoch": 0.3823008849557522, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.0403935536754099, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 103639485.0, + "reward": 0.6734374761581421, + "reward_std": 0.15072943270206451, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9755353927612305, + "sampling/importance_sampling_ratio/min": 0.005255233496427536, + "sampling/sampling_logp_difference/max": 5.248530864715576, + "sampling/sampling_logp_difference/mean": 0.11154483258724213, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 194.30078125, + "completions/mean_terminated_length": 194.30078125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.06830351264216006, + "epoch": 0.384070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5593418220164562, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 104138538.0, + "reward": 0.651171863079071, + "reward_std": 0.09950816631317139, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8637901544570923, + "sampling/importance_sampling_ratio/mean": 0.9779515266418457, + "sampling/importance_sampling_ratio/min": 6.336591013678117e-06, + "sampling/sampling_logp_difference/max": 11.969169616699219, + "sampling/sampling_logp_difference/mean": 0.10662736743688583, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 192.55078125, + "completions/mean_terminated_length": 192.55078125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06964478921145201, + "epoch": 0.3858407079646018, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6540024337199174, + "learning_rate": 1e-06, + "loss": 0.0115, + "num_tokens": 104647751.0, + "reward": 0.6066405773162842, + "reward_std": 0.0726388692855835, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812942147254944, + "sampling/importance_sampling_ratio/min": 0.002566620707511902, + "sampling/sampling_logp_difference/max": 5.965165138244629, + "sampling/sampling_logp_difference/mean": 0.10259804129600525, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 289.0, + "completions/max_terminated_length": 289.0, + "completions/mean_length": 173.1484375, + "completions/mean_terminated_length": 173.1484375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.0755694005638361, + "epoch": 0.38761061946902653, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7416574955562725, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 105010925.0, + "reward": 0.591796875, + "reward_std": 0.07749484479427338, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978164553642273, + "sampling/importance_sampling_ratio/min": 0.005249785725027323, + "sampling/sampling_logp_difference/max": 5.249567985534668, + "sampling/sampling_logp_difference/mean": 0.11254717409610748, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 188.9609375, + "completions/mean_terminated_length": 188.9609375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.06937104044482112, + "epoch": 0.3893805309734513, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8433409695479646, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 105441155.0, + "reward": 0.5621093511581421, + "reward_std": 0.0950193852186203, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9378886222839355, + "sampling/importance_sampling_ratio/mean": 0.9785889983177185, + "sampling/importance_sampling_ratio/min": 0.0067480625584721565, + "sampling/sampling_logp_difference/max": 4.998499870300293, + "sampling/sampling_logp_difference/mean": 0.1028067022562027, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 170.91796875, + "completions/mean_terminated_length": 170.91796875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.0655801563989371, + "epoch": 0.3911504424778761, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8028338115538568, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 105867470.0, + "reward": 0.591796875, + "reward_std": 0.07024835050106049, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.975791096687317, + "sampling/importance_sampling_ratio/mean": 0.9821312427520752, + "sampling/importance_sampling_ratio/min": 0.0054520508274436, + "sampling/sampling_logp_difference/max": 5.211763381958008, + "sampling/sampling_logp_difference/mean": 0.09844622015953064, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 178.98046875, + "completions/mean_terminated_length": 178.98046875, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.06477804412133992, + "epoch": 0.3929203539823009, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8235277333173985, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 106338361.0, + "reward": 0.46562498807907104, + "reward_std": 0.09010109305381775, + "rewards/execution_accuracy_EX/mean": 0.4375, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774770736694336, + "sampling/importance_sampling_ratio/min": 0.0004470261628739536, + "sampling/sampling_logp_difference/max": 7.712893486022949, + "sampling/sampling_logp_difference/mean": 0.11008358001708984, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 192.57421875, + "completions/mean_terminated_length": 192.57421875, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.0667738956399262, + "epoch": 0.39469026548672564, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7797602697375864, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 106897228.0, + "reward": 0.4916015565395355, + "reward_std": 0.08352725207805634, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814401865005493, + "sampling/importance_sampling_ratio/min": 0.00018600340990815312, + "sampling/sampling_logp_difference/max": 8.58974552154541, + "sampling/sampling_logp_difference/mean": 0.09754404425621033, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 193.47265625, + "completions/mean_terminated_length": 193.47265625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.06793347140774131, + "epoch": 0.39646017699115044, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6641950501795775, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 107373061.0, + "reward": 0.5769531726837158, + "reward_std": 0.09113702178001404, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9756093621253967, + "sampling/importance_sampling_ratio/min": 0.004087820183485746, + "sampling/sampling_logp_difference/max": 5.499743461608887, + "sampling/sampling_logp_difference/mean": 0.11636139452457428, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 199.14453125, + "completions/mean_terminated_length": 199.14453125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06570139154791832, + "epoch": 0.39823008849557523, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8253330595448228, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 107751370.0, + "reward": 0.517578125, + "reward_std": 0.11373046040534973, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9816456437110901, + "sampling/importance_sampling_ratio/min": 0.004120486322790384, + "sampling/sampling_logp_difference/max": 5.49178409576416, + "sampling/sampling_logp_difference/mean": 0.09656339883804321, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 301.0, + "completions/max_terminated_length": 301.0, + "completions/mean_length": 186.046875, + "completions/mean_terminated_length": 186.046875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.06716337846592069, + "epoch": 0.4, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5257796340806438, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 108056662.0, + "reward": 0.24296873807907104, + "reward_std": 0.038778576999902725, + "rewards/execution_accuracy_EX/mean": 0.203125, + "rewards/execution_accuracy_EX/std": 0.40311288833618164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9745264649391174, + "sampling/importance_sampling_ratio/min": 0.006748339161276817, + "sampling/sampling_logp_difference/max": 4.9984588623046875, + "sampling/sampling_logp_difference/mean": 0.11180450022220612, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 194.88671875, + "completions/mean_terminated_length": 194.88671875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.060349891893565655, + "epoch": 0.40176991150442476, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6169298535744425, + "learning_rate": 1e-06, + "loss": -0.016, + "num_tokens": 108442361.0, + "reward": 0.5361327528953552, + "reward_std": 0.07390274852514267, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9765355587005615, + "sampling/importance_sampling_ratio/min": 0.004090401344001293, + "sampling/sampling_logp_difference/max": 5.499112129211426, + "sampling/sampling_logp_difference/mean": 0.10977569222450256, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 363.0, + "completions/max_terminated_length": 363.0, + "completions/mean_length": 188.24609375, + "completions/mean_terminated_length": 188.24609375, + "completions/min_length": 28.0, + "completions/min_terminated_length": 28.0, + "entropy": 0.059034221805632114, + "epoch": 0.40353982300884955, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8415126272540865, + "learning_rate": 1e-06, + "loss": -0.0258, + "num_tokens": 109122088.0, + "reward": 0.6875, + "reward_std": 0.09328272938728333, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825355410575867, + "sampling/importance_sampling_ratio/min": 0.0015177546301856637, + "sampling/sampling_logp_difference/max": 6.490523338317871, + "sampling/sampling_logp_difference/mean": 0.09474876523017883, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 181.51953125, + "completions/mean_terminated_length": 181.51953125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.06270501064136624, + "epoch": 0.40530973451327434, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8251966686285062, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 109563053.0, + "reward": 0.651171863079071, + "reward_std": 0.07392848283052444, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764101505279541, + "sampling/importance_sampling_ratio/min": 0.005263828206807375, + "sampling/sampling_logp_difference/max": 5.246896743774414, + "sampling/sampling_logp_difference/mean": 0.10821399092674255, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 414.0, + "completions/max_terminated_length": 414.0, + "completions/mean_length": 209.05078125, + "completions/mean_terminated_length": 209.05078125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.06168009783141315, + "epoch": 0.40707964601769914, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8744661618125532, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 110227146.0, + "reward": 0.6177734136581421, + "reward_std": 0.153347909450531, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812148809432983, + "sampling/importance_sampling_ratio/min": 0.0024831658229231834, + "sampling/sampling_logp_difference/max": 5.998220920562744, + "sampling/sampling_logp_difference/mean": 0.09586876630783081, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 179.34375, + "completions/mean_terminated_length": 179.34375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.05431597982533276, + "epoch": 0.4088495575221239, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7920541326338416, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 110745298.0, + "reward": 0.6845703125, + "reward_std": 0.07368800044059753, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790382981300354, + "sampling/importance_sampling_ratio/min": 0.0015095833223313093, + "sampling/sampling_logp_difference/max": 6.495921611785889, + "sampling/sampling_logp_difference/mean": 0.1024990826845169, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 202.5, + "completions/mean_terminated_length": 202.5, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.06586730806156993, + "epoch": 0.41061946902654867, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7462293376460698, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 111153490.0, + "reward": 0.5249999761581421, + "reward_std": 0.07652123272418976, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798729419708252, + "sampling/importance_sampling_ratio/min": 0.0003379088593646884, + "sampling/sampling_logp_difference/max": 7.992734432220459, + "sampling/sampling_logp_difference/mean": 0.10514885187149048, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 184.96484375, + "completions/mean_terminated_length": 184.96484375, + "completions/min_length": 56.0, + "completions/min_terminated_length": 56.0, + "entropy": 0.05353686073794961, + "epoch": 0.41238938053097346, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7913496069083056, + "learning_rate": 1e-06, + "loss": -0.0218, + "num_tokens": 111670745.0, + "reward": 0.42851561307907104, + "reward_std": 0.10561299324035645, + "rewards/execution_accuracy_EX/mean": 0.3984375, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9729653596878052, + "sampling/importance_sampling_ratio/min": 0.005275248549878597, + "sampling/sampling_logp_difference/max": 5.244729518890381, + "sampling/sampling_logp_difference/mean": 0.11015370488166809, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 698.0, + "completions/max_terminated_length": 698.0, + "completions/mean_length": 195.32421875, + "completions/mean_terminated_length": 195.32421875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.05500617087818682, + "epoch": 0.41415929203539825, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.1417912959125394, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 112114684.0, + "reward": 0.43964841961860657, + "reward_std": 0.06481166929006577, + "rewards/execution_accuracy_EX/mean": 0.41015625, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795290231704712, + "sampling/importance_sampling_ratio/min": 0.0003370318445377052, + "sampling/sampling_logp_difference/max": 7.995333194732666, + "sampling/sampling_logp_difference/mean": 0.09656843543052673, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 336.0, + "completions/max_terminated_length": 336.0, + "completions/mean_length": 181.58203125, + "completions/mean_terminated_length": 181.58203125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.06703933514654636, + "epoch": 0.415929203539823, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.5732457357815264, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 112459521.0, + "reward": 0.7736327648162842, + "reward_std": 0.0866614356637001, + "rewards/execution_accuracy_EX/mean": 0.76171875, + "rewards/execution_accuracy_EX/std": 0.4268665909767151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774959087371826, + "sampling/importance_sampling_ratio/min": 0.0024823835119605064, + "sampling/sampling_logp_difference/max": 5.998536109924316, + "sampling/sampling_logp_difference/mean": 0.11353403329849243, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 501.0, + "completions/max_terminated_length": 501.0, + "completions/mean_length": 194.515625, + "completions/mean_terminated_length": 194.515625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.0496614808216691, + "epoch": 0.4176991150442478, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 112897989.0, + "reward": 0.34687498211860657, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.3125, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9801385402679443, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.09537176787853241, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 347.0, + "completions/max_terminated_length": 347.0, + "completions/mean_length": 187.15625, + "completions/mean_terminated_length": 187.15625, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "entropy": 0.055510540725663304, + "epoch": 0.4194690265486726, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.66950861514116, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 113518925.0, + "reward": 0.5101562738418579, + "reward_std": 0.05721442401409149, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761072993278503, + "sampling/importance_sampling_ratio/min": 0.001665646443143487, + "sampling/sampling_logp_difference/max": 6.3975419998168945, + "sampling/sampling_logp_difference/mean": 0.10421738028526306, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 470.0, + "completions/max_terminated_length": 470.0, + "completions/mean_length": 181.828125, + "completions/mean_terminated_length": 181.828125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.052765025524422526, + "epoch": 0.42123893805309737, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5278907844498033, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 113949649.0, + "reward": 0.558398425579071, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984184980392456, + "sampling/importance_sampling_ratio/min": 0.0011158257257193327, + "sampling/sampling_logp_difference/max": 6.798160552978516, + "sampling/sampling_logp_difference/mean": 0.08022935688495636, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 202.4765625, + "completions/mean_terminated_length": 202.4765625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.050976953469216824, + "epoch": 0.4230088495575221, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 114306075.0, + "reward": 0.40625, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841535687446594, + "sampling/importance_sampling_ratio/min": 0.00248554814606905, + "sampling/sampling_logp_difference/max": 5.997262001037598, + "sampling/sampling_logp_difference/mean": 0.09012670814990997, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 172.0078125, + "completions/mean_terminated_length": 172.0078125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.05649355659261346, + "epoch": 0.4247787610619469, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 114824525.0, + "reward": 0.5843750238418579, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764881730079651, + "sampling/importance_sampling_ratio/min": 0.0004408725944813341, + "sampling/sampling_logp_difference/max": 7.726754665374756, + "sampling/sampling_logp_difference/mean": 0.10590438544750214, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 329.0, + "completions/max_terminated_length": 329.0, + "completions/mean_length": 186.21875, + "completions/mean_terminated_length": 186.21875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.04785611876286566, + "epoch": 0.4265486725663717, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8941294560891194, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 115240053.0, + "reward": 0.539843738079071, + "reward_std": 0.10718235373497009, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9794677495956421, + "sampling/importance_sampling_ratio/min": 0.00318659539334476, + "sampling/sampling_logp_difference/max": 5.748802185058594, + "sampling/sampling_logp_difference/mean": 0.0989503562450409, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 341.0, + "completions/max_terminated_length": 341.0, + "completions/mean_length": 173.45703125, + "completions/mean_terminated_length": 173.45703125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.05558731919154525, + "epoch": 0.4283185840707965, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7548009390890925, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 115731722.0, + "reward": 0.6029297113418579, + "reward_std": 0.05697394162416458, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9769210815429688, + "sampling/importance_sampling_ratio/min": 0.0015178514877334237, + "sampling/sampling_logp_difference/max": 6.490459442138672, + "sampling/sampling_logp_difference/mean": 0.10848993808031082, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 785.0, + "completions/max_terminated_length": 785.0, + "completions/mean_length": 192.23828125, + "completions/mean_terminated_length": 192.23828125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.06494789407588542, + "epoch": 0.4300884955752212, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.3621398361939095, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 116154391.0, + "reward": 0.5435546636581421, + "reward_std": 0.05697394162416458, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9804878830909729, + "sampling/importance_sampling_ratio/min": 0.002480123657733202, + "sampling/sampling_logp_difference/max": 5.999446868896484, + "sampling/sampling_logp_difference/mean": 0.1074967011809349, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 344.0, + "completions/max_terminated_length": 344.0, + "completions/mean_length": 190.41015625, + "completions/mean_terminated_length": 190.41015625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.057483400916680694, + "epoch": 0.431858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1091352303266158, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 116655696.0, + "reward": 0.703125, + "reward_std": 0.09052877128124237, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9758817553520203, + "sampling/importance_sampling_ratio/min": 0.0031850412487983704, + "sampling/sampling_logp_difference/max": 5.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.10856863856315613, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 186.36328125, + "completions/mean_terminated_length": 186.36328125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.0515605837572366, + "epoch": 0.4336283185840708, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.46680498589666747, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 117153485.0, + "reward": 0.5806640386581421, + "reward_std": 0.04421525076031685, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978112518787384, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.09863690286874771, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 156.984375, + "completions/mean_terminated_length": 156.984375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.044441307662054896, + "epoch": 0.4353982300884956, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.7571492986368435, + "learning_rate": 1e-06, + "loss": -0.008, + "num_tokens": 117532105.0, + "reward": 0.725390613079071, + "reward_std": 0.04326736554503441, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820444583892822, + "sampling/importance_sampling_ratio/min": 0.003185046138241887, + "sampling/sampling_logp_difference/max": 5.749288558959961, + "sampling/sampling_logp_difference/mean": 0.08466016501188278, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 188.6328125, + "completions/mean_terminated_length": 188.6328125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.05099848075769842, + "epoch": 0.43716814159292033, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 117984699.0, + "reward": 0.46562498807907104, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.4375, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788640141487122, + "sampling/importance_sampling_ratio/min": 0.00218316656537354, + "sampling/sampling_logp_difference/max": 6.126978874206543, + "sampling/sampling_logp_difference/mean": 0.09730532020330429, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 193.28125, + "completions/mean_terminated_length": 193.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.047978213522583246, + "epoch": 0.4389380530973451, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7465490631828592, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 118442851.0, + "reward": 0.539843738079071, + "reward_std": 0.05435546487569809, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9754295349121094, + "sampling/importance_sampling_ratio/min": 0.0032013151794672012, + "sampling/sampling_logp_difference/max": 5.7441935539245605, + "sampling/sampling_logp_difference/mean": 0.1029406189918518, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 208.859375, + "completions/mean_terminated_length": 208.859375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05064773070625961, + "epoch": 0.4407079646017699, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8221328795670951, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 118981119.0, + "reward": 0.3431640863418579, + "reward_std": 0.07108455896377563, + "rewards/execution_accuracy_EX/mean": 0.30859375, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846144914627075, + "sampling/importance_sampling_ratio/min": 0.006744742393493652, + "sampling/sampling_logp_difference/max": 4.998991966247559, + "sampling/sampling_logp_difference/mean": 0.08240656554698944, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 179.8515625, + "completions/mean_terminated_length": 179.8515625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.04947967128828168, + "epoch": 0.4424778761061947, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.45951993024724286, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 119446457.0, + "reward": 0.5101562142372131, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.867811679840088, + "sampling/importance_sampling_ratio/mean": 0.9825025796890259, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.08983209729194641, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 346.0, + "completions/max_terminated_length": 346.0, + "completions/mean_length": 179.90625, + "completions/mean_terminated_length": 179.90625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.05238872580230236, + "epoch": 0.44424778761061945, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6474577041175358, + "learning_rate": 1e-06, + "loss": 0.006, + "num_tokens": 119965697.0, + "reward": 0.666015625, + "reward_std": 0.06354779005050659, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9805374145507812, + "sampling/importance_sampling_ratio/min": 0.0009419890702702105, + "sampling/sampling_logp_difference/max": 6.967516899108887, + "sampling/sampling_logp_difference/mean": 0.09997466206550598, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 186.69140625, + "completions/mean_terminated_length": 186.69140625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.05170595156960189, + "epoch": 0.44601769911504424, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.4283983436155647, + "learning_rate": 1e-06, + "loss": 0.0113, + "num_tokens": 120466482.0, + "reward": 0.6103515625, + "reward_std": 0.11748611181974411, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790263175964355, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.09992367774248123, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 208.48828125, + "completions/mean_terminated_length": 208.48828125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.04635073058307171, + "epoch": 0.44778761061946903, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7638469973795421, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 120771423.0, + "reward": 0.5806640386581421, + "reward_std": 0.07368800044059753, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799988865852356, + "sampling/importance_sampling_ratio/min": 0.004088143818080425, + "sampling/sampling_logp_difference/max": 5.499664306640625, + "sampling/sampling_logp_difference/mean": 0.09313726425170898, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 192.078125, + "completions/mean_terminated_length": 192.078125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.053239843575283885, + "epoch": 0.4495575221238938, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.536511644069365, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 121268307.0, + "reward": 0.743945300579071, + "reward_std": 0.028423616662621498, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790952205657959, + "sampling/importance_sampling_ratio/min": 5.212580801022826e-12, + "sampling/sampling_logp_difference/max": 25.97994613647461, + "sampling/sampling_logp_difference/mean": 0.09940062463283539, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 178.0625, + "completions/mean_terminated_length": 178.0625, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.049338994547724724, + "epoch": 0.45132743362831856, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.2380090743182313, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 121601363.0, + "reward": 0.517578125, + "reward_std": 0.020280424505472183, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832853078842163, + "sampling/importance_sampling_ratio/min": 0.004189265891909599, + "sampling/sampling_logp_difference/max": 5.475229740142822, + "sampling/sampling_logp_difference/mean": 0.08592215180397034, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 195.25, + "completions/mean_terminated_length": 195.25, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.05622630682773888, + "epoch": 0.45309734513274336, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.8582414449007862, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 121994963.0, + "reward": 0.3876953125, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.35546875, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9807649850845337, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.09695159643888474, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 186.734375, + "completions/mean_terminated_length": 186.734375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.04980440391227603, + "epoch": 0.45486725663716815, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4604056219585437, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 122569375.0, + "reward": 0.6734374761581421, + "reward_std": 0.030661117285490036, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845761656761169, + "sampling/importance_sampling_ratio/min": 0.0024795844219624996, + "sampling/sampling_logp_difference/max": 5.999664306640625, + "sampling/sampling_logp_difference/mean": 0.08232512325048447, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 365.0, + "completions/max_terminated_length": 365.0, + "completions/mean_length": 205.19140625, + "completions/mean_terminated_length": 205.19140625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.049544031266123056, + "epoch": 0.45663716814159294, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0031350145940707, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 122977008.0, + "reward": 0.725390613079071, + "reward_std": 0.07965542376041412, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826902151107788, + "sampling/importance_sampling_ratio/min": 0.004115331918001175, + "sampling/sampling_logp_difference/max": 5.493035793304443, + "sampling/sampling_logp_difference/mean": 0.08731532096862793, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 187.2578125, + "completions/mean_terminated_length": 187.2578125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.05032071494497359, + "epoch": 0.4584070796460177, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.41329523275538693, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 123351122.0, + "reward": 0.6957031488418579, + "reward_std": 0.020280424505472183, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798043966293335, + "sampling/importance_sampling_ratio/min": 0.0024848966859281063, + "sampling/sampling_logp_difference/max": 5.997524261474609, + "sampling/sampling_logp_difference/mean": 0.09387662261724472, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 210.73828125, + "completions/mean_terminated_length": 210.73828125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.05778343975543976, + "epoch": 0.46017699115044247, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1584413078590219, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 123877967.0, + "reward": 0.5843750238418579, + "reward_std": 0.1576429307460785, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.977090060710907, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.10387515276670456, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 194.20703125, + "completions/mean_terminated_length": 194.20703125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.044735269621014595, + "epoch": 0.46194690265486726, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.675966087790948, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 124250644.0, + "reward": 0.632617175579071, + "reward_std": 0.06481166929006577, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9831644296646118, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.0819057822227478, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 194.23046875, + "completions/mean_terminated_length": 194.23046875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05417222948744893, + "epoch": 0.46371681415929206, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2599778535376105, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 124584415.0, + "reward": 0.651171863079071, + "reward_std": 0.11768850684165955, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9778647422790527, + "sampling/importance_sampling_ratio/min": 0.0019327143672853708, + "sampling/sampling_logp_difference/max": 6.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.10001817345619202, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 222.609375, + "completions/mean_terminated_length": 222.609375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.05858397204428911, + "epoch": 0.4654867256637168, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6036688451438139, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 124979659.0, + "reward": 0.591796875, + "reward_std": 0.04683373123407364, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813190698623657, + "sampling/importance_sampling_ratio/min": 0.006744090002030134, + "sampling/sampling_logp_difference/max": 4.999088764190674, + "sampling/sampling_logp_difference/mean": 0.09564685821533203, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 160.6328125, + "completions/mean_terminated_length": 160.6328125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.03753574448637664, + "epoch": 0.4672566371681416, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4649628261772436, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 125469389.0, + "reward": 0.666015625, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813580513000488, + "sampling/importance_sampling_ratio/min": 0.0011898259399458766, + "sampling/sampling_logp_difference/max": 6.733948230743408, + "sampling/sampling_logp_difference/mean": 0.08440389484167099, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 208.453125, + "completions/mean_terminated_length": 208.453125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.05567732104100287, + "epoch": 0.4690265486725664, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0751693432456957, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 125834625.0, + "reward": 0.6957031488418579, + "reward_std": 0.09209811687469482, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840471148490906, + "sampling/importance_sampling_ratio/min": 0.005292921327054501, + "sampling/sampling_logp_difference/max": 5.241384983062744, + "sampling/sampling_logp_difference/mean": 0.08824936300516129, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 219.375, + "completions/mean_terminated_length": 219.375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.06540046748705208, + "epoch": 0.47079646017699117, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.283988585589163, + "learning_rate": 1e-06, + "loss": 0.0068, + "num_tokens": 126266097.0, + "reward": 0.6511719226837158, + "reward_std": 0.1595407873392105, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814093112945557, + "sampling/importance_sampling_ratio/min": 0.001608553808182478, + "sampling/sampling_logp_difference/max": 6.432419776916504, + "sampling/sampling_logp_difference/mean": 0.10234202444553375, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 216.2265625, + "completions/mean_terminated_length": 216.2265625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.051048469031229615, + "epoch": 0.4725663716814159, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1346975289592114, + "learning_rate": 1e-06, + "loss": -0.0421, + "num_tokens": 126705483.0, + "reward": 0.5287109613418579, + "reward_std": 0.15573394298553467, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9785364270210266, + "sampling/importance_sampling_ratio/min": 0.00038086267886683345, + "sampling/sampling_logp_difference/max": 7.873071670532227, + "sampling/sampling_logp_difference/mean": 0.09751743078231812, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 402.0, + "completions/max_terminated_length": 402.0, + "completions/mean_length": 198.20703125, + "completions/mean_terminated_length": 198.20703125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.05062879668548703, + "epoch": 0.4743362831858407, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5151744225734421, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 127017696.0, + "reward": 0.595507800579071, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9831171035766602, + "sampling/importance_sampling_ratio/min": 0.004088143818080425, + "sampling/sampling_logp_difference/max": 5.499664306640625, + "sampling/sampling_logp_difference/mean": 0.08302123099565506, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 214.34375, + "completions/mean_terminated_length": 214.34375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.06246657809242606, + "epoch": 0.4761061946902655, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5310864689654373, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 127464232.0, + "reward": 0.6585937738418579, + "reward_std": 0.05937499925494194, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839457869529724, + "sampling/importance_sampling_ratio/min": 0.005257717799395323, + "sampling/sampling_logp_difference/max": 5.248058319091797, + "sampling/sampling_logp_difference/mean": 0.09775343537330627, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 182.93359375, + "completions/mean_terminated_length": 182.93359375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.04230481549166143, + "epoch": 0.4778761061946903, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3355519070848472, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 127867735.0, + "reward": 0.8515625, + "reward_std": 0.12731032073497772, + "rewards/execution_accuracy_EX/mean": 0.84375, + "rewards/execution_accuracy_EX/std": 0.3638034462928772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814539551734924, + "sampling/importance_sampling_ratio/min": 0.00318911112844944, + "sampling/sampling_logp_difference/max": 5.748013019561768, + "sampling/sampling_logp_difference/mean": 0.09095747768878937, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 211.11328125, + "completions/mean_terminated_length": 211.11328125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.05057228496298194, + "epoch": 0.479646017699115, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2842551940544416, + "learning_rate": 1e-06, + "loss": 0.0336, + "num_tokens": 128442660.0, + "reward": 0.48046875, + "reward_std": 0.15835241973400116, + "rewards/execution_accuracy_EX/mean": 0.453125, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786376953125, + "sampling/importance_sampling_ratio/min": 0.0005632326938211918, + "sampling/sampling_logp_difference/max": 7.481817722320557, + "sampling/sampling_logp_difference/mean": 0.10052070021629333, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 191.65234375, + "completions/mean_terminated_length": 191.65234375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.051969186402857304, + "epoch": 0.4814159292035398, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6625858360139675, + "learning_rate": 1e-06, + "loss": -0.0126, + "num_tokens": 128970811.0, + "reward": 0.5361328125, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9781502485275269, + "sampling/importance_sampling_ratio/min": 0.0016268546460196376, + "sampling/sampling_logp_difference/max": 6.421106815338135, + "sampling/sampling_logp_difference/mean": 0.09620077162981033, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 206.5625, + "completions/mean_terminated_length": 206.5625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.05129739479161799, + "epoch": 0.4831858407079646, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.13939541557578, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 129562907.0, + "reward": 0.4173828065395355, + "reward_std": 0.06010813266038895, + "rewards/execution_accuracy_EX/mean": 0.38671875, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795727729797363, + "sampling/importance_sampling_ratio/min": 0.004132191650569439, + "sampling/sampling_logp_difference/max": 5.48894739151001, + "sampling/sampling_logp_difference/mean": 0.09732958674430847, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 450.0, + "completions/max_terminated_length": 450.0, + "completions/mean_length": 196.6796875, + "completions/mean_terminated_length": 196.6796875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.0472798936534673, + "epoch": 0.4849557522123894, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4090884918554263, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 130071337.0, + "reward": 0.666015625, + "reward_std": 0.04326736554503441, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840908646583557, + "sampling/importance_sampling_ratio/min": 0.00024113479594234377, + "sampling/sampling_logp_difference/max": 8.330154418945312, + "sampling/sampling_logp_difference/mean": 0.08214626461267471, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 189.546875, + "completions/mean_terminated_length": 189.546875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05129129299893975, + "epoch": 0.48672566371681414, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.1396749090035225, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 130544981.0, + "reward": 0.558398425579071, + "reward_std": 0.11717011034488678, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846123456954956, + "sampling/importance_sampling_ratio/min": 0.005259412806481123, + "sampling/sampling_logp_difference/max": 5.247735977172852, + "sampling/sampling_logp_difference/mean": 0.0850289911031723, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 181.30859375, + "completions/mean_terminated_length": 181.30859375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.04402087558992207, + "epoch": 0.48849557522123893, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5870295623370129, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 130978468.0, + "reward": 0.49531248211860657, + "reward_std": 0.030661117285490036, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9853346347808838, + "sampling/importance_sampling_ratio/min": 0.004088143818080425, + "sampling/sampling_logp_difference/max": 5.499664306640625, + "sampling/sampling_logp_difference/mean": 0.08171343058347702, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 345.0, + "completions/max_terminated_length": 345.0, + "completions/mean_length": 181.66015625, + "completions/mean_terminated_length": 181.66015625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.04424508870579302, + "epoch": 0.4902654867256637, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7608694404846298, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 131433197.0, + "reward": 0.7328125238418579, + "reward_std": 0.06354779005050659, + "rewards/execution_accuracy_EX/mean": 0.71875, + "rewards/execution_accuracy_EX/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811903834342957, + "sampling/importance_sampling_ratio/min": 0.0011717112502083182, + "sampling/sampling_logp_difference/max": 6.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.08624295145273209, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 210.2578125, + "completions/mean_terminated_length": 210.2578125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.05125125776976347, + "epoch": 0.4920353982300885, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6991726105038402, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 131973823.0, + "reward": 0.4507812261581421, + "reward_std": 0.08062904328107834, + "rewards/execution_accuracy_EX/mean": 0.421875, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812140464782715, + "sampling/importance_sampling_ratio/min": 0.0009142700582742691, + "sampling/sampling_logp_difference/max": 6.997384548187256, + "sampling/sampling_logp_difference/mean": 0.08920617401599884, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 199.7109375, + "completions/mean_terminated_length": 199.7109375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.04429618245922029, + "epoch": 0.49380530973451325, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6519449851797027, + "learning_rate": 1e-06, + "loss": -0.0103, + "num_tokens": 132468917.0, + "reward": 0.7513672113418579, + "reward_std": 0.04870403930544853, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813372492790222, + "sampling/importance_sampling_ratio/min": 0.003735871519893408, + "sampling/sampling_logp_difference/max": 5.589774131774902, + "sampling/sampling_logp_difference/mean": 0.08783899247646332, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 195.03125, + "completions/mean_terminated_length": 195.03125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.049749566707760096, + "epoch": 0.49557522123893805, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6178204848790358, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 133008285.0, + "reward": 0.632617175579071, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864822626113892, + "sampling/importance_sampling_ratio/min": 0.0032769273966550827, + "sampling/sampling_logp_difference/max": 5.72084903717041, + "sampling/sampling_logp_difference/mean": 0.07578238844871521, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 167.3671875, + "completions/mean_terminated_length": 167.3671875, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.048610941506922245, + "epoch": 0.49734513274336284, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3001053262864535, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 133378779.0, + "reward": 0.662304699420929, + "reward_std": 0.11916713416576385, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845118522644043, + "sampling/importance_sampling_ratio/min": 0.0007163186674006283, + "sampling/sampling_logp_difference/max": 7.241385459899902, + "sampling/sampling_logp_difference/mean": 0.08339308947324753, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 308.0, + "completions/max_terminated_length": 308.0, + "completions/mean_length": 182.21484375, + "completions/mean_terminated_length": 182.21484375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.04578643827699125, + "epoch": 0.49911504424778763, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 133763122.0, + "reward": 0.762499988079071, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852456450462341, + "sampling/importance_sampling_ratio/min": 0.0024810130707919598, + "sampling/sampling_logp_difference/max": 5.999088287353516, + "sampling/sampling_logp_difference/mean": 0.08164516091346741, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 426.0, + "completions/max_terminated_length": 426.0, + "completions/mean_length": 201.76171875, + "completions/mean_terminated_length": 201.76171875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.051659599877893925, + "epoch": 0.5008849557522124, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8233051868514297, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 134202277.0, + "reward": 0.4248046576976776, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.39453125, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820558428764343, + "sampling/importance_sampling_ratio/min": 0.003193199634552002, + "sampling/sampling_logp_difference/max": 5.746731758117676, + "sampling/sampling_logp_difference/mean": 0.09158714860677719, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 216.67578125, + "completions/mean_terminated_length": 216.67578125, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.04935676441527903, + "epoch": 0.5026548672566372, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8569047889200642, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 134618098.0, + "reward": 0.5992187261581421, + "reward_std": 0.07912467420101166, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849299192428589, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.08148159086704254, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 225.0, + "completions/max_terminated_length": 225.0, + "completions/mean_length": 162.4375, + "completions/mean_terminated_length": 162.4375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.04004441061988473, + "epoch": 0.504424778761062, + "frac_reward_zero_std": 0.75, + "grad_norm": 3.1241691990185863, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 135099154.0, + "reward": 0.7179687023162842, + "reward_std": 0.11308803409337997, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860707521438599, + "sampling/importance_sampling_ratio/min": 0.0002490998012945056, + "sampling/sampling_logp_difference/max": 8.297657012939453, + "sampling/sampling_logp_difference/mean": 0.07750853896141052, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 571.0, + "completions/max_terminated_length": 571.0, + "completions/mean_length": 192.5625, + "completions/mean_terminated_length": 192.5625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.048907139571383595, + "epoch": 0.5061946902654867, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6175879020007512, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 135606450.0, + "reward": 0.595507800579071, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9801557064056396, + "sampling/importance_sampling_ratio/min": 2.072112920359359e-06, + "sampling/sampling_logp_difference/max": 13.086941719055176, + "sampling/sampling_logp_difference/mean": 0.09168112277984619, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 217.5859375, + "completions/mean_terminated_length": 217.5859375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05278244917280972, + "epoch": 0.5079646017699115, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 136152568.0, + "reward": 0.5843750238418579, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9791572093963623, + "sampling/importance_sampling_ratio/min": 0.0005592286470346153, + "sampling/sampling_logp_difference/max": 7.488952159881592, + "sampling/sampling_logp_difference/mean": 0.09987437725067139, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 195.55078125, + "completions/mean_terminated_length": 195.55078125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.04491586983203888, + "epoch": 0.5097345132743363, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.4065995584543316, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 136480085.0, + "reward": 0.632617175579071, + "reward_std": 0.11321474611759186, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862728714942932, + "sampling/importance_sampling_ratio/min": 0.005253775045275688, + "sampling/sampling_logp_difference/max": 5.24880838394165, + "sampling/sampling_logp_difference/mean": 0.07470975071191788, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 435.0, + "completions/max_terminated_length": 435.0, + "completions/mean_length": 198.65625, + "completions/mean_terminated_length": 198.65625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.04775779810734093, + "epoch": 0.511504424778761, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9866769004800582, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 137116877.0, + "reward": 0.6957031488418579, + "reward_std": 0.07912467420101166, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820747971534729, + "sampling/importance_sampling_ratio/min": 0.00014151231152936816, + "sampling/sampling_logp_difference/max": 8.863123893737793, + "sampling/sampling_logp_difference/mean": 0.08921748399734497, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 217.7109375, + "completions/mean_terminated_length": 217.7109375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.04724769201129675, + "epoch": 0.5132743362831859, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6554443503936286, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 137842467.0, + "reward": 0.47675779461860657, + "reward_std": 0.05908473581075668, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984429121017456, + "sampling/importance_sampling_ratio/min": 0.0011887246510013938, + "sampling/sampling_logp_difference/max": 6.734874248504639, + "sampling/sampling_logp_difference/mean": 0.0844247043132782, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 207.30078125, + "completions/mean_terminated_length": 207.30078125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.049809438176453114, + "epoch": 0.5150442477876106, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3153296643027175, + "learning_rate": 1e-06, + "loss": 0.0126, + "num_tokens": 138594368.0, + "reward": 0.4062499701976776, + "reward_std": 0.129853293299675, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9775841236114502, + "sampling/importance_sampling_ratio/min": 0.0032299254089593887, + "sampling/sampling_logp_difference/max": 5.735296249389648, + "sampling/sampling_logp_difference/mean": 0.0988219678401947, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 215.40234375, + "completions/mean_terminated_length": 215.40234375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.05302939680404961, + "epoch": 0.5168141592920354, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8853634322123276, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 139076519.0, + "reward": 0.7550780773162842, + "reward_std": 0.11059627681970596, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813735485076904, + "sampling/importance_sampling_ratio/min": 0.001513576484285295, + "sampling/sampling_logp_difference/max": 6.493279933929443, + "sampling/sampling_logp_difference/mean": 0.09062591940164566, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 195.45703125, + "completions/mean_terminated_length": 180.16079711914062, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.03904645750299096, + "epoch": 0.5185840707964602, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.9221575533663118, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 139730060.0, + "reward": 0.740039050579071, + "reward_std": 0.054542817175388336, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871268272399902, + "sampling/importance_sampling_ratio/min": 0.002846190007403493, + "sampling/sampling_logp_difference/max": 5.86177396774292, + "sampling/sampling_logp_difference/mean": 0.07170381397008896, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 201.98828125, + "completions/mean_terminated_length": 201.98828125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.05013478174805641, + "epoch": 0.5203539823008849, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.7524419425631024, + "learning_rate": 1e-06, + "loss": 0.0191, + "num_tokens": 140184105.0, + "reward": 0.6919921636581421, + "reward_std": 0.13274700939655304, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.979781985282898, + "sampling/importance_sampling_ratio/min": 0.0011875813361257315, + "sampling/sampling_logp_difference/max": 6.735836505889893, + "sampling/sampling_logp_difference/mean": 0.09894461184740067, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 380.0, + "completions/max_terminated_length": 380.0, + "completions/mean_length": 175.625, + "completions/mean_terminated_length": 175.625, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.03481391293462366, + "epoch": 0.5221238938053098, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8123699311326985, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 140567289.0, + "reward": 0.7884765863418579, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.77734375, + "rewards/execution_accuracy_EX/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898311495780945, + "sampling/importance_sampling_ratio/min": 9.478514402871951e-05, + "sampling/sampling_logp_difference/max": 9.263897895812988, + "sampling/sampling_logp_difference/mean": 0.06085116043686867, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 205.546875, + "completions/mean_terminated_length": 205.546875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.04844993958249688, + "epoch": 0.5238938053097345, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2131014991684952, + "learning_rate": 1e-06, + "loss": 0.009, + "num_tokens": 140924805.0, + "reward": 0.6771484017372131, + "reward_std": 0.08779861778020859, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829519987106323, + "sampling/importance_sampling_ratio/min": 0.005285256542265415, + "sampling/sampling_logp_difference/max": 5.242834091186523, + "sampling/sampling_logp_difference/mean": 0.0873769223690033, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 179.80859375, + "completions/mean_terminated_length": 179.80859375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.042584938229992986, + "epoch": 0.5256637168141592, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5016617854627178, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 141443332.0, + "reward": 0.7216796875, + "reward_std": 0.028423616662621498, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988422155380249, + "sampling/importance_sampling_ratio/min": 0.002524983836337924, + "sampling/sampling_logp_difference/max": 5.981520652770996, + "sampling/sampling_logp_difference/mean": 0.0706477016210556, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 351.0, + "completions/max_terminated_length": 351.0, + "completions/mean_length": 172.2890625, + "completions/mean_terminated_length": 172.2890625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.04733118740841746, + "epoch": 0.5274336283185841, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5456573489553538, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 141728910.0, + "reward": 0.5064452886581421, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984705924987793, + "sampling/importance_sampling_ratio/min": 0.0031865620985627174, + "sampling/sampling_logp_difference/max": 5.748812675476074, + "sampling/sampling_logp_difference/mean": 0.08340511471033096, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 185.26953125, + "completions/mean_terminated_length": 185.26953125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.04411654337309301, + "epoch": 0.5292035398230088, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0345858594488728, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 142188435.0, + "reward": 0.5287109017372131, + "reward_std": 0.08748261630535126, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9853092432022095, + "sampling/importance_sampling_ratio/min": 0.004334472119808197, + "sampling/sampling_logp_difference/max": 5.441155433654785, + "sampling/sampling_logp_difference/mean": 0.08046219497919083, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 183.79296875, + "completions/mean_terminated_length": 183.79296875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.03833493799902499, + "epoch": 0.5309734513274337, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.884015834395577, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 142541822.0, + "reward": 0.5658203363418579, + "reward_std": 0.07390275597572327, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9853612184524536, + "sampling/importance_sampling_ratio/min": 0.0031990553252398968, + "sampling/sampling_logp_difference/max": 5.744899749755859, + "sampling/sampling_logp_difference/mean": 0.07646487653255463, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 201.56640625, + "completions/mean_terminated_length": 201.56640625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.04956588428467512, + "epoch": 0.5327433628318584, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6985191686215754, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 143180767.0, + "reward": 0.5546875, + "reward_std": 0.060348618775606155, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825830459594727, + "sampling/importance_sampling_ratio/min": 0.001245732419192791, + "sampling/sampling_logp_difference/max": 6.6880316734313965, + "sampling/sampling_logp_difference/mean": 0.0915587842464447, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 324.0, + "completions/max_terminated_length": 324.0, + "completions/mean_length": 192.4609375, + "completions/mean_terminated_length": 192.4609375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.04267005855217576, + "epoch": 0.5345132743362832, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.2753040119330576, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 143641829.0, + "reward": 0.606640636920929, + "reward_std": 0.13108962774276733, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873267412185669, + "sampling/importance_sampling_ratio/min": 0.0028423357289284468, + "sampling/sampling_logp_difference/max": 5.863129138946533, + "sampling/sampling_logp_difference/mean": 0.07296876609325409, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 196.76953125, + "completions/mean_terminated_length": 196.76953125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.04838581453077495, + "epoch": 0.536283185840708, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8056098105874437, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 144163130.0, + "reward": 0.521289050579071, + "reward_std": 0.09553777426481247, + "rewards/execution_accuracy_EX/mean": 0.49609375, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798790216445923, + "sampling/importance_sampling_ratio/min": 0.00026267211069352925, + "sampling/sampling_logp_difference/max": 8.244604110717773, + "sampling/sampling_logp_difference/mean": 0.0976749137043953, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 199.203125, + "completions/mean_terminated_length": 199.203125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.04596993583254516, + "epoch": 0.5380530973451327, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.088595453549068, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 144717198.0, + "reward": 0.632617175579071, + "reward_std": 0.10337549448013306, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9851846694946289, + "sampling/importance_sampling_ratio/min": 0.0026373614091426134, + "sampling/sampling_logp_difference/max": 5.937976360321045, + "sampling/sampling_logp_difference/mean": 0.078038290143013, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 186.7109375, + "completions/mean_terminated_length": 186.7109375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.045229511335492134, + "epoch": 0.5398230088495575, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7305875337208488, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 145103268.0, + "reward": 0.669726550579071, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819443821907043, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.08018605411052704, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 542.0, + "completions/max_terminated_length": 542.0, + "completions/mean_length": 185.27734375, + "completions/mean_terminated_length": 185.27734375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.049617046723142266, + "epoch": 0.5415929203539823, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.2516222054118282, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 145664123.0, + "reward": 0.725390613079071, + "reward_std": 0.06846607476472855, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984667181968689, + "sampling/importance_sampling_ratio/min": 0.0024888881016522646, + "sampling/sampling_logp_difference/max": 5.995919227600098, + "sampling/sampling_logp_difference/mean": 0.08565886318683624, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 565.0, + "completions/max_terminated_length": 565.0, + "completions/mean_length": 220.95703125, + "completions/mean_terminated_length": 220.95703125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.05207423749379814, + "epoch": 0.5433628318584071, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.7788835288655805, + "learning_rate": 1e-06, + "loss": 0.0145, + "num_tokens": 146189728.0, + "reward": 0.7662109136581421, + "reward_std": 0.12365593016147614, + "rewards/execution_accuracy_EX/mean": 0.75390625, + "rewards/execution_accuracy_EX/std": 0.43157756328582764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812676310539246, + "sampling/importance_sampling_ratio/min": 0.0009238221682608128, + "sampling/sampling_logp_difference/max": 6.986990928649902, + "sampling/sampling_logp_difference/mean": 0.09266985207796097, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 227.53125, + "completions/mean_terminated_length": 227.53125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.05841087154112756, + "epoch": 0.5451327433628319, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8794842773860468, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 146732168.0, + "reward": 0.651171863079071, + "reward_std": 0.1226067915558815, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980589747428894, + "sampling/importance_sampling_ratio/min": 4.766187339555472e-05, + "sampling/sampling_logp_difference/max": 9.95137882232666, + "sampling/sampling_logp_difference/mean": 0.09845191240310669, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1062.0, + "completions/max_terminated_length": 1062.0, + "completions/mean_length": 222.03515625, + "completions/mean_terminated_length": 222.03515625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.047275930643081665, + "epoch": 0.5469026548672566, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7851132751986495, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 147140641.0, + "reward": 0.5806640386581421, + "reward_std": 0.04421525076031685, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846183061599731, + "sampling/importance_sampling_ratio/min": 0.0026058880612254143, + "sampling/sampling_logp_difference/max": 5.949981689453125, + "sampling/sampling_logp_difference/mean": 0.08060619235038757, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 207.875, + "completions/mean_terminated_length": 207.875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.052825033431872725, + "epoch": 0.5486725663716814, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5052144243494189, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 147618369.0, + "reward": 0.6029297113418579, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818271398544312, + "sampling/importance_sampling_ratio/min": 0.0032281808089464903, + "sampling/sampling_logp_difference/max": 5.735836505889893, + "sampling/sampling_logp_difference/mean": 0.09160901606082916, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 203.33984375, + "completions/mean_terminated_length": 203.33984375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.05661896453239024, + "epoch": 0.5504424778761062, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9398966696406653, + "learning_rate": 1e-06, + "loss": -0.0266, + "num_tokens": 148046024.0, + "reward": 0.5435546636581421, + "reward_std": 0.08277907967567444, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795078039169312, + "sampling/importance_sampling_ratio/min": 0.00043334157089702785, + "sampling/sampling_logp_difference/max": 7.743984222412109, + "sampling/sampling_logp_difference/mean": 0.10273094475269318, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 215.8828125, + "completions/mean_terminated_length": 215.8828125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.049795015482231975, + "epoch": 0.552212389380531, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.44239151228637846, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 148438250.0, + "reward": 0.7550780773162842, + "reward_std": 0.020280424505472183, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9834499359130859, + "sampling/importance_sampling_ratio/min": 0.0019365980988368392, + "sampling/sampling_logp_difference/max": 6.246822357177734, + "sampling/sampling_logp_difference/mean": 0.08328215777873993, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 207.7265625, + "completions/mean_terminated_length": 207.7265625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.05661593144759536, + "epoch": 0.5539823008849557, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7906079758249497, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 148729300.0, + "reward": 0.513867199420929, + "reward_std": 0.055404599756002426, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9804394245147705, + "sampling/importance_sampling_ratio/min": 0.003193076467141509, + "sampling/sampling_logp_difference/max": 5.74677038192749, + "sampling/sampling_logp_difference/mean": 0.09597845375537872, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 372.0, + "completions/max_terminated_length": 372.0, + "completions/mean_length": 210.703125, + "completions/mean_terminated_length": 210.703125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.058622196316719055, + "epoch": 0.5557522123893806, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.30417101421699555, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 149164232.0, + "reward": 0.576953113079071, + "reward_std": 0.020280424505472183, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9781725406646729, + "sampling/importance_sampling_ratio/min": 0.0067480942234396935, + "sampling/sampling_logp_difference/max": 4.998495101928711, + "sampling/sampling_logp_difference/mean": 0.10136409103870392, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 201.7421875, + "completions/mean_terminated_length": 201.7421875, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.05231445445679128, + "epoch": 0.5575221238938053, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.3508241693717966, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 149666806.0, + "reward": 0.5287109613418579, + "reward_std": 0.15431705117225647, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9806950092315674, + "sampling/importance_sampling_ratio/min": 0.0031892010010778904, + "sampling/sampling_logp_difference/max": 5.747984886169434, + "sampling/sampling_logp_difference/mean": 0.09248045831918716, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 201.39453125, + "completions/mean_terminated_length": 201.39453125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.05736704752780497, + "epoch": 0.5592920353982301, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.0488376351308362, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 150245499.0, + "reward": 0.6771484613418579, + "reward_std": 0.13976356387138367, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9780290126800537, + "sampling/importance_sampling_ratio/min": 0.00033918931148946285, + "sampling/sampling_logp_difference/max": 7.988952159881592, + "sampling/sampling_logp_difference/mean": 0.10655570030212402, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 215.56640625, + "completions/mean_terminated_length": 215.56640625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.06249663443304598, + "epoch": 0.5610619469026549, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5141383295398578, + "learning_rate": 1e-06, + "loss": -0.0076, + "num_tokens": 150760828.0, + "reward": 0.688281238079071, + "reward_std": 0.06846607476472855, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9751781225204468, + "sampling/importance_sampling_ratio/min": 0.0025299068074673414, + "sampling/sampling_logp_difference/max": 5.979572772979736, + "sampling/sampling_logp_difference/mean": 0.10954082012176514, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 896.0, + "completions/mean_length": 258.62109375, + "completions/mean_terminated_length": 243.57257080078125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.07188520161435008, + "epoch": 0.5628318584070796, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.5015242477714076, + "learning_rate": 1e-06, + "loss": -0.0341, + "num_tokens": 151237771.0, + "reward": 0.4580078125, + "reward_std": 0.06432904303073883, + "rewards/execution_accuracy_EX/mean": 0.4296875, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9780511856079102, + "sampling/importance_sampling_ratio/min": 0.0025671126786619425, + "sampling/sampling_logp_difference/max": 5.964973449707031, + "sampling/sampling_logp_difference/mean": 0.11118845641613007, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 216.703125, + "completions/mean_terminated_length": 216.703125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.0561389175709337, + "epoch": 0.5646017699115045, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7575117622708043, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 151680719.0, + "reward": 0.5101562738418579, + "reward_std": 0.08874650299549103, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9803179502487183, + "sampling/importance_sampling_ratio/min": 0.0009156086016446352, + "sampling/sampling_logp_difference/max": 6.995921611785889, + "sampling/sampling_logp_difference/mean": 0.0952753871679306, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 199.22265625, + "completions/mean_terminated_length": 199.22265625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.050593928433954716, + "epoch": 0.5663716814159292, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0694277244538497, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 152146488.0, + "reward": 0.9183593392372131, + "reward_std": 0.08874650299549103, + "rewards/execution_accuracy_EX/mean": 0.9140625, + "rewards/execution_accuracy_EX/std": 0.28082075715065, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795510768890381, + "sampling/importance_sampling_ratio/min": 0.0015369926113635302, + "sampling/sampling_logp_difference/max": 6.4779276847839355, + "sampling/sampling_logp_difference/mean": 0.09687424451112747, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 251.65234375, + "completions/mean_terminated_length": 251.65234375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.057413500268012285, + "epoch": 0.5681415929203539, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.655237213624733, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 152794943.0, + "reward": 0.5435546636581421, + "reward_std": 0.10380767285823822, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9744194746017456, + "sampling/importance_sampling_ratio/min": 1.961269481398631e-05, + "sampling/sampling_logp_difference/max": 10.839333534240723, + "sampling/sampling_logp_difference/mean": 0.10939045995473862, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 386.0, + "completions/max_terminated_length": 386.0, + "completions/mean_length": 209.35546875, + "completions/mean_terminated_length": 209.35546875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05489099584519863, + "epoch": 0.5699115044247788, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0280436078170612, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 153286314.0, + "reward": 0.6474609375, + "reward_std": 0.11916713416576385, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9778928756713867, + "sampling/importance_sampling_ratio/min": 0.0031850412487983704, + "sampling/sampling_logp_difference/max": 5.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.10156022757291794, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 211.35546875, + "completions/mean_terminated_length": 211.35546875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.0663001926150173, + "epoch": 0.5716814159292035, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.49757190385671995, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 153744949.0, + "reward": 0.666015625, + "reward_std": 0.04326736554503441, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9802959561347961, + "sampling/importance_sampling_ratio/min": 0.0015048434725031257, + "sampling/sampling_logp_difference/max": 6.499066352844238, + "sampling/sampling_logp_difference/mean": 0.10074768215417862, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 197.546875, + "completions/mean_terminated_length": 197.546875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.04612495796754956, + "epoch": 0.5734513274336284, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.8660383333657942, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 154147761.0, + "reward": 0.7550780773162842, + "reward_std": 0.11248160898685455, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839067459106445, + "sampling/importance_sampling_ratio/min": 0.0024795844219624996, + "sampling/sampling_logp_difference/max": 5.999664306640625, + "sampling/sampling_logp_difference/mean": 0.08446662127971649, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 199.01953125, + "completions/mean_terminated_length": 199.01953125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.0564639566000551, + "epoch": 0.5752212389380531, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5346715840468365, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 154599238.0, + "reward": 0.6548827886581421, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9789228439331055, + "sampling/importance_sampling_ratio/min": 0.004090500995516777, + "sampling/sampling_logp_difference/max": 5.499087810516357, + "sampling/sampling_logp_difference/mean": 0.10110551118850708, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 203.30859375, + "completions/mean_terminated_length": 203.30859375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.051778314635157585, + "epoch": 0.5769911504424778, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6958215029315045, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 155038837.0, + "reward": 0.4878906011581421, + "reward_std": 0.05094154179096222, + "rewards/execution_accuracy_EX/mean": 0.4609375, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9821540117263794, + "sampling/importance_sampling_ratio/min": 0.0007142578833736479, + "sampling/sampling_logp_difference/max": 7.244266510009766, + "sampling/sampling_logp_difference/mean": 0.09247968345880508, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 226.765625, + "completions/mean_terminated_length": 226.765625, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.04970480687916279, + "epoch": 0.5787610619469027, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.4534563573689337, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 155560297.0, + "reward": 0.5621093511581421, + "reward_std": 0.045264385640621185, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824632406234741, + "sampling/importance_sampling_ratio/min": 0.0009142569615505636, + "sampling/sampling_logp_difference/max": 6.997398853302002, + "sampling/sampling_logp_difference/mean": 0.08942683041095734, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 219.94921875, + "completions/mean_terminated_length": 219.94921875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.05523719475604594, + "epoch": 0.5805309734513274, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7204076470700003, + "learning_rate": 1e-06, + "loss": 0.0039, + "num_tokens": 156139580.0, + "reward": 0.4730468690395355, + "reward_std": 0.06354779005050659, + "rewards/execution_accuracy_EX/mean": 0.4453125, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9784348011016846, + "sampling/importance_sampling_ratio/min": 0.002482479205355048, + "sampling/sampling_logp_difference/max": 5.998497486114502, + "sampling/sampling_logp_difference/mean": 0.10015009343624115, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 632.0, + "completions/max_terminated_length": 632.0, + "completions/mean_length": 218.921875, + "completions/mean_terminated_length": 218.921875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.05623843614012003, + "epoch": 0.5823008849557522, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3809386132069958, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 156672536.0, + "reward": 0.7216796875, + "reward_std": 0.028423616662621498, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.982369065284729, + "sampling/importance_sampling_ratio/min": 0.0002649858652148396, + "sampling/sampling_logp_difference/max": 8.235834121704102, + "sampling/sampling_logp_difference/mean": 0.09450428187847137, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 204.78515625, + "completions/mean_terminated_length": 204.78515625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.05437911790795624, + "epoch": 0.584070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8657381262725702, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 157159409.0, + "reward": 0.5249999761581421, + "reward_std": 0.09491631388664246, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980783224105835, + "sampling/importance_sampling_ratio/min": 0.0032739758025854826, + "sampling/sampling_logp_difference/max": 5.721750259399414, + "sampling/sampling_logp_difference/mean": 0.09665679186582565, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 202.3828125, + "completions/mean_terminated_length": 202.3828125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.04962495435029268, + "epoch": 0.5858407079646017, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.2940437154492566, + "learning_rate": 1e-06, + "loss": 0.011, + "num_tokens": 157632659.0, + "reward": 0.5992187261581421, + "reward_std": 0.1542913019657135, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825015068054199, + "sampling/importance_sampling_ratio/min": 4.381871576697449e-08, + "sampling/sampling_logp_difference/max": 16.943204879760742, + "sampling/sampling_logp_difference/mean": 0.09224773943424225, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 213.5, + "completions/mean_terminated_length": 213.5, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.055298845283687115, + "epoch": 0.5876106194690266, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8519505528098199, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 158160835.0, + "reward": 0.5472655892372131, + "reward_std": 0.05937499925494194, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815801382064819, + "sampling/importance_sampling_ratio/min": 0.0019327143672853708, + "sampling/sampling_logp_difference/max": 6.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.09445376694202423, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 207.84375, + "completions/mean_terminated_length": 207.84375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.05007517337799072, + "epoch": 0.5893805309734513, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.823050750507612, + "learning_rate": 1e-06, + "loss": -0.0056, + "num_tokens": 158525099.0, + "reward": 0.4507812559604645, + "reward_std": 0.15564590692520142, + "rewards/execution_accuracy_EX/mean": 0.421875, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839816689491272, + "sampling/importance_sampling_ratio/min": 0.0011984313605353236, + "sampling/sampling_logp_difference/max": 6.726741790771484, + "sampling/sampling_logp_difference/mean": 0.08478345721960068, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 224.5390625, + "completions/mean_terminated_length": 224.5390625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.04792047035880387, + "epoch": 0.5911504424778761, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8612953675729165, + "learning_rate": 1e-06, + "loss": -0.0194, + "num_tokens": 159000085.0, + "reward": 0.5621093511581421, + "reward_std": 0.09313404560089111, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823781251907349, + "sampling/importance_sampling_ratio/min": 0.004097046330571175, + "sampling/sampling_logp_difference/max": 5.497488975524902, + "sampling/sampling_logp_difference/mean": 0.08656078577041626, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 210.578125, + "completions/mean_terminated_length": 210.578125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.0517090258654207, + "epoch": 0.5929203539823009, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 159390233.0, + "reward": 0.703125, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.981236457824707, + "sampling/importance_sampling_ratio/min": 0.003210386261343956, + "sampling/sampling_logp_difference/max": 5.741364002227783, + "sampling/sampling_logp_difference/mean": 0.09414251148700714, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 408.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 195.140625, + "completions/mean_terminated_length": 195.140625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.041676986031234264, + "epoch": 0.5946902654867257, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.043018714675196, + "learning_rate": 1e-06, + "loss": -0.0037, + "num_tokens": 159885517.0, + "reward": 0.7476562261581421, + "reward_std": 0.08664823323488235, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9854810237884521, + "sampling/importance_sampling_ratio/min": 0.003184151602908969, + "sampling/sampling_logp_difference/max": 5.749569416046143, + "sampling/sampling_logp_difference/mean": 0.07963273674249649, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 208.38671875, + "completions/mean_terminated_length": 208.38671875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.047601311933249235, + "epoch": 0.5964601769911504, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 160293520.0, + "reward": 0.6437499523162842, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815754890441895, + "sampling/importance_sampling_ratio/min": 0.0052538178861141205, + "sampling/sampling_logp_difference/max": 5.248800277709961, + "sampling/sampling_logp_difference/mean": 0.08991867303848267, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 384.0, + "completions/max_terminated_length": 384.0, + "completions/mean_length": 203.82421875, + "completions/mean_terminated_length": 203.82421875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.04659639182500541, + "epoch": 0.5982300884955752, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3176520586239473, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 160707843.0, + "reward": 0.680859386920929, + "reward_std": 0.11080919951200485, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878112077713013, + "sampling/importance_sampling_ratio/min": 0.0003959884634241462, + "sampling/sampling_logp_difference/max": 7.834125518798828, + "sampling/sampling_logp_difference/mean": 0.07971542328596115, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 221.4453125, + "completions/mean_terminated_length": 221.4453125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.05004101060330868, + "epoch": 0.6, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8184821487127191, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 161317285.0, + "reward": 0.5287109613418579, + "reward_std": 0.1142488420009613, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819902181625366, + "sampling/importance_sampling_ratio/min": 0.002818513195961714, + "sampling/sampling_logp_difference/max": 5.871545791625977, + "sampling/sampling_logp_difference/mean": 0.08730147778987885, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 204.875, + "completions/mean_terminated_length": 204.875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.0555198157671839, + "epoch": 0.6017699115044248, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5882204173611767, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 161877189.0, + "reward": 0.4544921815395355, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.42578125, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826542139053345, + "sampling/importance_sampling_ratio/min": 2.1555084458668716e-05, + "sampling/sampling_logp_difference/max": 10.744898796081543, + "sampling/sampling_logp_difference/mean": 0.0956350713968277, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 179.390625, + "completions/mean_terminated_length": 179.390625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.04553712857887149, + "epoch": 0.6035398230088496, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9355096461144995, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 162453065.0, + "reward": 0.703125, + "reward_std": 0.07495187968015671, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837697744369507, + "sampling/importance_sampling_ratio/min": 0.004101223312318325, + "sampling/sampling_logp_difference/max": 5.496469974517822, + "sampling/sampling_logp_difference/mean": 0.08368054032325745, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 196.328125, + "completions/mean_terminated_length": 196.328125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.053554473677650094, + "epoch": 0.6053097345132743, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.9007888922325235, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 162880381.0, + "reward": 0.6882812976837158, + "reward_std": 0.09491631388664246, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814350605010986, + "sampling/importance_sampling_ratio/min": 0.0019337818957865238, + "sampling/sampling_logp_difference/max": 6.24827766418457, + "sampling/sampling_logp_difference/mean": 0.09719858318567276, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 214.421875, + "completions/mean_terminated_length": 214.421875, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.047669662395492196, + "epoch": 0.6070796460176991, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.9091267080840625, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 163388857.0, + "reward": 0.6400390863418579, + "reward_std": 0.13133010268211365, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819040298461914, + "sampling/importance_sampling_ratio/min": 0.0005823129904456437, + "sampling/sampling_logp_difference/max": 7.448502540588379, + "sampling/sampling_logp_difference/mean": 0.0882585197687149, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 390.0, + "completions/max_terminated_length": 390.0, + "completions/mean_length": 208.85546875, + "completions/mean_terminated_length": 208.85546875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.046810150146484375, + "epoch": 0.6088495575221239, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6604868357043533, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 164088692.0, + "reward": 0.6066405773162842, + "reward_std": 0.05435546487569809, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822702407836914, + "sampling/importance_sampling_ratio/min": 8.46540723091043e-19, + "sampling/sampling_logp_difference/max": 41.613128662109375, + "sampling/sampling_logp_difference/mean": 0.08694726228713989, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 212.58203125, + "completions/mean_terminated_length": 212.58203125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.051637756172567606, + "epoch": 0.6106194690265486, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.1926752920876587, + "learning_rate": 1e-06, + "loss": 0.0082, + "num_tokens": 164484105.0, + "reward": 0.5806640386581421, + "reward_std": 0.1496153175830841, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983299732208252, + "sampling/importance_sampling_ratio/min": 8.099605474853888e-05, + "sampling/sampling_logp_difference/max": 9.421110153198242, + "sampling/sampling_logp_difference/mean": 0.09277094900608063, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 194.34765625, + "completions/mean_terminated_length": 194.34765625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.051617288729175925, + "epoch": 0.6123893805309735, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8638835915686713, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 164853474.0, + "reward": 0.7365233898162842, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808909893035889, + "sampling/importance_sampling_ratio/min": 0.004097010940313339, + "sampling/sampling_logp_difference/max": 5.49749755859375, + "sampling/sampling_logp_difference/mean": 0.09261120855808258, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 275.0, + "completions/max_terminated_length": 275.0, + "completions/mean_length": 187.83984375, + "completions/mean_terminated_length": 187.83984375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.04956577718257904, + "epoch": 0.6141592920353982, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7287753727366357, + "learning_rate": 1e-06, + "loss": 0.0075, + "num_tokens": 165381945.0, + "reward": 0.703125, + "reward_std": 0.07495187968015671, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823348522186279, + "sampling/importance_sampling_ratio/min": 0.0015449458733201027, + "sampling/sampling_logp_difference/max": 6.472766399383545, + "sampling/sampling_logp_difference/mean": 0.09282916784286499, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 230.79296875, + "completions/mean_terminated_length": 215.6352996826172, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.06123170396313071, + "epoch": 0.6159292035398231, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.28615468986647974, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 165921396.0, + "reward": 0.3095703125, + "reward_std": 0.029853563755750656, + "rewards/execution_accuracy_EX/mean": 0.2734375, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819658994674683, + "sampling/importance_sampling_ratio/min": 0.002482872223481536, + "sampling/sampling_logp_difference/max": 5.9983391761779785, + "sampling/sampling_logp_difference/mean": 0.10140170156955719, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 192.40234375, + "completions/mean_terminated_length": 192.40234375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.04973010439425707, + "epoch": 0.6176991150442478, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.2959991477911743, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 166511643.0, + "reward": 0.49531251192092896, + "reward_std": 0.15543298423290253, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829964637756348, + "sampling/importance_sampling_ratio/min": 0.0019420698517933488, + "sampling/sampling_logp_difference/max": 6.2440009117126465, + "sampling/sampling_logp_difference/mean": 0.08792255818843842, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 201.83984375, + "completions/mean_terminated_length": 201.83984375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.054532683454453945, + "epoch": 0.6194690265486725, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9217644626550645, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 167062098.0, + "reward": 0.6214843988418579, + "reward_std": 0.12887966632843018, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796415567398071, + "sampling/importance_sampling_ratio/min": 0.002482479205355048, + "sampling/sampling_logp_difference/max": 5.998497486114502, + "sampling/sampling_logp_difference/mean": 0.09988877177238464, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 203.640625, + "completions/mean_terminated_length": 203.640625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.050870410865172744, + "epoch": 0.6212389380530974, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.684347992144397, + "learning_rate": 1e-06, + "loss": -0.009, + "num_tokens": 167589030.0, + "reward": 0.643750011920929, + "reward_std": 0.08561230450868607, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9784557819366455, + "sampling/importance_sampling_ratio/min": 0.00443896884098649, + "sampling/sampling_logp_difference/max": 5.417333126068115, + "sampling/sampling_logp_difference/mean": 0.10360198467969894, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 197.03125, + "completions/mean_terminated_length": 197.03125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.06183059047907591, + "epoch": 0.6230088495575221, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9296446289099377, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 168019390.0, + "reward": 0.6585937738418579, + "reward_std": 0.08112169802188873, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98182213306427, + "sampling/importance_sampling_ratio/min": 0.0015048098284751177, + "sampling/sampling_logp_difference/max": 6.499088764190674, + "sampling/sampling_logp_difference/mean": 0.09926865249872208, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 211.5546875, + "completions/mean_terminated_length": 211.5546875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.05494373710826039, + "epoch": 0.6247787610619469, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5958217771488614, + "learning_rate": 1e-06, + "loss": 0.0097, + "num_tokens": 168414156.0, + "reward": 0.48046875, + "reward_std": 0.06554481387138367, + "rewards/execution_accuracy_EX/mean": 0.453125, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783415198326111, + "sampling/importance_sampling_ratio/min": 0.0019389728549867868, + "sampling/sampling_logp_difference/max": 6.245596885681152, + "sampling/sampling_logp_difference/mean": 0.10239555686712265, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 175.3203125, + "completions/mean_terminated_length": 175.3203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.04758126800879836, + "epoch": 0.6265486725663717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 168864030.0, + "reward": 0.703125, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9807040095329285, + "sampling/importance_sampling_ratio/min": 0.003184151602908969, + "sampling/sampling_logp_difference/max": 5.749569416046143, + "sampling/sampling_logp_difference/mean": 0.09073126316070557, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 392.0, + "completions/max_terminated_length": 392.0, + "completions/mean_length": 178.32421875, + "completions/mean_terminated_length": 178.32421875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.048853211803361773, + "epoch": 0.6283185840707964, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.308372880282191, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 169303505.0, + "reward": 0.8033202886581421, + "reward_std": 0.07839153707027435, + "rewards/execution_accuracy_EX/mean": 0.79296875, + "rewards/execution_accuracy_EX/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830752611160278, + "sampling/importance_sampling_ratio/min": 0.004142302088439465, + "sampling/sampling_logp_difference/max": 5.486503601074219, + "sampling/sampling_logp_difference/mean": 0.08557950705289841, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 190.31640625, + "completions/mean_terminated_length": 190.31640625, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.04816487431526184, + "epoch": 0.6300884955752213, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7062407214247715, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 169741682.0, + "reward": 0.699414074420929, + "reward_std": 0.07108455151319504, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9809256792068481, + "sampling/importance_sampling_ratio/min": 0.004424495622515678, + "sampling/sampling_logp_difference/max": 5.420598983764648, + "sampling/sampling_logp_difference/mean": 0.08835375308990479, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 389.0, + "completions/max_terminated_length": 389.0, + "completions/mean_length": 197.40625, + "completions/mean_terminated_length": 197.40625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.05359330866485834, + "epoch": 0.631858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7468583667621106, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 170243898.0, + "reward": 0.6548827886581421, + "reward_std": 0.07839153707027435, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813674688339233, + "sampling/importance_sampling_ratio/min": 5.9131129091838375e-05, + "sampling/sampling_logp_difference/max": 9.735753059387207, + "sampling/sampling_logp_difference/mean": 0.09579256176948547, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 189.12109375, + "completions/mean_terminated_length": 189.12109375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.05604408658109605, + "epoch": 0.6336283185840708, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9916940212874865, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 170734809.0, + "reward": 0.5732421875, + "reward_std": 0.13901987671852112, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9791431427001953, + "sampling/importance_sampling_ratio/min": 0.0015056998236104846, + "sampling/sampling_logp_difference/max": 6.498497486114502, + "sampling/sampling_logp_difference/mean": 0.09788219630718231, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 198.3203125, + "completions/mean_terminated_length": 198.3203125, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.05386378266848624, + "epoch": 0.6353982300884956, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.0309755865968975, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 171463403.0, + "reward": 0.688281238079071, + "reward_std": 0.15640661120414734, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9809324145317078, + "sampling/importance_sampling_ratio/min": 0.0041328733786940575, + "sampling/sampling_logp_difference/max": 5.4887824058532715, + "sampling/sampling_logp_difference/mean": 0.09346139430999756, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 182.609375, + "completions/mean_terminated_length": 182.609375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.04600366437807679, + "epoch": 0.6371681415929203, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3124350596399814, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 171902119.0, + "reward": 0.7884765267372131, + "reward_std": 0.09197140485048294, + "rewards/execution_accuracy_EX/mean": 0.77734375, + "rewards/execution_accuracy_EX/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849363565444946, + "sampling/importance_sampling_ratio/min": 0.0011773352744057775, + "sampling/sampling_logp_difference/max": 6.74450159072876, + "sampling/sampling_logp_difference/mean": 0.08519085496664047, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 187.87890625, + "completions/mean_terminated_length": 187.87890625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.05377147370018065, + "epoch": 0.6389380530973451, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4650598110023498, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 172359384.0, + "reward": 0.666015625, + "reward_std": 0.12568050622940063, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9768861532211304, + "sampling/importance_sampling_ratio/min": 0.002482479205355048, + "sampling/sampling_logp_difference/max": 5.998497486114502, + "sampling/sampling_logp_difference/mean": 0.1023087203502655, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 176.69921875, + "completions/mean_terminated_length": 176.69921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.05126607744023204, + "epoch": 0.6407079646017699, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8872666430314095, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 172803131.0, + "reward": 0.6437499523162842, + "reward_std": 0.056847233325242996, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808732867240906, + "sampling/importance_sampling_ratio/min": 0.0010354254627600312, + "sampling/sampling_logp_difference/max": 6.872942924499512, + "sampling/sampling_logp_difference/mean": 0.09201299399137497, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 225.88671875, + "completions/mean_terminated_length": 210.70982360839844, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.06373013672418892, + "epoch": 0.6424778761061947, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.5976485009068124, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 173283150.0, + "reward": 0.602734386920929, + "reward_std": 0.18506553769111633, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9750391244888306, + "sampling/importance_sampling_ratio/min": 0.0012697766069322824, + "sampling/sampling_logp_difference/max": 6.668914318084717, + "sampling/sampling_logp_difference/mean": 0.11234313249588013, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 176.3125, + "completions/mean_terminated_length": 176.3125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.0455334996804595, + "epoch": 0.6442477876106195, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6418523030949159, + "learning_rate": 1e-06, + "loss": 0.0094, + "num_tokens": 173789230.0, + "reward": 0.5806640386581421, + "reward_std": 0.04421525076031685, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783281087875366, + "sampling/importance_sampling_ratio/min": 0.0031850412487983704, + "sampling/sampling_logp_difference/max": 5.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.0926356166601181, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 198.98046875, + "completions/mean_terminated_length": 198.98046875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.05406486289575696, + "epoch": 0.6460176991150443, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.9367138212895019, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 174351353.0, + "reward": 0.5249999761581421, + "reward_std": 0.06084126979112625, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782751798629761, + "sampling/importance_sampling_ratio/min": 0.00177084153983742, + "sampling/sampling_logp_difference/max": 6.336300373077393, + "sampling/sampling_logp_difference/mean": 0.09801220893859863, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 654.0, + "completions/max_terminated_length": 654.0, + "completions/mean_length": 215.2734375, + "completions/mean_terminated_length": 215.2734375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.053189637837931514, + "epoch": 0.647787610619469, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.770460900269305, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 174768815.0, + "reward": 0.5658203363418579, + "reward_std": 0.10045605897903442, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.977229654788971, + "sampling/importance_sampling_ratio/min": 0.0015042709419503808, + "sampling/sampling_logp_difference/max": 6.499446868896484, + "sampling/sampling_logp_difference/mean": 0.10121339559555054, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 206.3671875, + "completions/mean_terminated_length": 206.3671875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.04738211981020868, + "epoch": 0.6495575221238938, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7722517070536362, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 175116877.0, + "reward": 0.699414074420929, + "reward_std": 0.07829029113054276, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825224876403809, + "sampling/importance_sampling_ratio/min": 0.004092941991984844, + "sampling/sampling_logp_difference/max": 5.498491287231445, + "sampling/sampling_logp_difference/mean": 0.0863746851682663, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 210.30078125, + "completions/mean_terminated_length": 210.30078125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.055021335138008, + "epoch": 0.6513274336283186, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.5931286973307045, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 175642346.0, + "reward": 0.6066405773162842, + "reward_std": 0.060348618775606155, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9747425317764282, + "sampling/importance_sampling_ratio/min": 0.0020871355663985014, + "sampling/sampling_logp_difference/max": 6.171962738037109, + "sampling/sampling_logp_difference/mean": 0.1051156222820282, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 506.0, + "completions/max_terminated_length": 506.0, + "completions/mean_length": 206.6484375, + "completions/mean_terminated_length": 206.6484375, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.0483693964779377, + "epoch": 0.6530973451327433, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0048420387694674, + "learning_rate": 1e-06, + "loss": 0.0222, + "num_tokens": 176231712.0, + "reward": 0.5806640386581421, + "reward_std": 0.08979563415050507, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811728000640869, + "sampling/importance_sampling_ratio/min": 0.0005592286470346153, + "sampling/sampling_logp_difference/max": 7.488952159881592, + "sampling/sampling_logp_difference/mean": 0.08909212052822113, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 181.40625, + "completions/mean_terminated_length": 181.40625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.05278869019821286, + "epoch": 0.6548672566371682, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.671765020499032, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 176548056.0, + "reward": 0.5138671398162842, + "reward_std": 0.04870403930544853, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796327948570251, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.09088835120201111, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 181.40234375, + "completions/mean_terminated_length": 181.40234375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05211420636624098, + "epoch": 0.6566371681415929, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3643366459300659, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 176929279.0, + "reward": 0.6585937738418579, + "reward_std": 0.11059626936912537, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.977271318435669, + "sampling/importance_sampling_ratio/min": 0.0015074125258252025, + "sampling/sampling_logp_difference/max": 6.497360706329346, + "sampling/sampling_logp_difference/mean": 0.10295344889163971, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 651.0, + "completions/max_terminated_length": 651.0, + "completions/mean_length": 201.60546875, + "completions/mean_terminated_length": 201.60546875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.05875820852816105, + "epoch": 0.6584070796460177, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8798633153397238, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 177608394.0, + "reward": 0.6066405773162842, + "reward_std": 0.06846607476472855, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9736084938049316, + "sampling/importance_sampling_ratio/min": 0.0012599017936736345, + "sampling/sampling_logp_difference/max": 6.676721572875977, + "sampling/sampling_logp_difference/mean": 0.11426495760679245, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 220.0625, + "completions/mean_terminated_length": 220.0625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.0614921017549932, + "epoch": 0.6601769911504425, + "frac_reward_zero_std": 0.625, + "grad_norm": 0.968593531163994, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 178039546.0, + "reward": 0.4916015863418579, + "reward_std": 0.15092915296554565, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9781675934791565, + "sampling/importance_sampling_ratio/min": 0.0018237639451399446, + "sampling/sampling_logp_difference/max": 6.3068528175354, + "sampling/sampling_logp_difference/mean": 0.1050269603729248, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 185.83984375, + "completions/mean_terminated_length": 185.83984375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.046782430494204164, + "epoch": 0.6619469026548672, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.40238216482376304, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 178570177.0, + "reward": 0.7216796875, + "reward_std": 0.028423616662621498, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796968698501587, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.0892026275396347, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 187.21484375, + "completions/mean_terminated_length": 187.21484375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.05169937666505575, + "epoch": 0.6637168141592921, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.746614109931729, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 179107976.0, + "reward": 0.5992187261581421, + "reward_std": 0.07965992391109467, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779437780380249, + "sampling/importance_sampling_ratio/min": 0.004161851480603218, + "sampling/sampling_logp_difference/max": 5.481795310974121, + "sampling/sampling_logp_difference/mean": 0.1006498634815216, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 216.21484375, + "completions/mean_terminated_length": 216.21484375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.0502176599111408, + "epoch": 0.6654867256637168, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.08118795745842, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 179564959.0, + "reward": 0.6511718034744263, + "reward_std": 0.10090947151184082, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779660701751709, + "sampling/importance_sampling_ratio/min": 0.0025565973483026028, + "sampling/sampling_logp_difference/max": 5.969078063964844, + "sampling/sampling_logp_difference/mean": 0.09516897797584534, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 197.984375, + "completions/mean_terminated_length": 197.984375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.04846697999164462, + "epoch": 0.6672566371681415, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8292352477826112, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 180157579.0, + "reward": 0.6734374761581421, + "reward_std": 0.08906249701976776, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9787276983261108, + "sampling/importance_sampling_ratio/min": 0.0015072536189109087, + "sampling/sampling_logp_difference/max": 6.497466087341309, + "sampling/sampling_logp_difference/mean": 0.09636777639389038, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 413.0, + "completions/max_terminated_length": 413.0, + "completions/mean_length": 222.7578125, + "completions/mean_terminated_length": 222.7578125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.051815691869705915, + "epoch": 0.6690265486725664, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2026801682083967, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 180564685.0, + "reward": 0.606640636920929, + "reward_std": 0.06034861505031586, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.976426899433136, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.10420379042625427, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 187.29296875, + "completions/mean_terminated_length": 187.29296875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.05123723135329783, + "epoch": 0.6707964601769911, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.714720121217564, + "learning_rate": 1e-06, + "loss": 0.0187, + "num_tokens": 181089464.0, + "reward": 0.666015625, + "reward_std": 0.06982067227363586, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9776961803436279, + "sampling/importance_sampling_ratio/min": 0.00011370181891834363, + "sampling/sampling_logp_difference/max": 9.081931114196777, + "sampling/sampling_logp_difference/mean": 0.10171456634998322, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 236.0, + "completions/max_terminated_length": 236.0, + "completions/mean_length": 172.80859375, + "completions/mean_terminated_length": 172.80859375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.04394562868401408, + "epoch": 0.672566371681416, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6341696039543058, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 181657943.0, + "reward": 0.6585937738418579, + "reward_std": 0.056240808218717575, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862318634986877, + "sampling/importance_sampling_ratio/min": 0.00528706656768918, + "sampling/sampling_logp_difference/max": 5.242491722106934, + "sampling/sampling_logp_difference/mean": 0.07022035866975784, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 191.91015625, + "completions/mean_terminated_length": 191.91015625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.04886707942932844, + "epoch": 0.6743362831858407, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.6896321519419875, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 182068624.0, + "reward": 0.6474609375, + "reward_std": 0.0866614431142807, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808017611503601, + "sampling/importance_sampling_ratio/min": 0.00026165202143602073, + "sampling/sampling_logp_difference/max": 8.248495101928711, + "sampling/sampling_logp_difference/mean": 0.09078819304704666, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 262.0, + "completions/max_terminated_length": 262.0, + "completions/mean_length": 175.28125, + "completions/mean_terminated_length": 175.28125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.049037616001442075, + "epoch": 0.6761061946902654, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0090588908822031, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 182492152.0, + "reward": 0.6957031488418579, + "reward_std": 0.11968552321195602, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9805042743682861, + "sampling/importance_sampling_ratio/min": 0.003521983977407217, + "sampling/sampling_logp_difference/max": 5.648730754852295, + "sampling/sampling_logp_difference/mean": 0.08683665096759796, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 192.55078125, + "completions/mean_terminated_length": 192.55078125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.050409688614308834, + "epoch": 0.6778761061946903, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8640466841913647, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 183048453.0, + "reward": 0.669726550579071, + "reward_std": 0.08017563819885254, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.979473352432251, + "sampling/importance_sampling_ratio/min": 0.003388156183063984, + "sampling/sampling_logp_difference/max": 5.687469482421875, + "sampling/sampling_logp_difference/mean": 0.09112448245286942, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 183.1171875, + "completions/mean_terminated_length": 183.1171875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.046200540382415056, + "epoch": 0.679646017699115, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8978718109149366, + "learning_rate": 1e-06, + "loss": 0.015, + "num_tokens": 183454835.0, + "reward": 0.6623046398162842, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796804785728455, + "sampling/importance_sampling_ratio/min": 0.004090507049113512, + "sampling/sampling_logp_difference/max": 5.499086380004883, + "sampling/sampling_logp_difference/mean": 0.08823394775390625, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 563.0, + "completions/max_terminated_length": 563.0, + "completions/mean_length": 178.359375, + "completions/mean_terminated_length": 178.359375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.04631510376930237, + "epoch": 0.6814159292035398, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4051374697601621, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 183847823.0, + "reward": 0.7699218988418579, + "reward_std": 0.020280422642827034, + "rewards/execution_accuracy_EX/mean": 0.7578125, + "rewards/execution_accuracy_EX/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9465184211730957, + "sampling/importance_sampling_ratio/mean": 0.9813239574432373, + "sampling/importance_sampling_ratio/min": 0.0024825206492096186, + "sampling/sampling_logp_difference/max": 5.998480796813965, + "sampling/sampling_logp_difference/mean": 0.084245964884758, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 167.84375, + "completions/mean_terminated_length": 167.84375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.04379211855120957, + "epoch": 0.6831858407079646, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3840572222075551, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 184297447.0, + "reward": 0.706835925579071, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.69140625, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798949956893921, + "sampling/importance_sampling_ratio/min": 0.000913253054022789, + "sampling/sampling_logp_difference/max": 6.998497486114502, + "sampling/sampling_logp_difference/mean": 0.09194238483905792, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 207.5078125, + "completions/mean_terminated_length": 207.5078125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.045693976106122136, + "epoch": 0.6849557522123894, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 184598361.0, + "reward": 0.6437499523162842, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838193655014038, + "sampling/importance_sampling_ratio/min": 0.005315958522260189, + "sampling/sampling_logp_difference/max": 5.23704195022583, + "sampling/sampling_logp_difference/mean": 0.07834985852241516, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 174.4921875, + "completions/mean_terminated_length": 174.4921875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.04458746896125376, + "epoch": 0.6867256637168142, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.077754813369031, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 185375015.0, + "reward": 0.6177734136581421, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823440313339233, + "sampling/importance_sampling_ratio/min": 0.0007110103615559638, + "sampling/sampling_logp_difference/max": 7.248823642730713, + "sampling/sampling_logp_difference/mean": 0.08295151591300964, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 228.0, + "completions/max_terminated_length": 228.0, + "completions/mean_length": 156.27734375, + "completions/mean_terminated_length": 156.27734375, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.04180929507128894, + "epoch": 0.6884955752212389, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0727856007341887, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 185739070.0, + "reward": 0.7291015386581421, + "reward_std": 0.08606570959091187, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825483560562134, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.07702117413282394, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 180.2734375, + "completions/mean_terminated_length": 180.2734375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.05284590250812471, + "epoch": 0.6902654867256637, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.1063533524531874, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 186122340.0, + "reward": 0.614062488079071, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9789842963218689, + "sampling/importance_sampling_ratio/min": 0.004425975028425455, + "sampling/sampling_logp_difference/max": 5.420264720916748, + "sampling/sampling_logp_difference/mean": 0.0898595005273819, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 186.00390625, + "completions/mean_terminated_length": 186.00390625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.044459878001362085, + "epoch": 0.6920353982300885, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.240905747612231, + "learning_rate": 1e-06, + "loss": -0.0157, + "num_tokens": 186533525.0, + "reward": 0.5546875, + "reward_std": 0.10410863161087036, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818249344825745, + "sampling/importance_sampling_ratio/min": 3.77207288693171e-05, + "sampling/sampling_logp_difference/max": 10.185300827026367, + "sampling/sampling_logp_difference/mean": 0.08293484151363373, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 205.390625, + "completions/mean_terminated_length": 205.390625, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.05267933080904186, + "epoch": 0.6938053097345133, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3456317076030761, + "learning_rate": 1e-06, + "loss": 0.0142, + "num_tokens": 186962281.0, + "reward": 0.6251952648162842, + "reward_std": 0.028423616662621498, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826935529708862, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.08386804908514023, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 184.10546875, + "completions/mean_terminated_length": 184.10546875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.04479903541505337, + "epoch": 0.695575221238938, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4925968299990241, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 187343892.0, + "reward": 0.799609363079071, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.7890625, + "rewards/execution_accuracy_EX/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9772796630859375, + "sampling/importance_sampling_ratio/min": 6.222480806172825e-06, + "sampling/sampling_logp_difference/max": 11.98734188079834, + "sampling/sampling_logp_difference/mean": 0.0916595607995987, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 378.0, + "completions/max_terminated_length": 378.0, + "completions/mean_length": 182.3984375, + "completions/mean_terminated_length": 182.3984375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.041240089340135455, + "epoch": 0.6973451327433628, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.42924457388175696, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 187843882.0, + "reward": 0.5546875, + "reward_std": 0.030661117285490036, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829157590866089, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.0787586197257042, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 717.0, + "completions/max_terminated_length": 717.0, + "completions/mean_length": 196.64453125, + "completions/mean_terminated_length": 196.64453125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.048630278557538986, + "epoch": 0.6991150442477876, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5695396265136251, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 188247775.0, + "reward": 0.5806640386581421, + "reward_std": 0.04421525076031685, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798430800437927, + "sampling/importance_sampling_ratio/min": 0.0052618407644331455, + "sampling/sampling_logp_difference/max": 5.247274398803711, + "sampling/sampling_logp_difference/mean": 0.09684212505817413, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 370.0, + "completions/max_terminated_length": 370.0, + "completions/mean_length": 172.38671875, + "completions/mean_terminated_length": 172.38671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.043348495149984956, + "epoch": 0.7008849557522124, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.1359415859648045, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 188688786.0, + "reward": 0.725390613079071, + "reward_std": 0.06533188372850418, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838850498199463, + "sampling/importance_sampling_ratio/min": 0.005257737822830677, + "sampling/sampling_logp_difference/max": 5.248054504394531, + "sampling/sampling_logp_difference/mean": 0.0778856873512268, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 180.15234375, + "completions/mean_terminated_length": 180.15234375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.04276940552517772, + "epoch": 0.7026548672566372, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 189332153.0, + "reward": 0.40625, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984542965888977, + "sampling/importance_sampling_ratio/min": 0.0024795890785753727, + "sampling/sampling_logp_difference/max": 5.999662399291992, + "sampling/sampling_logp_difference/mean": 0.07923919707536697, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 279.0, + "completions/max_terminated_length": 279.0, + "completions/mean_length": 170.71484375, + "completions/mean_terminated_length": 170.71484375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.04188469937071204, + "epoch": 0.7044247787610619, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.75895835553019, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 189712416.0, + "reward": 0.77734375, + "reward_std": 0.07295486330986023, + "rewards/execution_accuracy_EX/mean": 0.765625, + "rewards/execution_accuracy_EX/std": 0.42443734407424927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823094606399536, + "sampling/importance_sampling_ratio/min": 0.0005533903604373336, + "sampling/sampling_logp_difference/max": 7.499446868896484, + "sampling/sampling_logp_difference/mean": 0.07819315791130066, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 278.0, + "completions/max_terminated_length": 278.0, + "completions/mean_length": 179.625, + "completions/mean_terminated_length": 179.625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.03992706025019288, + "epoch": 0.7061946902654868, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7621636983177205, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 190079024.0, + "reward": 0.6400390267372131, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874575734138489, + "sampling/importance_sampling_ratio/min": 0.005583661608397961, + "sampling/sampling_logp_difference/max": 5.187910556793213, + "sampling/sampling_logp_difference/mean": 0.06382463872432709, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 179.1875, + "completions/mean_terminated_length": 179.1875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.03766210877802223, + "epoch": 0.7079646017699115, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.467853234197227, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 190387312.0, + "reward": 0.5992187261581421, + "reward_std": 0.038778576999902725, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840128421783447, + "sampling/importance_sampling_ratio/min": 0.0031850473023951054, + "sampling/sampling_logp_difference/max": 5.749288082122803, + "sampling/sampling_logp_difference/mean": 0.0717402920126915, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 352.0, + "completions/max_terminated_length": 352.0, + "completions/mean_length": 174.07421875, + "completions/mean_terminated_length": 174.07421875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.04573288047686219, + "epoch": 0.7097345132743362, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6934957112341334, + "learning_rate": 1e-06, + "loss": -0.0082, + "num_tokens": 190722531.0, + "reward": 0.5583984851837158, + "reward_std": 0.06578528881072998, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980005145072937, + "sampling/importance_sampling_ratio/min": 0.003192942589521408, + "sampling/sampling_logp_difference/max": 5.746812343597412, + "sampling/sampling_logp_difference/mean": 0.08805866539478302, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 173.546875, + "completions/mean_terminated_length": 173.546875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.04186416487209499, + "epoch": 0.7115044247787611, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.3310115164763248, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 191236959.0, + "reward": 0.5249999761581421, + "reward_std": 0.07912467420101166, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985221266746521, + "sampling/importance_sampling_ratio/min": 4.130156738311406e-16, + "sampling/sampling_logp_difference/max": 35.42304611206055, + "sampling/sampling_logp_difference/mean": 0.07671888172626495, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 326.0, + "completions/max_terminated_length": 326.0, + "completions/mean_length": 176.6171875, + "completions/mean_terminated_length": 176.6171875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.03845847793854773, + "epoch": 0.7132743362831858, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8096221503918023, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 191853597.0, + "reward": 0.49531248211860657, + "reward_std": 0.045264385640621185, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812091588973999, + "sampling/importance_sampling_ratio/min": 0.004135897383093834, + "sampling/sampling_logp_difference/max": 5.488050937652588, + "sampling/sampling_logp_difference/mean": 0.0872371569275856, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 171.15234375, + "completions/mean_terminated_length": 171.15234375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.03325132618192583, + "epoch": 0.7150442477876107, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4827950115677092, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 192414404.0, + "reward": 0.706835925579071, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.69140625, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841488599777222, + "sampling/importance_sampling_ratio/min": 0.00027251752908341587, + "sampling/sampling_logp_difference/max": 8.207807540893555, + "sampling/sampling_logp_difference/mean": 0.06903485208749771, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 252.0, + "completions/max_terminated_length": 252.0, + "completions/mean_length": 159.80859375, + "completions/mean_terminated_length": 159.80859375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.03494338982272893, + "epoch": 0.7168141592920354, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5536657185390912, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 192920723.0, + "reward": 0.5435546636581421, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838402271270752, + "sampling/importance_sampling_ratio/min": 0.006923105102032423, + "sampling/sampling_logp_difference/max": 4.972890853881836, + "sampling/sampling_logp_difference/mean": 0.0706358402967453, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 187.0078125, + "completions/mean_terminated_length": 187.0078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.041024084435775876, + "epoch": 0.7185840707964601, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.523502677399838, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 193425285.0, + "reward": 0.5621093511581421, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814822673797607, + "sampling/importance_sampling_ratio/min": 0.004096942488104105, + "sampling/sampling_logp_difference/max": 5.497514247894287, + "sampling/sampling_logp_difference/mean": 0.078708715736866, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 412.0, + "completions/max_terminated_length": 412.0, + "completions/mean_length": 194.03515625, + "completions/mean_terminated_length": 194.03515625, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.04274896439164877, + "epoch": 0.720353982300885, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6248215674373395, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 193963774.0, + "reward": 0.4544921815395355, + "reward_std": 0.05908473581075668, + "rewards/execution_accuracy_EX/mean": 0.42578125, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811341762542725, + "sampling/importance_sampling_ratio/min": 0.002612431300804019, + "sampling/sampling_logp_difference/max": 5.947474002838135, + "sampling/sampling_logp_difference/mean": 0.08653772622346878, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 201.6015625, + "completions/mean_terminated_length": 201.6015625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.03842438664287329, + "epoch": 0.7221238938053097, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1316616930134138, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 194615736.0, + "reward": 0.5843750238418579, + "reward_std": 0.08204594254493713, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839637279510498, + "sampling/importance_sampling_ratio/min": 0.0024795844219624996, + "sampling/sampling_logp_difference/max": 5.999664306640625, + "sampling/sampling_logp_difference/mean": 0.07672550529241562, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 182.6015625, + "completions/mean_terminated_length": 182.6015625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.03768653352744877, + "epoch": 0.7238938053097345, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7966533799539178, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 194986258.0, + "reward": 0.6548827886581421, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985592782497406, + "sampling/importance_sampling_ratio/min": 0.003344968892633915, + "sampling/sampling_logp_difference/max": 5.700297832489014, + "sampling/sampling_logp_difference/mean": 0.07073619961738586, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 174.30078125, + "completions/mean_terminated_length": 174.30078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.035016599111258984, + "epoch": 0.7256637168141593, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.2511724225416736, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 195700719.0, + "reward": 0.521289050579071, + "reward_std": 0.10024130344390869, + "rewards/execution_accuracy_EX/mean": 0.49609375, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838403463363647, + "sampling/importance_sampling_ratio/min": 0.0007565529667772353, + "sampling/sampling_logp_difference/max": 7.186738014221191, + "sampling/sampling_logp_difference/mean": 0.0730225145816803, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 209.48046875, + "completions/mean_terminated_length": 209.48046875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.03943896200507879, + "epoch": 0.727433628318584, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.44208571827978876, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 196245994.0, + "reward": 0.5732421875, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9817679524421692, + "sampling/importance_sampling_ratio/min": 0.0005733446450904012, + "sampling/sampling_logp_difference/max": 7.464023590087891, + "sampling/sampling_logp_difference/mean": 0.08086696267127991, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 187.9296875, + "completions/mean_terminated_length": 187.9296875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.03866938268765807, + "epoch": 0.7292035398230089, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.5305148511717597, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 196733176.0, + "reward": 0.6029297113418579, + "reward_std": 0.13296175003051758, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874039888381958, + "sampling/importance_sampling_ratio/min": 0.004105238243937492, + "sampling/sampling_logp_difference/max": 5.4954915046691895, + "sampling/sampling_logp_difference/mean": 0.06777317821979523, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 273.0, + "completions/max_terminated_length": 273.0, + "completions/mean_length": 175.828125, + "completions/mean_terminated_length": 175.828125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.033790889428928494, + "epoch": 0.7309734513274336, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.2764865089020971, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 197312076.0, + "reward": 0.6919921636581421, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9858330488204956, + "sampling/importance_sampling_ratio/min": 0.0015042709419503808, + "sampling/sampling_logp_difference/max": 6.499446868896484, + "sampling/sampling_logp_difference/mean": 0.06551600992679596, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 312.0, + "completions/max_terminated_length": 312.0, + "completions/mean_length": 168.9609375, + "completions/mean_terminated_length": 168.9609375, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.036426497739739716, + "epoch": 0.7327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 197693234.0, + "reward": 0.6437499523162842, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862203598022461, + "sampling/importance_sampling_ratio/min": 0.0012697427300736308, + "sampling/sampling_logp_difference/max": 6.668941020965576, + "sampling/sampling_logp_difference/mean": 0.06786644458770752, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 310.0, + "completions/max_terminated_length": 310.0, + "completions/mean_length": 165.75390625, + "completions/mean_terminated_length": 165.75390625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.030346823390573263, + "epoch": 0.7345132743362832, + "frac_reward_zero_std": 0.875, + "grad_norm": 2.320548928698098, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 198154195.0, + "reward": 0.7291015386581421, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9874189496040344, + "sampling/importance_sampling_ratio/min": 0.0019564172253012657, + "sampling/sampling_logp_difference/max": 6.236640453338623, + "sampling/sampling_logp_difference/mean": 0.06267690658569336, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 184.8203125, + "completions/mean_terminated_length": 184.8203125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.03448976203799248, + "epoch": 0.736283185840708, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9572127400179158, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 198593125.0, + "reward": 0.669726550579071, + "reward_std": 0.06481167674064636, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864132404327393, + "sampling/importance_sampling_ratio/min": 2.042848791461438e-05, + "sampling/sampling_logp_difference/max": 10.798580169677734, + "sampling/sampling_logp_difference/mean": 0.0684976726770401, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 297.0, + "completions/max_terminated_length": 297.0, + "completions/mean_length": 169.91015625, + "completions/mean_terminated_length": 169.91015625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.03351215971633792, + "epoch": 0.7380530973451327, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 199066254.0, + "reward": 0.703125, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872507452964783, + "sampling/importance_sampling_ratio/min": 0.0002635216515045613, + "sampling/sampling_logp_difference/max": 8.241374969482422, + "sampling/sampling_logp_difference/mean": 0.06463707983493805, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 559.0, + "completions/max_terminated_length": 559.0, + "completions/mean_length": 186.46875, + "completions/mean_terminated_length": 186.46875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.03897794825024903, + "epoch": 0.7398230088495575, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2433718664716331, + "learning_rate": 1e-06, + "loss": 0.014, + "num_tokens": 199499974.0, + "reward": 0.6808593273162842, + "reward_std": 0.05937499925494194, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9836287498474121, + "sampling/importance_sampling_ratio/min": 0.0024888941552489996, + "sampling/sampling_logp_difference/max": 5.995916843414307, + "sampling/sampling_logp_difference/mean": 0.07531644403934479, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 181.95703125, + "completions/mean_terminated_length": 181.95703125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.03403950703795999, + "epoch": 0.7415929203539823, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3320061389407159, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 200107195.0, + "reward": 0.6994140148162842, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9843035936355591, + "sampling/importance_sampling_ratio/min": 0.0019471559207886457, + "sampling/sampling_logp_difference/max": 6.241385459899902, + "sampling/sampling_logp_difference/mean": 0.07460791617631912, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 208.359375, + "completions/mean_terminated_length": 208.359375, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.03853577165864408, + "epoch": 0.7433628318584071, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.23272584041241848, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 200774711.0, + "reward": 0.5287109017372131, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9851356148719788, + "sampling/importance_sampling_ratio/min": 0.00023882582900114357, + "sampling/sampling_logp_difference/max": 8.339776039123535, + "sampling/sampling_logp_difference/mean": 0.07532660663127899, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 173.88671875, + "completions/mean_terminated_length": 173.88671875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.03371518186759204, + "epoch": 0.7451327433628319, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 201177322.0, + "reward": 0.7625000476837158, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9869147539138794, + "sampling/importance_sampling_ratio/min": 9.133262210525572e-05, + "sampling/sampling_logp_difference/max": 9.301002502441406, + "sampling/sampling_logp_difference/mean": 0.07050107419490814, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 362.0, + "completions/max_terminated_length": 362.0, + "completions/mean_length": 191.5546875, + "completions/mean_terminated_length": 191.5546875, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.03665241762064397, + "epoch": 0.7469026548672566, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.950988605092553, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 201648136.0, + "reward": 0.669726550579071, + "reward_std": 0.10494484007358551, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845188856124878, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.06406085193157196, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 268.0, + "completions/max_terminated_length": 268.0, + "completions/mean_length": 180.25, + "completions/mean_terminated_length": 180.25, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.034022323437966406, + "epoch": 0.7486725663716814, + "frac_reward_zero_std": 0.8125, + "grad_norm": 2.821159023501673, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 202105560.0, + "reward": 0.539843738079071, + "reward_std": 0.0749518871307373, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9856160879135132, + "sampling/importance_sampling_ratio/min": 0.0031994825694710016, + "sampling/sampling_logp_difference/max": 5.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.07072180509567261, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 192.88671875, + "completions/mean_terminated_length": 192.88671875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.03738184436224401, + "epoch": 0.7504424778761062, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.9658226641455354, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 202495659.0, + "reward": 0.651171863079071, + "reward_std": 0.1382867395877838, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852752089500427, + "sampling/importance_sampling_ratio/min": 0.002482546726241708, + "sampling/sampling_logp_difference/max": 5.998470306396484, + "sampling/sampling_logp_difference/mean": 0.07124799489974976, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 188.5859375, + "completions/mean_terminated_length": 188.5859375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.03595548321027309, + "epoch": 0.7522123893805309, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.9855874928787963, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 202988625.0, + "reward": 0.6103515625, + "reward_std": 0.04870403930544853, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9817183017730713, + "sampling/importance_sampling_ratio/min": 0.00021359855600167066, + "sampling/sampling_logp_difference/max": 8.451412200927734, + "sampling/sampling_logp_difference/mean": 0.0761106014251709, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 260.0, + "completions/max_terminated_length": 260.0, + "completions/mean_length": 176.91015625, + "completions/mean_terminated_length": 176.91015625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.03337432839907706, + "epoch": 0.7539823008849558, + "frac_reward_zero_std": 0.5, + "grad_norm": 1.4695429939309823, + "learning_rate": 1e-06, + "loss": 0.0076, + "num_tokens": 203383130.0, + "reward": 0.49160152673721313, + "reward_std": 0.1823747158050537, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9853971600532532, + "sampling/importance_sampling_ratio/min": 0.0025269424077123404, + "sampling/sampling_logp_difference/max": 5.980745315551758, + "sampling/sampling_logp_difference/mean": 0.06926571577787399, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 170.12890625, + "completions/mean_terminated_length": 170.12890625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.03176326269749552, + "epoch": 0.7557522123893805, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 203821403.0, + "reward": 0.6437499523162842, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.995414137840271, + "sampling/importance_sampling_ratio/mean": 0.9846120476722717, + "sampling/importance_sampling_ratio/min": 0.0024824936408549547, + "sampling/sampling_logp_difference/max": 5.9984917640686035, + "sampling/sampling_logp_difference/mean": 0.06789524853229523, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 281.0, + "completions/max_terminated_length": 281.0, + "completions/mean_length": 182.2578125, + "completions/mean_terminated_length": 182.2578125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.03244726057164371, + "epoch": 0.7575221238938054, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8316523340577135, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 204347885.0, + "reward": 0.517578125, + "reward_std": 0.038778576999902725, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985505998134613, + "sampling/importance_sampling_ratio/min": 0.00033629851532168686, + "sampling/sampling_logp_difference/max": 7.997511386871338, + "sampling/sampling_logp_difference/mean": 0.06805039197206497, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 379.0, + "completions/max_terminated_length": 379.0, + "completions/mean_length": 169.84765625, + "completions/mean_terminated_length": 169.84765625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.034791712765581906, + "epoch": 0.7592920353982301, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 204762262.0, + "reward": 0.5249999761581421, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889514446258545, + "sampling/importance_sampling_ratio/min": 0.004103472921997309, + "sampling/sampling_logp_difference/max": 5.495921611785889, + "sampling/sampling_logp_difference/mean": 0.06241118907928467, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 166.0, + "completions/mean_terminated_length": 166.0, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.03213086456526071, + "epoch": 0.7610619469026548, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.295966117595592, + "learning_rate": 1e-06, + "loss": -0.007, + "num_tokens": 205434294.0, + "reward": 0.550976574420929, + "reward_std": 0.030420634895563126, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9836922287940979, + "sampling/importance_sampling_ratio/min": 0.0024810167960822582, + "sampling/sampling_logp_difference/max": 5.999086856842041, + "sampling/sampling_logp_difference/mean": 0.070546455681324, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 178.58984375, + "completions/mean_terminated_length": 178.58984375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.040628812508657575, + "epoch": 0.7628318584070797, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.1342758940535873, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 206013677.0, + "reward": 0.6882812976837158, + "reward_std": 0.06711415946483612, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832375049591064, + "sampling/importance_sampling_ratio/min": 0.0031929120887070894, + "sampling/sampling_logp_difference/max": 5.746821880340576, + "sampling/sampling_logp_difference/mean": 0.07909567654132843, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 160.76171875, + "completions/mean_terminated_length": 160.76171875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.03127549251075834, + "epoch": 0.7646017699115044, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.9625981510683317, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 206561648.0, + "reward": 0.6103515625, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98762047290802, + "sampling/importance_sampling_ratio/min": 9.84363941824995e-05, + "sampling/sampling_logp_difference/max": 9.226099967956543, + "sampling/sampling_logp_difference/mean": 0.0587695948779583, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 186.203125, + "completions/mean_terminated_length": 186.203125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.0332630091579631, + "epoch": 0.7663716814159292, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.7271845801407777, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 206947412.0, + "reward": 0.740234375, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842133522033691, + "sampling/importance_sampling_ratio/min": 6.217168993316591e-05, + "sampling/sampling_logp_difference/max": 9.6856107711792, + "sampling/sampling_logp_difference/mean": 0.069841168820858, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 173.94921875, + "completions/mean_terminated_length": 173.94921875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.03432316554244608, + "epoch": 0.768141592920354, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.8091410639142782, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 207432279.0, + "reward": 0.6029297113418579, + "reward_std": 0.08017563074827194, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838988780975342, + "sampling/importance_sampling_ratio/min": 9.636551112635061e-05, + "sampling/sampling_logp_difference/max": 9.24736213684082, + "sampling/sampling_logp_difference/mean": 0.07580746710300446, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 182.0703125, + "completions/mean_terminated_length": 182.0703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.034088216023519635, + "epoch": 0.7699115044247787, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.43665188277664685, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 207853993.0, + "reward": 0.4507812559604645, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.421875, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898681640625, + "sampling/importance_sampling_ratio/min": 0.006783537101000547, + "sampling/sampling_logp_difference/max": 4.993256568908691, + "sampling/sampling_logp_difference/mean": 0.05816485732793808, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 180.8984375, + "completions/mean_terminated_length": 180.8984375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.03289527911692858, + "epoch": 0.7716814159292036, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.0580073029426735, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 208356975.0, + "reward": 0.6400390863418579, + "reward_std": 0.1142488494515419, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842815399169922, + "sampling/importance_sampling_ratio/min": 0.0024888820480555296, + "sampling/sampling_logp_difference/max": 5.995921611785889, + "sampling/sampling_logp_difference/mean": 0.07519666850566864, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 294.0, + "completions/max_terminated_length": 294.0, + "completions/mean_length": 189.5859375, + "completions/mean_terminated_length": 189.5859375, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.03927030530758202, + "epoch": 0.7734513274336283, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1689930695761859, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 208824757.0, + "reward": 0.35429686307907104, + "reward_std": 0.08561231195926666, + "rewards/execution_accuracy_EX/mean": 0.3203125, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9804680943489075, + "sampling/importance_sampling_ratio/min": 0.0020814009476453066, + "sampling/sampling_logp_difference/max": 6.174714088439941, + "sampling/sampling_logp_difference/mean": 0.08233625441789627, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 315.0, + "completions/max_terminated_length": 315.0, + "completions/mean_length": 185.22265625, + "completions/mean_terminated_length": 185.22265625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.038315842393785715, + "epoch": 0.7752212389380531, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3226393860724158, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 209453918.0, + "reward": 0.517578125, + "reward_std": 0.020280424505472183, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845386743545532, + "sampling/importance_sampling_ratio/min": 0.0009394197259098291, + "sampling/sampling_logp_difference/max": 6.970248222351074, + "sampling/sampling_logp_difference/mean": 0.07381948083639145, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 266.0, + "completions/max_terminated_length": 266.0, + "completions/mean_length": 167.7421875, + "completions/mean_terminated_length": 167.7421875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.032037795754149556, + "epoch": 0.7769911504424779, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.4730571685464607, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 210061340.0, + "reward": 0.532421886920929, + "reward_std": 0.020280422642827034, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852868318557739, + "sampling/importance_sampling_ratio/min": 9.692386811366305e-05, + "sampling/sampling_logp_difference/max": 9.241584777832031, + "sampling/sampling_logp_difference/mean": 0.0676056370139122, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 180.7578125, + "completions/mean_terminated_length": 180.7578125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.03670450847130269, + "epoch": 0.7787610619469026, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8201680403355739, + "learning_rate": 1e-06, + "loss": -0.0152, + "num_tokens": 210467102.0, + "reward": 0.49531248211860657, + "reward_std": 0.045264385640621185, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.939831256866455, + "sampling/importance_sampling_ratio/mean": 0.9840477108955383, + "sampling/importance_sampling_ratio/min": 0.0025771246291697025, + "sampling/sampling_logp_difference/max": 5.961081027984619, + "sampling/sampling_logp_difference/mean": 0.07285960018634796, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 198.6875, + "completions/mean_terminated_length": 198.6875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.032644074875861406, + "epoch": 0.7805309734513274, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7477070387075161, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 210948686.0, + "reward": 0.5992187261581421, + "reward_std": 0.05721442401409149, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845725297927856, + "sampling/importance_sampling_ratio/min": 0.0011875852942466736, + "sampling/sampling_logp_difference/max": 6.735833168029785, + "sampling/sampling_logp_difference/mean": 0.06937295198440552, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 250.0, + "completions/max_terminated_length": 250.0, + "completions/mean_length": 172.484375, + "completions/mean_terminated_length": 172.484375, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.03287287475541234, + "epoch": 0.7823008849557522, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.899724565990717, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 211300922.0, + "reward": 0.688281238079071, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9851868152618408, + "sampling/importance_sampling_ratio/min": 0.0024831669870764017, + "sampling/sampling_logp_difference/max": 5.998220443725586, + "sampling/sampling_logp_difference/mean": 0.07144519686698914, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 300.0, + "completions/mean_length": 184.71875, + "completions/mean_terminated_length": 169.38040161132812, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.035513352835550904, + "epoch": 0.784070796460177, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.9146950972535406, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 211712578.0, + "reward": 0.6175780892372131, + "reward_std": 0.14168648421764374, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9865785837173462, + "sampling/importance_sampling_ratio/min": 0.0011792866280302405, + "sampling/sampling_logp_difference/max": 6.74284553527832, + "sampling/sampling_logp_difference/mean": 0.06966756284236908, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 432.0, + "completions/max_terminated_length": 432.0, + "completions/mean_length": 191.4765625, + "completions/mean_terminated_length": 191.4765625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.0361675291787833, + "epoch": 0.7858407079646018, + "frac_reward_zero_std": 0.5625, + "grad_norm": 2.2496813942982503, + "learning_rate": 1e-06, + "loss": -0.0154, + "num_tokens": 212339036.0, + "reward": 0.42109373211860657, + "reward_std": 0.17300689220428467, + "rewards/execution_accuracy_EX/mean": 0.390625, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9848983287811279, + "sampling/importance_sampling_ratio/min": 0.0024888820480555296, + "sampling/sampling_logp_difference/max": 5.995921611785889, + "sampling/sampling_logp_difference/mean": 0.07575321197509766, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 476.0, + "completions/max_terminated_length": 476.0, + "completions/mean_length": 194.79296875, + "completions/mean_terminated_length": 194.79296875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.040215677581727505, + "epoch": 0.7876106194690266, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.0276933605333982, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 212860727.0, + "reward": 0.43964844942092896, + "reward_std": 0.06481166929006577, + "rewards/execution_accuracy_EX/mean": 0.41015625, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863396883010864, + "sampling/importance_sampling_ratio/min": 7.312193338293582e-05, + "sampling/sampling_logp_difference/max": 9.523382186889648, + "sampling/sampling_logp_difference/mean": 0.07146722823381424, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 180.28125, + "completions/mean_terminated_length": 180.28125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.03822091734036803, + "epoch": 0.7893805309734513, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.8830218320974383, + "learning_rate": 1e-06, + "loss": -0.0058, + "num_tokens": 213361839.0, + "reward": 0.569531261920929, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9836409091949463, + "sampling/importance_sampling_ratio/min": 0.0052536651492118835, + "sampling/sampling_logp_difference/max": 5.248829364776611, + "sampling/sampling_logp_difference/mean": 0.07403667271137238, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 188.38671875, + "completions/mean_terminated_length": 188.38671875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.03450642281677574, + "epoch": 0.7911504424778761, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6405524088104448, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 213707474.0, + "reward": 0.5843750238418579, + "reward_std": 0.040560849010944366, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860996007919312, + "sampling/importance_sampling_ratio/min": 0.0002619486185722053, + "sampling/sampling_logp_difference/max": 8.24736213684082, + "sampling/sampling_logp_difference/mean": 0.06750054657459259, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 184.12109375, + "completions/mean_terminated_length": 184.12109375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.04336669575423002, + "epoch": 0.7929203539823009, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.817175148002452, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 214067121.0, + "reward": 0.5472656488418579, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798983335494995, + "sampling/importance_sampling_ratio/min": 0.0032281840685755014, + "sampling/sampling_logp_difference/max": 5.735835552215576, + "sampling/sampling_logp_difference/mean": 0.08915063738822937, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 316.0, + "completions/max_terminated_length": 316.0, + "completions/mean_length": 173.921875, + "completions/mean_terminated_length": 173.921875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.03771132975816727, + "epoch": 0.7946902654867256, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.5856246164059995, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 214391373.0, + "reward": 0.5658203363418579, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863474369049072, + "sampling/importance_sampling_ratio/min": 0.0004315741534810513, + "sampling/sampling_logp_difference/max": 7.748071193695068, + "sampling/sampling_logp_difference/mean": 0.06461955606937408, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 183.80859375, + "completions/mean_terminated_length": 183.80859375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.037194730481132865, + "epoch": 0.7964601769911505, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.4804346036068103, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 214861884.0, + "reward": 0.5287109017372131, + "reward_std": 0.07616598159074783, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863549470901489, + "sampling/importance_sampling_ratio/min": 0.0003532337723299861, + "sampling/sampling_logp_difference/max": 7.948380470275879, + "sampling/sampling_logp_difference/mean": 0.06500714272260666, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 182.0234375, + "completions/mean_terminated_length": 182.0234375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.035413203528150916, + "epoch": 0.7982300884955752, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.119154931881152, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 215579506.0, + "reward": 0.6734374761581421, + "reward_std": 0.09323528409004211, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9855575561523438, + "sampling/importance_sampling_ratio/min": 0.0007124387775547802, + "sampling/sampling_logp_difference/max": 7.246816635131836, + "sampling/sampling_logp_difference/mean": 0.07431971281766891, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 182.41796875, + "completions/mean_terminated_length": 182.41796875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.035863753175362945, + "epoch": 0.8, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5394763027315568, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 216128717.0, + "reward": 0.6474609375, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888901710510254, + "sampling/importance_sampling_ratio/min": 0.0004761523741763085, + "sampling/sampling_logp_difference/max": 7.649772644042969, + "sampling/sampling_logp_difference/mean": 0.0634334608912468, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 295.0, + "completions/max_terminated_length": 295.0, + "completions/mean_length": 178.10546875, + "completions/mean_terminated_length": 178.10546875, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.03369049716275185, + "epoch": 0.8017699115044248, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.061376033042985, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 216696888.0, + "reward": 0.595507800579071, + "reward_std": 0.04870404303073883, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875693917274475, + "sampling/importance_sampling_ratio/min": 0.0019405841594561934, + "sampling/sampling_logp_difference/max": 6.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.06381489336490631, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 193.5625, + "completions/mean_terminated_length": 193.5625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.03183405671734363, + "epoch": 0.8035398230088495, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.163643440838458, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 217087896.0, + "reward": 0.5287109017372131, + "reward_std": 0.12355467677116394, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987615704536438, + "sampling/importance_sampling_ratio/min": 0.0019327143672853708, + "sampling/sampling_logp_difference/max": 6.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.06233280897140503, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 195.9375, + "completions/mean_terminated_length": 195.9375, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.03245685843285173, + "epoch": 0.8053097345132744, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1170246667784294, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 217509992.0, + "reward": 0.5732421875, + "reward_std": 0.10797779262065887, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9909529685974121, + "sampling/importance_sampling_ratio/min": 0.0015071752713993192, + "sampling/sampling_logp_difference/max": 6.497518062591553, + "sampling/sampling_logp_difference/mean": 0.06087698042392731, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 183.734375, + "completions/mean_terminated_length": 183.734375, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.036316481186077, + "epoch": 0.8070796460176991, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3919319523376626, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 217870452.0, + "reward": 0.6474609375, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9881608486175537, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.06095626950263977, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 321.0, + "completions/max_terminated_length": 321.0, + "completions/mean_length": 195.24609375, + "completions/mean_terminated_length": 195.24609375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.03276994626503438, + "epoch": 0.8088495575221238, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.0613938658220035, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 218433171.0, + "reward": 0.44707030057907104, + "reward_std": 0.05697394162416458, + "rewards/execution_accuracy_EX/mean": 0.41796875, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900121092796326, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.05962977185845375, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 548.0, + "completions/max_terminated_length": 548.0, + "completions/mean_length": 189.1484375, + "completions/mean_terminated_length": 189.1484375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.036552409175783396, + "epoch": 0.8106194690265487, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.41713833635820313, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 218929705.0, + "reward": 0.6548827886581421, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9827380180358887, + "sampling/importance_sampling_ratio/min": 0.0009120672475546598, + "sampling/sampling_logp_difference/max": 6.9997968673706055, + "sampling/sampling_logp_difference/mean": 0.07468201965093613, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 339.0, + "completions/max_terminated_length": 339.0, + "completions/mean_length": 171.25390625, + "completions/mean_terminated_length": 171.25390625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.03858993435278535, + "epoch": 0.8123893805309734, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4547161853605264, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 219291178.0, + "reward": 0.569531261920929, + "reward_std": 0.09993584454059601, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860604405403137, + "sampling/importance_sampling_ratio/min": 0.0016887388192117214, + "sampling/sampling_logp_difference/max": 6.383773326873779, + "sampling/sampling_logp_difference/mean": 0.07278476655483246, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 373.0, + "completions/max_terminated_length": 373.0, + "completions/mean_length": 185.296875, + "completions/mean_terminated_length": 185.296875, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.0370300910435617, + "epoch": 0.8141592920353983, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7536610409679617, + "learning_rate": 1e-06, + "loss": -0.006, + "num_tokens": 219825382.0, + "reward": 0.7662109136581421, + "reward_std": 0.035124171525239944, + "rewards/execution_accuracy_EX/mean": 0.75390625, + "rewards/execution_accuracy_EX/std": 0.43157756328582764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864706993103027, + "sampling/importance_sampling_ratio/min": 0.00020719511667266488, + "sampling/sampling_logp_difference/max": 8.481849670410156, + "sampling/sampling_logp_difference/mean": 0.06992129981517792, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 328.0, + "completions/max_terminated_length": 328.0, + "completions/mean_length": 209.9296875, + "completions/mean_terminated_length": 209.9296875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.03714966797269881, + "epoch": 0.815929203539823, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.7471992836036272, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 220408820.0, + "reward": 0.5806640386581421, + "reward_std": 0.12804345786571503, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985925018787384, + "sampling/importance_sampling_ratio/min": 0.0005697862943634391, + "sampling/sampling_logp_difference/max": 7.470249176025391, + "sampling/sampling_logp_difference/mean": 0.07193641364574432, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 229.0, + "completions/max_terminated_length": 229.0, + "completions/mean_length": 162.3515625, + "completions/mean_terminated_length": 162.3515625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.030318079399876297, + "epoch": 0.8176991150442477, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.1945294855346704, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 220734398.0, + "reward": 0.743945300579071, + "reward_std": 0.06578528881072998, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892048835754395, + "sampling/importance_sampling_ratio/min": 0.002484897617250681, + "sampling/sampling_logp_difference/max": 5.997523784637451, + "sampling/sampling_logp_difference/mean": 0.06512968242168427, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 393.0, + "completions/max_terminated_length": 393.0, + "completions/mean_length": 195.61328125, + "completions/mean_terminated_length": 195.61328125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.03934220178052783, + "epoch": 0.8194690265486726, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7712496840257055, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 221092699.0, + "reward": 0.5324218273162842, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839807748794556, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.07511478662490845, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 178.2734375, + "completions/mean_terminated_length": 178.2734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.04063643258996308, + "epoch": 0.8212389380530973, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.712399909787215, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 221633729.0, + "reward": 0.5249999761581421, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9836113452911377, + "sampling/importance_sampling_ratio/min": 0.0007850354304537177, + "sampling/sampling_logp_difference/max": 7.149781703948975, + "sampling/sampling_logp_difference/mean": 0.08031530678272247, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 334.0, + "completions/max_terminated_length": 334.0, + "completions/mean_length": 173.35546875, + "completions/mean_terminated_length": 173.35546875, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.03969687083736062, + "epoch": 0.8230088495575221, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1193689068657768, + "learning_rate": 1e-06, + "loss": -0.0281, + "num_tokens": 222047436.0, + "reward": 0.5769531726837158, + "reward_std": 0.107462078332901, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9835950136184692, + "sampling/importance_sampling_ratio/min": 0.002526219468563795, + "sampling/sampling_logp_difference/max": 5.98103141784668, + "sampling/sampling_logp_difference/mean": 0.07873211801052094, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 183.34375, + "completions/mean_terminated_length": 183.34375, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.037796722492203116, + "epoch": 0.8247787610619469, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7730252508665224, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 222485588.0, + "reward": 0.3876953125, + "reward_std": 0.06578528881072998, + "rewards/execution_accuracy_EX/mean": 0.35546875, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872028827667236, + "sampling/importance_sampling_ratio/min": 0.0005597883719019592, + "sampling/sampling_logp_difference/max": 7.487951755523682, + "sampling/sampling_logp_difference/mean": 0.06885313987731934, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 264.0, + "completions/max_terminated_length": 264.0, + "completions/mean_length": 182.00390625, + "completions/mean_terminated_length": 182.00390625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.03711854084394872, + "epoch": 0.8265486725663717, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 223031541.0, + "reward": 0.5843750238418579, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859459400177002, + "sampling/importance_sampling_ratio/min": 0.0019365980988368392, + "sampling/sampling_logp_difference/max": 6.246822357177734, + "sampling/sampling_logp_difference/mean": 0.06760353595018387, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 322.0, + "completions/max_terminated_length": 322.0, + "completions/mean_length": 181.09765625, + "completions/mean_terminated_length": 181.09765625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.0391436277423054, + "epoch": 0.8283185840707965, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2556989483267407, + "learning_rate": 1e-06, + "loss": 0.012, + "num_tokens": 223600334.0, + "reward": 0.5806640386581421, + "reward_std": 0.07525734603404999, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826628565788269, + "sampling/importance_sampling_ratio/min": 0.004103472921997309, + "sampling/sampling_logp_difference/max": 5.495921611785889, + "sampling/sampling_logp_difference/mean": 0.07907578349113464, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 177.7109375, + "completions/mean_terminated_length": 177.7109375, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.03847864316776395, + "epoch": 0.8300884955752212, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7362766343426851, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 224060308.0, + "reward": 0.5287109613418579, + "reward_std": 0.035124171525239944, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906367063522339, + "sampling/importance_sampling_ratio/min": 0.0003400211571715772, + "sampling/sampling_logp_difference/max": 7.986502647399902, + "sampling/sampling_logp_difference/mean": 0.062188658863306046, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 175.51953125, + "completions/mean_terminated_length": 175.51953125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.03579380200244486, + "epoch": 0.831858407079646, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3110324854586604, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 224612025.0, + "reward": 0.6623046398162842, + "reward_std": 0.09824429452419281, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885303974151611, + "sampling/importance_sampling_ratio/min": 0.0024954716209322214, + "sampling/sampling_logp_difference/max": 5.993277549743652, + "sampling/sampling_logp_difference/mean": 0.06727813929319382, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 287.0, + "completions/max_terminated_length": 287.0, + "completions/mean_length": 169.53125, + "completions/mean_terminated_length": 169.53125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.03632258530706167, + "epoch": 0.8336283185840708, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.7906746524692754, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 224977745.0, + "reward": 0.7476562261581421, + "reward_std": 0.13797074556350708, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.99134761095047, + "sampling/importance_sampling_ratio/min": 0.0019376660929992795, + "sampling/sampling_logp_difference/max": 6.246271133422852, + "sampling/sampling_logp_difference/mean": 0.058021172881126404, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 175.54296875, + "completions/mean_terminated_length": 175.54296875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.03317859407979995, + "epoch": 0.8353982300884956, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8567654641940903, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 225427868.0, + "reward": 0.632617175579071, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867100119590759, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.06782960146665573, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 243.0, + "completions/max_terminated_length": 243.0, + "completions/mean_length": 179.52734375, + "completions/mean_terminated_length": 179.52734375, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.03570391912944615, + "epoch": 0.8371681415929203, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.5440183663895908, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 225805555.0, + "reward": 0.4990234375, + "reward_std": 0.05048813670873642, + "rewards/execution_accuracy_EX/mean": 0.47265625, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873706102371216, + "sampling/importance_sampling_ratio/min": 0.0019318372942507267, + "sampling/sampling_logp_difference/max": 6.249283790588379, + "sampling/sampling_logp_difference/mean": 0.06798580288887024, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 182.265625, + "completions/mean_terminated_length": 182.265625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.04240417550317943, + "epoch": 0.8389380530973451, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8522710654737147, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 226347639.0, + "reward": 0.539843738079071, + "reward_std": 0.05721442401409149, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849438667297363, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.08094295114278793, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 169.17578125, + "completions/mean_terminated_length": 169.17578125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.0391388984862715, + "epoch": 0.8407079646017699, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9064649033043038, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 226922724.0, + "reward": 0.799609363079071, + "reward_std": 0.06554480642080307, + "rewards/execution_accuracy_EX/mean": 0.7890625, + "rewards/execution_accuracy_EX/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987274169921875, + "sampling/importance_sampling_ratio/min": 0.005257665179669857, + "sampling/sampling_logp_difference/max": 5.248068332672119, + "sampling/sampling_logp_difference/mean": 0.06695103645324707, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 368.0, + "completions/max_terminated_length": 368.0, + "completions/mean_length": 186.29296875, + "completions/mean_terminated_length": 186.29296875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.041356358444318175, + "epoch": 0.8424778761061947, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0380525349286314, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 227391007.0, + "reward": 0.5621093511581421, + "reward_std": 0.10232636332511902, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837192296981812, + "sampling/importance_sampling_ratio/min": 0.003200841834768653, + "sampling/sampling_logp_difference/max": 5.7443413734436035, + "sampling/sampling_logp_difference/mean": 0.07811155170202255, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 193.703125, + "completions/mean_terminated_length": 193.703125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.041427367250435054, + "epoch": 0.8442477876106195, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.2581858157057406, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 227715651.0, + "reward": 0.6326172351837158, + "reward_std": 0.11007605493068695, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9858970642089844, + "sampling/importance_sampling_ratio/min": 0.00043688679579645395, + "sampling/sampling_logp_difference/max": 7.735836505889893, + "sampling/sampling_logp_difference/mean": 0.07141127437353134, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 299.0, + "completions/max_terminated_length": 299.0, + "completions/mean_length": 178.8671875, + "completions/mean_terminated_length": 178.8671875, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.043659768998622894, + "epoch": 0.8460176991150442, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9622439982396505, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 228137233.0, + "reward": 0.5843750238418579, + "reward_std": 0.08404296636581421, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9816052913665771, + "sampling/importance_sampling_ratio/min": 2.148682324332185e-05, + "sampling/sampling_logp_difference/max": 10.74807071685791, + "sampling/sampling_logp_difference/mean": 0.08495950698852539, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 333.0, + "completions/max_terminated_length": 333.0, + "completions/mean_length": 168.34765625, + "completions/mean_terminated_length": 168.34765625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.03739546122960746, + "epoch": 0.8477876106194691, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2040538486718038, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 228664554.0, + "reward": 0.703125, + "reward_std": 0.08539755642414093, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839541912078857, + "sampling/importance_sampling_ratio/min": 0.0024954539258033037, + "sampling/sampling_logp_difference/max": 5.993284702301025, + "sampling/sampling_logp_difference/mean": 0.07889965176582336, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 175.8046875, + "completions/mean_terminated_length": 175.8046875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.04593391972593963, + "epoch": 0.8495575221238938, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7868897679508872, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 228971576.0, + "reward": 0.5509765148162842, + "reward_std": 0.06578528881072998, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823975563049316, + "sampling/importance_sampling_ratio/min": 0.002584121422842145, + "sampling/sampling_logp_difference/max": 5.958369731903076, + "sampling/sampling_logp_difference/mean": 0.08234436810016632, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 168.58203125, + "completions/mean_terminated_length": 168.58203125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.04308780119754374, + "epoch": 0.8513274336283185, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6876962004942724, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 229415037.0, + "reward": 0.5472656488418579, + "reward_std": 0.05094154179096222, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876067042350769, + "sampling/importance_sampling_ratio/min": 0.0032308525405824184, + "sampling/sampling_logp_difference/max": 5.73500919342041, + "sampling/sampling_logp_difference/mean": 0.06762432307004929, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 186.64453125, + "completions/mean_terminated_length": 186.64453125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.05345354042947292, + "epoch": 0.8530973451327434, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.24413067492461082, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 229808850.0, + "reward": 0.5806640386581421, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797642230987549, + "sampling/importance_sampling_ratio/min": 0.0005695176660083234, + "sampling/sampling_logp_difference/max": 7.4707207679748535, + "sampling/sampling_logp_difference/mean": 0.09888408333063126, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 270.0, + "completions/max_terminated_length": 270.0, + "completions/mean_length": 174.8125, + "completions/mean_terminated_length": 174.8125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.04117256891913712, + "epoch": 0.8548672566371681, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3534241590255191, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 230344946.0, + "reward": 0.4173828065395355, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.38671875, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9865865111351013, + "sampling/importance_sampling_ratio/min": 8.781182003758659e-08, + "sampling/sampling_logp_difference/max": 16.248069763183594, + "sampling/sampling_logp_difference/mean": 0.07125282287597656, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 169.78125, + "completions/mean_terminated_length": 169.78125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.039728819159790874, + "epoch": 0.856637168141593, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8414924061731868, + "learning_rate": 1e-06, + "loss": -0.0005, + "num_tokens": 230661546.0, + "reward": 0.6103515625, + "reward_std": 0.05048813670873642, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9854416847229004, + "sampling/importance_sampling_ratio/min": 0.0011722506023943424, + "sampling/sampling_logp_difference/max": 6.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.0713871419429779, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 397.0, + "completions/max_terminated_length": 397.0, + "completions/mean_length": 194.578125, + "completions/mean_terminated_length": 194.578125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.043377350782975554, + "epoch": 0.8584070796460177, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.1445156488374872, + "learning_rate": 1e-06, + "loss": -0.0125, + "num_tokens": 231145054.0, + "reward": 0.550976574420929, + "reward_std": 0.10591846704483032, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860328435897827, + "sampling/importance_sampling_ratio/min": 0.0015062455786392093, + "sampling/sampling_logp_difference/max": 6.498135089874268, + "sampling/sampling_logp_difference/mean": 0.07490431517362595, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 170.09765625, + "completions/mean_terminated_length": 170.09765625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.03980531729757786, + "epoch": 0.8601769911504424, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.7532126264704515, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 231484215.0, + "reward": 0.77734375, + "reward_std": 0.06554481387138367, + "rewards/execution_accuracy_EX/mean": 0.765625, + "rewards/execution_accuracy_EX/std": 0.42443734407424927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866953492164612, + "sampling/importance_sampling_ratio/min": 0.0032041221857070923, + "sampling/sampling_logp_difference/max": 5.743317127227783, + "sampling/sampling_logp_difference/mean": 0.06984348595142365, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 174.8046875, + "completions/mean_terminated_length": 174.8046875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.048828796949237585, + "epoch": 0.8619469026548673, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.0203868657198039, + "learning_rate": 1e-06, + "loss": 0.0051, + "num_tokens": 231902053.0, + "reward": 0.6363281011581421, + "reward_std": 0.08204594254493713, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810388684272766, + "sampling/importance_sampling_ratio/min": 0.0025510797277092934, + "sampling/sampling_logp_difference/max": 5.971238613128662, + "sampling/sampling_logp_difference/mean": 0.08873553574085236, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 169.30078125, + "completions/mean_terminated_length": 169.30078125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.04062080825679004, + "epoch": 0.863716814159292, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5125065401826144, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 232398818.0, + "reward": 0.6994140148162842, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9858344197273254, + "sampling/importance_sampling_ratio/min": 0.0052536954171955585, + "sampling/sampling_logp_difference/max": 5.248823642730713, + "sampling/sampling_logp_difference/mean": 0.07541064918041229, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 179.10546875, + "completions/mean_terminated_length": 179.10546875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.04024584265425801, + "epoch": 0.8654867256637168, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7319043885174177, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 232948749.0, + "reward": 0.7476562261581421, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9848055839538574, + "sampling/importance_sampling_ratio/min": 0.0005553600494749844, + "sampling/sampling_logp_difference/max": 7.495893955230713, + "sampling/sampling_logp_difference/mean": 0.07531032711267471, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 169.4921875, + "completions/mean_terminated_length": 169.4921875, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.0417027052026242, + "epoch": 0.8672566371681416, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7221173075640632, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 233386011.0, + "reward": 0.717968761920929, + "reward_std": 0.08340054005384445, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9391865730285645, + "sampling/importance_sampling_ratio/mean": 0.9820489883422852, + "sampling/importance_sampling_ratio/min": 0.003193895798176527, + "sampling/sampling_logp_difference/max": 5.746513843536377, + "sampling/sampling_logp_difference/mean": 0.08255937695503235, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 151.703125, + "completions/mean_terminated_length": 151.703125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "entropy": 0.04000973026268184, + "epoch": 0.8690265486725663, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7895176978648266, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 233772111.0, + "reward": 0.513867199420929, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984424352645874, + "sampling/importance_sampling_ratio/min": 0.0024810119066387415, + "sampling/sampling_logp_difference/max": 5.999088764190674, + "sampling/sampling_logp_difference/mean": 0.07751961052417755, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 161.765625, + "completions/mean_terminated_length": 161.765625, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.04194347094744444, + "epoch": 0.8707964601769912, + "frac_reward_zero_std": 0.875, + "grad_norm": 1.0357085296636368, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 234180659.0, + "reward": 0.7550780773162842, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825524091720581, + "sampling/importance_sampling_ratio/min": 0.002486311597749591, + "sampling/sampling_logp_difference/max": 5.996954917907715, + "sampling/sampling_logp_difference/mean": 0.08252972364425659, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 400.0, + "completions/max_terminated_length": 400.0, + "completions/mean_length": 188.39453125, + "completions/mean_terminated_length": 188.39453125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.05364944553002715, + "epoch": 0.8725663716814159, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6061087133622067, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 234841560.0, + "reward": 0.5880858898162842, + "reward_std": 0.035124171525239944, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840527176856995, + "sampling/importance_sampling_ratio/min": 0.0031951875425875187, + "sampling/sampling_logp_difference/max": 5.746109485626221, + "sampling/sampling_logp_difference/mean": 0.08586367964744568, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 168.3984375, + "completions/mean_terminated_length": 168.3984375, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.042989730602130294, + "epoch": 0.8743362831858407, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9857441156888342, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 235374958.0, + "reward": 0.48046875, + "reward_std": 0.0712219625711441, + "rewards/execution_accuracy_EX/mean": 0.453125, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845379590988159, + "sampling/importance_sampling_ratio/min": 0.004095880314707756, + "sampling/sampling_logp_difference/max": 5.49777364730835, + "sampling/sampling_logp_difference/mean": 0.07833441346883774, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 247.0, + "completions/max_terminated_length": 247.0, + "completions/mean_length": 165.23828125, + "completions/mean_terminated_length": 165.23828125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.04796855547465384, + "epoch": 0.8761061946902655, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 235827579.0, + "reward": 0.703125, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842017889022827, + "sampling/importance_sampling_ratio/min": 0.006766402628272772, + "sampling/sampling_logp_difference/max": 4.995785713195801, + "sampling/sampling_logp_difference/mean": 0.08210749924182892, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 354.0, + "completions/max_terminated_length": 354.0, + "completions/mean_length": 167.4765625, + "completions/mean_terminated_length": 167.4765625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.048083732137456536, + "epoch": 0.8778761061946903, + "frac_reward_zero_std": 0.6875, + "grad_norm": 2.0256388741525395, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 236379413.0, + "reward": 0.6326172351837158, + "reward_std": 0.12441661953926086, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826089143753052, + "sampling/importance_sampling_ratio/min": 0.0002779910864774138, + "sampling/sampling_logp_difference/max": 8.187921524047852, + "sampling/sampling_logp_difference/mean": 0.08782650530338287, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 296.0, + "completions/max_terminated_length": 296.0, + "completions/mean_length": 164.984375, + "completions/mean_terminated_length": 164.984375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.04291428369469941, + "epoch": 0.879646017699115, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.920506452423874, + "learning_rate": 1e-06, + "loss": 0.0012, + "num_tokens": 236815073.0, + "reward": 0.5732421875, + "reward_std": 0.06449567526578903, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873014688491821, + "sampling/importance_sampling_ratio/min": 0.004105329979211092, + "sampling/sampling_logp_difference/max": 5.495469093322754, + "sampling/sampling_logp_difference/mean": 0.07400424778461456, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 175.3125, + "completions/mean_terminated_length": 175.3125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.048128560883924365, + "epoch": 0.8814159292035398, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7191630296610151, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 237275441.0, + "reward": 0.703125, + "reward_std": 0.05937499925494194, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9805392026901245, + "sampling/importance_sampling_ratio/min": 0.004097620956599712, + "sampling/sampling_logp_difference/max": 5.497348785400391, + "sampling/sampling_logp_difference/mean": 0.09002789855003357, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 165.09765625, + "completions/mean_terminated_length": 165.09765625, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.05205330019816756, + "epoch": 0.8831858407079646, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6440712071633822, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 237665162.0, + "reward": 0.5992187261581421, + "reward_std": 0.05721442401409149, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9803601503372192, + "sampling/importance_sampling_ratio/min": 0.0052624354138970375, + "sampling/sampling_logp_difference/max": 5.247161388397217, + "sampling/sampling_logp_difference/mean": 0.09621217846870422, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 176.1171875, + "completions/mean_terminated_length": 176.1171875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.05902375653386116, + "epoch": 0.8849557522123894, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.323538404628415, + "learning_rate": 1e-06, + "loss": 0.0172, + "num_tokens": 238106696.0, + "reward": 0.5101562142372131, + "reward_std": 0.11658942699432373, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9775343537330627, + "sampling/importance_sampling_ratio/min": 0.004173972178250551, + "sampling/sampling_logp_difference/max": 5.47888708114624, + "sampling/sampling_logp_difference/mean": 0.10734754800796509, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 257.0, + "completions/max_terminated_length": 257.0, + "completions/mean_length": 162.65234375, + "completions/mean_terminated_length": 162.65234375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.046103555243462324, + "epoch": 0.8867256637168142, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.9277323806583541, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 238603839.0, + "reward": 0.703125, + "reward_std": 0.07592549920082092, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819241166114807, + "sampling/importance_sampling_ratio/min": 0.0013437034795060754, + "sampling/sampling_logp_difference/max": 6.612325668334961, + "sampling/sampling_logp_difference/mean": 0.08260732889175415, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 353.0, + "completions/max_terminated_length": 353.0, + "completions/mean_length": 177.9609375, + "completions/mean_terminated_length": 177.9609375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.04882032400928438, + "epoch": 0.8884955752212389, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.4037562098587755, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 239362517.0, + "reward": 0.4322265386581421, + "reward_std": 0.10337550193071365, + "rewards/execution_accuracy_EX/mean": 0.40234375, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799545407295227, + "sampling/importance_sampling_ratio/min": 5.0001915802289076e-11, + "sampling/sampling_logp_difference/max": 23.71895980834961, + "sampling/sampling_logp_difference/mean": 0.09593340009450912, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 493.0, + "completions/max_terminated_length": 493.0, + "completions/mean_length": 191.703125, + "completions/mean_terminated_length": 191.703125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.049478446366265416, + "epoch": 0.8902654867256637, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8559094032526826, + "learning_rate": 1e-06, + "loss": -0.0072, + "num_tokens": 239693641.0, + "reward": 0.6474609375, + "reward_std": 0.10142967104911804, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796859622001648, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.09414543211460114, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 177.3359375, + "completions/mean_terminated_length": 177.3359375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.046070623910054564, + "epoch": 0.8920353982300885, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.3550449847298833, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 240304031.0, + "reward": 0.632617175579071, + "reward_std": 0.12852442264556885, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798822402954102, + "sampling/importance_sampling_ratio/min": 0.000726876372937113, + "sampling/sampling_logp_difference/max": 7.226754188537598, + "sampling/sampling_logp_difference/mean": 0.089444600045681, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 369.0, + "completions/max_terminated_length": 369.0, + "completions/mean_length": 177.08984375, + "completions/mean_terminated_length": 177.08984375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.05228368123061955, + "epoch": 0.8938053097345132, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.43048480587039784, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 240826870.0, + "reward": 0.6029297113418579, + "reward_std": 0.028423616662621498, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9821292757987976, + "sampling/importance_sampling_ratio/min": 0.005258575081825256, + "sampling/sampling_logp_difference/max": 5.247895240783691, + "sampling/sampling_logp_difference/mean": 0.08833771198987961, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 256.0, + "completions/max_terminated_length": 256.0, + "completions/mean_length": 170.33203125, + "completions/mean_terminated_length": 170.33203125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.05010732146911323, + "epoch": 0.8955752212389381, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.8311929706929964, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 241308203.0, + "reward": 0.725390613079071, + "reward_std": 0.02968749962747097, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830573797225952, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.08577954769134521, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 272.0, + "completions/max_terminated_length": 272.0, + "completions/mean_length": 176.390625, + "completions/mean_terminated_length": 176.390625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.0489632326643914, + "epoch": 0.8973451327433628, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.006347222049109, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 241942863.0, + "reward": 0.7365233898162842, + "reward_std": 0.13036097586154938, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818917512893677, + "sampling/importance_sampling_ratio/min": 0.0026035383343696594, + "sampling/sampling_logp_difference/max": 5.950883865356445, + "sampling/sampling_logp_difference/mean": 0.09044378995895386, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 284.0, + "completions/max_terminated_length": 284.0, + "completions/mean_length": 179.421875, + "completions/mean_terminated_length": 179.421875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.05083822226151824, + "epoch": 0.8991150442477877, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5067714796461338, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 242432283.0, + "reward": 0.47304683923721313, + "reward_std": 0.038778576999902725, + "rewards/execution_accuracy_EX/mean": 0.4453125, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774977564811707, + "sampling/importance_sampling_ratio/min": 0.002825167030096054, + "sampling/sampling_logp_difference/max": 5.869187831878662, + "sampling/sampling_logp_difference/mean": 0.09798707067966461, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 271.0, + "completions/max_terminated_length": 271.0, + "completions/mean_length": 172.3671875, + "completions/mean_terminated_length": 172.3671875, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.04754835646599531, + "epoch": 0.9008849557522124, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6421275733765186, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 243101129.0, + "reward": 0.8218749761581421, + "reward_std": 0.08301956206560135, + "rewards/execution_accuracy_EX/mean": 0.8125, + "rewards/execution_accuracy_EX/std": 0.3910769522190094, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9835774898529053, + "sampling/importance_sampling_ratio/min": 0.0032113860361278057, + "sampling/sampling_logp_difference/max": 5.741052627563477, + "sampling/sampling_logp_difference/mean": 0.08551837503910065, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 469.0, + "completions/max_terminated_length": 469.0, + "completions/mean_length": 176.01953125, + "completions/mean_terminated_length": 176.01953125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "entropy": 0.055300379171967506, + "epoch": 0.9026548672566371, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.4032649365281848, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 243547742.0, + "reward": 0.6882812976837158, + "reward_std": 0.15326790511608124, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808853268623352, + "sampling/importance_sampling_ratio/min": 7.492009899578989e-05, + "sampling/sampling_logp_difference/max": 9.499088287353516, + "sampling/sampling_logp_difference/mean": 0.09364474564790726, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 377.0, + "completions/max_terminated_length": 377.0, + "completions/mean_length": 192.40625, + "completions/mean_terminated_length": 192.40625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.04891200643032789, + "epoch": 0.904424778761062, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7218189877140291, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 244127046.0, + "reward": 0.703125, + "reward_std": 0.049967922270298004, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808559417724609, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.09143536537885666, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 358.0, + "completions/max_terminated_length": 358.0, + "completions/mean_length": 168.1484375, + "completions/mean_terminated_length": 168.1484375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.05485077598132193, + "epoch": 0.9061946902654867, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.8174867814838335, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 244631372.0, + "reward": 0.910937488079071, + "reward_std": 0.045264385640621185, + "rewards/execution_accuracy_EX/mean": 0.90625, + "rewards/execution_accuracy_EX/std": 0.2920515835285187, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814273118972778, + "sampling/importance_sampling_ratio/min": 0.0031994853634387255, + "sampling/sampling_logp_difference/max": 5.744765281677246, + "sampling/sampling_logp_difference/mean": 0.09440530836582184, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 293.0, + "completions/max_terminated_length": 293.0, + "completions/mean_length": 184.03515625, + "completions/mean_terminated_length": 184.03515625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.04843215038999915, + "epoch": 0.9079646017699115, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5072985230057611, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 245028037.0, + "reward": 0.7476562261581421, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9813686013221741, + "sampling/importance_sampling_ratio/min": 0.0005568118067458272, + "sampling/sampling_logp_difference/max": 7.493283271789551, + "sampling/sampling_logp_difference/mean": 0.09296062588691711, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 342.0, + "completions/max_terminated_length": 342.0, + "completions/mean_length": 168.08984375, + "completions/mean_terminated_length": 168.08984375, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.057977584190666676, + "epoch": 0.9097345132743363, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.0103608316081492, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 245387148.0, + "reward": 0.6919921636581421, + "reward_std": 0.0866614431142807, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879189729690552, + "sampling/importance_sampling_ratio/min": 0.0024848966859281063, + "sampling/sampling_logp_difference/max": 5.997524261474609, + "sampling/sampling_logp_difference/mean": 0.08319272100925446, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 306.0, + "completions/max_terminated_length": 306.0, + "completions/mean_length": 189.7421875, + "completions/mean_terminated_length": 189.7421875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.04901131591759622, + "epoch": 0.911504424778761, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.8316976643499827, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 245865034.0, + "reward": 0.5695312023162842, + "reward_std": 0.12638607621192932, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830834865570068, + "sampling/importance_sampling_ratio/min": 0.0010456463787704706, + "sampling/sampling_logp_difference/max": 6.863120079040527, + "sampling/sampling_logp_difference/mean": 0.08485496044158936, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 418.0, + "completions/max_terminated_length": 418.0, + "completions/mean_length": 194.8046875, + "completions/mean_terminated_length": 194.8046875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.056694274535402656, + "epoch": 0.9132743362831859, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5754258337433564, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 246329976.0, + "reward": 0.62890625, + "reward_std": 0.04326736554503441, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790133237838745, + "sampling/importance_sampling_ratio/min": 0.003186937188729644, + "sampling/sampling_logp_difference/max": 5.748694896697998, + "sampling/sampling_logp_difference/mean": 0.09674401581287384, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 198.66015625, + "completions/mean_terminated_length": 198.66015625, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.06033860193565488, + "epoch": 0.9150442477876106, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3946633431561115, + "learning_rate": 1e-06, + "loss": -0.0142, + "num_tokens": 247059329.0, + "reward": 0.539843738079071, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782962203025818, + "sampling/importance_sampling_ratio/min": 0.0024888820480555296, + "sampling/sampling_logp_difference/max": 5.995921611785889, + "sampling/sampling_logp_difference/mean": 0.10552509129047394, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 349.0, + "completions/max_terminated_length": 349.0, + "completions/mean_length": 173.5234375, + "completions/mean_terminated_length": 173.5234375, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.048745911568403244, + "epoch": 0.9168141592920354, + "frac_reward_zero_std": 0.9375, + "grad_norm": 1.8271823749521028, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 247469447.0, + "reward": 0.6103515625, + "reward_std": 0.030420634895563126, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841370582580566, + "sampling/importance_sampling_ratio/min": 0.005257880315184593, + "sampling/sampling_logp_difference/max": 5.248027324676514, + "sampling/sampling_logp_difference/mean": 0.08694303035736084, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 350.0, + "completions/max_terminated_length": 350.0, + "completions/mean_length": 199.87890625, + "completions/mean_terminated_length": 199.87890625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.0591482964809984, + "epoch": 0.9185840707964602, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.622852537075716, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 247833976.0, + "reward": 0.5398436784744263, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840697050094604, + "sampling/importance_sampling_ratio/min": 0.002481157425791025, + "sampling/sampling_logp_difference/max": 5.999030113220215, + "sampling/sampling_logp_difference/mean": 0.0902697741985321, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 189.89453125, + "completions/mean_terminated_length": 189.89453125, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "entropy": 0.05301560740917921, + "epoch": 0.9203539823008849, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6566269409422131, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 248239853.0, + "reward": 0.7958984375, + "reward_std": 0.04453124850988388, + "rewards/execution_accuracy_EX/mean": 0.78515625, + "rewards/execution_accuracy_EX/std": 0.4115184545516968, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830227494239807, + "sampling/importance_sampling_ratio/min": 0.0031995694153010845, + "sampling/sampling_logp_difference/max": 5.744739055633545, + "sampling/sampling_logp_difference/mean": 0.08880066126585007, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 192.5390625, + "completions/mean_terminated_length": 192.5390625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.05712575395591557, + "epoch": 0.9221238938053097, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.7597496145836427, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 248684183.0, + "reward": 0.5138671398162842, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9793723225593567, + "sampling/importance_sampling_ratio/min": 1.3860160834155977e-05, + "sampling/sampling_logp_difference/max": 11.186491966247559, + "sampling/sampling_logp_difference/mean": 0.0985555425286293, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 340.0, + "completions/max_terminated_length": 340.0, + "completions/mean_length": 192.3828125, + "completions/mean_terminated_length": 192.3828125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.0574874640442431, + "epoch": 0.9238938053097345, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.24561183861228908, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 249131209.0, + "reward": 0.5287109017372131, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9751148223876953, + "sampling/importance_sampling_ratio/min": 0.003188966540619731, + "sampling/sampling_logp_difference/max": 5.748058319091797, + "sampling/sampling_logp_difference/mean": 0.11055392771959305, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 192.28515625, + "completions/mean_terminated_length": 192.28515625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.06137994769960642, + "epoch": 0.9256637168141593, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5716437595398852, + "learning_rate": 1e-06, + "loss": 0.0046, + "num_tokens": 249658322.0, + "reward": 0.6957030892372131, + "reward_std": 0.06034861505031586, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811362028121948, + "sampling/importance_sampling_ratio/min": 0.005255925469100475, + "sampling/sampling_logp_difference/max": 5.248399257659912, + "sampling/sampling_logp_difference/mean": 0.10237579792737961, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 183.34375, + "completions/mean_terminated_length": 183.34375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.05871612951159477, + "epoch": 0.9274336283185841, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.622556044290377, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 249997802.0, + "reward": 0.6103515625, + "reward_std": 0.09763786941766739, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783234596252441, + "sampling/importance_sampling_ratio/min": 0.004098503850400448, + "sampling/sampling_logp_difference/max": 5.497133255004883, + "sampling/sampling_logp_difference/mean": 0.10579982399940491, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 265.0, + "completions/max_terminated_length": 265.0, + "completions/mean_length": 179.80859375, + "completions/mean_terminated_length": 179.80859375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.055529567413032055, + "epoch": 0.9292035398230089, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8976085435185657, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 250346505.0, + "reward": 0.7365233898162842, + "reward_std": 0.08078206330537796, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983036458492279, + "sampling/importance_sampling_ratio/min": 0.00045901790144853294, + "sampling/sampling_logp_difference/max": 7.6864213943481445, + "sampling/sampling_logp_difference/mean": 0.09104293584823608, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 288.0, + "completions/max_terminated_length": 288.0, + "completions/mean_length": 177.71484375, + "completions/mean_terminated_length": 177.71484375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.06272785738110542, + "epoch": 0.9309734513274336, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.0219138521532776, + "learning_rate": 1e-06, + "loss": 0.0146, + "num_tokens": 250647472.0, + "reward": 0.5546875, + "reward_std": 0.10589273273944855, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8711411952972412, + "sampling/importance_sampling_ratio/mean": 0.9805116057395935, + "sampling/importance_sampling_ratio/min": 0.005310718901455402, + "sampling/sampling_logp_difference/max": 5.238028049468994, + "sampling/sampling_logp_difference/mean": 0.10020837932825089, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 320.0, + "completions/max_terminated_length": 320.0, + "completions/mean_length": 175.24609375, + "completions/mean_terminated_length": 175.24609375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.05689932149834931, + "epoch": 0.9327433628318584, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 251211823.0, + "reward": 0.6437499523162842, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980980396270752, + "sampling/importance_sampling_ratio/min": 0.0040929848328232765, + "sampling/sampling_logp_difference/max": 5.498480796813965, + "sampling/sampling_logp_difference/mean": 0.09796097874641418, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 193.703125, + "completions/mean_terminated_length": 193.703125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.062146071111783385, + "epoch": 0.9345132743362832, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5326054705327706, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 251620947.0, + "reward": 0.5546875, + "reward_std": 0.030661117285490036, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9767408967018127, + "sampling/importance_sampling_ratio/min": 0.0015209248522296548, + "sampling/sampling_logp_difference/max": 6.488436698913574, + "sampling/sampling_logp_difference/mean": 0.1093941330909729, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 194.80859375, + "completions/mean_terminated_length": 194.80859375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.06090115010738373, + "epoch": 0.9362831858407079, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.20750105176528, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 252115154.0, + "reward": 0.48417967557907104, + "reward_std": 0.09449917078018188, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795873761177063, + "sampling/importance_sampling_ratio/min": 0.0003456080157775432, + "sampling/sampling_logp_difference/max": 7.970205307006836, + "sampling/sampling_logp_difference/mean": 0.10623567551374435, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 267.0, + "completions/max_terminated_length": 267.0, + "completions/mean_length": 169.55078125, + "completions/mean_terminated_length": 169.55078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.060906231636181474, + "epoch": 0.9380530973451328, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9140394386063178, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 252529247.0, + "reward": 0.606640636920929, + "reward_std": 0.12389640510082245, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9793052077293396, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.10013088583946228, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 291.0, + "completions/max_terminated_length": 291.0, + "completions/mean_length": 172.71484375, + "completions/mean_terminated_length": 172.71484375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.05663017835468054, + "epoch": 0.9398230088495575, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.8041390764017795, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 252913334.0, + "reward": 0.814453125, + "reward_std": 0.12731033563613892, + "rewards/execution_accuracy_EX/mean": 0.8046875, + "rewards/execution_accuracy_EX/std": 0.39721766114234924, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9821113348007202, + "sampling/importance_sampling_ratio/min": 0.0068273525685071945, + "sampling/sampling_logp_difference/max": 4.986818313598633, + "sampling/sampling_logp_difference/mean": 0.0965123251080513, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 330.0, + "completions/max_terminated_length": 330.0, + "completions/mean_length": 185.41796875, + "completions/mean_terminated_length": 185.41796875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.05432614218443632, + "epoch": 0.9415929203539823, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5232407083993299, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 253241169.0, + "reward": 0.6103515625, + "reward_std": 0.053622327744960785, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9848209619522095, + "sampling/importance_sampling_ratio/min": 0.008679499849677086, + "sampling/sampling_logp_difference/max": 4.746791362762451, + "sampling/sampling_logp_difference/mean": 0.08514297753572464, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 332.0, + "completions/max_terminated_length": 332.0, + "completions/mean_length": 165.63671875, + "completions/mean_terminated_length": 165.63671875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.06238645710982382, + "epoch": 0.9433628318584071, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5907705205102637, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 253643204.0, + "reward": 0.651171863079071, + "reward_std": 0.038778576999902725, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9802834987640381, + "sampling/importance_sampling_ratio/min": 1.945988151419442e-05, + "sampling/sampling_logp_difference/max": 10.847155570983887, + "sampling/sampling_logp_difference/mean": 0.100850909948349, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 237.0, + "completions/max_terminated_length": 237.0, + "completions/mean_length": 162.7578125, + "completions/mean_terminated_length": 162.7578125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.053772482089698315, + "epoch": 0.9451327433628318, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 254035622.0, + "reward": 0.8218749761581421, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.8125, + "rewards/execution_accuracy_EX/std": 0.3910769522190094, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782205820083618, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1011219471693039, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 286.0, + "completions/max_terminated_length": 286.0, + "completions/mean_length": 170.2421875, + "completions/mean_terminated_length": 170.2421875, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.051883811596781015, + "epoch": 0.9469026548672567, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.697677563144362, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 254385844.0, + "reward": 0.7439453601837158, + "reward_std": 0.0866614431142807, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9833471775054932, + "sampling/importance_sampling_ratio/min": 0.0052607497200369835, + "sampling/sampling_logp_difference/max": 5.247481822967529, + "sampling/sampling_logp_difference/mean": 0.08491303026676178, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 171.109375, + "completions/mean_terminated_length": 171.109375, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.0560115126427263, + "epoch": 0.9486725663716814, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.206609341950761, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 254947040.0, + "reward": 0.6103515625, + "reward_std": 0.07390274852514267, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810516834259033, + "sampling/importance_sampling_ratio/min": 0.005258296616375446, + "sampling/sampling_logp_difference/max": 5.247948169708252, + "sampling/sampling_logp_difference/mean": 0.09410244226455688, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 269.0, + "completions/max_terminated_length": 269.0, + "completions/mean_length": 180.51171875, + "completions/mean_terminated_length": 180.51171875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.049978159135207534, + "epoch": 0.9504424778761061, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.21017277426867115, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 255493315.0, + "reward": 0.5806640386581421, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818358421325684, + "sampling/importance_sampling_ratio/min": 5.0185037252958864e-05, + "sampling/sampling_logp_difference/max": 9.89979362487793, + "sampling/sampling_logp_difference/mean": 0.09051823616027832, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 169.96875, + "completions/mean_terminated_length": 169.96875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.05133820232003927, + "epoch": 0.952212389380531, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.36520972457989764, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 256092875.0, + "reward": 0.5695312023162842, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783322811126709, + "sampling/importance_sampling_ratio/min": 0.0009590815170668066, + "sampling/sampling_logp_difference/max": 6.9495344161987305, + "sampling/sampling_logp_difference/mean": 0.10050339996814728, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 227.0, + "completions/max_terminated_length": 227.0, + "completions/mean_length": 168.6484375, + "completions/mean_terminated_length": 168.6484375, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.05146279325708747, + "epoch": 0.9539823008849557, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7296065223027633, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 256570081.0, + "reward": 0.6919921636581421, + "reward_std": 0.07568501681089401, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797185659408569, + "sampling/importance_sampling_ratio/min": 0.0035121028777211905, + "sampling/sampling_logp_difference/max": 5.651540279388428, + "sampling/sampling_logp_difference/mean": 0.09460432827472687, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 327.0, + "completions/max_terminated_length": 327.0, + "completions/mean_length": 169.3984375, + "completions/mean_terminated_length": 169.3984375, + "completions/min_length": 27.0, + "completions/min_terminated_length": 27.0, + "entropy": 0.05147869186475873, + "epoch": 0.9557522123893806, + "frac_reward_zero_std": 0.6875, + "grad_norm": 0.9547430898811416, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 257257047.0, + "reward": 0.4691406190395355, + "reward_std": 0.1422634869813919, + "rewards/execution_accuracy_EX/mean": 0.44140625, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.981183648109436, + "sampling/importance_sampling_ratio/min": 0.0006282316753640771, + "sampling/sampling_logp_difference/max": 7.372601509094238, + "sampling/sampling_logp_difference/mean": 0.09387682378292084, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 276.0, + "completions/max_terminated_length": 276.0, + "completions/mean_length": 162.2109375, + "completions/mean_terminated_length": 162.2109375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05744664301164448, + "epoch": 0.9575221238938053, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.7382732182583928, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 257654445.0, + "reward": 0.7365233898162842, + "reward_std": 0.07525734603404999, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9807277321815491, + "sampling/importance_sampling_ratio/min": 0.004093420226126909, + "sampling/sampling_logp_difference/max": 5.4983744621276855, + "sampling/sampling_logp_difference/mean": 0.09675689786672592, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 303.0, + "completions/max_terminated_length": 303.0, + "completions/mean_length": 175.2890625, + "completions/mean_terminated_length": 175.2890625, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.05315975029952824, + "epoch": 0.95929203539823, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5996381421609973, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 258145047.0, + "reward": 0.595507800579071, + "reward_std": 0.041397057473659515, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814416170120239, + "sampling/importance_sampling_ratio/min": 0.0011875813361257315, + "sampling/sampling_logp_difference/max": 6.735836505889893, + "sampling/sampling_logp_difference/mean": 0.09460874646902084, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 172.28515625, + "completions/mean_terminated_length": 172.28515625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.048761931248009205, + "epoch": 0.9610619469026549, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.6912616879192374, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 258579136.0, + "reward": 0.5621093511581421, + "reward_std": 0.08090877532958984, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9836956262588501, + "sampling/importance_sampling_ratio/min": 1.9459073428151896e-06, + "sampling/sampling_logp_difference/max": 13.149782180786133, + "sampling/sampling_logp_difference/mean": 0.08460815250873566, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 254.0, + "completions/max_terminated_length": 254.0, + "completions/mean_length": 173.69140625, + "completions/mean_terminated_length": 173.69140625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.05467568803578615, + "epoch": 0.9628318584070796, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8738373465420919, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 259090177.0, + "reward": 0.5361328125, + "reward_std": 0.0718044862151146, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.982956051826477, + "sampling/importance_sampling_ratio/min": 0.0024824803695082664, + "sampling/sampling_logp_difference/max": 5.998497009277344, + "sampling/sampling_logp_difference/mean": 0.08767026662826538, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 323.0, + "completions/max_terminated_length": 323.0, + "completions/mean_length": 165.88671875, + "completions/mean_terminated_length": 165.88671875, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.05155494483187795, + "epoch": 0.9646017699115044, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.7896884142075465, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 259411028.0, + "reward": 0.6585937142372131, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826200604438782, + "sampling/importance_sampling_ratio/min": 0.0041324361227452755, + "sampling/sampling_logp_difference/max": 5.488888263702393, + "sampling/sampling_logp_difference/mean": 0.08633853495121002, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 164.6953125, + "completions/mean_terminated_length": 164.6953125, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.051994505571201444, + "epoch": 0.9663716814159292, + "frac_reward_zero_std": 1.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 259858118.0, + "reward": 0.5249999761581421, + "reward_std": 0.0, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819843173027039, + "sampling/importance_sampling_ratio/min": 0.0008800112409517169, + "sampling/sampling_logp_difference/max": 7.035575866699219, + "sampling/sampling_logp_difference/mean": 0.09068815410137177, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 285.0, + "completions/max_terminated_length": 285.0, + "completions/mean_length": 171.08203125, + "completions/mean_terminated_length": 171.08203125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.050545030273497105, + "epoch": 0.968141592920354, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.5448697818832311, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 260393563.0, + "reward": 0.4359374940395355, + "reward_std": 0.030661117285490036, + "rewards/execution_accuracy_EX/mean": 0.40625, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798040390014648, + "sampling/importance_sampling_ratio/min": 0.0019341905135661364, + "sampling/sampling_logp_difference/max": 6.248066425323486, + "sampling/sampling_logp_difference/mean": 0.09756286442279816, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 249.0, + "completions/max_terminated_length": 249.0, + "completions/mean_length": 157.25390625, + "completions/mean_terminated_length": 157.25390625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.052860751980915666, + "epoch": 0.9699115044247788, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.7457001889811058, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 260947084.0, + "reward": 0.6697266101837158, + "reward_std": 0.07519236952066422, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800780415534973, + "sampling/importance_sampling_ratio/min": 0.0031861092429608107, + "sampling/sampling_logp_difference/max": 5.748954772949219, + "sampling/sampling_logp_difference/mean": 0.10188010334968567, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 255.0, + "completions/max_terminated_length": 255.0, + "completions/mean_length": 150.0390625, + "completions/mean_terminated_length": 150.0390625, + "completions/min_length": 87.0, + "completions/min_terminated_length": 87.0, + "entropy": 0.05352012813091278, + "epoch": 0.9716814159292035, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.3492184162003309, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 261414470.0, + "reward": 0.6919921636581421, + "reward_std": 0.023934828117489815, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829031825065613, + "sampling/importance_sampling_ratio/min": 0.009969072416424751, + "sampling/sampling_logp_difference/max": 4.608267784118652, + "sampling/sampling_logp_difference/mean": 0.09117774665355682, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 246.0, + "completions/max_terminated_length": 246.0, + "completions/mean_length": 155.1171875, + "completions/mean_terminated_length": 155.1171875, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "entropy": 0.05170729919336736, + "epoch": 0.9734513274336283, + "frac_reward_zero_std": 0.75, + "grad_norm": 1.3065584756127775, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 261869492.0, + "reward": 0.62890625, + "reward_std": 0.1043233871459961, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832204580307007, + "sampling/importance_sampling_ratio/min": 0.00417554285377264, + "sampling/sampling_logp_difference/max": 5.478510856628418, + "sampling/sampling_logp_difference/mean": 0.08806648850440979, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 282.0, + "completions/max_terminated_length": 282.0, + "completions/mean_length": 164.1484375, + "completions/mean_terminated_length": 164.1484375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.062490148935467005, + "epoch": 0.9752212389380531, + "frac_reward_zero_std": 0.6875, + "grad_norm": 1.4955446863354496, + "learning_rate": 1e-06, + "loss": 0.007, + "num_tokens": 262226042.0, + "reward": 0.5621093511581421, + "reward_std": 0.11319971084594727, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9793463945388794, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.10468199104070663, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 304.0, + "completions/max_terminated_length": 304.0, + "completions/mean_length": 155.7109375, + "completions/mean_terminated_length": 155.7109375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.056072956416755915, + "epoch": 0.9769911504424779, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.946122111598964, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 262709712.0, + "reward": 0.7179687023162842, + "reward_std": 0.026553306728601456, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797009229660034, + "sampling/importance_sampling_ratio/min": 0.004090499132871628, + "sampling/sampling_logp_difference/max": 5.499088287353516, + "sampling/sampling_logp_difference/mean": 0.10099714994430542, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 348.0, + "completions/max_terminated_length": 348.0, + "completions/mean_length": 163.41015625, + "completions/mean_terminated_length": 163.41015625, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.056844214210286736, + "epoch": 0.9787610619469026, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.889253296836571, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 263306409.0, + "reward": 0.5658203363418579, + "reward_std": 0.055404599756002426, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9828378558158875, + "sampling/importance_sampling_ratio/min": 0.0033242932986468077, + "sampling/sampling_logp_difference/max": 5.706498146057129, + "sampling/sampling_logp_difference/mean": 0.0983411967754364, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 283.0, + "completions/max_terminated_length": 283.0, + "completions/mean_length": 159.23046875, + "completions/mean_terminated_length": 159.23046875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05631491937674582, + "epoch": 0.9805309734513274, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.2544378290389382, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 263603796.0, + "reward": 0.8070312738418579, + "reward_std": 0.0499679259955883, + "rewards/execution_accuracy_EX/mean": 0.796875, + "rewards/execution_accuracy_EX/std": 0.40311288833618164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9835492968559265, + "sampling/importance_sampling_ratio/min": 0.001442394102923572, + "sampling/sampling_logp_difference/max": 6.5414509773254395, + "sampling/sampling_logp_difference/mean": 0.08899315446615219, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 261.0, + "completions/max_terminated_length": 261.0, + "completions/mean_length": 169.78515625, + "completions/mean_terminated_length": 169.78515625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.05844833888113499, + "epoch": 0.9823008849557522, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.5146563491046112, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 264018349.0, + "reward": 0.4210937023162842, + "reward_std": 0.038778576999902725, + "rewards/execution_accuracy_EX/mean": 0.390625, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9787590503692627, + "sampling/importance_sampling_ratio/min": 0.0015058118151500821, + "sampling/sampling_logp_difference/max": 6.498423099517822, + "sampling/sampling_logp_difference/mean": 0.10254551470279694, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 172.40625, + "completions/mean_terminated_length": 172.40625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.05133802676573396, + "epoch": 0.984070796460177, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.0498848879693101, + "learning_rate": 1e-06, + "loss": 0.0086, + "num_tokens": 264443157.0, + "reward": 0.558398425579071, + "reward_std": 0.08078206330537796, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822711944580078, + "sampling/importance_sampling_ratio/min": 0.004132171627134085, + "sampling/sampling_logp_difference/max": 5.488952159881592, + "sampling/sampling_logp_difference/mean": 0.08806522935628891, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 277.0, + "completions/max_terminated_length": 277.0, + "completions/mean_length": 152.20703125, + "completions/mean_terminated_length": 152.20703125, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "entropy": 0.055271646939218044, + "epoch": 0.9858407079646018, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.8322525142419908, + "learning_rate": 1e-06, + "loss": 0.0112, + "num_tokens": 265026458.0, + "reward": 0.6251952648162842, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9772956371307373, + "sampling/importance_sampling_ratio/min": 0.005265550222247839, + "sampling/sampling_logp_difference/max": 5.246569633483887, + "sampling/sampling_logp_difference/mean": 0.10262373834848404, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 173.953125, + "completions/mean_terminated_length": 173.953125, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.05820183688774705, + "epoch": 0.9876106194690265, + "frac_reward_zero_std": 0.8125, + "grad_norm": 1.8725625512806454, + "learning_rate": 1e-06, + "loss": 0.0148, + "num_tokens": 265447966.0, + "reward": 0.680859386920929, + "reward_std": 0.05905900523066521, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796777963638306, + "sampling/importance_sampling_ratio/min": 0.005275107454508543, + "sampling/sampling_logp_difference/max": 5.24475622177124, + "sampling/sampling_logp_difference/mean": 0.0990913063287735, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 280.0, + "completions/max_terminated_length": 280.0, + "completions/mean_length": 153.2421875, + "completions/mean_terminated_length": 153.2421875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.05820212885737419, + "epoch": 0.9893805309734514, + "frac_reward_zero_std": 0.875, + "grad_norm": 0.6727502812774927, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 265763596.0, + "reward": 0.8330078125, + "reward_std": 0.06010813266038895, + "rewards/execution_accuracy_EX/mean": 0.82421875, + "rewards/execution_accuracy_EX/std": 0.3813795745372772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842797517776489, + "sampling/importance_sampling_ratio/min": 0.0011746090603992343, + "sampling/sampling_logp_difference/max": 6.746819972991943, + "sampling/sampling_logp_difference/mean": 0.08817964792251587, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 317.0, + "completions/max_terminated_length": 317.0, + "completions/mean_length": 186.328125, + "completions/mean_terminated_length": 186.328125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.06822470435872674, + "epoch": 0.9911504424778761, + "frac_reward_zero_std": 0.5625, + "grad_norm": 1.3424420761424296, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 266293536.0, + "reward": 0.5472656488418579, + "reward_std": 0.19300755858421326, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800787568092346, + "sampling/importance_sampling_ratio/min": 0.00248520215973258, + "sampling/sampling_logp_difference/max": 5.997401237487793, + "sampling/sampling_logp_difference/mean": 0.10830190777778625, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 244.0, + "completions/max_terminated_length": 244.0, + "completions/mean_length": 144.22265625, + "completions/mean_terminated_length": 144.22265625, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.06672902987338603, + "epoch": 0.9929203539823008, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.44198925947237244, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 266530905.0, + "reward": 0.7662109136581421, + "reward_std": 0.014843749813735485, + "rewards/execution_accuracy_EX/mean": 0.75390625, + "rewards/execution_accuracy_EX/std": 0.43157756328582764, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9607163667678833, + "sampling/importance_sampling_ratio/mean": 0.9820749163627625, + "sampling/importance_sampling_ratio/min": 0.0003362954594194889, + "sampling/sampling_logp_difference/max": 7.997520446777344, + "sampling/sampling_logp_difference/mean": 0.09850668162107468, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 313.0, + "completions/max_terminated_length": 313.0, + "completions/mean_length": 165.62109375, + "completions/mean_terminated_length": 165.62109375, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.06595823029056191, + "epoch": 0.9946902654867257, + "frac_reward_zero_std": 0.8125, + "grad_norm": 0.849059295057431, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 266922376.0, + "reward": 0.5732421875, + "reward_std": 0.05811111629009247, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761077165603638, + "sampling/importance_sampling_ratio/min": 0.0015476843109354377, + "sampling/sampling_logp_difference/max": 6.4709954261779785, + "sampling/sampling_logp_difference/mean": 0.11305363476276398, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 235.0, + "completions/max_terminated_length": 235.0, + "completions/mean_length": 155.05078125, + "completions/mean_terminated_length": 155.05078125, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.06390288984403014, + "epoch": 0.9964601769911504, + "frac_reward_zero_std": 0.75, + "grad_norm": 0.8978575867859608, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 267254933.0, + "reward": 0.5509765148162842, + "reward_std": 0.07421875, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9787781834602356, + "sampling/importance_sampling_ratio/min": 0.001508211949840188, + "sampling/sampling_logp_difference/max": 6.496830463409424, + "sampling/sampling_logp_difference/mean": 0.1093311756849289, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 174.3046875, + "completions/mean_terminated_length": 174.3046875, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.06668579182587564, + "epoch": 0.9982300884955753, + "frac_reward_zero_std": 0.625, + "grad_norm": 1.307785124920412, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 267812467.0, + "reward": 0.558398425579071, + "reward_std": 0.1498282551765442, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9778845906257629, + "sampling/importance_sampling_ratio/min": 0.0019405841594561934, + "sampling/sampling_logp_difference/max": 6.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.11229446530342102, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 408.0, + "completions/mean_length": 192.6484375, + "completions/mean_terminated_length": 161.91339111328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.06363298557698727, + "epoch": 1.0, + "frac_reward_zero_std": 0.9375, + "grad_norm": 0.28230888146679, + "learning_rate": 1e-06, + "loss": -0.0321, + "num_tokens": 268163225.0, + "reward": 0.880859375, + "reward_std": 0.0010673906654119492, + "rewards/execution_accuracy_EX/mean": 0.875, + "rewards/execution_accuracy_EX/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814881086349487, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.10160448402166367, + "step": 565 + }, + { + "epoch": 1.0, + "step": 565, + "total_flos": 0.0, + "train_loss": 0.00011464951547538549, + "train_runtime": 5541.4616, + "train_samples_per_second": 1.633, + "train_steps_per_second": 0.102 + } + ], + "logging_steps": 1, + "max_steps": 565, + "num_input_tokens_seen": 268163225, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}