diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13543 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.15245064410397133, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 509.875, + "completions/mean_terminated_length": 509.875, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.0003049012882079427, + "grad_norm": 0.29205822944641113, + "kl": 0.0, + "learning_rate": 0.0, + "loss": 0.0179, + "num_tokens": 89984.0, + "reward": 0.008750000968575478, + "reward_std": 0.01237436942756176, + "rewards/format_reward/mean": 0.08749999850988388, + "rewards/format_reward/std": 0.28434914350509644, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 454.3500061035156, + "completions/mean_terminated_length": 454.3500061035156, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.0006098025764158854, + "grad_norm": 0.17669807374477386, + "kl": 0.0, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.0108, + "num_tokens": 173100.0, + "reward": 0.003750000149011612, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.03750000149011612, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1389.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 524.1000366210938, + "completions/mean_terminated_length": 524.1000366210938, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.000914703864623828, + "grad_norm": 0.16274994611740112, + "kl": 0.0002689361572265625, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0008, + "num_tokens": 264720.0, + "reward": 0.003750000149011612, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.03750000149011612, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1683.0, + "completions/max_terminated_length": 1683.0, + "completions/mean_length": 530.7125244140625, + "completions/mean_terminated_length": 530.7125244140625, + "completions/min_length": 52.0, + "completions/min_terminated_length": 52.0, + "epoch": 0.0012196051528317708, + "grad_norm": 0.0007452780846506357, + "kl": 0.0002593994140625, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.0, + "num_tokens": 361177.0, + "reward": 0.0, + "reward_std": 0.0, + "rewards/format_reward/mean": 0.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1767.0, + "completions/mean_length": 514.2750244140625, + "completions/mean_terminated_length": 494.86077880859375, + "completions/min_length": 48.0, + "completions/min_terminated_length": 48.0, + "epoch": 0.0015245064410397133, + "grad_norm": 0.1586974710226059, + "kl": 0.00028252601623535156, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0063, + "num_tokens": 447999.0, + "reward": 0.003750000149011612, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.03750000149011612, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1261.0, + "completions/mean_length": 532.2250366210938, + "completions/mean_terminated_length": 513.0379638671875, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.001829407729247656, + "grad_norm": 0.2026820033788681, + "kl": 0.0002999305725097656, + "learning_rate": 1.6666666666666667e-06, + "loss": -0.0177, + "num_tokens": 538403.0, + "reward": 0.0062500000931322575, + "reward_std": 0.008838835172355175, + "rewards/format_reward/mean": 0.0625, + "rewards/format_reward/std": 0.2435886710882187, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1397.0, + "completions/mean_length": 524.6875, + "completions/mean_terminated_length": 505.40509033203125, + "completions/min_length": 50.0, + "completions/min_terminated_length": 50.0, + "epoch": 0.0021343090174555986, + "grad_norm": 0.19141650199890137, + "kl": 0.00039958953857421875, + "learning_rate": 2.0000000000000003e-06, + "loss": -0.0069, + "num_tokens": 628748.0, + "reward": 0.003750000149011612, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.03750000149011612, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 480.625, + "completions/mean_terminated_length": 480.625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.0024392103056635416, + "grad_norm": 0.29890286922454834, + "kl": 0.0007905960083007812, + "learning_rate": 2.3333333333333336e-06, + "loss": -0.0039, + "num_tokens": 714662.0, + "reward": 0.010000000707805157, + "reward_std": 0.014142136089503765, + "rewards/format_reward/mean": 0.10000000149011612, + "rewards/format_reward/std": 0.3018927574157715, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1123.0, + "completions/mean_length": 520.4625244140625, + "completions/mean_terminated_length": 501.1265869140625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.002744111593871484, + "grad_norm": 0.36780887842178345, + "kl": 0.0013713836669921875, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0206, + "num_tokens": 807605.0, + "reward": 0.021250000223517418, + "reward_std": 0.0265165064483881, + "rewards/format_reward/mean": 0.21250000596046448, + "rewards/format_reward/std": 0.4116576611995697, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1675.0, + "completions/max_terminated_length": 1675.0, + "completions/mean_length": 538.6625366210938, + "completions/mean_terminated_length": 538.6625366210938, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.0030490128820794267, + "grad_norm": 0.4874690771102905, + "kl": 0.003643035888671875, + "learning_rate": 3e-06, + "loss": 0.0225, + "num_tokens": 897952.0, + "reward": 0.05625000223517418, + "reward_std": 0.04065864160656929, + "rewards/format_reward/mean": 0.5625, + "rewards/format_reward/std": 0.4992082417011261, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 517.8500366210938, + "completions/mean_terminated_length": 517.8500366210938, + "completions/min_length": 81.0, + "completions/min_terminated_length": 81.0, + "epoch": 0.0033539141702873696, + "grad_norm": 0.49236273765563965, + "kl": 0.00574493408203125, + "learning_rate": 3.3333333333333333e-06, + "loss": -0.004, + "num_tokens": 989026.0, + "reward": 0.06625000387430191, + "reward_std": 0.03712311014533043, + "rewards/format_reward/mean": 0.6625000238418579, + "rewards/format_reward/std": 0.47584035992622375, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 464.4875183105469, + "completions/mean_terminated_length": 464.4875183105469, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.003658815458495312, + "grad_norm": 0.3598475754261017, + "kl": 0.01023101806640625, + "learning_rate": 3.6666666666666666e-06, + "loss": -0.0077, + "num_tokens": 1075109.0, + "reward": 0.08749999850988388, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.33280548453330994, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1385.0, + "completions/max_terminated_length": 1385.0, + "completions/mean_length": 446.8000183105469, + "completions/mean_terminated_length": 446.8000183105469, + "completions/min_length": 24.0, + "completions/min_terminated_length": 24.0, + "epoch": 0.003963716746703255, + "grad_norm": 0.37172672152519226, + "kl": 0.013092041015625, + "learning_rate": 4.000000000000001e-06, + "loss": -0.013, + "num_tokens": 1160069.0, + "reward": 0.08125000447034836, + "reward_std": 0.01944543793797493, + "rewards/format_reward/mean": 0.8125, + "rewards/format_reward/std": 0.39277493953704834, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1425.0, + "completions/max_terminated_length": 1425.0, + "completions/mean_length": 507.1750183105469, + "completions/mean_terminated_length": 507.1750183105469, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.004268618034911197, + "grad_norm": 0.1720256209373474, + "kl": 0.015472412109375, + "learning_rate": 4.333333333333334e-06, + "loss": 0.0009, + "num_tokens": 1252237.0, + "reward": 0.0937500074505806, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.2435886710882187, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1569.0, + "completions/max_terminated_length": 1569.0, + "completions/mean_length": 400.88751220703125, + "completions/mean_terminated_length": 400.88751220703125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.00457351932311914, + "grad_norm": 0.18945425748825073, + "kl": 0.02191162109375, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0034, + "num_tokens": 1330022.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1017.0, + "completions/max_terminated_length": 1017.0, + "completions/mean_length": 378.8500061035156, + "completions/mean_terminated_length": 378.8500061035156, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.004878420611327083, + "grad_norm": 0.10855857282876968, + "kl": 0.026611328125, + "learning_rate": 5e-06, + "loss": -0.0117, + "num_tokens": 1410306.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 407.0, + "completions/mean_terminated_length": 407.0, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.005183321899535026, + "grad_norm": 0.1446218341588974, + "kl": 0.036224365234375, + "learning_rate": 4.999952797253148e-06, + "loss": 0.0001, + "num_tokens": 1486402.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 369.8625183105469, + "completions/mean_terminated_length": 348.6202697753906, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.005488223187742968, + "grad_norm": 0.21489065885543823, + "kl": 0.03070068359375, + "learning_rate": 4.9998111909931225e-06, + "loss": -0.004, + "num_tokens": 1560439.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1078.0, + "completions/max_terminated_length": 1078.0, + "completions/mean_length": 372.3374938964844, + "completions/mean_terminated_length": 372.3374938964844, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.005793124475950911, + "grad_norm": 0.04406031593680382, + "kl": 0.0367431640625, + "learning_rate": 4.999575187161439e-06, + "loss": 0.0004, + "num_tokens": 1634478.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1460.0, + "completions/max_terminated_length": 1460.0, + "completions/mean_length": 385.125, + "completions/mean_terminated_length": 385.125, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.006098025764158853, + "grad_norm": 0.018597273156046867, + "kl": 0.02655029296875, + "learning_rate": 4.9992447956603455e-06, + "loss": 0.0003, + "num_tokens": 1715820.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025000000000000022, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1865.0, + "completions/mean_length": 392.8500061035156, + "completions/mean_terminated_length": 350.4102478027344, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.006402927052366796, + "grad_norm": 0.19325323402881622, + "kl": 0.029815673828125, + "learning_rate": 4.998820030352409e-06, + "loss": 0.0281, + "num_tokens": 1795344.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 334.625, + "completions/mean_terminated_length": 334.625, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.006707828340574739, + "grad_norm": 0.13783958554267883, + "kl": 0.031219482421875, + "learning_rate": 4.998300909059929e-06, + "loss": -0.0075, + "num_tokens": 1870718.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1610.0, + "completions/max_terminated_length": 1610.0, + "completions/mean_length": 328.76251220703125, + "completions/mean_terminated_length": 328.76251220703125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.007012729628782682, + "grad_norm": 0.0586252324283123, + "kl": 0.0343017578125, + "learning_rate": 4.997687453564198e-06, + "loss": 0.0003, + "num_tokens": 1945877.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1775.0, + "completions/max_terminated_length": 1775.0, + "completions/mean_length": 403.4875183105469, + "completions/mean_terminated_length": 403.4875183105469, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.007317630916990624, + "grad_norm": 0.032569218426942825, + "kl": 0.024200439453125, + "learning_rate": 4.9969796896045775e-06, + "loss": 0.0002, + "num_tokens": 2028546.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.03749999999999998, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 424.1000061035156, + "completions/mean_terminated_length": 360.8311767578125, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.007622532205198567, + "grad_norm": 0.11784270405769348, + "kl": 0.022308349609375, + "learning_rate": 4.996177646877426e-06, + "loss": 0.0503, + "num_tokens": 2108420.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 764.0, + "completions/max_terminated_length": 764.0, + "completions/mean_length": 363.6125183105469, + "completions/mean_terminated_length": 363.6125183105469, + "completions/min_length": 57.0, + "completions/min_terminated_length": 57.0, + "epoch": 0.00792743349340651, + "grad_norm": 0.029199976474046707, + "kl": 0.022918701171875, + "learning_rate": 4.995281359034851e-06, + "loss": 0.0002, + "num_tokens": 2184783.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 361.01251220703125, + "completions/mean_terminated_length": 361.01251220703125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.008232334781614453, + "grad_norm": 0.027427662163972855, + "kl": 0.024810791015625, + "learning_rate": 4.994290863683296e-06, + "loss": 0.0002, + "num_tokens": 2260382.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2038.0, + "completions/max_terminated_length": 2038.0, + "completions/mean_length": 450.63751220703125, + "completions/mean_terminated_length": 450.63751220703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.008537236069822395, + "grad_norm": 0.011162430979311466, + "kl": 0.0190887451171875, + "learning_rate": 4.99320620238196e-06, + "loss": 0.0002, + "num_tokens": 2345657.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1869.0, + "completions/mean_length": 497.01251220703125, + "completions/mean_terminated_length": 477.3797607421875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.008842137358030338, + "grad_norm": 0.09861345589160919, + "kl": 0.01690673828125, + "learning_rate": 4.99202742064106e-06, + "loss": 0.0143, + "num_tokens": 2432992.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 524.5625, + "completions/mean_terminated_length": 505.27850341796875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.00914703864623828, + "grad_norm": 0.08128828555345535, + "kl": 0.017486572265625, + "learning_rate": 4.990754567919917e-06, + "loss": 0.0157, + "num_tokens": 2526345.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 462.7749938964844, + "completions/mean_terminated_length": 442.7088623046875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.009451939934446223, + "grad_norm": 0.16708894073963165, + "kl": 0.01788330078125, + "learning_rate": 4.989387697624881e-06, + "loss": 0.021, + "num_tokens": 2608671.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 542.2000122070312, + "completions/mean_terminated_length": 523.1392822265625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "epoch": 0.009756841222654166, + "grad_norm": 0.05789264664053917, + "kl": 0.0167236328125, + "learning_rate": 4.987926867107095e-06, + "loss": 0.0154, + "num_tokens": 2702183.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1897.0, + "completions/max_terminated_length": 1897.0, + "completions/mean_length": 499.20001220703125, + "completions/mean_terminated_length": 499.20001220703125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.010061742510862108, + "grad_norm": 0.009858060628175735, + "kl": 0.018798828125, + "learning_rate": 4.986372137660078e-06, + "loss": 0.0002, + "num_tokens": 2787461.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025000000000000022, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 614.0375366210938, + "completions/mean_terminated_length": 577.2692260742188, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.010366643799070051, + "grad_norm": 0.10538285225629807, + "kl": 0.0189208984375, + "learning_rate": 4.984723574517165e-06, + "loss": 0.0309, + "num_tokens": 2887760.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1250.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 528.1625366210938, + "completions/mean_terminated_length": 528.1625366210938, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.010671545087277993, + "grad_norm": 0.1755860298871994, + "kl": 0.019439697265625, + "learning_rate": 4.9829812468487655e-06, + "loss": 0.0171, + "num_tokens": 2981059.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1143.0, + "completions/mean_length": 573.625, + "completions/mean_terminated_length": 554.9620361328125, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "epoch": 0.010976446375485937, + "grad_norm": 0.10435257107019424, + "kl": 0.018310546875, + "learning_rate": 4.981145227759457e-06, + "loss": 0.0148, + "num_tokens": 3077261.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1156.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 525.2625122070312, + "completions/mean_terminated_length": 525.2625122070312, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.01128134766369388, + "grad_norm": 0.11834096163511276, + "kl": 0.01971435546875, + "learning_rate": 4.979215594284924e-06, + "loss": -0.0022, + "num_tokens": 3168516.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1372.0, + "completions/max_terminated_length": 1372.0, + "completions/mean_length": 530.125, + "completions/mean_terminated_length": 530.125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.011586248951901822, + "grad_norm": 0.1209435909986496, + "kl": 0.0193939208984375, + "learning_rate": 4.977192427388722e-06, + "loss": 0.0082, + "num_tokens": 3260722.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1229.0, + "completions/mean_length": 550.6124877929688, + "completions/mean_terminated_length": 531.6582641601562, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.011891150240109765, + "grad_norm": 0.127229243516922, + "kl": 0.019744873046875, + "learning_rate": 4.9750758119588824e-06, + "loss": 0.0111, + "num_tokens": 3350507.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1225.0, + "completions/max_terminated_length": 1225.0, + "completions/mean_length": 476.4125061035156, + "completions/mean_terminated_length": 476.4125061035156, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.012196051528317707, + "grad_norm": 0.00884264800697565, + "kl": 0.01934814453125, + "learning_rate": 4.972865836804349e-06, + "loss": 0.0002, + "num_tokens": 3435038.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2025.0, + "completions/max_terminated_length": 2025.0, + "completions/mean_length": 523.1749877929688, + "completions/mean_terminated_length": 523.1749877929688, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.01250095281652565, + "grad_norm": 0.05174906179308891, + "kl": 0.018310546875, + "learning_rate": 4.970562594651254e-06, + "loss": 0.0024, + "num_tokens": 3527120.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1158.0, + "completions/max_terminated_length": 1158.0, + "completions/mean_length": 429.125, + "completions/mean_terminated_length": 429.125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.012805854104733592, + "grad_norm": 0.008613626472651958, + "kl": 0.02117919921875, + "learning_rate": 4.968166182139026e-06, + "loss": 0.0002, + "num_tokens": 3606646.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 415.57501220703125, + "completions/mean_terminated_length": 415.57501220703125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.013110755392941535, + "grad_norm": 0.15895198285579681, + "kl": 0.021728515625, + "learning_rate": 4.9656766998163306e-06, + "loss": -0.0001, + "num_tokens": 3686402.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1111.0, + "completions/max_terminated_length": 1111.0, + "completions/mean_length": 457.8374938964844, + "completions/mean_terminated_length": 457.8374938964844, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.013415656681149479, + "grad_norm": 0.0086384741589427, + "kl": 0.019500732421875, + "learning_rate": 4.963094252136865e-06, + "loss": 0.0002, + "num_tokens": 3771961.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 943.0, + "completions/max_terminated_length": 943.0, + "completions/mean_length": 392.7375183105469, + "completions/mean_terminated_length": 392.7375183105469, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.01372055796935742, + "grad_norm": 0.00958193838596344, + "kl": 0.022064208984375, + "learning_rate": 4.960418947454958e-06, + "loss": 0.0002, + "num_tokens": 3850288.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1287.0, + "completions/max_terminated_length": 1287.0, + "completions/mean_length": 470.5249938964844, + "completions/mean_terminated_length": 470.5249938964844, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.014025459257565364, + "grad_norm": 0.008218112401664257, + "kl": 0.018829345703125, + "learning_rate": 4.957650898021038e-06, + "loss": 0.0002, + "num_tokens": 3937080.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1030.0, + "completions/mean_length": 383.45001220703125, + "completions/mean_terminated_length": 362.3797607421875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.014330360545773305, + "grad_norm": 0.08822523057460785, + "kl": 0.021636962890625, + "learning_rate": 4.954790219976915e-06, + "loss": 0.0225, + "num_tokens": 4014978.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 438.5625, + "completions/mean_terminated_length": 438.5625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.014635261833981249, + "grad_norm": 0.1345287412405014, + "kl": 0.0201416015625, + "learning_rate": 4.95183703335091e-06, + "loss": -0.0076, + "num_tokens": 4095191.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1120.0, + "completions/max_terminated_length": 1120.0, + "completions/mean_length": 431.4125061035156, + "completions/mean_terminated_length": 431.4125061035156, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.01494016312218919, + "grad_norm": 0.008926213718950748, + "kl": 0.019744873046875, + "learning_rate": 4.948791462052819e-06, + "loss": 0.0002, + "num_tokens": 4175032.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1367.0, + "completions/mean_length": 470.625, + "completions/mean_terminated_length": 450.6582336425781, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.015245064410397134, + "grad_norm": 0.08256660401821136, + "kl": 0.0185546875, + "learning_rate": 4.945653633868716e-06, + "loss": 0.0172, + "num_tokens": 4258442.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1280.0, + "completions/max_terminated_length": 1280.0, + "completions/mean_length": 416.125, + "completions/mean_terminated_length": 416.125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.015549965698605077, + "grad_norm": 0.03866208344697952, + "kl": 0.021453857421875, + "learning_rate": 4.942423680455584e-06, + "loss": 0.0002, + "num_tokens": 4338172.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 421.0874938964844, + "completions/mean_terminated_length": 421.0874938964844, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.01585486698681302, + "grad_norm": 0.008591040968894958, + "kl": 0.021087646484375, + "learning_rate": 4.939101737335802e-06, + "loss": 0.0002, + "num_tokens": 4421777.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1862.0, + "completions/max_terminated_length": 1862.0, + "completions/mean_length": 466.82501220703125, + "completions/mean_terminated_length": 466.82501220703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.016159768275020962, + "grad_norm": 0.009130142629146576, + "kl": 0.02008056640625, + "learning_rate": 4.935687943891447e-06, + "loss": 0.0002, + "num_tokens": 4510599.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1244.0, + "completions/max_terminated_length": 1244.0, + "completions/mean_length": 418.9375, + "completions/mean_terminated_length": 418.9375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.016464669563228906, + "grad_norm": 0.007702388800680637, + "kl": 0.01898193359375, + "learning_rate": 4.932182443358458e-06, + "loss": 0.0002, + "num_tokens": 4589492.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1052.0, + "completions/max_terminated_length": 1052.0, + "completions/mean_length": 425.9375, + "completions/mean_terminated_length": 425.9375, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.016769570851436846, + "grad_norm": 0.4790143668651581, + "kl": 0.12841796875, + "learning_rate": 4.928585382820616e-06, + "loss": 0.0013, + "num_tokens": 4673435.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1364.0, + "completions/max_terminated_length": 1364.0, + "completions/mean_length": 433.2875061035156, + "completions/mean_terminated_length": 433.2875061035156, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "epoch": 0.01707447213964479, + "grad_norm": 0.009019332937896252, + "kl": 0.021881103515625, + "learning_rate": 4.924896913203376e-06, + "loss": 0.0002, + "num_tokens": 4752248.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1579.0, + "completions/max_terminated_length": 1579.0, + "completions/mean_length": 468.7124938964844, + "completions/mean_terminated_length": 468.7124938964844, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.017379373427852732, + "grad_norm": 0.010964120738208294, + "kl": 0.02215576171875, + "learning_rate": 4.921117189267535e-06, + "loss": 0.0002, + "num_tokens": 4833935.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 458.9750061035156, + "completions/mean_terminated_length": 458.9750061035156, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.017684274716060676, + "grad_norm": 0.07652813196182251, + "kl": 0.026702880859375, + "learning_rate": 4.917246369602742e-06, + "loss": 0.0062, + "num_tokens": 4917547.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1608.0, + "completions/max_terminated_length": 1608.0, + "completions/mean_length": 455.3125, + "completions/mean_terminated_length": 455.3125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.01798917600426862, + "grad_norm": 0.1456306278705597, + "kl": 0.03131103515625, + "learning_rate": 4.9132846166208355e-06, + "loss": -0.0105, + "num_tokens": 5001028.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1673.0, + "completions/max_terminated_length": 1673.0, + "completions/mean_length": 439.875, + "completions/mean_terminated_length": 439.875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.01829407729247656, + "grad_norm": 0.013068665750324726, + "kl": 0.0283203125, + "learning_rate": 4.9092320965490365e-06, + "loss": 0.0003, + "num_tokens": 5082706.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1421.0, + "completions/max_terminated_length": 1421.0, + "completions/mean_length": 432.8625183105469, + "completions/mean_terminated_length": 432.8625183105469, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.018598978580684503, + "grad_norm": 0.1498623639345169, + "kl": 0.03411865234375, + "learning_rate": 4.905088979422971e-06, + "loss": -0.0192, + "num_tokens": 5167075.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 478.20001220703125, + "completions/mean_terminated_length": 478.20001220703125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.018903879868892446, + "grad_norm": 0.15401065349578857, + "kl": 0.03167724609375, + "learning_rate": 4.900855439079536e-06, + "loss": 0.0069, + "num_tokens": 5255569.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1416.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 468.125, + "completions/mean_terminated_length": 468.125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.01920878115710039, + "grad_norm": 0.05759025737643242, + "kl": 0.029327392578125, + "learning_rate": 4.8965316531496055e-06, + "loss": -0.0007, + "num_tokens": 5340617.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1037.0, + "completions/max_terminated_length": 1037.0, + "completions/mean_length": 426.7875061035156, + "completions/mean_terminated_length": 426.7875061035156, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.019513682445308333, + "grad_norm": 0.2939658761024475, + "kl": 0.03350830078125, + "learning_rate": 4.892117803050578e-06, + "loss": 0.022, + "num_tokens": 5423220.0, + "reward": 0.0925000011920929, + "reward_std": 0.01060660183429718, + "rewards/format_reward/mean": 0.925000011920929, + "rewards/format_reward/std": 0.2650531232357025, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1884.0, + "completions/max_terminated_length": 1884.0, + "completions/mean_length": 502.9875183105469, + "completions/mean_terminated_length": 502.9875183105469, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.019818583733516273, + "grad_norm": 0.15384595096111298, + "kl": 0.0291748046875, + "learning_rate": 4.887614073978761e-06, + "loss": 0.0003, + "num_tokens": 5513533.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 496.1499938964844, + "completions/mean_terminated_length": 476.50634765625, + "completions/min_length": 45.0, + "completions/min_terminated_length": 45.0, + "epoch": 0.020123485021724216, + "grad_norm": 0.13877885043621063, + "kl": 0.0316162109375, + "learning_rate": 4.883020654901609e-06, + "loss": 0.0111, + "num_tokens": 5601189.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1679.0, + "completions/max_terminated_length": 1679.0, + "completions/mean_length": 439.95001220703125, + "completions/mean_terminated_length": 439.95001220703125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.02042838630993216, + "grad_norm": 0.14163745939731598, + "kl": 0.0322265625, + "learning_rate": 4.878337738549785e-06, + "loss": 0.0032, + "num_tokens": 5684995.0, + "reward": 0.0925000011920929, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.925000011920929, + "rewards/format_reward/std": 0.2650531232357025, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 409.3625183105469, + "completions/mean_terminated_length": 409.3625183105469, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.020733287598140103, + "grad_norm": 0.23476682603359222, + "kl": 0.034271240234375, + "learning_rate": 4.873565521409082e-06, + "loss": 0.0071, + "num_tokens": 5763802.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1514.0, + "completions/mean_length": 387.625, + "completions/mean_terminated_length": 366.60760498046875, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.021038188886348046, + "grad_norm": 0.15044650435447693, + "kl": 0.037353515625, + "learning_rate": 4.868704203712173e-06, + "loss": 0.0222, + "num_tokens": 5840026.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1254.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 370.6625061035156, + "completions/mean_terminated_length": 370.6625061035156, + "completions/min_length": 42.0, + "completions/min_terminated_length": 42.0, + "epoch": 0.021343090174555986, + "grad_norm": 0.012564298696815968, + "kl": 0.03436279296875, + "learning_rate": 4.86375398943021e-06, + "loss": 0.0003, + "num_tokens": 5920369.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1343.0, + "completions/max_terminated_length": 1343.0, + "completions/mean_length": 390.3625183105469, + "completions/mean_terminated_length": 390.3625183105469, + "completions/min_length": 39.0, + "completions/min_terminated_length": 39.0, + "epoch": 0.02164799146276393, + "grad_norm": 0.0150454081594944, + "kl": 0.0364990234375, + "learning_rate": 4.858715086264274e-06, + "loss": 0.0004, + "num_tokens": 6000374.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1405.0, + "completions/max_terminated_length": 1405.0, + "completions/mean_length": 405.95001220703125, + "completions/mean_terminated_length": 405.95001220703125, + "completions/min_length": 46.0, + "completions/min_terminated_length": 46.0, + "epoch": 0.021952892750971873, + "grad_norm": 0.17304885387420654, + "kl": 0.04144287109375, + "learning_rate": 4.853587705636646e-06, + "loss": 0.0302, + "num_tokens": 6079606.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 817.0, + "completions/max_terminated_length": 817.0, + "completions/mean_length": 375.45001220703125, + "completions/mean_terminated_length": 375.45001220703125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.022257794039179817, + "grad_norm": 0.2335526943206787, + "kl": 0.0367431640625, + "learning_rate": 4.84837206268195e-06, + "loss": -0.0071, + "num_tokens": 6155698.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1201.0, + "completions/max_terminated_length": 1201.0, + "completions/mean_length": 385.4875183105469, + "completions/mean_terminated_length": 385.4875183105469, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.02256269532738776, + "grad_norm": 0.17186102271080017, + "kl": 0.0338134765625, + "learning_rate": 4.8430683762381195e-06, + "loss": 0.0066, + "num_tokens": 6234269.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1636.0, + "completions/mean_length": 410.1000061035156, + "completions/mean_terminated_length": 389.3670959472656, + "completions/min_length": 47.0, + "completions/min_terminated_length": 47.0, + "epoch": 0.0228675966155957, + "grad_norm": 0.05610320344567299, + "kl": 0.035552978515625, + "learning_rate": 4.837676868837213e-06, + "loss": 0.0143, + "num_tokens": 6314337.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 946.0, + "completions/max_terminated_length": 946.0, + "completions/mean_length": 370.7250061035156, + "completions/mean_terminated_length": 370.7250061035156, + "completions/min_length": 74.0, + "completions/min_terminated_length": 74.0, + "epoch": 0.023172497903803643, + "grad_norm": 0.17599275708198547, + "kl": 0.03692626953125, + "learning_rate": 4.832197766696085e-06, + "loss": 0.013, + "num_tokens": 6387825.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1283.0, + "completions/mean_length": 457.0625, + "completions/mean_terminated_length": 436.924072265625, + "completions/min_length": 63.0, + "completions/min_terminated_length": 63.0, + "epoch": 0.023477399192011587, + "grad_norm": 0.07211699336767197, + "kl": 0.03021240234375, + "learning_rate": 4.826631299706887e-06, + "loss": 0.0221, + "num_tokens": 6476254.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025000000000000022, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1299.0, + "completions/mean_length": 454.0500183105469, + "completions/mean_terminated_length": 413.17950439453125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.02378230048021953, + "grad_norm": 0.13199199736118317, + "kl": 0.028106689453125, + "learning_rate": 4.820977701427424e-06, + "loss": 0.0396, + "num_tokens": 6559896.0, + "reward": 0.0949999988079071, + "reward_std": 0.0070710680447518826, + "rewards/format_reward/mean": 0.949999988079071, + "rewards/format_reward/std": 0.21931999921798706, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1246.0, + "completions/max_terminated_length": 1246.0, + "completions/mean_length": 467.07501220703125, + "completions/mean_terminated_length": 467.07501220703125, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.02408720176842747, + "grad_norm": 0.1922425776720047, + "kl": 0.04315185546875, + "learning_rate": 4.81523720907136e-06, + "loss": 0.0114, + "num_tokens": 6644664.0, + "reward": 0.0962500050663948, + "reward_std": 0.00530330091714859, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 915.0, + "completions/max_terminated_length": 915.0, + "completions/mean_length": 427.0500183105469, + "completions/mean_terminated_length": 427.0500183105469, + "completions/min_length": 70.0, + "completions/min_terminated_length": 70.0, + "epoch": 0.024392103056635413, + "grad_norm": 0.10455196350812912, + "kl": 0.036407470703125, + "learning_rate": 4.809410063498254e-06, + "loss": 0.0061, + "num_tokens": 6728988.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1238.0, + "completions/max_terminated_length": 1238.0, + "completions/mean_length": 461.5500183105469, + "completions/mean_terminated_length": 461.5500183105469, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.024697004344843357, + "grad_norm": 0.09028248488903046, + "kl": 0.032958984375, + "learning_rate": 4.8034965092034656e-06, + "loss": -0.0011, + "num_tokens": 6817160.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1799.0, + "completions/max_terminated_length": 1799.0, + "completions/mean_length": 499.2250061035156, + "completions/mean_terminated_length": 499.2250061035156, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.0250019056330513, + "grad_norm": 0.010679539293050766, + "kl": 0.0279541015625, + "learning_rate": 4.797496794307889e-06, + "loss": 0.0003, + "num_tokens": 6910458.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1101.0, + "completions/max_terminated_length": 1101.0, + "completions/mean_length": 460.26251220703125, + "completions/mean_terminated_length": 460.26251220703125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.025306806921259244, + "grad_norm": 0.009925225749611855, + "kl": 0.027679443359375, + "learning_rate": 4.791411170547545e-06, + "loss": 0.0003, + "num_tokens": 6997335.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1793.0, + "completions/max_terminated_length": 1793.0, + "completions/mean_length": 476.9375, + "completions/mean_terminated_length": 476.9375, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.025611708209467184, + "grad_norm": 0.14034686982631683, + "kl": 0.0299072265625, + "learning_rate": 4.785239893263017e-06, + "loss": 0.0066, + "num_tokens": 7084896.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1289.0, + "completions/max_terminated_length": 1289.0, + "completions/mean_length": 459.01251220703125, + "completions/mean_terminated_length": 459.01251220703125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.025916609497675127, + "grad_norm": 0.18238796293735504, + "kl": 0.03082275390625, + "learning_rate": 4.778983221388742e-06, + "loss": 0.0036, + "num_tokens": 7169579.0, + "reward": 0.0949999988079071, + "reward_std": 0.0070710680447518826, + "rewards/format_reward/mean": 0.949999988079071, + "rewards/format_reward/std": 0.21931999921798706, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 922.0, + "completions/max_terminated_length": 922.0, + "completions/mean_length": 425.125, + "completions/mean_terminated_length": 425.125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.02622151078588307, + "grad_norm": 0.12785503268241882, + "kl": 0.03192138671875, + "learning_rate": 4.77264141744214e-06, + "loss": -0.0056, + "num_tokens": 7255699.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 988.0, + "completions/mean_length": 474.32501220703125, + "completions/mean_terminated_length": 454.4050598144531, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "epoch": 0.026526412074091014, + "grad_norm": 0.09049192816019058, + "kl": 0.029388427734375, + "learning_rate": 4.766214747512603e-06, + "loss": 0.017, + "num_tokens": 7338231.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 874.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 412.7250061035156, + "completions/mean_terminated_length": 412.7250061035156, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.026831313362298957, + "grad_norm": 0.012042547576129436, + "kl": 0.031829833984375, + "learning_rate": 4.759703481250331e-06, + "loss": 0.0003, + "num_tokens": 7421839.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 380.7124938964844, + "completions/mean_terminated_length": 380.7124938964844, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.027136214650506897, + "grad_norm": 0.012744505889713764, + "kl": 0.031402587890625, + "learning_rate": 4.753107891855015e-06, + "loss": 0.0003, + "num_tokens": 7496678.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1102.0, + "completions/max_terminated_length": 1102.0, + "completions/mean_length": 421.6499938964844, + "completions/mean_terminated_length": 421.6499938964844, + "completions/min_length": 41.0, + "completions/min_terminated_length": 41.0, + "epoch": 0.02744111593871484, + "grad_norm": 0.09909161925315857, + "kl": 0.031219482421875, + "learning_rate": 4.746428256064375e-06, + "loss": 0.0088, + "num_tokens": 7579184.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 992.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 365.3500061035156, + "completions/mean_terminated_length": 365.3500061035156, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.027746017226922784, + "grad_norm": 0.011725598014891148, + "kl": 0.03399658203125, + "learning_rate": 4.7396648541425534e-06, + "loss": 0.0003, + "num_tokens": 7658790.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 655.0, + "completions/max_terminated_length": 655.0, + "completions/mean_length": 378.8125, + "completions/mean_terminated_length": 378.8125, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.028050918515130727, + "grad_norm": 0.01128524262458086, + "kl": 0.031982421875, + "learning_rate": 4.732817969868348e-06, + "loss": 0.0003, + "num_tokens": 7736683.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1329.0, + "completions/max_terminated_length": 1329.0, + "completions/mean_length": 430.6875, + "completions/mean_terminated_length": 430.6875, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.02835581980333867, + "grad_norm": 0.01020871289074421, + "kl": 0.029205322265625, + "learning_rate": 4.7258878905233095e-06, + "loss": 0.0003, + "num_tokens": 7820178.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 434.7749938964844, + "completions/mean_terminated_length": 434.7749938964844, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.02866072109154661, + "grad_norm": 0.13812255859375, + "kl": 0.030548095703125, + "learning_rate": 4.718874906879688e-06, + "loss": 0.0087, + "num_tokens": 7906574.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 392.76251220703125, + "completions/mean_terminated_length": 392.76251220703125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.028965622379754554, + "grad_norm": 0.05963268131017685, + "kl": 0.03778076171875, + "learning_rate": 4.711779313188231e-06, + "loss": 0.0004, + "num_tokens": 7986005.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 402.0500183105469, + "completions/mean_terminated_length": 402.0500183105469, + "completions/min_length": 61.0, + "completions/min_terminated_length": 61.0, + "epoch": 0.029270523667962497, + "grad_norm": 0.011644248850643635, + "kl": 0.033355712890625, + "learning_rate": 4.70460140716584e-06, + "loss": 0.0003, + "num_tokens": 8071719.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1219.0, + "completions/max_terminated_length": 1219.0, + "completions/mean_length": 417.5625, + "completions/mean_terminated_length": 417.5625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.02957542495617044, + "grad_norm": 0.01004591304808855, + "kl": 0.0301513671875, + "learning_rate": 4.697341489983076e-06, + "loss": 0.0003, + "num_tokens": 8156948.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1250.0, + "completions/mean_length": 436.9250183105469, + "completions/mean_terminated_length": 416.5316467285156, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.02988032624437838, + "grad_norm": 0.09471645951271057, + "kl": 0.027435302734375, + "learning_rate": 4.6899998662515215e-06, + "loss": 0.0162, + "num_tokens": 8240926.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1559.0, + "completions/max_terminated_length": 1559.0, + "completions/mean_length": 369.82501220703125, + "completions/mean_terminated_length": 369.82501220703125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.030185227532586324, + "grad_norm": 0.010060581378638744, + "kl": 0.029296875, + "learning_rate": 4.682576844011007e-06, + "loss": 0.0003, + "num_tokens": 8318122.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 847.0, + "completions/max_terminated_length": 847.0, + "completions/mean_length": 439.9125061035156, + "completions/mean_terminated_length": 439.9125061035156, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "epoch": 0.030490128820794268, + "grad_norm": 0.013250144198536873, + "kl": 0.02764892578125, + "learning_rate": 4.675072734716678e-06, + "loss": 0.0003, + "num_tokens": 8399933.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1053.0, + "completions/max_terminated_length": 1053.0, + "completions/mean_length": 440.5375061035156, + "completions/mean_terminated_length": 440.5375061035156, + "completions/min_length": 49.0, + "completions/min_terminated_length": 49.0, + "epoch": 0.03079503010900221, + "grad_norm": 0.0173508208245039, + "kl": 0.024932861328125, + "learning_rate": 4.667487853225931e-06, + "loss": 0.0002, + "num_tokens": 8486574.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1100.0, + "completions/max_terminated_length": 1100.0, + "completions/mean_length": 436.01251220703125, + "completions/mean_terminated_length": 436.01251220703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.031099931397210154, + "grad_norm": 0.010417678393423557, + "kl": 0.028106689453125, + "learning_rate": 4.659822517785203e-06, + "loss": 0.0003, + "num_tokens": 8567453.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 445.5249938964844, + "completions/mean_terminated_length": 445.5249938964844, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.0314048326854181, + "grad_norm": 0.00837908685207367, + "kl": 0.0263671875, + "learning_rate": 4.6520770500166165e-06, + "loss": 0.0003, + "num_tokens": 8653671.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 405.76251220703125, + "completions/mean_terminated_length": 405.76251220703125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.03170973397362604, + "grad_norm": 0.008642381988465786, + "kl": 0.026397705078125, + "learning_rate": 4.644251774904487e-06, + "loss": 0.0003, + "num_tokens": 8733950.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1472.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 447.6750183105469, + "completions/mean_terminated_length": 447.6750183105469, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.032014635261833985, + "grad_norm": 0.010713264346122742, + "kl": 0.026611328125, + "learning_rate": 4.636347020781684e-06, + "loss": 0.0003, + "num_tokens": 8816570.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 819.0, + "completions/max_terminated_length": 819.0, + "completions/mean_length": 436.88751220703125, + "completions/mean_terminated_length": 436.88751220703125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.032319536550041925, + "grad_norm": 0.0077039930038154125, + "kl": 0.02520751953125, + "learning_rate": 4.6283631193158605e-06, + "loss": 0.0003, + "num_tokens": 8898097.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1872.0, + "completions/max_terminated_length": 1872.0, + "completions/mean_length": 467.5874938964844, + "completions/mean_terminated_length": 467.5874938964844, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.032624437838249865, + "grad_norm": 0.01947968266904354, + "kl": 0.027191162109375, + "learning_rate": 4.620300405495532e-06, + "loss": 0.0003, + "num_tokens": 8987072.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1248.0, + "completions/max_terminated_length": 1248.0, + "completions/mean_length": 428.8000183105469, + "completions/mean_terminated_length": 428.8000183105469, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.03292933912645781, + "grad_norm": 0.009133207611739635, + "kl": 0.0252685546875, + "learning_rate": 4.612159217616022e-06, + "loss": 0.0003, + "num_tokens": 9066656.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1211.0, + "completions/max_terminated_length": 1211.0, + "completions/mean_length": 431.13751220703125, + "completions/mean_terminated_length": 431.13751220703125, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.03323424041466575, + "grad_norm": 0.009221093729138374, + "kl": 0.027191162109375, + "learning_rate": 4.603939897265268e-06, + "loss": 0.0003, + "num_tokens": 9148285.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 425.76251220703125, + "completions/mean_terminated_length": 425.76251220703125, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.03353914170287369, + "grad_norm": 0.010868406854569912, + "kl": 0.027496337890625, + "learning_rate": 4.595642789309492e-06, + "loss": 0.0003, + "num_tokens": 9233742.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 417.0249938964844, + "completions/mean_terminated_length": 417.0249938964844, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.03384404299108164, + "grad_norm": 0.008578047156333923, + "kl": 0.0242919921875, + "learning_rate": 4.587268241878724e-06, + "loss": 0.0002, + "num_tokens": 9314770.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1450.0, + "completions/max_terminated_length": 1450.0, + "completions/mean_length": 478.7375183105469, + "completions/mean_terminated_length": 478.7375183105469, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.03414894427928958, + "grad_norm": 0.008643762208521366, + "kl": 0.024017333984375, + "learning_rate": 4.578816606352205e-06, + "loss": 0.0002, + "num_tokens": 9403055.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 512.1625366210938, + "completions/mean_terminated_length": 492.7215270996094, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.034453845567497525, + "grad_norm": 0.06351924687623978, + "kl": 0.023223876953125, + "learning_rate": 4.570288237343632e-06, + "loss": 0.0118, + "num_tokens": 9494696.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 431.6625061035156, + "completions/mean_terminated_length": 431.6625061035156, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.034758746855705465, + "grad_norm": 0.007523035630583763, + "kl": 0.0238037109375, + "learning_rate": 4.561683492686289e-06, + "loss": 0.0002, + "num_tokens": 9578089.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 421.6625061035156, + "completions/mean_terminated_length": 421.6625061035156, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.035063648143913405, + "grad_norm": 0.1079186275601387, + "kl": 0.02276611328125, + "learning_rate": 4.5530027334180285e-06, + "loss": 0.0148, + "num_tokens": 9662240.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1270.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 467.4875183105469, + "completions/mean_terminated_length": 467.4875183105469, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.03536854943212135, + "grad_norm": 0.12511233985424042, + "kl": 0.02191162109375, + "learning_rate": 4.544246323766122e-06, + "loss": -0.0006, + "num_tokens": 9748367.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1029.0, + "completions/max_terminated_length": 1029.0, + "completions/mean_length": 433.3374938964844, + "completions/mean_terminated_length": 433.3374938964844, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.03567345072032929, + "grad_norm": 0.12101423740386963, + "kl": 0.02276611328125, + "learning_rate": 4.535414631131983e-06, + "loss": 0.0056, + "num_tokens": 9830026.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 452.8500061035156, + "completions/mean_terminated_length": 452.8500061035156, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.03597835200853724, + "grad_norm": 0.00823962688446045, + "kl": 0.02374267578125, + "learning_rate": 4.526508026075746e-06, + "loss": 0.0002, + "num_tokens": 9913714.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 452.6625061035156, + "completions/mean_terminated_length": 432.4683532714844, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.03628325329674518, + "grad_norm": 0.11805210262537003, + "kl": 0.023345947265625, + "learning_rate": 4.517526882300721e-06, + "loss": 0.0268, + "num_tokens": 10000177.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 464.2375183105469, + "completions/mean_terminated_length": 464.2375183105469, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.03658815458495312, + "grad_norm": 0.0064226859249174595, + "kl": 0.019989013671875, + "learning_rate": 4.508471576637713e-06, + "loss": 0.0002, + "num_tokens": 10086616.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1389.0, + "completions/max_terminated_length": 1389.0, + "completions/mean_length": 479.45001220703125, + "completions/mean_terminated_length": 479.45001220703125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.036893055873161065, + "grad_norm": 0.006146405823528767, + "kl": 0.022857666015625, + "learning_rate": 4.499342489029211e-06, + "loss": 0.0002, + "num_tokens": 10171666.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1140.0, + "completions/max_terminated_length": 1140.0, + "completions/mean_length": 464.1125183105469, + "completions/mean_terminated_length": 464.1125183105469, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.037197957161369005, + "grad_norm": 0.007421064656227827, + "kl": 0.022613525390625, + "learning_rate": 4.490140002513449e-06, + "loss": 0.0002, + "num_tokens": 10258639.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1230.0, + "completions/max_terminated_length": 1230.0, + "completions/mean_length": 502.25, + "completions/mean_terminated_length": 502.25, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.03750285844957695, + "grad_norm": 0.006498878821730614, + "kl": 0.020965576171875, + "learning_rate": 4.48086450320833e-06, + "loss": 0.0002, + "num_tokens": 10346595.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1148.0, + "completions/max_terminated_length": 1148.0, + "completions/mean_length": 427.3125, + "completions/mean_terminated_length": 427.3125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.03780775973778489, + "grad_norm": 0.007438625209033489, + "kl": 0.02313232421875, + "learning_rate": 4.4715163802952266e-06, + "loss": 0.0002, + "num_tokens": 10430592.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1051.0, + "completions/mean_length": 515.0250244140625, + "completions/mean_terminated_length": 495.6202697753906, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.03811266102599283, + "grad_norm": 0.06223122030496597, + "kl": 0.019805908203125, + "learning_rate": 4.462096026002655e-06, + "loss": 0.0203, + "num_tokens": 10520862.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1879.0, + "completions/max_terminated_length": 1879.0, + "completions/mean_length": 481.13751220703125, + "completions/mean_terminated_length": 481.13751220703125, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.03841756231420078, + "grad_norm": 0.005953509360551834, + "kl": 0.0211181640625, + "learning_rate": 4.4526038355898144e-06, + "loss": 0.0002, + "num_tokens": 10610079.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 435.5874938964844, + "completions/mean_terminated_length": 435.5874938964844, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.03872246360240872, + "grad_norm": 0.0068345870822668076, + "kl": 0.0206298828125, + "learning_rate": 4.4430402073300035e-06, + "loss": 0.0002, + "num_tokens": 10691984.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 920.0, + "completions/max_terminated_length": 920.0, + "completions/mean_length": 446.7749938964844, + "completions/mean_terminated_length": 446.7749938964844, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.039027364890616666, + "grad_norm": 0.006349269766360521, + "kl": 0.02191162109375, + "learning_rate": 4.433405542493909e-06, + "loss": 0.0002, + "num_tokens": 10771836.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 460.25, + "completions/mean_terminated_length": 460.25, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.039332266178824606, + "grad_norm": 0.006095725577324629, + "kl": 0.02166748046875, + "learning_rate": 4.4237002453327734e-06, + "loss": 0.0002, + "num_tokens": 10854488.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 464.75, + "completions/mean_terminated_length": 464.75, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.039637167467032546, + "grad_norm": 0.006780361291021109, + "kl": 0.021820068359375, + "learning_rate": 4.4139247230614245e-06, + "loss": 0.0002, + "num_tokens": 10942168.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 443.2749938964844, + "completions/mean_terminated_length": 443.2749938964844, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.03994206875524049, + "grad_norm": 0.006308922544121742, + "kl": 0.020904541015625, + "learning_rate": 4.404079385841201e-06, + "loss": 0.0002, + "num_tokens": 11027114.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1570.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 498.5874938964844, + "completions/mean_terminated_length": 498.5874938964844, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.04024697004344843, + "grad_norm": 0.010841727256774902, + "kl": 0.021697998046875, + "learning_rate": 4.394164646762734e-06, + "loss": 0.0002, + "num_tokens": 11117615.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 979.0, + "completions/max_terminated_length": 979.0, + "completions/mean_length": 403.3000183105469, + "completions/mean_terminated_length": 403.3000183105469, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.04055187133165638, + "grad_norm": 0.007032663561403751, + "kl": 0.023834228515625, + "learning_rate": 4.384180921828618e-06, + "loss": 0.0002, + "num_tokens": 11191853.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 448.3125, + "completions/mean_terminated_length": 448.3125, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "epoch": 0.04085677261986432, + "grad_norm": 0.006273228675127029, + "kl": 0.02056884765625, + "learning_rate": 4.374128629935955e-06, + "loss": 0.0002, + "num_tokens": 11275020.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 829.0, + "completions/max_terminated_length": 829.0, + "completions/mean_length": 439.76251220703125, + "completions/mean_terminated_length": 439.76251220703125, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.04116167390807226, + "grad_norm": 0.0052821701392531395, + "kl": 0.0211181640625, + "learning_rate": 4.364008192858781e-06, + "loss": 0.0002, + "num_tokens": 11360009.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 441.2875061035156, + "completions/mean_terminated_length": 441.2875061035156, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.041466575196280206, + "grad_norm": 0.00741475960239768, + "kl": 0.020751953125, + "learning_rate": 4.353820035230366e-06, + "loss": 0.0002, + "num_tokens": 11444966.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1484.0, + "completions/max_terminated_length": 1484.0, + "completions/mean_length": 485.1125183105469, + "completions/mean_terminated_length": 485.1125183105469, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "epoch": 0.041771476484488146, + "grad_norm": 0.10163833945989609, + "kl": 0.020782470703125, + "learning_rate": 4.3435645845254e-06, + "loss": -0.0073, + "num_tokens": 11527187.0, + "reward": 0.09875000268220901, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1046.0, + "completions/max_terminated_length": 1046.0, + "completions/mean_length": 451.38751220703125, + "completions/mean_terminated_length": 451.38751220703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.04207637777269609, + "grad_norm": 0.0059946151450276375, + "kl": 0.020355224609375, + "learning_rate": 4.333242271042054e-06, + "loss": 0.0002, + "num_tokens": 11607904.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 485.8500061035156, + "completions/mean_terminated_length": 485.8500061035156, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.04238127906090403, + "grad_norm": 0.007027114741504192, + "kl": 0.02001953125, + "learning_rate": 4.32285352788393e-06, + "loss": 0.0002, + "num_tokens": 11692268.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1380.0, + "completions/max_terminated_length": 1380.0, + "completions/mean_length": 472.1000061035156, + "completions/mean_terminated_length": 472.1000061035156, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.04268618034911197, + "grad_norm": 0.022472839802503586, + "kl": 0.022674560546875, + "learning_rate": 4.312398790941882e-06, + "loss": 0.0002, + "num_tokens": 11779880.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1011.0, + "completions/max_terminated_length": 1011.0, + "completions/mean_length": 468.63751220703125, + "completions/mean_terminated_length": 468.63751220703125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.04299108163731992, + "grad_norm": 0.004842875991016626, + "kl": 0.020355224609375, + "learning_rate": 4.301878498875735e-06, + "loss": 0.0002, + "num_tokens": 11865031.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1296.0, + "completions/max_terminated_length": 1296.0, + "completions/mean_length": 521.7625122070312, + "completions/mean_terminated_length": 521.7625122070312, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.04329598292552786, + "grad_norm": 0.005916159600019455, + "kl": 0.019195556640625, + "learning_rate": 4.291293093095873e-06, + "loss": 0.0002, + "num_tokens": 11953252.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1441.0, + "completions/max_terminated_length": 1441.0, + "completions/mean_length": 452.1750183105469, + "completions/mean_terminated_length": 452.1750183105469, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.043600884213735806, + "grad_norm": 0.0050492798909544945, + "kl": 0.02130126953125, + "learning_rate": 4.280643017744723e-06, + "loss": 0.0002, + "num_tokens": 12031248.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1363.0, + "completions/max_terminated_length": 1363.0, + "completions/mean_length": 465.51251220703125, + "completions/mean_terminated_length": 465.51251220703125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.043905785501943746, + "grad_norm": 0.005258447024971247, + "kl": 0.021026611328125, + "learning_rate": 4.269928719678117e-06, + "loss": 0.0002, + "num_tokens": 12112583.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1134.0, + "completions/max_terminated_length": 1134.0, + "completions/mean_length": 457.1750183105469, + "completions/mean_terminated_length": 457.1750183105469, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.044210686790151686, + "grad_norm": 0.004629280883818865, + "kl": 0.0196533203125, + "learning_rate": 4.2591506484465426e-06, + "loss": 0.0002, + "num_tokens": 12192925.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1470.0, + "completions/max_terminated_length": 1470.0, + "completions/mean_length": 493.0375061035156, + "completions/mean_terminated_length": 493.0375061035156, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.04451558807835963, + "grad_norm": 0.1090981587767601, + "kl": 0.019500732421875, + "learning_rate": 4.248309256276283e-06, + "loss": -0.0041, + "num_tokens": 12284868.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 449.9375, + "completions/mean_terminated_length": 449.9375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.04482048936656757, + "grad_norm": 0.005555329844355583, + "kl": 0.019439697265625, + "learning_rate": 4.23740499805044e-06, + "loss": 0.0002, + "num_tokens": 12368695.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1056.0, + "completions/max_terminated_length": 1056.0, + "completions/mean_length": 460.8625183105469, + "completions/mean_terminated_length": 460.8625183105469, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.04512539065477552, + "grad_norm": 0.08225194364786148, + "kl": 0.023101806640625, + "learning_rate": 4.22643833128985e-06, + "loss": 0.0046, + "num_tokens": 12453566.0, + "reward": 0.09875001013278961, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1620.0, + "completions/max_terminated_length": 1620.0, + "completions/mean_length": 472.3000183105469, + "completions/mean_terminated_length": 472.3000183105469, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.04543029194298346, + "grad_norm": 0.004510453902184963, + "kl": 0.0183258056640625, + "learning_rate": 4.215409716133885e-06, + "loss": 0.0002, + "num_tokens": 12535486.0, + "reward": 0.10000000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1829.0, + "completions/mean_length": 469.6125183105469, + "completions/mean_terminated_length": 449.6329345703125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.0457351932311914, + "grad_norm": 0.13615085184574127, + "kl": 0.019378662109375, + "learning_rate": 4.204319615321151e-06, + "loss": 0.0102, + "num_tokens": 12618469.0, + "reward": 0.09750000387430191, + "reward_std": 0.0035355340223759413, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0, + "rewards/unicoder_reward_fn/std": 0.0, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 468.1750183105469, + "completions/mean_terminated_length": 468.1750183105469, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.04604009451939935, + "grad_norm": 0.13893385231494904, + "kl": 0.01971435546875, + "learning_rate": 4.193168494170065e-06, + "loss": -0.0089, + "num_tokens": 12700391.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 906.0, + "completions/max_terminated_length": 906.0, + "completions/mean_length": 450.0874938964844, + "completions/mean_terminated_length": 450.0874938964844, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.04634499580760729, + "grad_norm": 0.24638357758522034, + "kl": 0.020843505859375, + "learning_rate": 4.181956820559339e-06, + "loss": 0.0055, + "num_tokens": 12783810.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 874.0, + "completions/mean_length": 482.5874938964844, + "completions/mean_terminated_length": 462.77215576171875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.046649897095815226, + "grad_norm": 0.19727939367294312, + "kl": 0.01953125, + "learning_rate": 4.170685064908342e-06, + "loss": 0.0268, + "num_tokens": 12870267.0, + "reward": 0.16124999523162842, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1430.0, + "completions/max_terminated_length": 1430.0, + "completions/mean_length": 474.9125061035156, + "completions/mean_terminated_length": 474.9125061035156, + "completions/min_length": 205.0, + "completions/min_terminated_length": 205.0, + "epoch": 0.04695479838402317, + "grad_norm": 0.16645734012126923, + "kl": 0.02166748046875, + "learning_rate": 4.159353700157365e-06, + "loss": -0.006, + "num_tokens": 12958860.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1404.0, + "completions/max_terminated_length": 1404.0, + "completions/mean_length": 426.45001220703125, + "completions/mean_terminated_length": 426.45001220703125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.04725969967223111, + "grad_norm": 0.11824029684066772, + "kl": 0.022247314453125, + "learning_rate": 4.14796320174778e-06, + "loss": 0.0015, + "num_tokens": 13040700.0, + "reward": 0.23750002682209015, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1033.0, + "completions/max_terminated_length": 1033.0, + "completions/mean_length": 474.4250183105469, + "completions/mean_terminated_length": 474.4250183105469, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.04756460096043906, + "grad_norm": 0.20552416145801544, + "kl": 0.01910400390625, + "learning_rate": 4.136514047602087e-06, + "loss": 0.0055, + "num_tokens": 13127266.0, + "reward": 0.15000000596046448, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 409.7375183105469, + "completions/mean_terminated_length": 409.7375183105469, + "completions/min_length": 71.0, + "completions/min_terminated_length": 71.0, + "epoch": 0.047869502248647, + "grad_norm": 0.2528684735298157, + "kl": 0.02264404296875, + "learning_rate": 4.1250067181038635e-06, + "loss": 0.0083, + "num_tokens": 13205545.0, + "reward": 0.13625000417232513, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 397.0500183105469, + "completions/mean_terminated_length": 397.0500183105469, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "epoch": 0.04817440353685494, + "grad_norm": 0.20501503348350525, + "kl": 0.021697998046875, + "learning_rate": 4.113441696077608e-06, + "loss": -0.0023, + "num_tokens": 13284185.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1467.0, + "completions/max_terminated_length": 1467.0, + "completions/mean_length": 413.4250183105469, + "completions/mean_terminated_length": 413.4250183105469, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.04847930482506289, + "grad_norm": 0.2203351855278015, + "kl": 0.025726318359375, + "learning_rate": 4.101819466768484e-06, + "loss": 0.0014, + "num_tokens": 13358097.0, + "reward": 0.2250000238418579, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 953.0, + "completions/max_terminated_length": 953.0, + "completions/mean_length": 425.38751220703125, + "completions/mean_terminated_length": 425.38751220703125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "epoch": 0.04878420611327083, + "grad_norm": 0.1432938277721405, + "kl": 0.026611328125, + "learning_rate": 4.0901405178219535e-06, + "loss": 0.0072, + "num_tokens": 13440104.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 989.0, + "completions/max_terminated_length": 989.0, + "completions/mean_length": 431.8374938964844, + "completions/mean_terminated_length": 431.8374938964844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.049089107401478774, + "grad_norm": 0.09334322065114975, + "kl": 0.025604248046875, + "learning_rate": 4.078405339263326e-06, + "loss": 0.0031, + "num_tokens": 13520051.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1557.0, + "completions/max_terminated_length": 1557.0, + "completions/mean_length": 435.3625183105469, + "completions/mean_terminated_length": 435.3625183105469, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.049394008689686714, + "grad_norm": 0.1539635807275772, + "kl": 0.0269775390625, + "learning_rate": 4.06661442347719e-06, + "loss": 0.0098, + "num_tokens": 13602028.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1262.0, + "completions/max_terminated_length": 1262.0, + "completions/mean_length": 461.9125061035156, + "completions/mean_terminated_length": 461.9125061035156, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.049698909977894654, + "grad_norm": 0.18769684433937073, + "kl": 0.02703857421875, + "learning_rate": 4.054768265186758e-06, + "loss": 0.0163, + "num_tokens": 13687471.0, + "reward": 0.1625000238418579, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1472.0, + "completions/max_terminated_length": 1472.0, + "completions/mean_length": 474.4624938964844, + "completions/mean_terminated_length": 474.4624938964844, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "epoch": 0.0500038112661026, + "grad_norm": 0.13729074597358704, + "kl": 0.02593994140625, + "learning_rate": 4.0428673614331036e-06, + "loss": 0.0008, + "num_tokens": 13769654.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 451.1625061035156, + "completions/mean_terminated_length": 451.1625061035156, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.05030871255431054, + "grad_norm": 0.15735217928886414, + "kl": 0.027252197265625, + "learning_rate": 4.030912211554316e-06, + "loss": -0.004, + "num_tokens": 13847909.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 890.0, + "completions/max_terminated_length": 890.0, + "completions/mean_length": 466.01251220703125, + "completions/mean_terminated_length": 466.01251220703125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.05061361384251849, + "grad_norm": 0.1063733845949173, + "kl": 0.026611328125, + "learning_rate": 4.018903317164539e-06, + "loss": 0.0017, + "num_tokens": 13932246.0, + "reward": 0.13750000298023224, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1003.0, + "completions/max_terminated_length": 1003.0, + "completions/mean_length": 464.6125183105469, + "completions/mean_terminated_length": 464.6125183105469, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.05091851513072643, + "grad_norm": 0.18809597194194794, + "kl": 0.029754638671875, + "learning_rate": 4.006841182132932e-06, + "loss": -0.0045, + "num_tokens": 14018471.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1099.0, + "completions/max_terminated_length": 1099.0, + "completions/mean_length": 481.875, + "completions/mean_terminated_length": 481.875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.05122341641893437, + "grad_norm": 0.1885657161474228, + "kl": 0.029754638671875, + "learning_rate": 3.9947263125625195e-06, + "loss": -0.0012, + "num_tokens": 14104497.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 417.9624938964844, + "completions/mean_terminated_length": 417.9624938964844, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "epoch": 0.051528317707142314, + "grad_norm": 0.28072646260261536, + "kl": 0.0343017578125, + "learning_rate": 3.982559216768967e-06, + "loss": 0.0045, + "num_tokens": 14183420.0, + "reward": 0.2500000298023224, + "reward_std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.15000000596046448, + "rewards/unicoder_reward_fn/std": 0.35932427644729614, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 455.07501220703125, + "completions/mean_terminated_length": 455.07501220703125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.051833218995350254, + "grad_norm": 0.16700702905654907, + "kl": 0.04742431640625, + "learning_rate": 3.970340405259245e-06, + "loss": -0.0007, + "num_tokens": 14268502.0, + "reward": 0.13625000417232513, + "reward_std": 0.05126523971557617, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 435.3374938964844, + "completions/mean_terminated_length": 435.3374938964844, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.0521381202835582, + "grad_norm": 0.2016594558954239, + "kl": 0.03515625, + "learning_rate": 3.958070390710214e-06, + "loss": 0.0184, + "num_tokens": 14347941.0, + "reward": 0.2250000238418579, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 840.0, + "completions/max_terminated_length": 840.0, + "completions/mean_length": 430.5500183105469, + "completions/mean_terminated_length": 430.5500183105469, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.05244302157176614, + "grad_norm": 0.14722523093223572, + "kl": 0.033660888671875, + "learning_rate": 3.945749687947109e-06, + "loss": -0.0036, + "num_tokens": 14429165.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 390.01251220703125, + "completions/mean_terminated_length": 390.01251220703125, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.05274792285997408, + "grad_norm": 0.176780104637146, + "kl": 0.04071044921875, + "learning_rate": 3.933378813921942e-06, + "loss": -0.0041, + "num_tokens": 14507066.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1310.0, + "completions/max_terminated_length": 1310.0, + "completions/mean_length": 426.2124938964844, + "completions/mean_terminated_length": 426.2124938964844, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.05305282414818203, + "grad_norm": 0.11216724663972855, + "kl": 0.03741455078125, + "learning_rate": 3.920958287691811e-06, + "loss": -0.003, + "num_tokens": 14595147.0, + "reward": 0.1875000298023224, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1236.0, + "completions/max_terminated_length": 1236.0, + "completions/mean_length": 421.0, + "completions/mean_terminated_length": 421.0, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.05335772543638997, + "grad_norm": 0.012539403513073921, + "kl": 0.03759765625, + "learning_rate": 3.908488630397121e-06, + "loss": 0.0004, + "num_tokens": 14679785.0, + "reward": 0.1250000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 386.1625061035156, + "completions/mean_terminated_length": 386.1625061035156, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.053662626724597914, + "grad_norm": 0.28521767258644104, + "kl": 0.0400390625, + "learning_rate": 3.8959703652397175e-06, + "loss": 0.0077, + "num_tokens": 14754882.0, + "reward": 0.1600000113248825, + "reward_std": 0.09192388504743576, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 921.0, + "completions/max_terminated_length": 921.0, + "completions/mean_length": 396.88751220703125, + "completions/mean_terminated_length": 396.88751220703125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.053967528012805854, + "grad_norm": 0.2568298876285553, + "kl": 0.04534912109375, + "learning_rate": 3.883404017460935e-06, + "loss": 0.0135, + "num_tokens": 14835143.0, + "reward": 0.1625000238418579, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1079.0, + "completions/max_terminated_length": 1079.0, + "completions/mean_length": 396.6000061035156, + "completions/mean_terminated_length": 396.6000061035156, + "completions/min_length": 60.0, + "completions/min_terminated_length": 60.0, + "epoch": 0.054272429301013794, + "grad_norm": 0.17389893531799316, + "kl": 0.041015625, + "learning_rate": 3.870790114319559e-06, + "loss": -0.006, + "num_tokens": 14917165.0, + "reward": 0.11000000685453415, + "reward_std": 0.02121320180594921, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1661.0, + "completions/max_terminated_length": 1661.0, + "completions/mean_length": 365.9875183105469, + "completions/mean_terminated_length": 365.9875183105469, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.05457733058922174, + "grad_norm": 0.27765053510665894, + "kl": 0.043212890625, + "learning_rate": 3.858129185069701e-06, + "loss": -0.0025, + "num_tokens": 14992326.0, + "reward": 0.14874999225139618, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 381.3125, + "completions/mean_terminated_length": 381.3125, + "completions/min_length": 83.0, + "completions/min_terminated_length": 83.0, + "epoch": 0.05488223187742968, + "grad_norm": 0.1604042500257492, + "kl": 0.0423583984375, + "learning_rate": 3.845421760938597e-06, + "loss": -0.0024, + "num_tokens": 15069501.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 986.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 403.6000061035156, + "completions/mean_terminated_length": 403.6000061035156, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.05518713316563763, + "grad_norm": 0.11354026198387146, + "kl": 0.0467529296875, + "learning_rate": 3.832668375104312e-06, + "loss": 0.0026, + "num_tokens": 15150189.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 911.0, + "completions/max_terminated_length": 911.0, + "completions/mean_length": 389.9875183105469, + "completions/mean_terminated_length": 389.9875183105469, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.05549203445384557, + "grad_norm": 0.07974027842283249, + "kl": 0.04486083984375, + "learning_rate": 3.8198695626733725e-06, + "loss": 0.0057, + "num_tokens": 15228680.0, + "reward": 0.16250000894069672, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1088.0, + "completions/max_terminated_length": 1088.0, + "completions/mean_length": 422.125, + "completions/mean_terminated_length": 422.125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.05579693574205351, + "grad_norm": 0.13898016512393951, + "kl": 0.04345703125, + "learning_rate": 3.8070258606583156e-06, + "loss": 0.0012, + "num_tokens": 15308694.0, + "reward": 0.1612500101327896, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1294.0, + "completions/max_terminated_length": 1294.0, + "completions/mean_length": 412.57501220703125, + "completions/mean_terminated_length": 412.57501220703125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.056101837030261455, + "grad_norm": 0.22532965242862701, + "kl": 0.04083251953125, + "learning_rate": 3.7941378079551544e-06, + "loss": 0.0157, + "num_tokens": 15386502.0, + "reward": 0.19875001907348633, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 407.45001220703125, + "completions/mean_terminated_length": 407.45001220703125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.056406738318469395, + "grad_norm": 0.1868802011013031, + "kl": 0.0443115234375, + "learning_rate": 3.7812059453207677e-06, + "loss": 0.0052, + "num_tokens": 15465030.0, + "reward": 0.16250000894069672, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 415.6625061035156, + "completions/mean_terminated_length": 415.6625061035156, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.05671163960667734, + "grad_norm": 0.1819785237312317, + "kl": 0.04437255859375, + "learning_rate": 3.768230815350213e-06, + "loss": -0.0017, + "num_tokens": 15545053.0, + "reward": 0.2237500250339508, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 947.0, + "completions/max_terminated_length": 947.0, + "completions/mean_length": 415.26251220703125, + "completions/mean_terminated_length": 415.26251220703125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.05701654089488528, + "grad_norm": 0.22398653626441956, + "kl": 0.04693603515625, + "learning_rate": 3.7552129624539557e-06, + "loss": 0.0024, + "num_tokens": 15624898.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1042.0, + "completions/max_terminated_length": 1042.0, + "completions/mean_length": 471.25, + "completions/mean_terminated_length": 471.25, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "epoch": 0.05732144218309322, + "grad_norm": 0.15245965123176575, + "kl": 0.04522705078125, + "learning_rate": 3.7421529328350316e-06, + "loss": 0.0105, + "num_tokens": 15711106.0, + "reward": 0.11000000685453415, + "reward_std": 0.02121320180594921, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1312.0, + "completions/mean_length": 498.1625061035156, + "completions/mean_terminated_length": 478.5443115234375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "epoch": 0.05762634347130117, + "grad_norm": 0.2877877652645111, + "kl": 0.0426025390625, + "learning_rate": 3.7290512744661274e-06, + "loss": 0.0201, + "num_tokens": 15802865.0, + "reward": 0.14374999701976776, + "reward_std": 0.07954951375722885, + "rewards/format_reward/mean": 0.9375, + "rewards/format_reward/std": 0.2435886710882187, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1213.0, + "completions/max_terminated_length": 1213.0, + "completions/mean_length": 463.7250061035156, + "completions/mean_terminated_length": 463.7250061035156, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.05793124475950911, + "grad_norm": 0.220382422208786, + "kl": 0.044677734375, + "learning_rate": 3.715908537066589e-06, + "loss": -0.0025, + "num_tokens": 15888049.0, + "reward": 0.19875001907348633, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 992.0, + "completions/mean_length": 464.0874938964844, + "completions/mean_terminated_length": 444.0379943847656, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.058236146047717055, + "grad_norm": 0.15473699569702148, + "kl": 0.0445556640625, + "learning_rate": 3.7027252720793538e-06, + "loss": 0.0245, + "num_tokens": 15971338.0, + "reward": 0.1862500160932541, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1325.0, + "completions/mean_length": 499.75, + "completions/mean_terminated_length": 480.15191650390625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.058541047335924995, + "grad_norm": 0.2423507124185562, + "kl": 0.04345703125, + "learning_rate": 3.689502032647817e-06, + "loss": 0.0232, + "num_tokens": 16060400.0, + "reward": 0.17125001549720764, + "reward_std": 0.0760139748454094, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1072.0, + "completions/max_terminated_length": 1072.0, + "completions/mean_length": 456.5375061035156, + "completions/mean_terminated_length": 456.5375061035156, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.058845948624132935, + "grad_norm": 0.14700448513031006, + "kl": 0.046142578125, + "learning_rate": 3.6762393735926245e-06, + "loss": -0.0042, + "num_tokens": 16147755.0, + "reward": 0.11125000566244125, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1706.0, + "completions/max_terminated_length": 1706.0, + "completions/mean_length": 496.1000061035156, + "completions/mean_terminated_length": 496.1000061035156, + "completions/min_length": 64.0, + "completions/min_terminated_length": 64.0, + "epoch": 0.05915084991234088, + "grad_norm": 0.109529048204422, + "kl": 0.0516357421875, + "learning_rate": 3.6629378513883852e-06, + "loss": 0.005, + "num_tokens": 16239961.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 457.7749938964844, + "completions/mean_terminated_length": 457.7749938964844, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.05945575120054882, + "grad_norm": 0.1422124207019806, + "kl": 0.051513671875, + "learning_rate": 3.6495980241403307e-06, + "loss": 0.0032, + "num_tokens": 16321133.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 986.0, + "completions/mean_length": 506.3000183105469, + "completions/mean_terminated_length": 486.7848205566406, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.05976065248875676, + "grad_norm": 0.14283646643161774, + "kl": 0.051513671875, + "learning_rate": 3.636220451560896e-06, + "loss": 0.0142, + "num_tokens": 16412955.0, + "reward": 0.12375000864267349, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1063.0, + "completions/max_terminated_length": 1063.0, + "completions/mean_length": 490.1625061035156, + "completions/mean_terminated_length": 490.1625061035156, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.06006555377696471, + "grad_norm": 0.1564817726612091, + "kl": 0.0517578125, + "learning_rate": 3.622805694946235e-06, + "loss": 0.0033, + "num_tokens": 16498910.0, + "reward": 0.14875002205371857, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 429.9250183105469, + "completions/mean_terminated_length": 429.9250183105469, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.06037045506517265, + "grad_norm": 0.1235664039850235, + "kl": 0.057861328125, + "learning_rate": 3.609354317152667e-06, + "loss": 0.0017, + "num_tokens": 16578398.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1308.0, + "completions/max_terminated_length": 1308.0, + "completions/mean_length": 541.0499877929688, + "completions/mean_terminated_length": 541.0499877929688, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.060675356353380595, + "grad_norm": 0.1469779908657074, + "kl": 0.050048828125, + "learning_rate": 3.595866882573063e-06, + "loss": 0.0093, + "num_tokens": 16673246.0, + "reward": 0.12375000864267349, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 492.2250061035156, + "completions/mean_terminated_length": 492.2250061035156, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.060980257641588535, + "grad_norm": 0.16372045874595642, + "kl": 0.052734375, + "learning_rate": 3.5823439571131675e-06, + "loss": 0.0072, + "num_tokens": 16758306.0, + "reward": 0.23750002682209015, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1513.0, + "completions/max_terminated_length": 1513.0, + "completions/mean_length": 539.6875, + "completions/mean_terminated_length": 539.6875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.061285158929796475, + "grad_norm": 0.14389963448047638, + "kl": 0.0560302734375, + "learning_rate": 3.5687861081678477e-06, + "loss": -0.0008, + "num_tokens": 16850709.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 499.1875, + "completions/mean_terminated_length": 499.1875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.06159006021800442, + "grad_norm": 0.21020865440368652, + "kl": 0.0526123046875, + "learning_rate": 3.555193904597291e-06, + "loss": -0.0215, + "num_tokens": 16938202.0, + "reward": 0.1612500101327896, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1098.0, + "completions/max_terminated_length": 1098.0, + "completions/mean_length": 517.1875, + "completions/mean_terminated_length": 517.1875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "epoch": 0.06189496150621236, + "grad_norm": 0.19664756953716278, + "kl": 0.053466796875, + "learning_rate": 3.541567916703138e-06, + "loss": -0.0017, + "num_tokens": 17028301.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 495.375, + "completions/mean_terminated_length": 495.375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.06219986279442031, + "grad_norm": 0.04992162436246872, + "kl": 0.0518798828125, + "learning_rate": 3.5279087162045517e-06, + "loss": 0.0006, + "num_tokens": 17116071.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1911.0, + "completions/max_terminated_length": 1911.0, + "completions/mean_length": 544.8624877929688, + "completions/mean_terminated_length": 544.8624877929688, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "epoch": 0.06250476408262826, + "grad_norm": 0.06470425426959991, + "kl": 0.05096435546875, + "learning_rate": 3.5142168762142265e-06, + "loss": -0.0015, + "num_tokens": 17205530.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 468.8625183105469, + "completions/mean_terminated_length": 468.8625183105469, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.0628096653708362, + "grad_norm": 0.25689446926116943, + "kl": 0.0528564453125, + "learning_rate": 3.500492971214347e-06, + "loss": 0.0098, + "num_tokens": 17287257.0, + "reward": 0.2225000113248825, + "reward_std": 0.14495688676834106, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1482.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 536.5625, + "completions/mean_terminated_length": 536.5625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "epoch": 0.06311456665904414, + "grad_norm": 0.1306605488061905, + "kl": 0.04937744140625, + "learning_rate": 3.48673757703248e-06, + "loss": -0.0057, + "num_tokens": 17376676.0, + "reward": 0.20000003278255463, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 539.6749877929688, + "completions/mean_terminated_length": 539.6749877929688, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "epoch": 0.06341946794725208, + "grad_norm": 0.1695621907711029, + "kl": 0.04779052734375, + "learning_rate": 3.472951270817418e-06, + "loss": -0.005, + "num_tokens": 17472766.0, + "reward": 0.12250001728534698, + "reward_std": 0.03889087215065956, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 511.9624938964844, + "completions/mean_terminated_length": 511.9624938964844, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.06372436923546002, + "grad_norm": 0.1479700207710266, + "kl": 0.0560302734375, + "learning_rate": 3.4591346310149578e-06, + "loss": 0.0052, + "num_tokens": 17560111.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1759.0, + "completions/max_terminated_length": 1759.0, + "completions/mean_length": 515.7374877929688, + "completions/mean_terminated_length": 515.7374877929688, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.06402927052366797, + "grad_norm": 0.10783959180116653, + "kl": 0.05224609375, + "learning_rate": 3.445288237343632e-06, + "loss": 0.0036, + "num_tokens": 17646166.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1273.0, + "completions/max_terminated_length": 1273.0, + "completions/mean_length": 477.3625183105469, + "completions/mean_terminated_length": 477.3625183105469, + "completions/min_length": 73.0, + "completions/min_terminated_length": 73.0, + "epoch": 0.06433417181187591, + "grad_norm": 0.1348573863506317, + "kl": 0.051513671875, + "learning_rate": 3.4314126707703895e-06, + "loss": -0.0013, + "num_tokens": 17733095.0, + "reward": 0.1875000149011612, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 981.0, + "completions/max_terminated_length": 981.0, + "completions/mean_length": 463.5, + "completions/mean_terminated_length": 463.5, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.06463907310008385, + "grad_norm": 0.18264122307300568, + "kl": 0.05426025390625, + "learning_rate": 3.4175085134862128e-06, + "loss": 0.0083, + "num_tokens": 17817549.0, + "reward": 0.19875001907348633, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 476.375, + "completions/mean_terminated_length": 476.375, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.06494397438829179, + "grad_norm": 0.17124944925308228, + "kl": 0.0538330078125, + "learning_rate": 3.4035763488816953e-06, + "loss": 0.0049, + "num_tokens": 17903083.0, + "reward": 0.14875002205371857, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1036.0, + "completions/max_terminated_length": 1036.0, + "completions/mean_length": 465.2375183105469, + "completions/mean_terminated_length": 465.2375183105469, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.06524887567649973, + "grad_norm": 0.1577434241771698, + "kl": 0.0517578125, + "learning_rate": 3.3896167615225594e-06, + "loss": 0.0007, + "num_tokens": 17987156.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1317.0, + "completions/max_terminated_length": 1317.0, + "completions/mean_length": 489.1875, + "completions/mean_terminated_length": 489.1875, + "completions/min_length": 10.0, + "completions/min_terminated_length": 10.0, + "epoch": 0.06555377696470767, + "grad_norm": 0.2025170624256134, + "kl": 0.05126953125, + "learning_rate": 3.375630337125133e-06, + "loss": 0.006, + "num_tokens": 18078797.0, + "reward": 0.13500002026557922, + "reward_std": 0.05656854063272476, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 469.8374938964844, + "completions/mean_terminated_length": 469.8374938964844, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.06585867825291562, + "grad_norm": 0.12381523847579956, + "kl": 0.05096435546875, + "learning_rate": 3.361617662531772e-06, + "loss": -0.0053, + "num_tokens": 18165112.0, + "reward": 0.21250002086162567, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 939.0, + "completions/max_terminated_length": 939.0, + "completions/mean_length": 411.3500061035156, + "completions/mean_terminated_length": 411.3500061035156, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.06616357954112356, + "grad_norm": 0.16251295804977417, + "kl": 0.05572509765625, + "learning_rate": 3.347579325686237e-06, + "loss": -0.0059, + "num_tokens": 18246038.0, + "reward": 0.1875000149011612, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 416.6000061035156, + "completions/mean_terminated_length": 416.6000061035156, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.0664684808293315, + "grad_norm": 0.22227387130260468, + "kl": 0.059814453125, + "learning_rate": 3.333515915609027e-06, + "loss": -0.0285, + "num_tokens": 18330676.0, + "reward": 0.1862500160932541, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1560.0, + "completions/mean_length": 467.13751220703125, + "completions/mean_terminated_length": 447.1265869140625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.06677338211753944, + "grad_norm": 0.05367077514529228, + "kl": 0.0504150390625, + "learning_rate": 3.3194280223726616e-06, + "loss": 0.0194, + "num_tokens": 18418503.0, + "reward": 0.14875000715255737, + "reward_std": 0.0017677670111879706, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1152.0, + "completions/max_terminated_length": 1152.0, + "completions/mean_length": 448.9875183105469, + "completions/mean_terminated_length": 448.9875183105469, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.06707828340574738, + "grad_norm": 0.015069164335727692, + "kl": 0.0482177734375, + "learning_rate": 3.305316237076927e-06, + "loss": 0.0005, + "num_tokens": 18507032.0, + "reward": 0.15000002086162567, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 475.1499938964844, + "completions/mean_terminated_length": 475.1499938964844, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.06738318469395534, + "grad_norm": 0.20499880611896515, + "kl": 0.0531005859375, + "learning_rate": 3.291181151824071e-06, + "loss": 0.0038, + "num_tokens": 18592450.0, + "reward": 0.2250000238418579, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 434.0874938964844, + "completions/mean_terminated_length": 434.0874938964844, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "epoch": 0.06768808598216328, + "grad_norm": 0.1476088911294937, + "kl": 0.052734375, + "learning_rate": 3.27702335969396e-06, + "loss": -0.0044, + "num_tokens": 18676199.0, + "reward": 0.21250002086162567, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1681.0, + "completions/mean_length": 511.5, + "completions/mean_terminated_length": 492.0506591796875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.06799298727037122, + "grad_norm": 0.2085094004869461, + "kl": 0.05047607421875, + "learning_rate": 3.2628434547191985e-06, + "loss": 0.0071, + "num_tokens": 18767377.0, + "reward": 0.17250001430511475, + "reward_std": 0.07424621284008026, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 484.45001220703125, + "completions/mean_terminated_length": 484.45001220703125, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "epoch": 0.06829788855857916, + "grad_norm": 0.20141257345676422, + "kl": 0.0501708984375, + "learning_rate": 3.2486420318601973e-06, + "loss": -0.0068, + "num_tokens": 18856697.0, + "reward": 0.21250000596046448, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1449.0, + "completions/max_terminated_length": 1449.0, + "completions/mean_length": 461.0625, + "completions/mean_terminated_length": 461.0625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.0686027898467871, + "grad_norm": 0.21380314230918884, + "kl": 0.05499267578125, + "learning_rate": 3.2344196869802187e-06, + "loss": 0.0102, + "num_tokens": 18940914.0, + "reward": 0.2250000238418579, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1515.0, + "completions/max_terminated_length": 1515.0, + "completions/mean_length": 479.88751220703125, + "completions/mean_terminated_length": 479.88751220703125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.06890769113499505, + "grad_norm": 0.2144291251897812, + "kl": 0.0491943359375, + "learning_rate": 3.2201770168203694e-06, + "loss": 0.0036, + "num_tokens": 19026411.0, + "reward": 0.27500003576278687, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.17499999701976776, + "rewards/unicoder_reward_fn/std": 0.3823643922805786, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 492.5249938964844, + "completions/mean_terminated_length": 492.5249938964844, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.06921259242320299, + "grad_norm": 0.08882104605436325, + "kl": 0.04925537109375, + "learning_rate": 3.205914618974563e-06, + "loss": 0.0047, + "num_tokens": 19112903.0, + "reward": 0.20000003278255463, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 780.0, + "completions/max_terminated_length": 780.0, + "completions/mean_length": 435.6625061035156, + "completions/mean_terminated_length": 435.6625061035156, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.06951749371141093, + "grad_norm": 0.21108779311180115, + "kl": 0.05218505859375, + "learning_rate": 3.1916330918644496e-06, + "loss": -0.006, + "num_tokens": 19190060.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1416.0, + "completions/max_terminated_length": 1416.0, + "completions/mean_length": 546.9249877929688, + "completions/mean_terminated_length": 546.9249877929688, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.06982239499961887, + "grad_norm": 0.11964229494333267, + "kl": 0.0482177734375, + "learning_rate": 3.177333034714303e-06, + "loss": -0.0016, + "num_tokens": 19287010.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1137.0, + "completions/max_terminated_length": 1137.0, + "completions/mean_length": 467.1625061035156, + "completions/mean_terminated_length": 467.1625061035156, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07012729628782681, + "grad_norm": 0.0965242087841034, + "kl": 0.0548095703125, + "learning_rate": 3.1630150475258813e-06, + "loss": 0.0042, + "num_tokens": 19371217.0, + "reward": 0.17499999701976776, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 524.3375244140625, + "completions/mean_terminated_length": 524.3375244140625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.07043219757603476, + "grad_norm": 0.11118588596582413, + "kl": 0.04840087890625, + "learning_rate": 3.148679731053252e-06, + "loss": 0.0046, + "num_tokens": 19466568.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1177.0, + "completions/max_terminated_length": 1177.0, + "completions/mean_length": 543.4249877929688, + "completions/mean_terminated_length": 543.4249877929688, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.0707370988642427, + "grad_norm": 0.15548087656497955, + "kl": 0.0535888671875, + "learning_rate": 3.1343276867775805e-06, + "loss": 0.0054, + "num_tokens": 19557878.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1154.0, + "completions/max_terminated_length": 1154.0, + "completions/mean_length": 510.375, + "completions/mean_terminated_length": 510.375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.07104200015245064, + "grad_norm": 0.10689180344343185, + "kl": 0.050048828125, + "learning_rate": 3.1199595168819043e-06, + "loss": -0.0, + "num_tokens": 19643374.0, + "reward": 0.20000003278255463, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 474.9375, + "completions/mean_terminated_length": 474.9375, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.07134690144065858, + "grad_norm": 0.20179383456707, + "kl": 0.05975341796875, + "learning_rate": 3.105575824225852e-06, + "loss": -0.019, + "num_tokens": 19726871.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 525.5125122070312, + "completions/mean_terminated_length": 525.5125122070312, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "epoch": 0.07165180272886652, + "grad_norm": 0.1247158870100975, + "kl": 0.05096435546875, + "learning_rate": 3.091177212320363e-06, + "loss": 0.0034, + "num_tokens": 19819440.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1189.0, + "completions/max_terminated_length": 1189.0, + "completions/mean_length": 502.88751220703125, + "completions/mean_terminated_length": 502.88751220703125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.07195670401707448, + "grad_norm": 0.10479047149419785, + "kl": 0.05401611328125, + "learning_rate": 3.0767642853023538e-06, + "loss": 0.0005, + "num_tokens": 19905347.0, + "reward": 0.16250000894069672, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 520.3624877929688, + "completions/mean_terminated_length": 520.3624877929688, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.07226160530528242, + "grad_norm": 0.13667507469654083, + "kl": 0.05584716796875, + "learning_rate": 3.062337647909376e-06, + "loss": 0.0007, + "num_tokens": 19995426.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1233.0, + "completions/max_terminated_length": 1233.0, + "completions/mean_length": 538.1500244140625, + "completions/mean_terminated_length": 538.1500244140625, + "completions/min_length": 219.0, + "completions/min_terminated_length": 219.0, + "epoch": 0.07256650659349036, + "grad_norm": 0.15229485929012299, + "kl": 0.055419921875, + "learning_rate": 3.04789790545424e-06, + "loss": -0.004, + "num_tokens": 20086754.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1696.0, + "completions/max_terminated_length": 1696.0, + "completions/mean_length": 592.6625366210938, + "completions/mean_terminated_length": 592.6625366210938, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.0728714078816983, + "grad_norm": 0.20365579426288605, + "kl": 0.05059814453125, + "learning_rate": 3.033445663799621e-06, + "loss": -0.0056, + "num_tokens": 20185295.0, + "reward": 0.13625000417232513, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1354.0, + "completions/max_terminated_length": 1354.0, + "completions/mean_length": 504.13751220703125, + "completions/mean_terminated_length": 504.13751220703125, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.07317630916990624, + "grad_norm": 0.13447296619415283, + "kl": 0.05511474609375, + "learning_rate": 3.018981529332633e-06, + "loss": 0.0061, + "num_tokens": 20268892.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1257.0, + "completions/max_terminated_length": 1257.0, + "completions/mean_length": 489.4875183105469, + "completions/mean_terminated_length": 489.4875183105469, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "epoch": 0.07348121045811419, + "grad_norm": 0.18564718961715698, + "kl": 0.05279541015625, + "learning_rate": 3.00450610893939e-06, + "loss": 0.0078, + "num_tokens": 20353243.0, + "reward": 0.21250002086162567, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1298.0, + "completions/max_terminated_length": 1298.0, + "completions/mean_length": 544.4000244140625, + "completions/mean_terminated_length": 544.4000244140625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.07378611174632213, + "grad_norm": 0.18029005825519562, + "kl": 0.0494384765625, + "learning_rate": 2.9900200099795396e-06, + "loss": 0.0083, + "num_tokens": 20446257.0, + "reward": 0.11124999821186066, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1963.0, + "completions/max_terminated_length": 1963.0, + "completions/mean_length": 501.1000061035156, + "completions/mean_terminated_length": 501.1000061035156, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "epoch": 0.07409101303453007, + "grad_norm": 0.06964573264122009, + "kl": 0.0528564453125, + "learning_rate": 2.9755238402607826e-06, + "loss": -0.0022, + "num_tokens": 20535433.0, + "reward": 0.1625000238418579, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1388.0, + "completions/max_terminated_length": 1388.0, + "completions/mean_length": 435.5, + "completions/mean_terminated_length": 435.5, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "epoch": 0.07439591432273801, + "grad_norm": 0.19546860456466675, + "kl": 0.057861328125, + "learning_rate": 2.961018208013367e-06, + "loss": 0.0005, + "num_tokens": 20618669.0, + "reward": 0.14875002205371857, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1013.0, + "completions/max_terminated_length": 1013.0, + "completions/mean_length": 512.2000122070312, + "completions/mean_terminated_length": 512.2000122070312, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.07470081561094595, + "grad_norm": 0.1879434734582901, + "kl": 0.0555419921875, + "learning_rate": 2.9465037218645694e-06, + "loss": 0.0154, + "num_tokens": 20708125.0, + "reward": 0.20000003278255463, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1509.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 459.4624938964844, + "completions/mean_terminated_length": 459.4624938964844, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.0750057168991539, + "grad_norm": 0.1770198494195938, + "kl": 0.0557861328125, + "learning_rate": 2.9319809908131604e-06, + "loss": 0.0076, + "num_tokens": 20795776.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 499.3625183105469, + "completions/mean_terminated_length": 499.3625183105469, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.07531061818736184, + "grad_norm": 0.2515522539615631, + "kl": 0.05352783203125, + "learning_rate": 2.917450624203847e-06, + "loss": -0.001, + "num_tokens": 20885125.0, + "reward": 0.20000003278255463, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1376.0, + "completions/max_terminated_length": 1376.0, + "completions/mean_length": 467.0874938964844, + "completions/mean_terminated_length": 467.0874938964844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.07561551947556978, + "grad_norm": 0.1412237584590912, + "kl": 0.05340576171875, + "learning_rate": 2.9029132317017118e-06, + "loss": -0.0074, + "num_tokens": 20971492.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 483.2250061035156, + "completions/mean_terminated_length": 483.2250061035156, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.07592042076377772, + "grad_norm": 0.08514195680618286, + "kl": 0.05108642578125, + "learning_rate": 2.888369423266629e-06, + "loss": 0.0062, + "num_tokens": 21055896.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1658.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 498.8374938964844, + "completions/mean_terminated_length": 498.8374938964844, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.07622532205198566, + "grad_norm": 0.011307273991405964, + "kl": 0.05059814453125, + "learning_rate": 2.8738198091276712e-06, + "loss": 0.0005, + "num_tokens": 21144641.0, + "reward": 0.1250000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1124.0, + "completions/max_terminated_length": 1124.0, + "completions/mean_length": 490.13751220703125, + "completions/mean_terminated_length": 490.13751220703125, + "completions/min_length": 68.0, + "completions/min_terminated_length": 68.0, + "epoch": 0.07653022334019362, + "grad_norm": 0.06738625466823578, + "kl": 0.05255126953125, + "learning_rate": 2.859264999757509e-06, + "loss": 0.0021, + "num_tokens": 21236112.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1016.0, + "completions/max_terminated_length": 1016.0, + "completions/mean_length": 440.5500183105469, + "completions/mean_terminated_length": 440.5500183105469, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.07683512462840156, + "grad_norm": 0.1394670307636261, + "kl": 0.0572509765625, + "learning_rate": 2.8447056058467928e-06, + "loss": -0.0058, + "num_tokens": 21317460.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 412.1125183105469, + "completions/mean_terminated_length": 412.1125183105469, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.0771400259166095, + "grad_norm": 0.18536435067653656, + "kl": 0.05535888671875, + "learning_rate": 2.830142238278531e-06, + "loss": 0.0095, + "num_tokens": 21397969.0, + "reward": 0.2250000238418579, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1132.0, + "completions/max_terminated_length": 1132.0, + "completions/mean_length": 485.13751220703125, + "completions/mean_terminated_length": 485.13751220703125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.07744492720481744, + "grad_norm": 0.16279473900794983, + "kl": 0.05804443359375, + "learning_rate": 2.81557550810246e-06, + "loss": 0.0074, + "num_tokens": 21483776.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1400.0, + "completions/max_terminated_length": 1400.0, + "completions/mean_length": 501.2124938964844, + "completions/mean_terminated_length": 501.2124938964844, + "completions/min_length": 228.0, + "completions/min_terminated_length": 228.0, + "epoch": 0.07774982849302538, + "grad_norm": 0.136715367436409, + "kl": 0.05419921875, + "learning_rate": 2.8010060265094026e-06, + "loss": -0.0002, + "num_tokens": 21573965.0, + "reward": 0.1625000238418579, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1386.0, + "completions/max_terminated_length": 1386.0, + "completions/mean_length": 516.1500244140625, + "completions/mean_terminated_length": 516.1500244140625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.07805472978123333, + "grad_norm": 0.23355121910572052, + "kl": 0.0560302734375, + "learning_rate": 2.786434404805629e-06, + "loss": -0.0051, + "num_tokens": 21660273.0, + "reward": 0.17375002801418304, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1097.0, + "completions/max_terminated_length": 1097.0, + "completions/mean_length": 484.8999938964844, + "completions/mean_terminated_length": 484.8999938964844, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.07835963106944127, + "grad_norm": 0.07816074043512344, + "kl": 0.05767822265625, + "learning_rate": 2.771861254387199e-06, + "loss": 0.0014, + "num_tokens": 21750347.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1437.0, + "completions/max_terminated_length": 1437.0, + "completions/mean_length": 518.9000244140625, + "completions/mean_terminated_length": 518.9000244140625, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "epoch": 0.07866453235764921, + "grad_norm": 0.37866440415382385, + "kl": 0.08160400390625, + "learning_rate": 2.7572871867143204e-06, + "loss": 0.0085, + "num_tokens": 21846487.0, + "reward": 0.14875000715255737, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1023.0, + "completions/max_terminated_length": 1023.0, + "completions/mean_length": 467.82501220703125, + "completions/mean_terminated_length": 467.82501220703125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.07896943364585715, + "grad_norm": 0.2316463440656662, + "kl": 0.058837890625, + "learning_rate": 2.742712813285681e-06, + "loss": -0.0169, + "num_tokens": 21932599.0, + "reward": 0.23750002682209015, + "reward_std": 0.1237436905503273, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 454.5874938964844, + "completions/mean_terminated_length": 454.5874938964844, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.07927433493406509, + "grad_norm": 0.21413984894752502, + "kl": 0.0577392578125, + "learning_rate": 2.7281387456128017e-06, + "loss": 0.0008, + "num_tokens": 22020246.0, + "reward": 0.18250000476837158, + "reward_std": 0.02474873699247837, + "rewards/format_reward/mean": 0.949999988079071, + "rewards/format_reward/std": 0.21931999921798706, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1379.0, + "completions/max_terminated_length": 1379.0, + "completions/mean_length": 490.07501220703125, + "completions/mean_terminated_length": 490.07501220703125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.07957923622227304, + "grad_norm": 0.2023562639951706, + "kl": 0.05303955078125, + "learning_rate": 2.7135655951943716e-06, + "loss": -0.0012, + "num_tokens": 22106920.0, + "reward": 0.14750002324581146, + "reward_std": 0.03889087215065956, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 454.7875061035156, + "completions/mean_terminated_length": 454.7875061035156, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.07988413751048098, + "grad_norm": 0.36662039160728455, + "kl": 0.0594482421875, + "learning_rate": 2.698993973490598e-06, + "loss": 0.0226, + "num_tokens": 22187965.0, + "reward": 0.13375000655651093, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.8374999761581421, + "rewards/format_reward/std": 0.3712363839149475, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1335.0, + "completions/max_terminated_length": 1335.0, + "completions/mean_length": 514.9125366210938, + "completions/mean_terminated_length": 514.9125366210938, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "epoch": 0.08018903879868892, + "grad_norm": 0.3255445659160614, + "kl": 0.056640625, + "learning_rate": 2.6844244918975416e-06, + "loss": 0.0268, + "num_tokens": 22278120.0, + "reward": 0.16250000894069672, + "reward_std": 0.08131727576255798, + "rewards/format_reward/mean": 0.875, + "rewards/format_reward/std": 0.33280548453330994, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1663.0, + "completions/mean_length": 513.7374877929688, + "completions/mean_terminated_length": 494.31646728515625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.08049394008689686, + "grad_norm": 0.39617183804512024, + "kl": 0.05194091796875, + "learning_rate": 2.66985776172147e-06, + "loss": 0.0428, + "num_tokens": 22365541.0, + "reward": 0.15125001966953278, + "reward_std": 0.10429824888706207, + "rewards/format_reward/mean": 0.887499988079071, + "rewards/format_reward/std": 0.3179742097854614, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 497.7250061035156, + "completions/mean_terminated_length": 497.7250061035156, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "epoch": 0.0807988413751048, + "grad_norm": 0.1744794100522995, + "kl": 0.0535888671875, + "learning_rate": 2.6552943941532088e-06, + "loss": -0.0065, + "num_tokens": 22453057.0, + "reward": 0.12250001728534698, + "reward_std": 0.03889087215065956, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1705.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 525.3250122070312, + "completions/mean_terminated_length": 525.3250122070312, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.08110374266331276, + "grad_norm": 0.20356491208076477, + "kl": 0.052001953125, + "learning_rate": 2.6407350002424927e-06, + "loss": 0.0127, + "num_tokens": 22544929.0, + "reward": 0.14625000953674316, + "reward_std": 0.04065864160656929, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 471.7749938964844, + "completions/mean_terminated_length": 471.7749938964844, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.0814086439515207, + "grad_norm": 0.18078124523162842, + "kl": 0.04986572265625, + "learning_rate": 2.626180190872329e-06, + "loss": -0.01, + "num_tokens": 22630733.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1374.0, + "completions/max_terminated_length": 1374.0, + "completions/mean_length": 462.3125, + "completions/mean_terminated_length": 462.3125, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "epoch": 0.08171354523972864, + "grad_norm": 0.12597592175006866, + "kl": 0.05572509765625, + "learning_rate": 2.611630576733372e-06, + "loss": 0.0038, + "num_tokens": 22715850.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1875.0, + "completions/max_terminated_length": 1875.0, + "completions/mean_length": 507.1499938964844, + "completions/mean_terminated_length": 507.1499938964844, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "epoch": 0.08201844652793658, + "grad_norm": 0.23196038603782654, + "kl": 0.06365966796875, + "learning_rate": 2.5970867682982885e-06, + "loss": 0.013, + "num_tokens": 22803240.0, + "reward": 0.13375000655651093, + "reward_std": 0.05833630636334419, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1414.0, + "completions/max_terminated_length": 1414.0, + "completions/mean_length": 464.3374938964844, + "completions/mean_terminated_length": 464.3374938964844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.08232334781614452, + "grad_norm": 0.23718927800655365, + "kl": 0.05914306640625, + "learning_rate": 2.582549375796154e-06, + "loss": 0.0149, + "num_tokens": 22891955.0, + "reward": 0.14750002324581146, + "reward_std": 0.07424621284008026, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1625.0, + "completions/max_terminated_length": 1625.0, + "completions/mean_length": 543.7000122070312, + "completions/mean_terminated_length": 543.7000122070312, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "epoch": 0.08262824910435247, + "grad_norm": 0.19418348371982574, + "kl": 0.0516357421875, + "learning_rate": 2.568019009186841e-06, + "loss": 0.0201, + "num_tokens": 22988511.0, + "reward": 0.17125001549720764, + "reward_std": 0.07601398229598999, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1268.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 453.5874938964844, + "completions/mean_terminated_length": 453.5874938964844, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "epoch": 0.08293315039256041, + "grad_norm": 0.20042388141155243, + "kl": 0.05975341796875, + "learning_rate": 2.5534962781354317e-06, + "loss": -0.0057, + "num_tokens": 23078212.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1336.0, + "completions/max_terminated_length": 1336.0, + "completions/mean_length": 435.8500061035156, + "completions/mean_terminated_length": 435.8500061035156, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.08323805168076835, + "grad_norm": 0.12753447890281677, + "kl": 0.054443359375, + "learning_rate": 2.538981791986634e-06, + "loss": 0.0026, + "num_tokens": 23165238.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1411.0, + "completions/max_terminated_length": 1411.0, + "completions/mean_length": 473.75, + "completions/mean_terminated_length": 473.75, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.08354295296897629, + "grad_norm": 0.12009397894144058, + "kl": 0.052978515625, + "learning_rate": 2.524476159739218e-06, + "loss": 0.005, + "num_tokens": 23252430.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1256.0, + "completions/max_terminated_length": 1256.0, + "completions/mean_length": 451.63751220703125, + "completions/mean_terminated_length": 451.63751220703125, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.08384785425718423, + "grad_norm": 0.1936834752559662, + "kl": 0.051513671875, + "learning_rate": 2.5099799900204607e-06, + "loss": 0.0055, + "num_tokens": 23334535.0, + "reward": 0.1612500250339508, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1269.0, + "completions/max_terminated_length": 1269.0, + "completions/mean_length": 455.9125061035156, + "completions/mean_terminated_length": 455.9125061035156, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.08415275554539219, + "grad_norm": 0.164746955037117, + "kl": 0.052490234375, + "learning_rate": 2.4954938910606108e-06, + "loss": 0.005, + "num_tokens": 23418978.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1540.0, + "completions/max_terminated_length": 1540.0, + "completions/mean_length": 451.3999938964844, + "completions/mean_terminated_length": 451.3999938964844, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.08445765683360013, + "grad_norm": 0.07814321666955948, + "kl": 0.05133056640625, + "learning_rate": 2.481018470667368e-06, + "loss": 0.0041, + "num_tokens": 23501772.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1570.0, + "completions/max_terminated_length": 1570.0, + "completions/mean_length": 468.1125183105469, + "completions/mean_terminated_length": 468.1125183105469, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.08476255812180807, + "grad_norm": 0.11970165371894836, + "kl": 0.0550537109375, + "learning_rate": 2.4665543362003802e-06, + "loss": 0.0026, + "num_tokens": 23588901.0, + "reward": 0.19875001907348633, + "reward_std": 0.03712311014533043, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1174.0, + "completions/max_terminated_length": 1174.0, + "completions/mean_length": 471.25, + "completions/mean_terminated_length": 471.25, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.085067459410016, + "grad_norm": 0.18868288397789001, + "kl": 0.05419921875, + "learning_rate": 2.4521020945457615e-06, + "loss": 0.0044, + "num_tokens": 23678053.0, + "reward": 0.16250000894069672, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1797.0, + "completions/max_terminated_length": 1797.0, + "completions/mean_length": 431.6875, + "completions/mean_terminated_length": 431.6875, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.08537236069822395, + "grad_norm": 0.17599807679653168, + "kl": 0.0562744140625, + "learning_rate": 2.4376623520906255e-06, + "loss": -0.0002, + "num_tokens": 23761322.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 949.0, + "completions/max_terminated_length": 949.0, + "completions/mean_length": 419.5874938964844, + "completions/mean_terminated_length": 419.5874938964844, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.0856772619864319, + "grad_norm": 0.015127049759030342, + "kl": 0.05780029296875, + "learning_rate": 2.4232357146976478e-06, + "loss": 0.0006, + "num_tokens": 23842793.0, + "reward": 0.1250000149011612, + "reward_std": 0.0, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1091.0, + "completions/max_terminated_length": 1091.0, + "completions/mean_length": 405.07501220703125, + "completions/mean_terminated_length": 405.07501220703125, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.08598216327463984, + "grad_norm": 0.15912564098834991, + "kl": 0.05450439453125, + "learning_rate": 2.408822787679637e-06, + "loss": 0.0003, + "num_tokens": 23921019.0, + "reward": 0.17375002801418304, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1962.0, + "completions/max_terminated_length": 1962.0, + "completions/mean_length": 465.9250183105469, + "completions/mean_terminated_length": 465.9250183105469, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.08628706456284778, + "grad_norm": 0.18430808186531067, + "kl": 0.05889892578125, + "learning_rate": 2.3944241757741475e-06, + "loss": -0.0003, + "num_tokens": 24009529.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1259.0, + "completions/max_terminated_length": 1259.0, + "completions/mean_length": 461.6750183105469, + "completions/mean_terminated_length": 461.6750183105469, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "epoch": 0.08659196585105572, + "grad_norm": 0.10521125048398972, + "kl": 0.054931640625, + "learning_rate": 2.380040483118097e-06, + "loss": 0.0058, + "num_tokens": 24098463.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 431.9624938964844, + "completions/mean_terminated_length": 431.9624938964844, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.08689686713926366, + "grad_norm": 0.18808409571647644, + "kl": 0.0546875, + "learning_rate": 2.365672313222419e-06, + "loss": 0.0099, + "num_tokens": 24181320.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1415.0, + "completions/max_terminated_length": 1415.0, + "completions/mean_length": 448.2875061035156, + "completions/mean_terminated_length": 448.2875061035156, + "completions/min_length": 44.0, + "completions/min_terminated_length": 44.0, + "epoch": 0.08720176842747161, + "grad_norm": 0.1849883794784546, + "kl": 0.05560302734375, + "learning_rate": 2.351320268946749e-06, + "loss": -0.0067, + "num_tokens": 24268271.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1215.0, + "completions/max_terminated_length": 1215.0, + "completions/mean_length": 485.70001220703125, + "completions/mean_terminated_length": 485.70001220703125, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.08750666971567955, + "grad_norm": 0.32291901111602783, + "kl": 0.1077880859375, + "learning_rate": 2.336984952474119e-06, + "loss": 0.0046, + "num_tokens": 24359331.0, + "reward": 0.12375000864267349, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1483.0, + "completions/max_terminated_length": 1483.0, + "completions/mean_length": 439.2875061035156, + "completions/mean_terminated_length": 439.2875061035156, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.08781157100388749, + "grad_norm": 0.27845945954322815, + "kl": 0.058837890625, + "learning_rate": 2.322666965285697e-06, + "loss": -0.0068, + "num_tokens": 24440342.0, + "reward": 0.21250002086162567, + "reward_std": 0.12374367564916611, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 856.0, + "completions/max_terminated_length": 856.0, + "completions/mean_length": 394.5375061035156, + "completions/mean_terminated_length": 394.5375061035156, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.08811647229209543, + "grad_norm": 0.22477968037128448, + "kl": 0.05712890625, + "learning_rate": 2.3083669081355507e-06, + "loss": 0.0006, + "num_tokens": 24518207.0, + "reward": 0.14875002205371857, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1919.0, + "completions/max_terminated_length": 1919.0, + "completions/mean_length": 493.63751220703125, + "completions/mean_terminated_length": 493.63751220703125, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.08842137358030337, + "grad_norm": 0.209952712059021, + "kl": 0.05535888671875, + "learning_rate": 2.2940853810254377e-06, + "loss": -0.008, + "num_tokens": 24604480.0, + "reward": 0.1862500160932541, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1040.0, + "completions/max_terminated_length": 1040.0, + "completions/mean_length": 449.51251220703125, + "completions/mean_terminated_length": 449.51251220703125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.08872627486851133, + "grad_norm": 0.21413753926753998, + "kl": 0.051513671875, + "learning_rate": 2.2798229831796313e-06, + "loss": 0.0089, + "num_tokens": 24687085.0, + "reward": 0.12000000476837158, + "reward_std": 0.0070710680447518826, + "rewards/format_reward/mean": 0.949999988079071, + "rewards/format_reward/std": 0.21931999921798706, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1057.0, + "completions/max_terminated_length": 1057.0, + "completions/mean_length": 448.9375, + "completions/mean_terminated_length": 448.9375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "epoch": 0.08903117615671927, + "grad_norm": 0.2990638315677643, + "kl": 0.04931640625, + "learning_rate": 2.2655803130197816e-06, + "loss": -0.0008, + "num_tokens": 24772748.0, + "reward": 0.11500000953674316, + "reward_std": 0.04949747398495674, + "rewards/format_reward/mean": 0.8999999761581421, + "rewards/format_reward/std": 0.3018927276134491, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 444.3625183105469, + "completions/mean_terminated_length": 444.3625183105469, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.0893360774449272, + "grad_norm": 0.30735084414482117, + "kl": 0.051025390625, + "learning_rate": 2.2513579681398034e-06, + "loss": -0.01, + "num_tokens": 24861053.0, + "reward": 0.15375001728534698, + "reward_std": 0.10076271742582321, + "rewards/format_reward/mean": 0.9125000238418579, + "rewards/format_reward/std": 0.28434911370277405, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1176.0, + "completions/max_terminated_length": 1176.0, + "completions/mean_length": 424.51251220703125, + "completions/mean_terminated_length": 424.51251220703125, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "epoch": 0.08964097873313515, + "grad_norm": 0.28689104318618774, + "kl": 0.052001953125, + "learning_rate": 2.237156545280803e-06, + "loss": -0.0063, + "num_tokens": 24942834.0, + "reward": 0.1300000101327896, + "reward_std": 0.06010407209396362, + "rewards/format_reward/mean": 0.925000011920929, + "rewards/format_reward/std": 0.2650531232357025, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1527.0, + "completions/max_terminated_length": 1527.0, + "completions/mean_length": 504.5375061035156, + "completions/mean_terminated_length": 504.5375061035156, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "epoch": 0.08994588002134309, + "grad_norm": 0.25867027044296265, + "kl": 0.05072021484375, + "learning_rate": 2.2229766403060403e-06, + "loss": -0.0047, + "num_tokens": 25030547.0, + "reward": 0.16500000655651093, + "reward_std": 0.11313708871603012, + "rewards/format_reward/mean": 0.8999999761581421, + "rewards/format_reward/std": 0.3018927574157715, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1292.0, + "completions/max_terminated_length": 1292.0, + "completions/mean_length": 453.5249938964844, + "completions/mean_terminated_length": 453.5249938964844, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.09025078130955104, + "grad_norm": 0.30247601866722107, + "kl": 0.0528564453125, + "learning_rate": 2.2088188481759305e-06, + "loss": -0.0018, + "num_tokens": 25109955.0, + "reward": 0.17750000953674316, + "reward_std": 0.06717514246702194, + "rewards/format_reward/mean": 0.8999999761581421, + "rewards/format_reward/std": 0.3018927276134491, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1423.0, + "completions/max_terminated_length": 1423.0, + "completions/mean_length": 478.95001220703125, + "completions/mean_terminated_length": 478.95001220703125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.09055568259775898, + "grad_norm": 0.20232892036437988, + "kl": 0.0496826171875, + "learning_rate": 2.194683762923073e-06, + "loss": 0.0001, + "num_tokens": 25195167.0, + "reward": 0.12000000476837158, + "reward_std": 0.04242640733718872, + "rewards/format_reward/mean": 0.949999988079071, + "rewards/format_reward/std": 0.21931999921798706, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1428.0, + "completions/max_terminated_length": 1428.0, + "completions/mean_length": 459.76251220703125, + "completions/mean_terminated_length": 459.76251220703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.09086058388596692, + "grad_norm": 0.28715941309928894, + "kl": 0.05517578125, + "learning_rate": 2.1805719776273387e-06, + "loss": -0.0145, + "num_tokens": 25272854.0, + "reward": 0.1925000250339508, + "reward_std": 0.08131728321313858, + "rewards/format_reward/mean": 0.925000011920929, + "rewards/format_reward/std": 0.2650531232357025, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1391.0, + "completions/max_terminated_length": 1391.0, + "completions/mean_length": 474.9250183105469, + "completions/mean_terminated_length": 474.9250183105469, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.09116548517417486, + "grad_norm": 0.19802211225032806, + "kl": 0.05157470703125, + "learning_rate": 2.166484084390974e-06, + "loss": 0.0061, + "num_tokens": 25365288.0, + "reward": 0.1862500160932541, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 983.0, + "completions/max_terminated_length": 983.0, + "completions/mean_length": 489.63751220703125, + "completions/mean_terminated_length": 489.63751220703125, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "epoch": 0.0914703864623828, + "grad_norm": 0.2699923813343048, + "kl": 0.0726318359375, + "learning_rate": 2.1524206743137636e-06, + "loss": -0.0068, + "num_tokens": 25452087.0, + "reward": 0.18500001728534698, + "reward_std": 0.12727922201156616, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 514.1500244140625, + "completions/mean_terminated_length": 514.1500244140625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.09177528775059074, + "grad_norm": 0.1984640210866928, + "kl": 0.05224609375, + "learning_rate": 2.1383823374682287e-06, + "loss": 0.0169, + "num_tokens": 25546801.0, + "reward": 0.1612500101327896, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1245.0, + "completions/max_terminated_length": 1245.0, + "completions/mean_length": 491.25, + "completions/mean_terminated_length": 491.25, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.0920801890387987, + "grad_norm": 0.18095120787620544, + "kl": 0.05096435546875, + "learning_rate": 2.124369662874868e-06, + "loss": 0.0081, + "num_tokens": 25636455.0, + "reward": 0.12375000864267349, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2041.0, + "completions/max_terminated_length": 2041.0, + "completions/mean_length": 464.95001220703125, + "completions/mean_terminated_length": 464.95001220703125, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.09238509032700663, + "grad_norm": 0.1827562004327774, + "kl": 0.05792236328125, + "learning_rate": 2.110383238477441e-06, + "loss": 0.0124, + "num_tokens": 25718507.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 987.0, + "completions/max_terminated_length": 987.0, + "completions/mean_length": 474.45001220703125, + "completions/mean_terminated_length": 474.45001220703125, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.09268999161521457, + "grad_norm": 0.2177320420742035, + "kl": 0.05657958984375, + "learning_rate": 2.096423651118305e-06, + "loss": -0.0, + "num_tokens": 25806627.0, + "reward": 0.1862500160932541, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1985.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 498.0375061035156, + "completions/mean_terminated_length": 498.0375061035156, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.09299489290342251, + "grad_norm": 0.1999090313911438, + "kl": 0.05413818359375, + "learning_rate": 2.082491486513788e-06, + "loss": 0.011, + "num_tokens": 25894094.0, + "reward": 0.17375002801418304, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1739.0, + "completions/max_terminated_length": 1739.0, + "completions/mean_length": 488.5625, + "completions/mean_terminated_length": 488.5625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.09329979419163045, + "grad_norm": 0.18753774464130402, + "kl": 0.05328369140625, + "learning_rate": 2.0685873292296116e-06, + "loss": -0.0051, + "num_tokens": 25985229.0, + "reward": 0.19875001907348633, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 914.0, + "completions/max_terminated_length": 914.0, + "completions/mean_length": 454.63751220703125, + "completions/mean_terminated_length": 454.63751220703125, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "epoch": 0.0936046954798384, + "grad_norm": 0.17221041023731232, + "kl": 0.052001953125, + "learning_rate": 2.054711762656369e-06, + "loss": -0.0021, + "num_tokens": 26069054.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1486.0, + "completions/max_terminated_length": 1486.0, + "completions/mean_length": 523.4874877929688, + "completions/mean_terminated_length": 523.4874877929688, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.09390959676804635, + "grad_norm": 0.0688505545258522, + "kl": 0.0504150390625, + "learning_rate": 2.040865368985044e-06, + "loss": -0.0007, + "num_tokens": 26164117.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1228.0, + "completions/max_terminated_length": 1228.0, + "completions/mean_length": 457.9624938964844, + "completions/mean_terminated_length": 457.9624938964844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.09421449805625429, + "grad_norm": 0.09899485111236572, + "kl": 0.05426025390625, + "learning_rate": 2.027048729182583e-06, + "loss": 0.0, + "num_tokens": 26247982.0, + "reward": 0.13750000298023224, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 460.8500061035156, + "completions/mean_terminated_length": 460.8500061035156, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.09451939934446223, + "grad_norm": 0.15719051659107208, + "kl": 0.0552978515625, + "learning_rate": 2.0132624229675205e-06, + "loss": 0.0058, + "num_tokens": 26335480.0, + "reward": 0.1625000238418579, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1182.0, + "completions/max_terminated_length": 1182.0, + "completions/mean_length": 408.1750183105469, + "completions/mean_terminated_length": 408.1750183105469, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.09482430063267017, + "grad_norm": 0.16415221989154816, + "kl": 0.05621337890625, + "learning_rate": 1.9995070287856546e-06, + "loss": -0.0017, + "num_tokens": 26414044.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1355.0, + "completions/max_terminated_length": 1355.0, + "completions/mean_length": 459.1000061035156, + "completions/mean_terminated_length": 459.1000061035156, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "epoch": 0.09512920192087812, + "grad_norm": 0.1567731350660324, + "kl": 0.05615234375, + "learning_rate": 1.985783123785774e-06, + "loss": -0.0001, + "num_tokens": 26499808.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1798.0, + "completions/max_terminated_length": 1798.0, + "completions/mean_length": 490.32501220703125, + "completions/mean_terminated_length": 490.32501220703125, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "epoch": 0.09543410320908606, + "grad_norm": 0.16579455137252808, + "kl": 0.0517578125, + "learning_rate": 1.9720912837954486e-06, + "loss": -0.0028, + "num_tokens": 26584350.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1157.0, + "completions/max_terminated_length": 1157.0, + "completions/mean_length": 443.6125183105469, + "completions/mean_terminated_length": 443.6125183105469, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "epoch": 0.095739004497294, + "grad_norm": 0.1825886368751526, + "kl": 0.0528564453125, + "learning_rate": 1.958432083296862e-06, + "loss": 0.0058, + "num_tokens": 26668297.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1138.0, + "completions/max_terminated_length": 1138.0, + "completions/mean_length": 489.1875, + "completions/mean_terminated_length": 489.1875, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.09604390578550194, + "grad_norm": 0.22271665930747986, + "kl": 0.05328369140625, + "learning_rate": 1.9448060954027093e-06, + "loss": -0.0087, + "num_tokens": 26753640.0, + "reward": 0.2250000238418579, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 975.0, + "completions/max_terminated_length": 975.0, + "completions/mean_length": 444.26251220703125, + "completions/mean_terminated_length": 444.26251220703125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "epoch": 0.09634880707370988, + "grad_norm": 0.18735575675964355, + "kl": 0.052734375, + "learning_rate": 1.931213891832153e-06, + "loss": -0.0087, + "num_tokens": 26833649.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1114.0, + "completions/max_terminated_length": 1114.0, + "completions/mean_length": 444.4624938964844, + "completions/mean_terminated_length": 444.4624938964844, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.09665370836191783, + "grad_norm": 0.2518155574798584, + "kl": 0.05218505859375, + "learning_rate": 1.9176560428868336e-06, + "loss": 0.023, + "num_tokens": 26913246.0, + "reward": 0.30000004172325134, + "reward_std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.20000000298023224, + "rewards/unicoder_reward_fn/std": 0.4025236964225769, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 397.3500061035156, + "completions/mean_terminated_length": 397.3500061035156, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.09695860965012577, + "grad_norm": 0.16875123977661133, + "kl": 0.0560302734375, + "learning_rate": 1.9041331174269373e-06, + "loss": 0.0031, + "num_tokens": 26990552.0, + "reward": 0.1875000149011612, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 918.0, + "completions/max_terminated_length": 918.0, + "completions/mean_length": 463.1499938964844, + "completions/mean_terminated_length": 463.1499938964844, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.09726351093833371, + "grad_norm": 0.17430393397808075, + "kl": 0.05401611328125, + "learning_rate": 1.8906456828473341e-06, + "loss": 0.0036, + "num_tokens": 27076338.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1058.0, + "completions/max_terminated_length": 1058.0, + "completions/mean_length": 436.9125061035156, + "completions/mean_terminated_length": 436.9125061035156, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.09756841222654165, + "grad_norm": 0.1987638920545578, + "kl": 0.0550537109375, + "learning_rate": 1.8771943050537656e-06, + "loss": 0.0036, + "num_tokens": 27159067.0, + "reward": 0.20000003278255463, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1049.0, + "completions/max_terminated_length": 1049.0, + "completions/mean_length": 444.95001220703125, + "completions/mean_terminated_length": 444.95001220703125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.0978733135147496, + "grad_norm": 0.14804542064666748, + "kl": 0.05242919921875, + "learning_rate": 1.8637795484391046e-06, + "loss": 0.0037, + "num_tokens": 27241497.0, + "reward": 0.21250002086162567, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2032.0, + "completions/max_terminated_length": 2032.0, + "completions/mean_length": 520.5250244140625, + "completions/mean_terminated_length": 520.5250244140625, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.09817821480295755, + "grad_norm": 0.18733462691307068, + "kl": 0.05059814453125, + "learning_rate": 1.8504019758596698e-06, + "loss": 0.0081, + "num_tokens": 27333439.0, + "reward": 0.20000003278255463, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1542.0, + "completions/max_terminated_length": 1542.0, + "completions/mean_length": 422.7124938964844, + "completions/mean_terminated_length": 422.7124938964844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.09848311609116549, + "grad_norm": 0.19084835052490234, + "kl": 0.05145263671875, + "learning_rate": 1.8370621486116163e-06, + "loss": 0.0045, + "num_tokens": 27412614.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1109.0, + "completions/max_terminated_length": 1109.0, + "completions/mean_length": 480.5375061035156, + "completions/mean_terminated_length": 480.5375061035156, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.09878801737937343, + "grad_norm": 0.1860656887292862, + "kl": 0.04840087890625, + "learning_rate": 1.823760626407377e-06, + "loss": 0.0028, + "num_tokens": 27498657.0, + "reward": 0.1862500160932541, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1159.0, + "completions/max_terminated_length": 1159.0, + "completions/mean_length": 472.2749938964844, + "completions/mean_terminated_length": 472.2749938964844, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.09909291866758137, + "grad_norm": 0.09327766299247742, + "kl": 0.053955078125, + "learning_rate": 1.8104979673521838e-06, + "loss": -0.002, + "num_tokens": 27583791.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1286.0, + "completions/max_terminated_length": 1286.0, + "completions/mean_length": 432.0, + "completions/mean_terminated_length": 432.0, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.09939781995578931, + "grad_norm": 0.2940045893192291, + "kl": 0.05206298828125, + "learning_rate": 1.7972747279206482e-06, + "loss": -0.0095, + "num_tokens": 27664373.0, + "reward": 0.2500000298023224, + "reward_std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.15000000596046448, + "rewards/unicoder_reward_fn/std": 0.35932427644729614, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1213.0, + "completions/max_terminated_length": 1213.0, + "completions/mean_length": 431.9875183105469, + "completions/mean_terminated_length": 431.9875183105469, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.09970272124399726, + "grad_norm": 0.21221143007278442, + "kl": 0.05511474609375, + "learning_rate": 1.7840914629334122e-06, + "loss": -0.0028, + "num_tokens": 27747314.0, + "reward": 0.20000003278255463, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1116.0, + "completions/max_terminated_length": 1116.0, + "completions/mean_length": 424.32501220703125, + "completions/mean_terminated_length": 424.32501220703125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.1000076225322052, + "grad_norm": 0.1102011427283287, + "kl": 0.05322265625, + "learning_rate": 1.7709487255338731e-06, + "loss": -0.0044, + "num_tokens": 27829286.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 447.625, + "completions/mean_terminated_length": 447.625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.10031252382041314, + "grad_norm": 0.21028603613376617, + "kl": 0.05322265625, + "learning_rate": 1.7578470671649684e-06, + "loss": 0.0188, + "num_tokens": 27912630.0, + "reward": 0.2237500250339508, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1119.0, + "completions/max_terminated_length": 1119.0, + "completions/mean_length": 422.70001220703125, + "completions/mean_terminated_length": 422.70001220703125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.10061742510862108, + "grad_norm": 0.09216304868459702, + "kl": 0.05438232421875, + "learning_rate": 1.744787037546045e-06, + "loss": -0.0007, + "num_tokens": 27997754.0, + "reward": 0.2500000298023224, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.15000000596046448, + "rewards/unicoder_reward_fn/std": 0.35932427644729614, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 414.6499938964844, + "completions/mean_terminated_length": 414.6499938964844, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.10092232639682902, + "grad_norm": 0.12062133103609085, + "kl": 0.04913330078125, + "learning_rate": 1.731769184649788e-06, + "loss": -0.0032, + "num_tokens": 28076546.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1300.0, + "completions/mean_length": 455.8999938964844, + "completions/mean_terminated_length": 435.7468566894531, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "epoch": 0.10122722768503697, + "grad_norm": 0.25286737084388733, + "kl": 0.05377197265625, + "learning_rate": 1.7187940546792325e-06, + "loss": 0.0184, + "num_tokens": 28159116.0, + "reward": 0.23375001549720764, + "reward_std": 0.09369164705276489, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1366.0, + "completions/max_terminated_length": 1366.0, + "completions/mean_length": 470.07501220703125, + "completions/mean_terminated_length": 470.07501220703125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.10153212897324491, + "grad_norm": 0.2345660924911499, + "kl": 0.048095703125, + "learning_rate": 1.7058621920448465e-06, + "loss": 0.0209, + "num_tokens": 28244778.0, + "reward": 0.23625002801418304, + "reward_std": 0.12551145255565643, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 441.3374938964844, + "completions/mean_terminated_length": 441.3374938964844, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.10183703026145285, + "grad_norm": 0.07905353605747223, + "kl": 0.053466796875, + "learning_rate": 1.6929741393416855e-06, + "loss": 0.0036, + "num_tokens": 28330591.0, + "reward": 0.13750000298023224, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1783.0, + "completions/max_terminated_length": 1783.0, + "completions/mean_length": 437.6750183105469, + "completions/mean_terminated_length": 437.6750183105469, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.1021419315496608, + "grad_norm": 0.19001714885234833, + "kl": 0.04754638671875, + "learning_rate": 1.6801304373266286e-06, + "loss": -0.0027, + "num_tokens": 28411101.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1038.0, + "completions/max_terminated_length": 1038.0, + "completions/mean_length": 408.375, + "completions/mean_terminated_length": 408.375, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.10244683283786873, + "grad_norm": 0.1954682320356369, + "kl": 0.052490234375, + "learning_rate": 1.667331624895689e-06, + "loss": 0.0003, + "num_tokens": 28490713.0, + "reward": 0.17500001192092896, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 436.26251220703125, + "completions/mean_terminated_length": 436.26251220703125, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.10275173412607669, + "grad_norm": 0.16154824197292328, + "kl": 0.053466796875, + "learning_rate": 1.6545782390614037e-06, + "loss": 0.001, + "num_tokens": 28573186.0, + "reward": 0.12375000864267349, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 941.0, + "completions/max_terminated_length": 941.0, + "completions/mean_length": 400.07501220703125, + "completions/mean_terminated_length": 400.07501220703125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.10305663541428463, + "grad_norm": 0.11140483617782593, + "kl": 0.0548095703125, + "learning_rate": 1.6418708149302992e-06, + "loss": 0.0025, + "num_tokens": 28650560.0, + "reward": 0.16250000894069672, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1155.0, + "completions/max_terminated_length": 1155.0, + "completions/mean_length": 389.75, + "completions/mean_terminated_length": 389.75, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.10336153670249257, + "grad_norm": 0.19081075489521027, + "kl": 0.0506591796875, + "learning_rate": 1.6292098856804423e-06, + "loss": 0.0001, + "num_tokens": 28726146.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 435.8625183105469, + "completions/mean_terminated_length": 435.8625183105469, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "epoch": 0.10366643799070051, + "grad_norm": 0.16997206211090088, + "kl": 0.04998779296875, + "learning_rate": 1.6165959825390661e-06, + "loss": 0.006, + "num_tokens": 28809467.0, + "reward": 0.16250000894069672, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1509.0, + "completions/max_terminated_length": 1509.0, + "completions/mean_length": 440.0, + "completions/mean_terminated_length": 440.0, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.10397133927890845, + "grad_norm": 0.10942406207323074, + "kl": 0.0523681640625, + "learning_rate": 1.604029634760284e-06, + "loss": 0.0081, + "num_tokens": 28894931.0, + "reward": 0.1875000298023224, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 903.0, + "completions/max_terminated_length": 903.0, + "completions/mean_length": 396.8500061035156, + "completions/mean_terminated_length": 396.8500061035156, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.1042762405671164, + "grad_norm": 0.16135147213935852, + "kl": 0.04913330078125, + "learning_rate": 1.59151136960288e-06, + "loss": 0.0017, + "num_tokens": 28969419.0, + "reward": 0.17500001192092896, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 408.3625183105469, + "completions/mean_terminated_length": 408.3625183105469, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.10458114185532434, + "grad_norm": 0.1693839132785797, + "kl": 0.053955078125, + "learning_rate": 1.5790417123081903e-06, + "loss": -0.0086, + "num_tokens": 29045756.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 410.7250061035156, + "completions/mean_terminated_length": 410.7250061035156, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.10488604314353228, + "grad_norm": 0.16823109984397888, + "kl": 0.0499267578125, + "learning_rate": 1.5666211860780583e-06, + "loss": 0.0005, + "num_tokens": 29125906.0, + "reward": 0.1862500160932541, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 860.0, + "completions/max_terminated_length": 860.0, + "completions/mean_length": 385.1000061035156, + "completions/mean_terminated_length": 385.1000061035156, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.10519094443174022, + "grad_norm": 0.13253702223300934, + "kl": 0.05218505859375, + "learning_rate": 1.5542503120528918e-06, + "loss": -0.0031, + "num_tokens": 29206558.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 682.0, + "completions/max_terminated_length": 682.0, + "completions/mean_length": 393.1750183105469, + "completions/mean_terminated_length": 393.1750183105469, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "epoch": 0.10549584571994816, + "grad_norm": 0.16460463404655457, + "kl": 0.0513916015625, + "learning_rate": 1.5419296092897866e-06, + "loss": 0.0061, + "num_tokens": 29285000.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 865.0, + "completions/max_terminated_length": 865.0, + "completions/mean_length": 383.76251220703125, + "completions/mean_terminated_length": 383.76251220703125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.10580074700815612, + "grad_norm": 0.23074668645858765, + "kl": 0.05499267578125, + "learning_rate": 1.529659594740755e-06, + "loss": -0.0055, + "num_tokens": 29362279.0, + "reward": 0.23625002801418304, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 982.0, + "completions/max_terminated_length": 982.0, + "completions/mean_length": 395.2124938964844, + "completions/mean_terminated_length": 395.2124938964844, + "completions/min_length": 75.0, + "completions/min_terminated_length": 75.0, + "epoch": 0.10610564829636406, + "grad_norm": 0.13016793131828308, + "kl": 0.0482177734375, + "learning_rate": 1.5174407832310338e-06, + "loss": 0.0076, + "num_tokens": 29443256.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1482.0, + "completions/max_terminated_length": 1482.0, + "completions/mean_length": 392.5625, + "completions/mean_terminated_length": 392.5625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.106410549584572, + "grad_norm": 0.21937747299671173, + "kl": 0.05255126953125, + "learning_rate": 1.5052736874374815e-06, + "loss": 0.0085, + "num_tokens": 29518973.0, + "reward": 0.17375001311302185, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 444.9624938964844, + "completions/mean_terminated_length": 444.9624938964844, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.10671545087277994, + "grad_norm": 0.15810927748680115, + "kl": 0.04937744140625, + "learning_rate": 1.4931588178670695e-06, + "loss": -0.0024, + "num_tokens": 29604052.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 925.0, + "completions/max_terminated_length": 925.0, + "completions/mean_length": 406.4375, + "completions/mean_terminated_length": 406.4375, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.10702035216098787, + "grad_norm": 0.11895695328712463, + "kl": 0.0513916015625, + "learning_rate": 1.4810966828354605e-06, + "loss": 0.0007, + "num_tokens": 29683913.0, + "reward": 0.20000003278255463, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1118.0, + "completions/max_terminated_length": 1118.0, + "completions/mean_length": 422.38751220703125, + "completions/mean_terminated_length": 422.38751220703125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.10732525344919583, + "grad_norm": 0.2081456184387207, + "kl": 0.04931640625, + "learning_rate": 1.469087788445684e-06, + "loss": 0.0127, + "num_tokens": 29764884.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1124.0, + "completions/max_terminated_length": 1124.0, + "completions/mean_length": 426.8999938964844, + "completions/mean_terminated_length": 426.8999938964844, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "epoch": 0.10763015473740377, + "grad_norm": 0.1928683966398239, + "kl": 0.04541015625, + "learning_rate": 1.4571326385668965e-06, + "loss": 0.0038, + "num_tokens": 29844874.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 452.3500061035156, + "completions/mean_terminated_length": 432.15191650390625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.10793505602561171, + "grad_norm": 0.26351362466812134, + "kl": 0.05120849609375, + "learning_rate": 1.4452317348132434e-06, + "loss": 0.0117, + "num_tokens": 29929006.0, + "reward": 0.2237500250339508, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1156.0, + "completions/max_terminated_length": 1156.0, + "completions/mean_length": 418.01251220703125, + "completions/mean_terminated_length": 418.01251220703125, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "epoch": 0.10823995731381965, + "grad_norm": 0.14276431500911713, + "kl": 0.05224609375, + "learning_rate": 1.4333855765228104e-06, + "loss": -0.001, + "num_tokens": 30014969.0, + "reward": 0.1625000238418579, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 996.0, + "completions/max_terminated_length": 996.0, + "completions/mean_length": 434.125, + "completions/mean_terminated_length": 434.125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.10854485860202759, + "grad_norm": 0.1563921421766281, + "kl": 0.04644775390625, + "learning_rate": 1.421594660736675e-06, + "loss": -0.0034, + "num_tokens": 30097137.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.025000000000000022, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 481.4125061035156, + "completions/mean_terminated_length": 441.24359130859375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.10884975989023554, + "grad_norm": 0.17907285690307617, + "kl": 0.04522705078125, + "learning_rate": 1.4098594821780476e-06, + "loss": 0.0443, + "num_tokens": 30185818.0, + "reward": 0.12250001728534698, + "reward_std": 0.03889087215065956, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1816.0, + "completions/max_terminated_length": 1816.0, + "completions/mean_length": 467.26251220703125, + "completions/mean_terminated_length": 467.26251220703125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.10915466117844348, + "grad_norm": 0.15856198966503143, + "kl": 0.04522705078125, + "learning_rate": 1.3981805332315174e-06, + "loss": -0.0018, + "num_tokens": 30272527.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1055.0, + "completions/max_terminated_length": 1055.0, + "completions/mean_length": 451.0625, + "completions/mean_terminated_length": 451.0625, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.10945956246665142, + "grad_norm": 0.23278465867042542, + "kl": 0.04681396484375, + "learning_rate": 1.3865583039223929e-06, + "loss": 0.0174, + "num_tokens": 30353838.0, + "reward": 0.1875000298023224, + "reward_std": 0.12374367564916611, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 900.0, + "completions/max_terminated_length": 900.0, + "completions/mean_length": 409.57501220703125, + "completions/mean_terminated_length": 409.57501220703125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.10976446375485936, + "grad_norm": 0.16648580133914948, + "kl": 0.04815673828125, + "learning_rate": 1.374993281896137e-06, + "loss": 0.0039, + "num_tokens": 30434910.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 912.0, + "completions/max_terminated_length": 912.0, + "completions/mean_length": 399.6875, + "completions/mean_terminated_length": 399.6875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "epoch": 0.1100693650430673, + "grad_norm": 0.23581717908382416, + "kl": 0.05224609375, + "learning_rate": 1.3634859523979134e-06, + "loss": 0.0181, + "num_tokens": 30514995.0, + "reward": 0.1600000113248825, + "reward_std": 0.05656854063272476, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 967.0, + "completions/max_terminated_length": 967.0, + "completions/mean_length": 392.2749938964844, + "completions/mean_terminated_length": 392.2749938964844, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.11037426633127526, + "grad_norm": 0.17342065274715424, + "kl": 0.04815673828125, + "learning_rate": 1.3520367982522208e-06, + "loss": 0.01, + "num_tokens": 30592935.0, + "reward": 0.14875002205371857, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1000.0, + "completions/max_terminated_length": 1000.0, + "completions/mean_length": 385.63751220703125, + "completions/mean_terminated_length": 385.63751220703125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.1106791676194832, + "grad_norm": 0.17184416949748993, + "kl": 0.05023193359375, + "learning_rate": 1.3406462998426358e-06, + "loss": 0.0057, + "num_tokens": 30670002.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1234.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 424.88751220703125, + "completions/mean_terminated_length": 424.88751220703125, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.11098406890769114, + "grad_norm": 0.2137637883424759, + "kl": 0.04620361328125, + "learning_rate": 1.3293149350916595e-06, + "loss": 0.0155, + "num_tokens": 30751821.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1108.0, + "completions/max_terminated_length": 1108.0, + "completions/mean_length": 383.45001220703125, + "completions/mean_terminated_length": 383.45001220703125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.11128897019589908, + "grad_norm": 0.2274777889251709, + "kl": 0.05029296875, + "learning_rate": 1.3180431794406623e-06, + "loss": 0.0004, + "num_tokens": 30827591.0, + "reward": 0.2250000238418579, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1117.0, + "completions/max_terminated_length": 1117.0, + "completions/mean_length": 414.125, + "completions/mean_terminated_length": 414.125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.11159387148410702, + "grad_norm": 0.22330601513385773, + "kl": 0.04931640625, + "learning_rate": 1.3068315058299358e-06, + "loss": 0.0006, + "num_tokens": 30911769.0, + "reward": 0.1625000238418579, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 913.0, + "completions/max_terminated_length": 913.0, + "completions/mean_length": 381.7749938964844, + "completions/mean_terminated_length": 381.7749938964844, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.11189877277231497, + "grad_norm": 0.2143508344888687, + "kl": 0.05047607421875, + "learning_rate": 1.2956803846788503e-06, + "loss": 0.0126, + "num_tokens": 30991433.0, + "reward": 0.1625000238418579, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1107.0, + "completions/max_terminated_length": 1107.0, + "completions/mean_length": 405.5375061035156, + "completions/mean_terminated_length": 405.5375061035156, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "epoch": 0.11220367406052291, + "grad_norm": 0.1381106972694397, + "kl": 0.04779052734375, + "learning_rate": 1.284590283866116e-06, + "loss": 0.0077, + "num_tokens": 31074002.0, + "reward": 0.16250000894069672, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 417.6000061035156, + "completions/mean_terminated_length": 417.6000061035156, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.11250857534873085, + "grad_norm": 0.1931677609682083, + "kl": 0.04852294921875, + "learning_rate": 1.2735616687101518e-06, + "loss": 0.0044, + "num_tokens": 31156898.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1339.0, + "completions/max_terminated_length": 1339.0, + "completions/mean_length": 402.76251220703125, + "completions/mean_terminated_length": 402.76251220703125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "epoch": 0.11281347663693879, + "grad_norm": 0.2690817713737488, + "kl": 0.0499267578125, + "learning_rate": 1.2625950019495614e-06, + "loss": 0.0277, + "num_tokens": 31240057.0, + "reward": 0.19875001907348633, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 389.38751220703125, + "completions/mean_terminated_length": 389.38751220703125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.11311837792514673, + "grad_norm": 0.1295740157365799, + "kl": 0.0501708984375, + "learning_rate": 1.251690743723718e-06, + "loss": 0.0025, + "num_tokens": 31319472.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 368.63751220703125, + "completions/mean_terminated_length": 368.63751220703125, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "epoch": 0.11342327921335468, + "grad_norm": 0.14982041716575623, + "kl": 0.049560546875, + "learning_rate": 1.2408493515534581e-06, + "loss": 0.0029, + "num_tokens": 31394757.0, + "reward": 0.1862500160932541, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1478.0, + "completions/max_terminated_length": 1478.0, + "completions/mean_length": 414.5375061035156, + "completions/mean_terminated_length": 414.5375061035156, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "epoch": 0.11372818050156262, + "grad_norm": 0.14369921386241913, + "kl": 0.04833984375, + "learning_rate": 1.2300712803218834e-06, + "loss": 0.0156, + "num_tokens": 31476508.0, + "reward": 0.20000003278255463, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 851.0, + "completions/max_terminated_length": 851.0, + "completions/mean_length": 377.6125183105469, + "completions/mean_terminated_length": 377.6125183105469, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.11403308178977056, + "grad_norm": 0.15167175233364105, + "kl": 0.05120849609375, + "learning_rate": 1.2193569822552772e-06, + "loss": -0.0041, + "num_tokens": 31554491.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 864.0, + "completions/max_terminated_length": 864.0, + "completions/mean_length": 390.1625061035156, + "completions/mean_terminated_length": 390.1625061035156, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.1143379830779785, + "grad_norm": 0.1379869431257248, + "kl": 0.04913330078125, + "learning_rate": 1.2087069069041268e-06, + "loss": -0.011, + "num_tokens": 31634458.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 927.0, + "completions/max_terminated_length": 927.0, + "completions/mean_length": 346.1125183105469, + "completions/mean_terminated_length": 346.1125183105469, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.11464288436618644, + "grad_norm": 0.19083495438098907, + "kl": 0.05242919921875, + "learning_rate": 1.1981215011242654e-06, + "loss": 0.0079, + "num_tokens": 31708299.0, + "reward": 0.17500001192092896, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 756.0, + "completions/max_terminated_length": 756.0, + "completions/mean_length": 353.0, + "completions/mean_terminated_length": 353.0, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "epoch": 0.1149477856543944, + "grad_norm": 0.31387728452682495, + "kl": 0.05279541015625, + "learning_rate": 1.1876012090581184e-06, + "loss": 0.0001, + "num_tokens": 31779549.0, + "reward": 0.21250002086162567, + "reward_std": 0.12374367564916611, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1206.0, + "completions/max_terminated_length": 1206.0, + "completions/mean_length": 377.7375183105469, + "completions/mean_terminated_length": 377.7375183105469, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "epoch": 0.11525268694260234, + "grad_norm": 0.23074573278427124, + "kl": 0.0509033203125, + "learning_rate": 1.177146472116071e-06, + "loss": -0.0032, + "num_tokens": 31858142.0, + "reward": 0.14875000715255737, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 945.0, + "completions/max_terminated_length": 945.0, + "completions/mean_length": 377.0, + "completions/mean_terminated_length": 377.0, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.11555758823081028, + "grad_norm": 0.08496691286563873, + "kl": 0.04949951171875, + "learning_rate": 1.1667577289579462e-06, + "loss": 0.0013, + "num_tokens": 31940052.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1028.0, + "completions/max_terminated_length": 1028.0, + "completions/mean_length": 431.20001220703125, + "completions/mean_terminated_length": 431.20001220703125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.11586248951901822, + "grad_norm": 0.18607525527477264, + "kl": 0.0474853515625, + "learning_rate": 1.1564354154746007e-06, + "loss": 0.0094, + "num_tokens": 32024476.0, + "reward": 0.1875000149011612, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 704.0, + "completions/max_terminated_length": 704.0, + "completions/mean_length": 343.3999938964844, + "completions/mean_terminated_length": 343.3999938964844, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.11616739080722616, + "grad_norm": 0.16431212425231934, + "kl": 0.05230712890625, + "learning_rate": 1.146179964769635e-06, + "loss": -0.0015, + "num_tokens": 32104930.0, + "reward": 0.13625000417232513, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 432.13751220703125, + "completions/mean_terminated_length": 432.13751220703125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.11647229209543411, + "grad_norm": 0.16867312788963318, + "kl": 0.0478515625, + "learning_rate": 1.1359918071412195e-06, + "loss": -0.003, + "num_tokens": 32193079.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 416.9375, + "completions/mean_terminated_length": 416.9375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "epoch": 0.11677719338364205, + "grad_norm": 0.2334057241678238, + "kl": 0.05169677734375, + "learning_rate": 1.1258713700640456e-06, + "loss": 0.003, + "num_tokens": 32277986.0, + "reward": 0.15000000596046448, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1081.0, + "completions/max_terminated_length": 1081.0, + "completions/mean_length": 364.625, + "completions/mean_terminated_length": 364.625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.11708209467184999, + "grad_norm": 0.3081050217151642, + "kl": 0.05218505859375, + "learning_rate": 1.115819078171383e-06, + "loss": -0.01, + "num_tokens": 32352954.0, + "reward": 0.19875001907348633, + "reward_std": 0.14318911731243134, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1087.0, + "completions/max_terminated_length": 1087.0, + "completions/mean_length": 454.5500183105469, + "completions/mean_terminated_length": 454.5500183105469, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "epoch": 0.11738699596005793, + "grad_norm": 0.21313853561878204, + "kl": 0.04852294921875, + "learning_rate": 1.1058353532372667e-06, + "loss": 0.0039, + "num_tokens": 32439852.0, + "reward": 0.1862500160932541, + "reward_std": 0.09015612304210663, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 889.0, + "completions/max_terminated_length": 889.0, + "completions/mean_length": 414.13751220703125, + "completions/mean_terminated_length": 414.13751220703125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.11769189724826587, + "grad_norm": 0.17836220562458038, + "kl": 0.0465087890625, + "learning_rate": 1.0959206141587998e-06, + "loss": 0.0003, + "num_tokens": 32516357.0, + "reward": 0.15000000596046448, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1639.0, + "completions/max_terminated_length": 1639.0, + "completions/mean_length": 415.2124938964844, + "completions/mean_terminated_length": 415.2124938964844, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.11799679853647381, + "grad_norm": 0.1351797878742218, + "kl": 0.04815673828125, + "learning_rate": 1.0860752769385766e-06, + "loss": 0.0018, + "num_tokens": 32595424.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 828.0, + "completions/mean_length": 421.2749938964844, + "completions/mean_terminated_length": 400.6835632324219, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "epoch": 0.11830169982468176, + "grad_norm": 0.11575304716825485, + "kl": 0.047119140625, + "learning_rate": 1.0762997546672279e-06, + "loss": 0.0158, + "num_tokens": 32679826.0, + "reward": 0.11125000566244125, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1026.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 440.7124938964844, + "completions/mean_terminated_length": 440.7124938964844, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1186066011128897, + "grad_norm": 0.12233486771583557, + "kl": 0.0482177734375, + "learning_rate": 1.0665944575060914e-06, + "loss": 0.0091, + "num_tokens": 32761793.0, + "reward": 0.13750001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 938.0, + "completions/max_terminated_length": 938.0, + "completions/mean_length": 404.7375183105469, + "completions/mean_terminated_length": 404.7375183105469, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.11891150240109764, + "grad_norm": 0.2841652035713196, + "kl": 0.0499267578125, + "learning_rate": 1.056959792669997e-06, + "loss": 0.0192, + "num_tokens": 32843012.0, + "reward": 0.21250002086162567, + "reward_std": 0.1237436905503273, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 957.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 399.38751220703125, + "completions/mean_terminated_length": 399.38751220703125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "epoch": 0.11921640368930558, + "grad_norm": 0.2231462299823761, + "kl": 0.047607421875, + "learning_rate": 1.0473961644101856e-06, + "loss": 0.0048, + "num_tokens": 32925553.0, + "reward": 0.27500003576278687, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.17499999701976776, + "rewards/unicoder_reward_fn/std": 0.3823643922805786, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 863.0, + "completions/max_terminated_length": 863.0, + "completions/mean_length": 418.5249938964844, + "completions/mean_terminated_length": 418.5249938964844, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "epoch": 0.11952130497751352, + "grad_norm": 0.12139161676168442, + "kl": 0.048095703125, + "learning_rate": 1.037903973997345e-06, + "loss": 0.0032, + "num_tokens": 33008785.0, + "reward": 0.17375002801418304, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 850.0, + "completions/max_terminated_length": 850.0, + "completions/mean_length": 434.7749938964844, + "completions/mean_terminated_length": 434.7749938964844, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.11982620626572148, + "grad_norm": 0.1002897098660469, + "kl": 0.0506591796875, + "learning_rate": 1.0284836197047737e-06, + "loss": -0.0004, + "num_tokens": 33091849.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 367.88751220703125, + "completions/mean_terminated_length": 367.88751220703125, + "completions/min_length": 62.0, + "completions/min_terminated_length": 62.0, + "epoch": 0.12013110755392942, + "grad_norm": 0.2262028157711029, + "kl": 0.05035400390625, + "learning_rate": 1.0191354967916712e-06, + "loss": 0.0012, + "num_tokens": 33165528.0, + "reward": 0.23750002682209015, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 855.0, + "completions/max_terminated_length": 855.0, + "completions/mean_length": 413.01251220703125, + "completions/mean_terminated_length": 413.01251220703125, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.12043600884213736, + "grad_norm": 0.3028506636619568, + "kl": 0.05047607421875, + "learning_rate": 1.0098599974865515e-06, + "loss": 0.0182, + "num_tokens": 33247377.0, + "reward": 0.29750001430511475, + "reward_std": 0.14495688676834106, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.20000000298023224, + "rewards/unicoder_reward_fn/std": 0.4025236964225769, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1170.0, + "completions/max_terminated_length": 1170.0, + "completions/mean_length": 427.3999938964844, + "completions/mean_terminated_length": 427.3999938964844, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.1207409101303453, + "grad_norm": 0.16660848259925842, + "kl": 0.04803466796875, + "learning_rate": 1.0006575109707898e-06, + "loss": -0.0099, + "num_tokens": 33329183.0, + "reward": 0.20000003278255463, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 929.0, + "completions/max_terminated_length": 929.0, + "completions/mean_length": 375.51251220703125, + "completions/mean_terminated_length": 375.51251220703125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.12104581141855324, + "grad_norm": 0.25240859389305115, + "kl": 0.05181884765625, + "learning_rate": 9.915284233622877e-07, + "loss": -0.0053, + "num_tokens": 33405410.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 376.38751220703125, + "completions/mean_terminated_length": 376.38751220703125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "epoch": 0.12135071270676119, + "grad_norm": 0.21786899864673615, + "kl": 0.05023193359375, + "learning_rate": 9.824731176992796e-07, + "loss": -0.0137, + "num_tokens": 33478769.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 442.9250183105469, + "completions/mean_terminated_length": 442.9250183105469, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "epoch": 0.12165561399496913, + "grad_norm": 0.21049322187900543, + "kl": 0.0450439453125, + "learning_rate": 9.734919739242543e-07, + "loss": 0.014, + "num_tokens": 33566537.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 963.0, + "completions/max_terminated_length": 963.0, + "completions/mean_length": 431.51251220703125, + "completions/mean_terminated_length": 431.51251220703125, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "epoch": 0.12196051528317707, + "grad_norm": 0.23430253565311432, + "kl": 0.04595947265625, + "learning_rate": 9.645853688680177e-07, + "loss": 0.0128, + "num_tokens": 33649636.0, + "reward": 0.1875000149011612, + "reward_std": 0.12374367564916611, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 990.0, + "completions/max_terminated_length": 990.0, + "completions/mean_length": 411.4750061035156, + "completions/mean_terminated_length": 411.4750061035156, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.12226541657138501, + "grad_norm": 0.21432843804359436, + "kl": 0.04547119140625, + "learning_rate": 9.557536762338786e-07, + "loss": 0.0005, + "num_tokens": 33727294.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1485.0, + "completions/max_terminated_length": 1485.0, + "completions/mean_length": 446.7124938964844, + "completions/mean_terminated_length": 446.7124938964844, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.12257031785959295, + "grad_norm": 0.1339891105890274, + "kl": 0.044921875, + "learning_rate": 9.46997266581973e-07, + "loss": -0.0003, + "num_tokens": 33810863.0, + "reward": 0.20000003278255463, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1266.0, + "completions/max_terminated_length": 1266.0, + "completions/mean_length": 431.7250061035156, + "completions/mean_terminated_length": 431.7250061035156, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "epoch": 0.1228752191478009, + "grad_norm": 0.2357596755027771, + "kl": 0.04827880859375, + "learning_rate": 9.383165073137115e-07, + "loss": 0.0063, + "num_tokens": 33893307.0, + "reward": 0.2500000298023224, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.15000000596046448, + "rewards/unicoder_reward_fn/std": 0.35932427644729614, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1068.0, + "completions/max_terminated_length": 1068.0, + "completions/mean_length": 452.8374938964844, + "completions/mean_terminated_length": 452.8374938964844, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.12318012043600884, + "grad_norm": 0.19236986339092255, + "kl": 0.04608154296875, + "learning_rate": 9.297117626563687e-07, + "loss": 0.0161, + "num_tokens": 33979250.0, + "reward": 0.20000003278255463, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 993.0, + "completions/max_terminated_length": 993.0, + "completions/mean_length": 438.32501220703125, + "completions/mean_terminated_length": 438.32501220703125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.12348502172421678, + "grad_norm": 0.20204347372055054, + "kl": 0.04681396484375, + "learning_rate": 9.211833936477957e-07, + "loss": 0.0012, + "num_tokens": 34062038.0, + "reward": 0.1875000149011612, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 1175.0, + "completions/mean_length": 434.0625, + "completions/mean_terminated_length": 413.6329345703125, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "epoch": 0.12378992301242472, + "grad_norm": 0.14883990585803986, + "kl": 0.04541015625, + "learning_rate": 9.127317581212753e-07, + "loss": 0.0263, + "num_tokens": 34147497.0, + "reward": 0.17375002801418304, + "reward_std": 0.07247845083475113, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 437.5375061035156, + "completions/mean_terminated_length": 437.5375061035156, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.12409482430063266, + "grad_norm": 0.17637896537780762, + "kl": 0.04620361328125, + "learning_rate": 9.043572106905084e-07, + "loss": 0.0249, + "num_tokens": 34230302.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 437.8500061035156, + "completions/mean_terminated_length": 437.8500061035156, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.12439972558884062, + "grad_norm": 0.18132470548152924, + "kl": 0.04522705078125, + "learning_rate": 8.960601027347321e-07, + "loss": 0.0439, + "num_tokens": 34312412.0, + "reward": 0.19875001907348633, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927574157715, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1449.0, + "completions/max_terminated_length": 1449.0, + "completions/mean_length": 454.0375061035156, + "completions/mean_terminated_length": 454.0375061035156, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.12470462687704856, + "grad_norm": 0.1201486811041832, + "kl": 0.045166015625, + "learning_rate": 8.878407823839788e-07, + "loss": -0.0122, + "num_tokens": 34397757.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 857.0, + "completions/max_terminated_length": 857.0, + "completions/mean_length": 423.7749938964844, + "completions/mean_terminated_length": 423.7749938964844, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "epoch": 0.1250095281652565, + "grad_norm": 0.17683278024196625, + "kl": 0.05145263671875, + "learning_rate": 8.796995945044689e-07, + "loss": -0.0036, + "num_tokens": 34478837.0, + "reward": 0.22500000894069672, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1375.0, + "completions/max_terminated_length": 1375.0, + "completions/mean_length": 416.9125061035156, + "completions/mean_terminated_length": 416.9125061035156, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "epoch": 0.12531442945346444, + "grad_norm": 0.22345435619354248, + "kl": 0.0445556640625, + "learning_rate": 8.716368806841405e-07, + "loss": 0.0144, + "num_tokens": 34557896.0, + "reward": 0.1875000149011612, + "reward_std": 0.12374367564916611, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1253.0, + "completions/max_terminated_length": 1253.0, + "completions/mean_length": 397.9375, + "completions/mean_terminated_length": 397.9375, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.1256193307416724, + "grad_norm": 0.16894420981407166, + "kl": 0.0504150390625, + "learning_rate": 8.636529792183171e-07, + "loss": -0.0049, + "num_tokens": 34637915.0, + "reward": 0.15000000596046448, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 399.6875, + "completions/mean_terminated_length": 399.6875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "epoch": 0.12592423202988032, + "grad_norm": 0.1841939240694046, + "kl": 0.052490234375, + "learning_rate": 8.557482250955144e-07, + "loss": 0.0012, + "num_tokens": 34720942.0, + "reward": 0.17500001192092896, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 966.0, + "completions/max_terminated_length": 966.0, + "completions/mean_length": 393.1499938964844, + "completions/mean_terminated_length": 393.1499938964844, + "completions/min_length": 112.0, + "completions/min_terminated_length": 112.0, + "epoch": 0.12622913331808827, + "grad_norm": 0.22368940711021423, + "kl": 0.04644775390625, + "learning_rate": 8.479229499833844e-07, + "loss": -0.0111, + "num_tokens": 34798866.0, + "reward": 0.27500003576278687, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.17499999701976776, + "rewards/unicoder_reward_fn/std": 0.3823643922805786, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1222.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 440.0, + "completions/mean_terminated_length": 440.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.12653403460629623, + "grad_norm": 0.19953802227973938, + "kl": 0.044677734375, + "learning_rate": 8.401774822147976e-07, + "loss": -0.0027, + "num_tokens": 34883180.0, + "reward": 0.16250000894069672, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1347.0, + "completions/max_terminated_length": 1347.0, + "completions/mean_length": 436.6750183105469, + "completions/mean_terminated_length": 436.6750183105469, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "epoch": 0.12683893589450415, + "grad_norm": 0.14660529792308807, + "kl": 0.045166015625, + "learning_rate": 8.325121467740695e-07, + "loss": 0.0062, + "num_tokens": 34971670.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 961.0, + "completions/max_terminated_length": 961.0, + "completions/mean_length": 415.125, + "completions/mean_terminated_length": 415.125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1271438371827121, + "grad_norm": 3.1824533939361572, + "kl": 0.06732177734375, + "learning_rate": 8.249272652833226e-07, + "loss": 0.0124, + "num_tokens": 35050342.0, + "reward": 0.20000000298023224, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 956.0, + "completions/max_terminated_length": 956.0, + "completions/mean_length": 420.4875183105469, + "completions/mean_terminated_length": 420.4875183105469, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "epoch": 0.12744873847092003, + "grad_norm": 0.16419346630573273, + "kl": 0.0458984375, + "learning_rate": 8.174231559889931e-07, + "loss": 0.0151, + "num_tokens": 35132545.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 390.51251220703125, + "completions/mean_terminated_length": 390.51251220703125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "epoch": 0.12775363975912798, + "grad_norm": 0.18130381405353546, + "kl": 0.04888916015625, + "learning_rate": 8.100001337484787e-07, + "loss": 0.0067, + "num_tokens": 35214504.0, + "reward": 0.23750002682209015, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1199.0, + "completions/max_terminated_length": 1199.0, + "completions/mean_length": 409.6499938964844, + "completions/mean_terminated_length": 409.6499938964844, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.12805854104733594, + "grad_norm": 0.16933445632457733, + "kl": 0.0457763671875, + "learning_rate": 8.026585100169251e-07, + "loss": 0.0005, + "num_tokens": 35296306.0, + "reward": 0.14875000715255737, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 425.6625061035156, + "completions/mean_terminated_length": 425.6625061035156, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "epoch": 0.12836344233554386, + "grad_norm": 0.27452513575553894, + "kl": 0.05029296875, + "learning_rate": 7.953985928341601e-07, + "loss": 0.0212, + "num_tokens": 35380561.0, + "reward": 0.20000003278255463, + "reward_std": 0.1414213627576828, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1043.0, + "completions/max_terminated_length": 1043.0, + "completions/mean_length": 399.8374938964844, + "completions/mean_terminated_length": 399.8374938964844, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.12866834362375182, + "grad_norm": 0.19873785972595215, + "kl": 0.046142578125, + "learning_rate": 7.882206868117693e-07, + "loss": -0.0027, + "num_tokens": 35459580.0, + "reward": 0.26250001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.16249999403953552, + "rewards/unicoder_reward_fn/std": 0.3712363839149475, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 787.0, + "completions/max_terminated_length": 787.0, + "completions/mean_length": 397.26251220703125, + "completions/mean_terminated_length": 397.26251220703125, + "completions/min_length": 72.0, + "completions/min_terminated_length": 72.0, + "epoch": 0.12897324491195974, + "grad_norm": 0.14467059075832367, + "kl": 0.04571533203125, + "learning_rate": 7.81125093120313e-07, + "loss": 0.0103, + "num_tokens": 35538789.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 390.25, + "completions/mean_terminated_length": 390.25, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "epoch": 0.1292781462001677, + "grad_norm": 0.1401596963405609, + "kl": 0.046630859375, + "learning_rate": 7.741121094766916e-07, + "loss": 0.0055, + "num_tokens": 35620247.0, + "reward": 0.21250002086162567, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1279.0, + "completions/max_terminated_length": 1279.0, + "completions/mean_length": 372.9750061035156, + "completions/mean_terminated_length": 372.9750061035156, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.12958304748837565, + "grad_norm": 0.16129828989505768, + "kl": 0.04754638671875, + "learning_rate": 7.671820301316532e-07, + "loss": 0.0035, + "num_tokens": 35694351.0, + "reward": 0.1625000238418579, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 930.0, + "completions/max_terminated_length": 930.0, + "completions/mean_length": 384.51251220703125, + "completions/mean_terminated_length": 384.51251220703125, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.12988794877658358, + "grad_norm": 0.22551488876342773, + "kl": 0.0595703125, + "learning_rate": 7.603351458574474e-07, + "loss": -0.0164, + "num_tokens": 35772664.0, + "reward": 0.17375002801418304, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1142.0, + "completions/max_terminated_length": 1142.0, + "completions/mean_length": 409.26251220703125, + "completions/mean_terminated_length": 409.26251220703125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.13019285006479153, + "grad_norm": 0.2181655764579773, + "kl": 0.04736328125, + "learning_rate": 7.535717439356255e-07, + "loss": 0.0084, + "num_tokens": 35853321.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 406.8999938964844, + "completions/mean_terminated_length": 406.8999938964844, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "epoch": 0.13049775135299946, + "grad_norm": 0.09705975651741028, + "kl": 0.0478515625, + "learning_rate": 7.46892108144986e-07, + "loss": 0.0028, + "num_tokens": 35937717.0, + "reward": 0.1250000149011612, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1916.0, + "completions/max_terminated_length": 1916.0, + "completions/mean_length": 414.5500183105469, + "completions/mean_terminated_length": 414.5500183105469, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.1308026526412074, + "grad_norm": 0.14803965389728546, + "kl": 0.04669189453125, + "learning_rate": 7.402965187496697e-07, + "loss": 0.0065, + "num_tokens": 36022223.0, + "reward": 0.11124999821186066, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 396.9250183105469, + "completions/mean_terminated_length": 396.9250183105469, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "epoch": 0.13110755392941534, + "grad_norm": 0.15297779440879822, + "kl": 0.04736328125, + "learning_rate": 7.337852524873974e-07, + "loss": 0.0036, + "num_tokens": 36103699.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1753.0, + "completions/max_terminated_length": 1753.0, + "completions/mean_length": 428.375, + "completions/mean_terminated_length": 428.375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "epoch": 0.1314124552176233, + "grad_norm": 0.16848964989185333, + "kl": 0.05059814453125, + "learning_rate": 7.273585825578608e-07, + "loss": 0.0129, + "num_tokens": 36186971.0, + "reward": 0.17375002801418304, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1241.0, + "completions/max_terminated_length": 1241.0, + "completions/mean_length": 424.7250061035156, + "completions/mean_terminated_length": 424.7250061035156, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.13171735650583125, + "grad_norm": 0.26175057888031006, + "kl": 0.04840087890625, + "learning_rate": 7.21016778611259e-07, + "loss": 0.0095, + "num_tokens": 36270993.0, + "reward": 0.19875001907348633, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 359.32501220703125, + "completions/mean_terminated_length": 359.32501220703125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.13202225779403917, + "grad_norm": 0.1780603528022766, + "kl": 0.047607421875, + "learning_rate": 7.147601067369835e-07, + "loss": 0.0054, + "num_tokens": 36347327.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 954.0, + "completions/max_terminated_length": 954.0, + "completions/mean_length": 399.6750183105469, + "completions/mean_terminated_length": 399.6750183105469, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.13232715908224713, + "grad_norm": 0.09010346233844757, + "kl": 0.04693603515625, + "learning_rate": 7.085888294524561e-07, + "loss": -0.0012, + "num_tokens": 36426937.0, + "reward": 0.11250000447034836, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1012.0, + "completions/max_terminated_length": 1012.0, + "completions/mean_length": 381.9750061035156, + "completions/mean_terminated_length": 381.9750061035156, + "completions/min_length": 59.0, + "completions/min_terminated_length": 59.0, + "epoch": 0.13263206037045505, + "grad_norm": 0.24115842580795288, + "kl": 0.04803466796875, + "learning_rate": 7.025032056921117e-07, + "loss": 0.0098, + "num_tokens": 36506775.0, + "reward": 0.19875001907348633, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 868.0, + "completions/max_terminated_length": 868.0, + "completions/mean_length": 406.75, + "completions/mean_terminated_length": 406.75, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.132936961658663, + "grad_norm": 0.15974393486976624, + "kl": 0.048583984375, + "learning_rate": 6.965034907965349e-07, + "loss": 0.0085, + "num_tokens": 36588207.0, + "reward": 0.1625000238418579, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 830.0, + "completions/max_terminated_length": 830.0, + "completions/mean_length": 341.8500061035156, + "completions/mean_terminated_length": 341.8500061035156, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "epoch": 0.13324186294687096, + "grad_norm": 0.17505139112472534, + "kl": 0.04583740234375, + "learning_rate": 6.905899365017462e-07, + "loss": 0.0065, + "num_tokens": 36660923.0, + "reward": 0.13750000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 965.0, + "completions/max_terminated_length": 965.0, + "completions/mean_length": 390.1750183105469, + "completions/mean_terminated_length": 390.1750183105469, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.13354676423507889, + "grad_norm": 0.13170388340950012, + "kl": 0.04925537109375, + "learning_rate": 6.847627909286409e-07, + "loss": 0.0008, + "num_tokens": 36740415.0, + "reward": 0.13625000417232513, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 837.0, + "completions/max_terminated_length": 837.0, + "completions/mean_length": 403.45001220703125, + "completions/mean_terminated_length": 403.45001220703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.13385166552328684, + "grad_norm": 0.19051723182201385, + "kl": 0.05145263671875, + "learning_rate": 6.790222985725761e-07, + "loss": 0.0026, + "num_tokens": 36819045.0, + "reward": 0.14875002205371857, + "reward_std": 0.07247844338417053, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 381.1750183105469, + "completions/mean_terminated_length": 381.1750183105469, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "epoch": 0.13415656681149477, + "grad_norm": 0.16633227467536926, + "kl": 0.048095703125, + "learning_rate": 6.733687002931141e-07, + "loss": 0.0078, + "num_tokens": 36899337.0, + "reward": 0.21250002086162567, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1014.0, + "completions/max_terminated_length": 1014.0, + "completions/mean_length": 383.6625061035156, + "completions/mean_terminated_length": 383.6625061035156, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.13446146809970272, + "grad_norm": 0.18865156173706055, + "kl": 0.04852294921875, + "learning_rate": 6.678022333039158e-07, + "loss": 0.0073, + "num_tokens": 36977848.0, + "reward": 0.2237500250339508, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 726.0, + "completions/max_terminated_length": 726.0, + "completions/mean_length": 358.82501220703125, + "completions/mean_terminated_length": 358.82501220703125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.13476636938791067, + "grad_norm": 0.19032283127307892, + "kl": 0.04852294921875, + "learning_rate": 6.623231311627876e-07, + "loss": 0.0042, + "num_tokens": 37050838.0, + "reward": 0.26250001788139343, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.16249999403953552, + "rewards/unicoder_reward_fn/std": 0.3712363839149475, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1254.0, + "completions/max_terminated_length": 1254.0, + "completions/mean_length": 400.8625183105469, + "completions/mean_terminated_length": 400.8625183105469, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "epoch": 0.1350712706761186, + "grad_norm": 0.11788418143987656, + "kl": 0.04779052734375, + "learning_rate": 6.569316237618811e-07, + "loss": 0.0015, + "num_tokens": 37131997.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1129.0, + "completions/max_terminated_length": 1129.0, + "completions/mean_length": 392.4750061035156, + "completions/mean_terminated_length": 392.4750061035156, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.13537617196432655, + "grad_norm": 0.14593246579170227, + "kl": 0.05035400390625, + "learning_rate": 6.516279373180499e-07, + "loss": 0.0039, + "num_tokens": 37214855.0, + "reward": 0.19875001907348633, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 411.125, + "completions/mean_terminated_length": 411.125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "epoch": 0.13568107325253448, + "grad_norm": 0.1687837392091751, + "kl": 0.04461669921875, + "learning_rate": 6.464122943633543e-07, + "loss": 0.01, + "num_tokens": 37298811.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1185.0, + "completions/max_terminated_length": 1185.0, + "completions/mean_length": 389.82501220703125, + "completions/mean_terminated_length": 389.82501220703125, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.13598597454074243, + "grad_norm": 0.14800910651683807, + "kl": 0.0494384765625, + "learning_rate": 6.412849137357271e-07, + "loss": -0.0077, + "num_tokens": 37378691.0, + "reward": 0.23750002682209015, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1122.0, + "completions/max_terminated_length": 1122.0, + "completions/mean_length": 434.5249938964844, + "completions/mean_terminated_length": 434.5249938964844, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.1362908758289504, + "grad_norm": 0.06896167248487473, + "kl": 0.0452880859375, + "learning_rate": 6.3624601056979e-07, + "loss": 0.0132, + "num_tokens": 37462417.0, + "reward": 0.16250000894069672, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 378.4875183105469, + "completions/mean_terminated_length": 378.4875183105469, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.1365957771171583, + "grad_norm": 0.23473790287971497, + "kl": 0.046630859375, + "learning_rate": 6.312957962878278e-07, + "loss": 0.0231, + "num_tokens": 37542628.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 838.0, + "completions/max_terminated_length": 838.0, + "completions/mean_length": 349.0874938964844, + "completions/mean_terminated_length": 349.0874938964844, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.13690067840536627, + "grad_norm": 0.2421552687883377, + "kl": 0.0560302734375, + "learning_rate": 6.264344785909181e-07, + "loss": 0.0092, + "num_tokens": 37618047.0, + "reward": 0.17250001430511475, + "reward_std": 0.07424621284008026, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 985.0, + "completions/max_terminated_length": 985.0, + "completions/mean_length": 398.7375183105469, + "completions/mean_terminated_length": 398.7375183105469, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.1372055796935742, + "grad_norm": 0.18554161489009857, + "kl": 0.04620361328125, + "learning_rate": 6.216622614502149e-07, + "loss": 0.0162, + "num_tokens": 37697614.0, + "reward": 0.2250000238418579, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 973.0, + "completions/max_terminated_length": 973.0, + "completions/mean_length": 371.125, + "completions/mean_terminated_length": 371.125, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "epoch": 0.13751048098178215, + "grad_norm": 0.19813232123851776, + "kl": 0.04840087890625, + "learning_rate": 6.169793450983916e-07, + "loss": 0.0075, + "num_tokens": 37771176.0, + "reward": 0.1875000298023224, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 356.9750061035156, + "completions/mean_terminated_length": 356.9750061035156, + "completions/min_length": 76.0, + "completions/min_terminated_length": 76.0, + "epoch": 0.1378153822699901, + "grad_norm": 0.21376754343509674, + "kl": 0.0504150390625, + "learning_rate": 6.123859260212393e-07, + "loss": 0.013, + "num_tokens": 37853318.0, + "reward": 0.14875000715255737, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 317.6125183105469, + "completions/mean_terminated_length": 317.6125183105469, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.13812028355819803, + "grad_norm": 0.21950644254684448, + "kl": 0.0565185546875, + "learning_rate": 6.07882196949423e-07, + "loss": 0.0005, + "num_tokens": 37925903.0, + "reward": 0.27250000834465027, + "reward_std": 0.07424621284008026, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.17499999701976776, + "rewards/unicoder_reward_fn/std": 0.3823643922805786, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 934.0, + "completions/max_terminated_length": 934.0, + "completions/mean_length": 366.5625, + "completions/mean_terminated_length": 366.5625, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.13842518484640598, + "grad_norm": 0.15327180922031403, + "kl": 0.044921875, + "learning_rate": 6.034683468503948e-07, + "loss": -0.002, + "num_tokens": 38006736.0, + "reward": 0.17500001192092896, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 971.0, + "completions/max_terminated_length": 971.0, + "completions/mean_length": 375.3374938964844, + "completions/mean_terminated_length": 375.3374938964844, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.1387300861346139, + "grad_norm": 0.12953659892082214, + "kl": 0.04559326171875, + "learning_rate": 5.991445609204641e-07, + "loss": -0.0036, + "num_tokens": 38087977.0, + "reward": 0.17500002682209015, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 365.9125061035156, + "completions/mean_terminated_length": 365.9125061035156, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.13903498742282186, + "grad_norm": 0.1928936243057251, + "kl": 0.04638671875, + "learning_rate": 5.949110205770292e-07, + "loss": 0.0041, + "num_tokens": 38164564.0, + "reward": 0.21250002086162567, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1813.0, + "completions/max_terminated_length": 1813.0, + "completions/mean_length": 368.6125183105469, + "completions/mean_terminated_length": 368.6125183105469, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1393398887110298, + "grad_norm": 0.248763307929039, + "kl": 0.04815673828125, + "learning_rate": 5.90767903450964e-07, + "loss": 0.0063, + "num_tokens": 38244613.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1316.0, + "completions/max_terminated_length": 1316.0, + "completions/mean_length": 391.5249938964844, + "completions/mean_terminated_length": 391.5249938964844, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "epoch": 0.13964478999923774, + "grad_norm": 0.08757586777210236, + "kl": 0.04510498046875, + "learning_rate": 5.867153833791652e-07, + "loss": -0.0026, + "num_tokens": 38325939.0, + "reward": 0.13500002026557922, + "reward_std": 0.01767767034471035, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 328.1750183105469, + "completions/mean_terminated_length": 328.1750183105469, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "epoch": 0.1399496912874457, + "grad_norm": 0.25197768211364746, + "kl": 0.05291748046875, + "learning_rate": 5.827536303972587e-07, + "loss": 0.0076, + "num_tokens": 38399427.0, + "reward": 0.2237500250339508, + "reward_std": 0.10783378034830093, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 811.0, + "completions/max_terminated_length": 811.0, + "completions/mean_length": 348.25, + "completions/mean_terminated_length": 348.25, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "epoch": 0.14025459257565362, + "grad_norm": 0.1697019338607788, + "kl": 0.0504150390625, + "learning_rate": 5.78882810732465e-07, + "loss": -0.0017, + "num_tokens": 38476585.0, + "reward": 0.17250001430511475, + "reward_std": 0.03889087215065956, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1145.0, + "completions/max_terminated_length": 1145.0, + "completions/mean_length": 345.5, + "completions/mean_terminated_length": 345.5, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.14055949386386157, + "grad_norm": 0.18673694133758545, + "kl": 0.0472412109375, + "learning_rate": 5.75103086796625e-07, + "loss": 0.0019, + "num_tokens": 38552001.0, + "reward": 0.1862500160932541, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 801.0, + "completions/max_terminated_length": 801.0, + "completions/mean_length": 334.76251220703125, + "completions/mean_terminated_length": 334.76251220703125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "epoch": 0.14086439515206953, + "grad_norm": 0.13160358369350433, + "kl": 0.05072021484375, + "learning_rate": 5.714146171793846e-07, + "loss": -0.005, + "num_tokens": 38623824.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1443.0, + "completions/max_terminated_length": 1443.0, + "completions/mean_length": 379.9624938964844, + "completions/mean_terminated_length": 379.9624938964844, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.14116929644027745, + "grad_norm": 0.19666478037834167, + "kl": 0.046630859375, + "learning_rate": 5.678175566415422e-07, + "loss": 0.0027, + "num_tokens": 38704121.0, + "reward": 0.1875000149011612, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 919.0, + "completions/max_terminated_length": 919.0, + "completions/mean_length": 355.4750061035156, + "completions/mean_terminated_length": 355.4750061035156, + "completions/min_length": 86.0, + "completions/min_terminated_length": 86.0, + "epoch": 0.1414741977284854, + "grad_norm": 0.14412783086299896, + "kl": 0.04425048828125, + "learning_rate": 5.643120561085528e-07, + "loss": 0.003, + "num_tokens": 38779325.0, + "reward": 0.1875000149011612, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 358.8625183105469, + "completions/mean_terminated_length": 358.8625183105469, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.14177909901669333, + "grad_norm": 0.21223917603492737, + "kl": 0.0516357421875, + "learning_rate": 5.608982626641991e-07, + "loss": -0.0045, + "num_tokens": 38855108.0, + "reward": 0.10875000059604645, + "reward_std": 0.02298097126185894, + "rewards/format_reward/mean": 0.9624999761581421, + "rewards/format_reward/std": 0.1911821961402893, + "rewards/unicoder_reward_fn/mean": 0.012500000186264515, + "rewards/unicoder_reward_fn/std": 0.11180339753627777, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 354.75, + "completions/mean_terminated_length": 354.75, + "completions/min_length": 67.0, + "completions/min_terminated_length": 67.0, + "epoch": 0.1420840003049013, + "grad_norm": 0.18273437023162842, + "kl": 0.0477294921875, + "learning_rate": 5.575763195444166e-07, + "loss": -0.0039, + "num_tokens": 38932418.0, + "reward": 0.2250000238418579, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1019.0, + "completions/max_terminated_length": 1019.0, + "completions/mean_length": 340.6125183105469, + "completions/mean_terminated_length": 340.6125183105469, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "epoch": 0.14238890159310924, + "grad_norm": 0.1895855814218521, + "kl": 0.05322265625, + "learning_rate": 5.543463661312847e-07, + "loss": -0.0005, + "num_tokens": 39007663.0, + "reward": 0.17500002682209015, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 364.6499938964844, + "completions/mean_terminated_length": 364.6499938964844, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "epoch": 0.14269380288131717, + "grad_norm": 0.10075707733631134, + "kl": 0.04315185546875, + "learning_rate": 5.512085379471808e-07, + "loss": 0.0025, + "num_tokens": 39086407.0, + "reward": 0.15000002086162567, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 968.0, + "completions/max_terminated_length": 968.0, + "completions/mean_length": 356.0249938964844, + "completions/mean_terminated_length": 356.0249938964844, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "epoch": 0.14299870416952512, + "grad_norm": 0.2628437280654907, + "kl": 0.04742431640625, + "learning_rate": 5.481629666490903e-07, + "loss": 0.0207, + "num_tokens": 39160057.0, + "reward": 0.23625002801418304, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1448.0, + "completions/max_terminated_length": 1448.0, + "completions/mean_length": 354.4750061035156, + "completions/mean_terminated_length": 354.4750061035156, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.14330360545773305, + "grad_norm": 0.24940533936023712, + "kl": 0.04864501953125, + "learning_rate": 5.452097800230853e-07, + "loss": -0.0037, + "num_tokens": 39234485.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 784.0, + "completions/max_terminated_length": 784.0, + "completions/mean_length": 324.2250061035156, + "completions/mean_terminated_length": 324.2250061035156, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "epoch": 0.143608506745941, + "grad_norm": 0.23759277164936066, + "kl": 0.0521240234375, + "learning_rate": 5.423491019789623e-07, + "loss": 0.0016, + "num_tokens": 39308599.0, + "reward": 0.1875000149011612, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.012499999999999956, + "completions/max_length": 2048.0, + "completions/max_terminated_length": 809.0, + "completions/mean_length": 337.5375061035156, + "completions/mean_terminated_length": 315.8860778808594, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.14391340803414895, + "grad_norm": 0.33607712388038635, + "kl": 0.04736328125, + "learning_rate": 5.395810525450425e-07, + "loss": 0.0124, + "num_tokens": 39379734.0, + "reward": 0.24750001728534698, + "reward_std": 0.14495688676834106, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.15000000596046448, + "rewards/unicoder_reward_fn/std": 0.35932427644729614, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 907.0, + "completions/max_terminated_length": 907.0, + "completions/mean_length": 340.9250183105469, + "completions/mean_terminated_length": 340.9250183105469, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "epoch": 0.14421830932235688, + "grad_norm": 0.13019315898418427, + "kl": 0.044921875, + "learning_rate": 5.369057478631359e-07, + "loss": -0.0017, + "num_tokens": 39455358.0, + "reward": 0.14875000715255737, + "reward_std": 0.03712310642004013, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1112.0, + "completions/max_terminated_length": 1112.0, + "completions/mean_length": 377.5500183105469, + "completions/mean_terminated_length": 377.5500183105469, + "completions/min_length": 88.0, + "completions/min_terminated_length": 88.0, + "epoch": 0.14452321061056483, + "grad_norm": 0.18562644720077515, + "kl": 0.04766845703125, + "learning_rate": 5.343233001836694e-07, + "loss": 0.01, + "num_tokens": 39536722.0, + "reward": 0.13625001907348633, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.03750000149011612, + "rewards/unicoder_reward_fn/std": 0.1911821961402893, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1605.0, + "completions/max_terminated_length": 1605.0, + "completions/mean_length": 356.63751220703125, + "completions/mean_terminated_length": 356.63751220703125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "epoch": 0.14482811189877276, + "grad_norm": 0.1613534539937973, + "kl": 0.04925537109375, + "learning_rate": 5.318338178609754e-07, + "loss": -0.0015, + "num_tokens": 39609775.0, + "reward": 0.1862500160932541, + "reward_std": 0.01944543607532978, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 795.0, + "completions/max_terminated_length": 795.0, + "completions/mean_length": 341.8999938964844, + "completions/mean_terminated_length": 341.8999938964844, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "epoch": 0.1451330131869807, + "grad_norm": 0.24990017712116241, + "kl": 0.05035400390625, + "learning_rate": 5.294374053487459e-07, + "loss": 0.0108, + "num_tokens": 39686339.0, + "reward": 0.21250002086162567, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 737.0, + "completions/max_terminated_length": 737.0, + "completions/mean_length": 344.375, + "completions/mean_terminated_length": 344.375, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.14543791447518867, + "grad_norm": 0.24874407052993774, + "kl": 0.05511474609375, + "learning_rate": 5.271341631956511e-07, + "loss": -0.0035, + "num_tokens": 39761839.0, + "reward": 0.18500001728534698, + "reward_std": 0.09192388504743576, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 794.0, + "completions/max_terminated_length": 794.0, + "completions/mean_length": 385.45001220703125, + "completions/mean_terminated_length": 385.45001220703125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "epoch": 0.1457428157633966, + "grad_norm": 0.2202742099761963, + "kl": 0.04620361328125, + "learning_rate": 5.249241880411181e-07, + "loss": 0.0121, + "num_tokens": 39843201.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 822.0, + "completions/max_terminated_length": 822.0, + "completions/mean_length": 377.8625183105469, + "completions/mean_terminated_length": 377.8625183105469, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "epoch": 0.14604771705160455, + "grad_norm": 0.21568068861961365, + "kl": 0.04791259765625, + "learning_rate": 5.228075726112785e-07, + "loss": -0.0043, + "num_tokens": 39922578.0, + "reward": 0.1875000149011612, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434911370277405, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 991.0, + "completions/max_terminated_length": 991.0, + "completions/mean_length": 336.1125183105469, + "completions/mean_terminated_length": 336.1125183105469, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "epoch": 0.14635261833981247, + "grad_norm": 0.19107092916965485, + "kl": 0.05096435546875, + "learning_rate": 5.207844057150768e-07, + "loss": 0.0107, + "num_tokens": 39995037.0, + "reward": 0.1875000298023224, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 321.88751220703125, + "completions/mean_terminated_length": 321.88751220703125, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "epoch": 0.14665751962802043, + "grad_norm": 0.2602921724319458, + "kl": 0.0550537109375, + "learning_rate": 5.188547722405437e-07, + "loss": 0.0126, + "num_tokens": 40066734.0, + "reward": 0.21250002086162567, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1018.0, + "completions/max_terminated_length": 1018.0, + "completions/mean_length": 387.6875, + "completions/mean_terminated_length": 387.6875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "epoch": 0.14696242091622838, + "grad_norm": 0.1923561990261078, + "kl": 0.04376220703125, + "learning_rate": 5.170187531512351e-07, + "loss": -0.0073, + "num_tokens": 40148063.0, + "reward": 0.1612500250339508, + "reward_std": 0.09015611559152603, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339753627777, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 854.0, + "completions/max_terminated_length": 854.0, + "completions/mean_length": 371.9250183105469, + "completions/mean_terminated_length": 371.9250183105469, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "epoch": 0.1472673222044363, + "grad_norm": 0.12472382187843323, + "kl": 0.0458984375, + "learning_rate": 5.152764254828348e-07, + "loss": 0.0028, + "num_tokens": 40228487.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1130.0, + "completions/max_terminated_length": 1130.0, + "completions/mean_length": 335.5249938964844, + "completions/mean_terminated_length": 335.5249938964844, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "epoch": 0.14757222349264426, + "grad_norm": 0.21659334003925323, + "kl": 0.05364990234375, + "learning_rate": 5.136278623399225e-07, + "loss": 0.0077, + "num_tokens": 40302519.0, + "reward": 0.21250002086162567, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 858.0, + "completions/max_terminated_length": 858.0, + "completions/mean_length": 372.6750183105469, + "completions/mean_terminated_length": 372.6750183105469, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "epoch": 0.1478771247808522, + "grad_norm": 0.26435017585754395, + "kl": 0.0491943359375, + "learning_rate": 5.120731328929058e-07, + "loss": 0.0008, + "num_tokens": 40380843.0, + "reward": 0.21125002205371857, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 869.0, + "completions/max_terminated_length": 869.0, + "completions/mean_length": 364.2875061035156, + "completions/mean_terminated_length": 364.2875061035156, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "epoch": 0.14818202606906014, + "grad_norm": 0.22579143941402435, + "kl": 0.04638671875, + "learning_rate": 5.106123023751187e-07, + "loss": -0.0071, + "num_tokens": 40455084.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1316.0, + "completions/max_terminated_length": 1316.0, + "completions/mean_length": 376.2875061035156, + "completions/mean_terminated_length": 376.2875061035156, + "completions/min_length": 79.0, + "completions/min_terminated_length": 79.0, + "epoch": 0.1484869273572681, + "grad_norm": 0.15771447122097015, + "kl": 0.04400634765625, + "learning_rate": 5.092454320800833e-07, + "loss": 0.003, + "num_tokens": 40533413.0, + "reward": 0.16250000894069672, + "reward_std": 0.0530330054461956, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 359.1875, + "completions/mean_terminated_length": 359.1875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "epoch": 0.14879182864547602, + "grad_norm": 0.2311951071023941, + "kl": 0.0474853515625, + "learning_rate": 5.079725793589405e-07, + "loss": 0.0024, + "num_tokens": 40607952.0, + "reward": 0.2250000238418579, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.125, + "rewards/unicoder_reward_fn/std": 0.33280548453330994, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 980.0, + "completions/max_terminated_length": 980.0, + "completions/mean_length": 342.7375183105469, + "completions/mean_terminated_length": 342.7375183105469, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "epoch": 0.14909672993368397, + "grad_norm": 0.18008172512054443, + "kl": 0.04840087890625, + "learning_rate": 5.067937976180407e-07, + "loss": 0.0046, + "num_tokens": 40683967.0, + "reward": 0.20000003278255463, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.10000000149011612, + "rewards/unicoder_reward_fn/std": 0.3018927276134491, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 861.0, + "completions/max_terminated_length": 861.0, + "completions/mean_length": 366.8999938964844, + "completions/mean_terminated_length": 366.8999938964844, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "epoch": 0.1494016312218919, + "grad_norm": 0.24172508716583252, + "kl": 0.04803466796875, + "learning_rate": 5.057091363167046e-07, + "loss": -0.0081, + "num_tokens": 40761483.0, + "reward": 0.1862500160932541, + "reward_std": 0.05480077490210533, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.08749999850988388, + "rewards/unicoder_reward_fn/std": 0.28434914350509644, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 364.7875061035156, + "completions/mean_terminated_length": 364.7875061035156, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "epoch": 0.14970653251009985, + "grad_norm": 0.18381540477275848, + "kl": 0.04437255859375, + "learning_rate": 5.047186409651489e-07, + "loss": 0.0102, + "num_tokens": 40838238.0, + "reward": 0.15000002086162567, + "reward_std": 0.0707106813788414, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 724.0, + "completions/max_terminated_length": 724.0, + "completions/mean_length": 346.7375183105469, + "completions/mean_terminated_length": 346.7375183105469, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "epoch": 0.1500114337983078, + "grad_norm": 0.12727724015712738, + "kl": 0.0479736328125, + "learning_rate": 5.038223531225742e-07, + "loss": -0.0009, + "num_tokens": 40914663.0, + "reward": 0.125, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.02500000037252903, + "rewards/unicoder_reward_fn/std": 0.15710999071598053, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 707.0, + "completions/max_terminated_length": 707.0, + "completions/mean_length": 353.95001220703125, + "completions/mean_terminated_length": 353.95001220703125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "epoch": 0.15031633508651573, + "grad_norm": 0.21529246866703033, + "kl": 0.05010986328125, + "learning_rate": 5.030203103954232e-07, + "loss": 0.016, + "num_tokens": 40989059.0, + "reward": 0.17500002682209015, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531530380249, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 831.0, + "completions/max_terminated_length": 831.0, + "completions/mean_length": 391.7250061035156, + "completions/mean_terminated_length": 391.7250061035156, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "epoch": 0.1506212363747237, + "grad_norm": 0.14168554544448853, + "kl": 0.04669189453125, + "learning_rate": 5.023125464358026e-07, + "loss": 0.004, + "num_tokens": 41072121.0, + "reward": 0.14875000715255737, + "reward_std": 0.03358757123351097, + "rewards/format_reward/mean": 0.987500011920929, + "rewards/format_reward/std": 0.11180339008569717, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1150.0, + "completions/max_terminated_length": 1150.0, + "completions/mean_length": 443.3000183105469, + "completions/mean_terminated_length": 443.3000183105469, + "completions/min_length": 80.0, + "completions/min_terminated_length": 80.0, + "epoch": 0.15092613766293161, + "grad_norm": 0.14296044409275055, + "kl": 0.04107666015625, + "learning_rate": 5.016990909400709e-07, + "loss": 0.0064, + "num_tokens": 41157177.0, + "reward": 0.15000000596046448, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.05000000074505806, + "rewards/unicoder_reward_fn/std": 0.21931999921798706, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 910.0, + "completions/max_terminated_length": 910.0, + "completions/mean_length": 376.1125183105469, + "completions/mean_terminated_length": 376.1125183105469, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.15123103895113957, + "grad_norm": 0.23286934196949005, + "kl": 0.04718017578125, + "learning_rate": 5.011799696475915e-07, + "loss": 0.011, + "num_tokens": 41237998.0, + "reward": 0.21000002324581146, + "reward_std": 0.05656854063272476, + "rewards/format_reward/mean": 0.9750000238418579, + "rewards/format_reward/std": 0.15710997581481934, + "rewards/unicoder_reward_fn/mean": 0.11249999701976776, + "rewards/unicoder_reward_fn/std": 0.3179742097854614, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 816.0, + "completions/max_terminated_length": 816.0, + "completions/mean_length": 385.8000183105469, + "completions/mean_terminated_length": 385.8000183105469, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "epoch": 0.15153594023934752, + "grad_norm": 0.13556820154190063, + "kl": 0.04815673828125, + "learning_rate": 5.007552043396547e-07, + "loss": 0.006, + "num_tokens": 41312662.0, + "reward": 0.17500001192092896, + "reward_std": 0.0353553406894207, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.07500000298023224, + "rewards/unicoder_reward_fn/std": 0.2650531232357025, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1462.0, + "completions/max_terminated_length": 1462.0, + "completions/mean_length": 366.8374938964844, + "completions/mean_terminated_length": 366.8374938964844, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "epoch": 0.15184084152755545, + "grad_norm": 0.24061597883701324, + "kl": 0.04522705078125, + "learning_rate": 5.004248128385618e-07, + "loss": 0.004, + "num_tokens": 41388355.0, + "reward": 0.2500000298023224, + "reward_std": 0.1060660108923912, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.15000000596046448, + "rewards/unicoder_reward_fn/std": 0.35932427644729614, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 412.2375183105469, + "completions/mean_terminated_length": 412.2375183105469, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "epoch": 0.1521457428157634, + "grad_norm": 0.1985752284526825, + "kl": 0.0421142578125, + "learning_rate": 5.001888090068784e-07, + "loss": -0.0046, + "num_tokens": 41472988.0, + "reward": 0.1625000238418579, + "reward_std": 0.0883883461356163, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.0625, + "rewards/unicoder_reward_fn/std": 0.2435886710882187, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 994.0, + "completions/max_terminated_length": 994.0, + "completions/mean_length": 380.9875183105469, + "completions/mean_terminated_length": 380.9875183105469, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "epoch": 0.15245064410397133, + "grad_norm": 0.24799692630767822, + "kl": 0.0428466796875, + "learning_rate": 5.000472027468528e-07, + "loss": 0.0084, + "num_tokens": 41550467.0, + "reward": 0.23750002682209015, + "reward_std": 0.12374367564916611, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "rewards/unicoder_reward_fn/mean": 0.13750000298023224, + "rewards/unicoder_reward_fn/std": 0.3465471565723419, + "step": 500 + }, + { + "epoch": 0.15245064410397133, + "step": 500, + "total_flos": 0.0, + "train_loss": 0.0025880077524270744, + "train_runtime": 10060.4286, + "train_samples_per_second": 3.976, + "train_steps_per_second": 0.05 + } + ], + "logging_steps": 1, + "max_steps": 500, + "num_input_tokens_seen": 41550467, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}