{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.15245064410397133, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1096.0, "completions/max_terminated_length": 1096.0, "completions/mean_length": 509.875, "completions/mean_terminated_length": 509.875, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.0003049012882079427, "grad_norm": 0.29205822944641113, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0179, "num_tokens": 89984.0, "reward": 0.008750000968575478, "reward_std": 0.01237436942756176, "rewards/format_reward/mean": 0.08749999850988388, "rewards/format_reward/std": 0.28434914350509644, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 454.3500061035156, "completions/mean_terminated_length": 454.3500061035156, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.0006098025764158854, "grad_norm": 0.17669807374477386, "kl": 0.0, "learning_rate": 3.3333333333333335e-07, "loss": 0.0108, "num_tokens": 173100.0, "reward": 0.003750000149011612, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.03750000149011612, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 524.1000366210938, "completions/mean_terminated_length": 524.1000366210938, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.000914703864623828, "grad_norm": 0.16274994611740112, "kl": 0.0002689361572265625, "learning_rate": 6.666666666666667e-07, "loss": 0.0008, "num_tokens": 264720.0, "reward": 0.003750000149011612, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.03750000149011612, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1683.0, "completions/max_terminated_length": 1683.0, "completions/mean_length": 530.7125244140625, "completions/mean_terminated_length": 530.7125244140625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "epoch": 0.0012196051528317708, "grad_norm": 0.0007452780846506357, "kl": 0.0002593994140625, "learning_rate": 1.0000000000000002e-06, "loss": 0.0, "num_tokens": 361177.0, "reward": 0.0, "reward_std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1767.0, "completions/mean_length": 514.2750244140625, "completions/mean_terminated_length": 494.86077880859375, "completions/min_length": 48.0, "completions/min_terminated_length": 48.0, "epoch": 0.0015245064410397133, "grad_norm": 0.1586974710226059, "kl": 0.00028252601623535156, "learning_rate": 1.3333333333333334e-06, "loss": 0.0063, "num_tokens": 447999.0, "reward": 0.003750000149011612, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.03750000149011612, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1261.0, "completions/mean_length": 532.2250366210938, "completions/mean_terminated_length": 513.0379638671875, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.001829407729247656, "grad_norm": 0.2026820033788681, "kl": 0.0002999305725097656, "learning_rate": 1.6666666666666667e-06, "loss": -0.0177, "num_tokens": 538403.0, "reward": 0.0062500000931322575, "reward_std": 0.008838835172355175, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.2435886710882187, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 524.6875, "completions/mean_terminated_length": 505.40509033203125, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.0021343090174555986, "grad_norm": 0.19141650199890137, "kl": 0.00039958953857421875, "learning_rate": 2.0000000000000003e-06, "loss": -0.0069, "num_tokens": 628748.0, "reward": 0.003750000149011612, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.03750000149011612, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1454.0, "completions/max_terminated_length": 1454.0, "completions/mean_length": 480.625, "completions/mean_terminated_length": 480.625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0024392103056635416, "grad_norm": 0.29890286922454834, "kl": 0.0007905960083007812, "learning_rate": 2.3333333333333336e-06, "loss": -0.0039, "num_tokens": 714662.0, "reward": 0.010000000707805157, "reward_std": 0.014142136089503765, "rewards/format_reward/mean": 0.10000000149011612, "rewards/format_reward/std": 0.3018927574157715, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 520.4625244140625, "completions/mean_terminated_length": 501.1265869140625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.002744111593871484, "grad_norm": 0.36780887842178345, "kl": 0.0013713836669921875, "learning_rate": 2.666666666666667e-06, "loss": 0.0206, "num_tokens": 807605.0, "reward": 0.021250000223517418, "reward_std": 0.0265165064483881, "rewards/format_reward/mean": 0.21250000596046448, "rewards/format_reward/std": 0.4116576611995697, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1675.0, "completions/max_terminated_length": 1675.0, "completions/mean_length": 538.6625366210938, "completions/mean_terminated_length": 538.6625366210938, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0030490128820794267, "grad_norm": 0.4874690771102905, "kl": 0.003643035888671875, "learning_rate": 3e-06, "loss": 0.0225, "num_tokens": 897952.0, "reward": 0.05625000223517418, "reward_std": 0.04065864160656929, "rewards/format_reward/mean": 0.5625, "rewards/format_reward/std": 0.4992082417011261, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 517.8500366210938, "completions/mean_terminated_length": 517.8500366210938, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.0033539141702873696, "grad_norm": 0.49236273765563965, "kl": 0.00574493408203125, "learning_rate": 3.3333333333333333e-06, "loss": -0.004, "num_tokens": 989026.0, "reward": 0.06625000387430191, "reward_std": 0.03712311014533043, "rewards/format_reward/mean": 0.6625000238418579, "rewards/format_reward/std": 0.47584035992622375, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 464.4875183105469, "completions/mean_terminated_length": 464.4875183105469, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.003658815458495312, "grad_norm": 0.3598475754261017, "kl": 0.01023101806640625, "learning_rate": 3.6666666666666666e-06, "loss": -0.0077, "num_tokens": 1075109.0, "reward": 0.08749999850988388, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33280548453330994, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 446.8000183105469, "completions/mean_terminated_length": 446.8000183105469, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.003963716746703255, "grad_norm": 0.37172672152519226, "kl": 0.013092041015625, "learning_rate": 4.000000000000001e-06, "loss": -0.013, "num_tokens": 1160069.0, "reward": 0.08125000447034836, "reward_std": 0.01944543793797493, "rewards/format_reward/mean": 0.8125, "rewards/format_reward/std": 0.39277493953704834, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1425.0, "completions/max_terminated_length": 1425.0, "completions/mean_length": 507.1750183105469, "completions/mean_terminated_length": 507.1750183105469, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.004268618034911197, "grad_norm": 0.1720256209373474, "kl": 0.015472412109375, "learning_rate": 4.333333333333334e-06, "loss": 0.0009, "num_tokens": 1252237.0, "reward": 0.0937500074505806, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2435886710882187, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1569.0, "completions/max_terminated_length": 1569.0, "completions/mean_length": 400.88751220703125, "completions/mean_terminated_length": 400.88751220703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.00457351932311914, "grad_norm": 0.18945425748825073, "kl": 0.02191162109375, "learning_rate": 4.666666666666667e-06, "loss": 0.0034, "num_tokens": 1330022.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1017.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 378.8500061035156, "completions/mean_terminated_length": 378.8500061035156, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.004878420611327083, "grad_norm": 0.10855857282876968, "kl": 0.026611328125, "learning_rate": 5e-06, "loss": -0.0117, "num_tokens": 1410306.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 407.0, "completions/mean_terminated_length": 407.0, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.005183321899535026, "grad_norm": 0.1446218341588974, "kl": 0.036224365234375, "learning_rate": 4.999952797253148e-06, "loss": 0.0001, "num_tokens": 1486402.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 369.8625183105469, "completions/mean_terminated_length": 348.6202697753906, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.005488223187742968, "grad_norm": 0.21489065885543823, "kl": 0.03070068359375, "learning_rate": 4.9998111909931225e-06, "loss": -0.004, "num_tokens": 1560439.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 372.3374938964844, "completions/mean_terminated_length": 372.3374938964844, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.005793124475950911, "grad_norm": 0.04406031593680382, "kl": 0.0367431640625, "learning_rate": 4.999575187161439e-06, "loss": 0.0004, "num_tokens": 1634478.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1460.0, "completions/max_terminated_length": 1460.0, "completions/mean_length": 385.125, "completions/mean_terminated_length": 385.125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.006098025764158853, "grad_norm": 0.018597273156046867, "kl": 0.02655029296875, "learning_rate": 4.9992447956603455e-06, "loss": 0.0003, "num_tokens": 1715820.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025000000000000022, "completions/max_length": 2048.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 392.8500061035156, "completions/mean_terminated_length": 350.4102478027344, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.006402927052366796, "grad_norm": 0.19325323402881622, "kl": 0.029815673828125, "learning_rate": 4.998820030352409e-06, "loss": 0.0281, "num_tokens": 1795344.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 334.625, "completions/mean_terminated_length": 334.625, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.006707828340574739, "grad_norm": 0.13783958554267883, "kl": 0.031219482421875, "learning_rate": 4.998300909059929e-06, "loss": -0.0075, "num_tokens": 1870718.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1610.0, "completions/max_terminated_length": 1610.0, "completions/mean_length": 328.76251220703125, "completions/mean_terminated_length": 328.76251220703125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.007012729628782682, "grad_norm": 0.0586252324283123, "kl": 0.0343017578125, "learning_rate": 4.997687453564198e-06, "loss": 0.0003, "num_tokens": 1945877.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 403.4875183105469, "completions/mean_terminated_length": 403.4875183105469, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.007317630916990624, "grad_norm": 0.032569218426942825, "kl": 0.024200439453125, "learning_rate": 4.9969796896045775e-06, "loss": 0.0002, "num_tokens": 2028546.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03749999999999998, "completions/max_length": 2048.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 424.1000061035156, "completions/mean_terminated_length": 360.8311767578125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.007622532205198567, "grad_norm": 0.11784270405769348, "kl": 0.022308349609375, "learning_rate": 4.996177646877426e-06, "loss": 0.0503, "num_tokens": 2108420.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 764.0, "completions/max_terminated_length": 764.0, "completions/mean_length": 363.6125183105469, "completions/mean_terminated_length": 363.6125183105469, "completions/min_length": 57.0, "completions/min_terminated_length": 57.0, "epoch": 0.00792743349340651, "grad_norm": 0.029199976474046707, "kl": 0.022918701171875, "learning_rate": 4.995281359034851e-06, "loss": 0.0002, "num_tokens": 2184783.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 361.01251220703125, "completions/mean_terminated_length": 361.01251220703125, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.008232334781614453, "grad_norm": 0.027427662163972855, "kl": 0.024810791015625, "learning_rate": 4.994290863683296e-06, "loss": 0.0002, "num_tokens": 2260382.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 450.63751220703125, "completions/mean_terminated_length": 450.63751220703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.008537236069822395, "grad_norm": 0.011162430979311466, "kl": 0.0190887451171875, "learning_rate": 4.99320620238196e-06, "loss": 0.0002, "num_tokens": 2345657.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1869.0, "completions/mean_length": 497.01251220703125, "completions/mean_terminated_length": 477.3797607421875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.008842137358030338, "grad_norm": 0.09861345589160919, "kl": 0.01690673828125, "learning_rate": 4.99202742064106e-06, "loss": 0.0143, "num_tokens": 2432992.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 524.5625, "completions/mean_terminated_length": 505.27850341796875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.00914703864623828, "grad_norm": 0.08128828555345535, "kl": 0.017486572265625, "learning_rate": 4.990754567919917e-06, "loss": 0.0157, "num_tokens": 2526345.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 462.7749938964844, "completions/mean_terminated_length": 442.7088623046875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.009451939934446223, "grad_norm": 0.16708894073963165, "kl": 0.01788330078125, "learning_rate": 4.989387697624881e-06, "loss": 0.021, "num_tokens": 2608671.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 542.2000122070312, "completions/mean_terminated_length": 523.1392822265625, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.009756841222654166, "grad_norm": 0.05789264664053917, "kl": 0.0167236328125, "learning_rate": 4.987926867107095e-06, "loss": 0.0154, "num_tokens": 2702183.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1897.0, "completions/max_terminated_length": 1897.0, "completions/mean_length": 499.20001220703125, "completions/mean_terminated_length": 499.20001220703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.010061742510862108, "grad_norm": 0.009858060628175735, "kl": 0.018798828125, "learning_rate": 4.986372137660078e-06, "loss": 0.0002, "num_tokens": 2787461.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025000000000000022, "completions/max_length": 2048.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 614.0375366210938, "completions/mean_terminated_length": 577.2692260742188, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.010366643799070051, "grad_norm": 0.10538285225629807, "kl": 0.0189208984375, "learning_rate": 4.984723574517165e-06, "loss": 0.0309, "num_tokens": 2887760.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 528.1625366210938, "completions/mean_terminated_length": 528.1625366210938, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.010671545087277993, "grad_norm": 0.1755860298871994, "kl": 0.019439697265625, "learning_rate": 4.9829812468487655e-06, "loss": 0.0171, "num_tokens": 2981059.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 573.625, "completions/mean_terminated_length": 554.9620361328125, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "epoch": 0.010976446375485937, "grad_norm": 0.10435257107019424, "kl": 0.018310546875, "learning_rate": 4.981145227759457e-06, "loss": 0.0148, "num_tokens": 3077261.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 525.2625122070312, "completions/mean_terminated_length": 525.2625122070312, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.01128134766369388, "grad_norm": 0.11834096163511276, "kl": 0.01971435546875, "learning_rate": 4.979215594284924e-06, "loss": -0.0022, "num_tokens": 3168516.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1372.0, "completions/max_terminated_length": 1372.0, "completions/mean_length": 530.125, "completions/mean_terminated_length": 530.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.011586248951901822, "grad_norm": 0.1209435909986496, "kl": 0.0193939208984375, "learning_rate": 4.977192427388722e-06, "loss": 0.0082, "num_tokens": 3260722.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 550.6124877929688, "completions/mean_terminated_length": 531.6582641601562, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.011891150240109765, "grad_norm": 0.127229243516922, "kl": 0.019744873046875, "learning_rate": 4.9750758119588824e-06, "loss": 0.0111, "num_tokens": 3350507.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1225.0, "completions/max_terminated_length": 1225.0, "completions/mean_length": 476.4125061035156, "completions/mean_terminated_length": 476.4125061035156, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.012196051528317707, "grad_norm": 0.00884264800697565, "kl": 0.01934814453125, "learning_rate": 4.972865836804349e-06, "loss": 0.0002, "num_tokens": 3435038.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2025.0, "completions/max_terminated_length": 2025.0, "completions/mean_length": 523.1749877929688, "completions/mean_terminated_length": 523.1749877929688, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.01250095281652565, "grad_norm": 0.05174906179308891, "kl": 0.018310546875, "learning_rate": 4.970562594651254e-06, "loss": 0.0024, "num_tokens": 3527120.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 429.125, "completions/mean_terminated_length": 429.125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.012805854104733592, "grad_norm": 0.008613626472651958, "kl": 0.02117919921875, "learning_rate": 4.968166182139026e-06, "loss": 0.0002, "num_tokens": 3606646.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 415.57501220703125, "completions/mean_terminated_length": 415.57501220703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.013110755392941535, "grad_norm": 0.15895198285579681, "kl": 0.021728515625, "learning_rate": 4.9656766998163306e-06, "loss": -0.0001, "num_tokens": 3686402.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 457.8374938964844, "completions/mean_terminated_length": 457.8374938964844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.013415656681149479, "grad_norm": 0.0086384741589427, "kl": 0.019500732421875, "learning_rate": 4.963094252136865e-06, "loss": 0.0002, "num_tokens": 3771961.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 392.7375183105469, "completions/mean_terminated_length": 392.7375183105469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.01372055796935742, "grad_norm": 0.00958193838596344, "kl": 0.022064208984375, "learning_rate": 4.960418947454958e-06, "loss": 0.0002, "num_tokens": 3850288.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1287.0, "completions/max_terminated_length": 1287.0, "completions/mean_length": 470.5249938964844, "completions/mean_terminated_length": 470.5249938964844, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.014025459257565364, "grad_norm": 0.008218112401664257, "kl": 0.018829345703125, "learning_rate": 4.957650898021038e-06, "loss": 0.0002, "num_tokens": 3937080.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 383.45001220703125, "completions/mean_terminated_length": 362.3797607421875, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.014330360545773305, "grad_norm": 0.08822523057460785, "kl": 0.021636962890625, "learning_rate": 4.954790219976915e-06, "loss": 0.0225, "num_tokens": 4014978.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 438.5625, "completions/mean_terminated_length": 438.5625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.014635261833981249, "grad_norm": 0.1345287412405014, "kl": 0.0201416015625, "learning_rate": 4.95183703335091e-06, "loss": -0.0076, "num_tokens": 4095191.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 431.4125061035156, "completions/mean_terminated_length": 431.4125061035156, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.01494016312218919, "grad_norm": 0.008926213718950748, "kl": 0.019744873046875, "learning_rate": 4.948791462052819e-06, "loss": 0.0002, "num_tokens": 4175032.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1367.0, "completions/mean_length": 470.625, "completions/mean_terminated_length": 450.6582336425781, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.015245064410397134, "grad_norm": 0.08256660401821136, "kl": 0.0185546875, "learning_rate": 4.945653633868716e-06, "loss": 0.0172, "num_tokens": 4258442.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1280.0, "completions/max_terminated_length": 1280.0, "completions/mean_length": 416.125, "completions/mean_terminated_length": 416.125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.015549965698605077, "grad_norm": 0.03866208344697952, "kl": 0.021453857421875, "learning_rate": 4.942423680455584e-06, "loss": 0.0002, "num_tokens": 4338172.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 421.0874938964844, "completions/mean_terminated_length": 421.0874938964844, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.01585486698681302, "grad_norm": 0.008591040968894958, "kl": 0.021087646484375, "learning_rate": 4.939101737335802e-06, "loss": 0.0002, "num_tokens": 4421777.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1862.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 466.82501220703125, "completions/mean_terminated_length": 466.82501220703125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.016159768275020962, "grad_norm": 0.009130142629146576, "kl": 0.02008056640625, "learning_rate": 4.935687943891447e-06, "loss": 0.0002, "num_tokens": 4510599.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 418.9375, "completions/mean_terminated_length": 418.9375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.016464669563228906, "grad_norm": 0.007702388800680637, "kl": 0.01898193359375, "learning_rate": 4.932182443358458e-06, "loss": 0.0002, "num_tokens": 4589492.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 425.9375, "completions/mean_terminated_length": 425.9375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.016769570851436846, "grad_norm": 0.4790143668651581, "kl": 0.12841796875, "learning_rate": 4.928585382820616e-06, "loss": 0.0013, "num_tokens": 4673435.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1364.0, "completions/max_terminated_length": 1364.0, "completions/mean_length": 433.2875061035156, "completions/mean_terminated_length": 433.2875061035156, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.01707447213964479, "grad_norm": 0.009019332937896252, "kl": 0.021881103515625, "learning_rate": 4.924896913203376e-06, "loss": 0.0002, "num_tokens": 4752248.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 468.7124938964844, "completions/mean_terminated_length": 468.7124938964844, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.017379373427852732, "grad_norm": 0.010964120738208294, "kl": 0.02215576171875, "learning_rate": 4.921117189267535e-06, "loss": 0.0002, "num_tokens": 4833935.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 458.9750061035156, "completions/mean_terminated_length": 458.9750061035156, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.017684274716060676, "grad_norm": 0.07652813196182251, "kl": 0.026702880859375, "learning_rate": 4.917246369602742e-06, "loss": 0.0062, "num_tokens": 4917547.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1608.0, "completions/max_terminated_length": 1608.0, "completions/mean_length": 455.3125, "completions/mean_terminated_length": 455.3125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.01798917600426862, "grad_norm": 0.1456306278705597, "kl": 0.03131103515625, "learning_rate": 4.9132846166208355e-06, "loss": -0.0105, "num_tokens": 5001028.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1673.0, "completions/max_terminated_length": 1673.0, "completions/mean_length": 439.875, "completions/mean_terminated_length": 439.875, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.01829407729247656, "grad_norm": 0.013068665750324726, "kl": 0.0283203125, "learning_rate": 4.9092320965490365e-06, "loss": 0.0003, "num_tokens": 5082706.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 432.8625183105469, "completions/mean_terminated_length": 432.8625183105469, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.018598978580684503, "grad_norm": 0.1498623639345169, "kl": 0.03411865234375, "learning_rate": 4.905088979422971e-06, "loss": -0.0192, "num_tokens": 5167075.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 478.20001220703125, "completions/mean_terminated_length": 478.20001220703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.018903879868892446, "grad_norm": 0.15401065349578857, "kl": 0.03167724609375, "learning_rate": 4.900855439079536e-06, "loss": 0.0069, "num_tokens": 5255569.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 468.125, "completions/mean_terminated_length": 468.125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.01920878115710039, "grad_norm": 0.05759025737643242, "kl": 0.029327392578125, "learning_rate": 4.8965316531496055e-06, "loss": -0.0007, "num_tokens": 5340617.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1037.0, "completions/max_terminated_length": 1037.0, "completions/mean_length": 426.7875061035156, "completions/mean_terminated_length": 426.7875061035156, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.019513682445308333, "grad_norm": 0.2939658761024475, "kl": 0.03350830078125, "learning_rate": 4.892117803050578e-06, "loss": 0.022, "num_tokens": 5423220.0, "reward": 0.0925000011920929, "reward_std": 0.01060660183429718, "rewards/format_reward/mean": 0.925000011920929, "rewards/format_reward/std": 0.2650531232357025, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 502.9875183105469, "completions/mean_terminated_length": 502.9875183105469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.019818583733516273, "grad_norm": 0.15384595096111298, "kl": 0.0291748046875, "learning_rate": 4.887614073978761e-06, "loss": 0.0003, "num_tokens": 5513533.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 496.1499938964844, "completions/mean_terminated_length": 476.50634765625, "completions/min_length": 45.0, "completions/min_terminated_length": 45.0, "epoch": 0.020123485021724216, "grad_norm": 0.13877885043621063, "kl": 0.0316162109375, "learning_rate": 4.883020654901609e-06, "loss": 0.0111, "num_tokens": 5601189.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1679.0, "completions/max_terminated_length": 1679.0, "completions/mean_length": 439.95001220703125, "completions/mean_terminated_length": 439.95001220703125, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.02042838630993216, "grad_norm": 0.14163745939731598, "kl": 0.0322265625, "learning_rate": 4.878337738549785e-06, "loss": 0.0032, "num_tokens": 5684995.0, "reward": 0.0925000011920929, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.925000011920929, "rewards/format_reward/std": 0.2650531232357025, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 409.3625183105469, "completions/mean_terminated_length": 409.3625183105469, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.020733287598140103, "grad_norm": 0.23476682603359222, "kl": 0.034271240234375, "learning_rate": 4.873565521409082e-06, "loss": 0.0071, "num_tokens": 5763802.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 387.625, "completions/mean_terminated_length": 366.60760498046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.021038188886348046, "grad_norm": 0.15044650435447693, "kl": 0.037353515625, "learning_rate": 4.868704203712173e-06, "loss": 0.0222, "num_tokens": 5840026.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 370.6625061035156, "completions/mean_terminated_length": 370.6625061035156, "completions/min_length": 42.0, "completions/min_terminated_length": 42.0, "epoch": 0.021343090174555986, "grad_norm": 0.012564298696815968, "kl": 0.03436279296875, "learning_rate": 4.86375398943021e-06, "loss": 0.0003, "num_tokens": 5920369.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1343.0, "completions/max_terminated_length": 1343.0, "completions/mean_length": 390.3625183105469, "completions/mean_terminated_length": 390.3625183105469, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.02164799146276393, "grad_norm": 0.0150454081594944, "kl": 0.0364990234375, "learning_rate": 4.858715086264274e-06, "loss": 0.0004, "num_tokens": 6000374.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1405.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 405.95001220703125, "completions/mean_terminated_length": 405.95001220703125, "completions/min_length": 46.0, "completions/min_terminated_length": 46.0, "epoch": 0.021952892750971873, "grad_norm": 0.17304885387420654, "kl": 0.04144287109375, "learning_rate": 4.853587705636646e-06, "loss": 0.0302, "num_tokens": 6079606.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 375.45001220703125, "completions/mean_terminated_length": 375.45001220703125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.022257794039179817, "grad_norm": 0.2335526943206787, "kl": 0.0367431640625, "learning_rate": 4.84837206268195e-06, "loss": -0.0071, "num_tokens": 6155698.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1201.0, "completions/max_terminated_length": 1201.0, "completions/mean_length": 385.4875183105469, "completions/mean_terminated_length": 385.4875183105469, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.02256269532738776, "grad_norm": 0.17186102271080017, "kl": 0.0338134765625, "learning_rate": 4.8430683762381195e-06, "loss": 0.0066, "num_tokens": 6234269.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 410.1000061035156, "completions/mean_terminated_length": 389.3670959472656, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "epoch": 0.0228675966155957, "grad_norm": 0.05610320344567299, "kl": 0.035552978515625, "learning_rate": 4.837676868837213e-06, "loss": 0.0143, "num_tokens": 6314337.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 370.7250061035156, "completions/mean_terminated_length": 370.7250061035156, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "epoch": 0.023172497903803643, "grad_norm": 0.17599275708198547, "kl": 0.03692626953125, "learning_rate": 4.832197766696085e-06, "loss": 0.013, "num_tokens": 6387825.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1283.0, "completions/mean_length": 457.0625, "completions/mean_terminated_length": 436.924072265625, "completions/min_length": 63.0, "completions/min_terminated_length": 63.0, "epoch": 0.023477399192011587, "grad_norm": 0.07211699336767197, "kl": 0.03021240234375, "learning_rate": 4.826631299706887e-06, "loss": 0.0221, "num_tokens": 6476254.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025000000000000022, "completions/max_length": 2048.0, "completions/max_terminated_length": 1299.0, "completions/mean_length": 454.0500183105469, "completions/mean_terminated_length": 413.17950439453125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.02378230048021953, "grad_norm": 0.13199199736118317, "kl": 0.028106689453125, "learning_rate": 4.820977701427424e-06, "loss": 0.0396, "num_tokens": 6559896.0, "reward": 0.0949999988079071, "reward_std": 0.0070710680447518826, "rewards/format_reward/mean": 0.949999988079071, "rewards/format_reward/std": 0.21931999921798706, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 467.07501220703125, "completions/mean_terminated_length": 467.07501220703125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.02408720176842747, "grad_norm": 0.1922425776720047, "kl": 0.04315185546875, "learning_rate": 4.81523720907136e-06, "loss": 0.0114, "num_tokens": 6644664.0, "reward": 0.0962500050663948, "reward_std": 0.00530330091714859, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 427.0500183105469, "completions/mean_terminated_length": 427.0500183105469, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "epoch": 0.024392103056635413, "grad_norm": 0.10455196350812912, "kl": 0.036407470703125, "learning_rate": 4.809410063498254e-06, "loss": 0.0061, "num_tokens": 6728988.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 461.5500183105469, "completions/mean_terminated_length": 461.5500183105469, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.024697004344843357, "grad_norm": 0.09028248488903046, "kl": 0.032958984375, "learning_rate": 4.8034965092034656e-06, "loss": -0.0011, "num_tokens": 6817160.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1799.0, "completions/max_terminated_length": 1799.0, "completions/mean_length": 499.2250061035156, "completions/mean_terminated_length": 499.2250061035156, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0250019056330513, "grad_norm": 0.010679539293050766, "kl": 0.0279541015625, "learning_rate": 4.797496794307889e-06, "loss": 0.0003, "num_tokens": 6910458.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 460.26251220703125, "completions/mean_terminated_length": 460.26251220703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.025306806921259244, "grad_norm": 0.009925225749611855, "kl": 0.027679443359375, "learning_rate": 4.791411170547545e-06, "loss": 0.0003, "num_tokens": 6997335.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1793.0, "completions/max_terminated_length": 1793.0, "completions/mean_length": 476.9375, "completions/mean_terminated_length": 476.9375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.025611708209467184, "grad_norm": 0.14034686982631683, "kl": 0.0299072265625, "learning_rate": 4.785239893263017e-06, "loss": 0.0066, "num_tokens": 7084896.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 459.01251220703125, "completions/mean_terminated_length": 459.01251220703125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.025916609497675127, "grad_norm": 0.18238796293735504, "kl": 0.03082275390625, "learning_rate": 4.778983221388742e-06, "loss": 0.0036, "num_tokens": 7169579.0, "reward": 0.0949999988079071, "reward_std": 0.0070710680447518826, "rewards/format_reward/mean": 0.949999988079071, "rewards/format_reward/std": 0.21931999921798706, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 425.125, "completions/mean_terminated_length": 425.125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.02622151078588307, "grad_norm": 0.12785503268241882, "kl": 0.03192138671875, "learning_rate": 4.77264141744214e-06, "loss": -0.0056, "num_tokens": 7255699.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 474.32501220703125, "completions/mean_terminated_length": 454.4050598144531, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.026526412074091014, "grad_norm": 0.09049192816019058, "kl": 0.029388427734375, "learning_rate": 4.766214747512603e-06, "loss": 0.017, "num_tokens": 7338231.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 412.7250061035156, "completions/mean_terminated_length": 412.7250061035156, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.026831313362298957, "grad_norm": 0.012042547576129436, "kl": 0.031829833984375, "learning_rate": 4.759703481250331e-06, "loss": 0.0003, "num_tokens": 7421839.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 380.7124938964844, "completions/mean_terminated_length": 380.7124938964844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.027136214650506897, "grad_norm": 0.012744505889713764, "kl": 0.031402587890625, "learning_rate": 4.753107891855015e-06, "loss": 0.0003, "num_tokens": 7496678.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 421.6499938964844, "completions/mean_terminated_length": 421.6499938964844, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "epoch": 0.02744111593871484, "grad_norm": 0.09909161925315857, "kl": 0.031219482421875, "learning_rate": 4.746428256064375e-06, "loss": 0.0088, "num_tokens": 7579184.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 992.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 365.3500061035156, "completions/mean_terminated_length": 365.3500061035156, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.027746017226922784, "grad_norm": 0.011725598014891148, "kl": 0.03399658203125, "learning_rate": 4.7396648541425534e-06, "loss": 0.0003, "num_tokens": 7658790.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 378.8125, "completions/mean_terminated_length": 378.8125, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.028050918515130727, "grad_norm": 0.01128524262458086, "kl": 0.031982421875, "learning_rate": 4.732817969868348e-06, "loss": 0.0003, "num_tokens": 7736683.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 430.6875, "completions/mean_terminated_length": 430.6875, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.02835581980333867, "grad_norm": 0.01020871289074421, "kl": 0.029205322265625, "learning_rate": 4.7258878905233095e-06, "loss": 0.0003, "num_tokens": 7820178.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 434.7749938964844, "completions/mean_terminated_length": 434.7749938964844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.02866072109154661, "grad_norm": 0.13812255859375, "kl": 0.030548095703125, "learning_rate": 4.718874906879688e-06, "loss": 0.0087, "num_tokens": 7906574.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 392.76251220703125, "completions/mean_terminated_length": 392.76251220703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.028965622379754554, "grad_norm": 0.05963268131017685, "kl": 0.03778076171875, "learning_rate": 4.711779313188231e-06, "loss": 0.0004, "num_tokens": 7986005.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 402.0500183105469, "completions/mean_terminated_length": 402.0500183105469, "completions/min_length": 61.0, "completions/min_terminated_length": 61.0, "epoch": 0.029270523667962497, "grad_norm": 0.011644248850643635, "kl": 0.033355712890625, "learning_rate": 4.70460140716584e-06, "loss": 0.0003, "num_tokens": 8071719.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 417.5625, "completions/mean_terminated_length": 417.5625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.02957542495617044, "grad_norm": 0.01004591304808855, "kl": 0.0301513671875, "learning_rate": 4.697341489983076e-06, "loss": 0.0003, "num_tokens": 8156948.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 436.9250183105469, "completions/mean_terminated_length": 416.5316467285156, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.02988032624437838, "grad_norm": 0.09471645951271057, "kl": 0.027435302734375, "learning_rate": 4.6899998662515215e-06, "loss": 0.0162, "num_tokens": 8240926.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1559.0, "completions/max_terminated_length": 1559.0, "completions/mean_length": 369.82501220703125, "completions/mean_terminated_length": 369.82501220703125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.030185227532586324, "grad_norm": 0.010060581378638744, "kl": 0.029296875, "learning_rate": 4.682576844011007e-06, "loss": 0.0003, "num_tokens": 8318122.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 847.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 439.9125061035156, "completions/mean_terminated_length": 439.9125061035156, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.030490128820794268, "grad_norm": 0.013250144198536873, "kl": 0.02764892578125, "learning_rate": 4.675072734716678e-06, "loss": 0.0003, "num_tokens": 8399933.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 440.5375061035156, "completions/mean_terminated_length": 440.5375061035156, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "epoch": 0.03079503010900221, "grad_norm": 0.0173508208245039, "kl": 0.024932861328125, "learning_rate": 4.667487853225931e-06, "loss": 0.0002, "num_tokens": 8486574.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1100.0, "completions/max_terminated_length": 1100.0, "completions/mean_length": 436.01251220703125, "completions/mean_terminated_length": 436.01251220703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.031099931397210154, "grad_norm": 0.010417678393423557, "kl": 0.028106689453125, "learning_rate": 4.659822517785203e-06, "loss": 0.0003, "num_tokens": 8567453.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 445.5249938964844, "completions/mean_terminated_length": 445.5249938964844, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.0314048326854181, "grad_norm": 0.00837908685207367, "kl": 0.0263671875, "learning_rate": 4.6520770500166165e-06, "loss": 0.0003, "num_tokens": 8653671.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 405.76251220703125, "completions/mean_terminated_length": 405.76251220703125, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.03170973397362604, "grad_norm": 0.008642381988465786, "kl": 0.026397705078125, "learning_rate": 4.644251774904487e-06, "loss": 0.0003, "num_tokens": 8733950.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 447.6750183105469, "completions/mean_terminated_length": 447.6750183105469, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.032014635261833985, "grad_norm": 0.010713264346122742, "kl": 0.026611328125, "learning_rate": 4.636347020781684e-06, "loss": 0.0003, "num_tokens": 8816570.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 819.0, "completions/max_terminated_length": 819.0, "completions/mean_length": 436.88751220703125, "completions/mean_terminated_length": 436.88751220703125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.032319536550041925, "grad_norm": 0.0077039930038154125, "kl": 0.02520751953125, "learning_rate": 4.6283631193158605e-06, "loss": 0.0003, "num_tokens": 8898097.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 467.5874938964844, "completions/mean_terminated_length": 467.5874938964844, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.032624437838249865, "grad_norm": 0.01947968266904354, "kl": 0.027191162109375, "learning_rate": 4.620300405495532e-06, "loss": 0.0003, "num_tokens": 8987072.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1248.0, "completions/max_terminated_length": 1248.0, "completions/mean_length": 428.8000183105469, "completions/mean_terminated_length": 428.8000183105469, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.03292933912645781, "grad_norm": 0.009133207611739635, "kl": 0.0252685546875, "learning_rate": 4.612159217616022e-06, "loss": 0.0003, "num_tokens": 9066656.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1211.0, "completions/max_terminated_length": 1211.0, "completions/mean_length": 431.13751220703125, "completions/mean_terminated_length": 431.13751220703125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.03323424041466575, "grad_norm": 0.009221093729138374, "kl": 0.027191162109375, "learning_rate": 4.603939897265268e-06, "loss": 0.0003, "num_tokens": 9148285.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 425.76251220703125, "completions/mean_terminated_length": 425.76251220703125, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.03353914170287369, "grad_norm": 0.010868406854569912, "kl": 0.027496337890625, "learning_rate": 4.595642789309492e-06, "loss": 0.0003, "num_tokens": 9233742.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 417.0249938964844, "completions/mean_terminated_length": 417.0249938964844, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.03384404299108164, "grad_norm": 0.008578047156333923, "kl": 0.0242919921875, "learning_rate": 4.587268241878724e-06, "loss": 0.0002, "num_tokens": 9314770.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 478.7375183105469, "completions/mean_terminated_length": 478.7375183105469, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.03414894427928958, "grad_norm": 0.008643762208521366, "kl": 0.024017333984375, "learning_rate": 4.578816606352205e-06, "loss": 0.0002, "num_tokens": 9403055.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 512.1625366210938, "completions/mean_terminated_length": 492.7215270996094, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.034453845567497525, "grad_norm": 0.06351924687623978, "kl": 0.023223876953125, "learning_rate": 4.570288237343632e-06, "loss": 0.0118, "num_tokens": 9494696.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 431.6625061035156, "completions/mean_terminated_length": 431.6625061035156, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.034758746855705465, "grad_norm": 0.007523035630583763, "kl": 0.0238037109375, "learning_rate": 4.561683492686289e-06, "loss": 0.0002, "num_tokens": 9578089.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 421.6625061035156, "completions/mean_terminated_length": 421.6625061035156, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.035063648143913405, "grad_norm": 0.1079186275601387, "kl": 0.02276611328125, "learning_rate": 4.5530027334180285e-06, "loss": 0.0148, "num_tokens": 9662240.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1270.0, "completions/max_terminated_length": 1270.0, "completions/mean_length": 467.4875183105469, "completions/mean_terminated_length": 467.4875183105469, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.03536854943212135, "grad_norm": 0.12511233985424042, "kl": 0.02191162109375, "learning_rate": 4.544246323766122e-06, "loss": -0.0006, "num_tokens": 9748367.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 433.3374938964844, "completions/mean_terminated_length": 433.3374938964844, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.03567345072032929, "grad_norm": 0.12101423740386963, "kl": 0.02276611328125, "learning_rate": 4.535414631131983e-06, "loss": 0.0056, "num_tokens": 9830026.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 452.8500061035156, "completions/mean_terminated_length": 452.8500061035156, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.03597835200853724, "grad_norm": 0.00823962688446045, "kl": 0.02374267578125, "learning_rate": 4.526508026075746e-06, "loss": 0.0002, "num_tokens": 9913714.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 452.6625061035156, "completions/mean_terminated_length": 432.4683532714844, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.03628325329674518, "grad_norm": 0.11805210262537003, "kl": 0.023345947265625, "learning_rate": 4.517526882300721e-06, "loss": 0.0268, "num_tokens": 10000177.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 464.2375183105469, "completions/mean_terminated_length": 464.2375183105469, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.03658815458495312, "grad_norm": 0.0064226859249174595, "kl": 0.019989013671875, "learning_rate": 4.508471576637713e-06, "loss": 0.0002, "num_tokens": 10086616.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1389.0, "completions/max_terminated_length": 1389.0, "completions/mean_length": 479.45001220703125, "completions/mean_terminated_length": 479.45001220703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.036893055873161065, "grad_norm": 0.006146405823528767, "kl": 0.022857666015625, "learning_rate": 4.499342489029211e-06, "loss": 0.0002, "num_tokens": 10171666.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 464.1125183105469, "completions/mean_terminated_length": 464.1125183105469, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.037197957161369005, "grad_norm": 0.007421064656227827, "kl": 0.022613525390625, "learning_rate": 4.490140002513449e-06, "loss": 0.0002, "num_tokens": 10258639.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 502.25, "completions/mean_terminated_length": 502.25, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.03750285844957695, "grad_norm": 0.006498878821730614, "kl": 0.020965576171875, "learning_rate": 4.48086450320833e-06, "loss": 0.0002, "num_tokens": 10346595.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1148.0, "completions/max_terminated_length": 1148.0, "completions/mean_length": 427.3125, "completions/mean_terminated_length": 427.3125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.03780775973778489, "grad_norm": 0.007438625209033489, "kl": 0.02313232421875, "learning_rate": 4.4715163802952266e-06, "loss": 0.0002, "num_tokens": 10430592.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 515.0250244140625, "completions/mean_terminated_length": 495.6202697753906, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.03811266102599283, "grad_norm": 0.06223122030496597, "kl": 0.019805908203125, "learning_rate": 4.462096026002655e-06, "loss": 0.0203, "num_tokens": 10520862.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 481.13751220703125, "completions/mean_terminated_length": 481.13751220703125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.03841756231420078, "grad_norm": 0.005953509360551834, "kl": 0.0211181640625, "learning_rate": 4.4526038355898144e-06, "loss": 0.0002, "num_tokens": 10610079.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 435.5874938964844, "completions/mean_terminated_length": 435.5874938964844, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.03872246360240872, "grad_norm": 0.0068345870822668076, "kl": 0.0206298828125, "learning_rate": 4.4430402073300035e-06, "loss": 0.0002, "num_tokens": 10691984.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 446.7749938964844, "completions/mean_terminated_length": 446.7749938964844, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.039027364890616666, "grad_norm": 0.006349269766360521, "kl": 0.02191162109375, "learning_rate": 4.433405542493909e-06, "loss": 0.0002, "num_tokens": 10771836.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 460.25, "completions/mean_terminated_length": 460.25, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.039332266178824606, "grad_norm": 0.006095725577324629, "kl": 0.02166748046875, "learning_rate": 4.4237002453327734e-06, "loss": 0.0002, "num_tokens": 10854488.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 464.75, "completions/mean_terminated_length": 464.75, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.039637167467032546, "grad_norm": 0.006780361291021109, "kl": 0.021820068359375, "learning_rate": 4.4139247230614245e-06, "loss": 0.0002, "num_tokens": 10942168.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 443.2749938964844, "completions/mean_terminated_length": 443.2749938964844, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.03994206875524049, "grad_norm": 0.006308922544121742, "kl": 0.020904541015625, "learning_rate": 4.404079385841201e-06, "loss": 0.0002, "num_tokens": 11027114.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 498.5874938964844, "completions/mean_terminated_length": 498.5874938964844, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.04024697004344843, "grad_norm": 0.010841727256774902, "kl": 0.021697998046875, "learning_rate": 4.394164646762734e-06, "loss": 0.0002, "num_tokens": 11117615.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 403.3000183105469, "completions/mean_terminated_length": 403.3000183105469, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.04055187133165638, "grad_norm": 0.007032663561403751, "kl": 0.023834228515625, "learning_rate": 4.384180921828618e-06, "loss": 0.0002, "num_tokens": 11191853.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 448.3125, "completions/mean_terminated_length": 448.3125, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.04085677261986432, "grad_norm": 0.006273228675127029, "kl": 0.02056884765625, "learning_rate": 4.374128629935955e-06, "loss": 0.0002, "num_tokens": 11275020.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 439.76251220703125, "completions/mean_terminated_length": 439.76251220703125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.04116167390807226, "grad_norm": 0.0052821701392531395, "kl": 0.0211181640625, "learning_rate": 4.364008192858781e-06, "loss": 0.0002, "num_tokens": 11360009.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 441.2875061035156, "completions/mean_terminated_length": 441.2875061035156, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.041466575196280206, "grad_norm": 0.00741475960239768, "kl": 0.020751953125, "learning_rate": 4.353820035230366e-06, "loss": 0.0002, "num_tokens": 11444966.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1484.0, "completions/max_terminated_length": 1484.0, "completions/mean_length": 485.1125183105469, "completions/mean_terminated_length": 485.1125183105469, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.041771476484488146, "grad_norm": 0.10163833945989609, "kl": 0.020782470703125, "learning_rate": 4.3435645845254e-06, "loss": -0.0073, "num_tokens": 11527187.0, "reward": 0.09875000268220901, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 451.38751220703125, "completions/mean_terminated_length": 451.38751220703125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.04207637777269609, "grad_norm": 0.0059946151450276375, "kl": 0.020355224609375, "learning_rate": 4.333242271042054e-06, "loss": 0.0002, "num_tokens": 11607904.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 485.8500061035156, "completions/mean_terminated_length": 485.8500061035156, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.04238127906090403, "grad_norm": 0.007027114741504192, "kl": 0.02001953125, "learning_rate": 4.32285352788393e-06, "loss": 0.0002, "num_tokens": 11692268.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1380.0, "completions/max_terminated_length": 1380.0, "completions/mean_length": 472.1000061035156, "completions/mean_terminated_length": 472.1000061035156, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.04268618034911197, "grad_norm": 0.022472839802503586, "kl": 0.022674560546875, "learning_rate": 4.312398790941882e-06, "loss": 0.0002, "num_tokens": 11779880.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 468.63751220703125, "completions/mean_terminated_length": 468.63751220703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.04299108163731992, "grad_norm": 0.004842875991016626, "kl": 0.020355224609375, "learning_rate": 4.301878498875735e-06, "loss": 0.0002, "num_tokens": 11865031.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 521.7625122070312, "completions/mean_terminated_length": 521.7625122070312, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.04329598292552786, "grad_norm": 0.005916159600019455, "kl": 0.019195556640625, "learning_rate": 4.291293093095873e-06, "loss": 0.0002, "num_tokens": 11953252.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 452.1750183105469, "completions/mean_terminated_length": 452.1750183105469, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.043600884213735806, "grad_norm": 0.0050492798909544945, "kl": 0.02130126953125, "learning_rate": 4.280643017744723e-06, "loss": 0.0002, "num_tokens": 12031248.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1363.0, "completions/max_terminated_length": 1363.0, "completions/mean_length": 465.51251220703125, "completions/mean_terminated_length": 465.51251220703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.043905785501943746, "grad_norm": 0.005258447024971247, "kl": 0.021026611328125, "learning_rate": 4.269928719678117e-06, "loss": 0.0002, "num_tokens": 12112583.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 457.1750183105469, "completions/mean_terminated_length": 457.1750183105469, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.044210686790151686, "grad_norm": 0.004629280883818865, "kl": 0.0196533203125, "learning_rate": 4.2591506484465426e-06, "loss": 0.0002, "num_tokens": 12192925.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1470.0, "completions/max_terminated_length": 1470.0, "completions/mean_length": 493.0375061035156, "completions/mean_terminated_length": 493.0375061035156, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.04451558807835963, "grad_norm": 0.1090981587767601, "kl": 0.019500732421875, "learning_rate": 4.248309256276283e-06, "loss": -0.0041, "num_tokens": 12284868.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 449.9375, "completions/mean_terminated_length": 449.9375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.04482048936656757, "grad_norm": 0.005555329844355583, "kl": 0.019439697265625, "learning_rate": 4.23740499805044e-06, "loss": 0.0002, "num_tokens": 12368695.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 460.8625183105469, "completions/mean_terminated_length": 460.8625183105469, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.04512539065477552, "grad_norm": 0.08225194364786148, "kl": 0.023101806640625, "learning_rate": 4.22643833128985e-06, "loss": 0.0046, "num_tokens": 12453566.0, "reward": 0.09875001013278961, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1620.0, "completions/max_terminated_length": 1620.0, "completions/mean_length": 472.3000183105469, "completions/mean_terminated_length": 472.3000183105469, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.04543029194298346, "grad_norm": 0.004510453902184963, "kl": 0.0183258056640625, "learning_rate": 4.215409716133885e-06, "loss": 0.0002, "num_tokens": 12535486.0, "reward": 0.10000000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 469.6125183105469, "completions/mean_terminated_length": 449.6329345703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.0457351932311914, "grad_norm": 0.13615085184574127, "kl": 0.019378662109375, "learning_rate": 4.204319615321151e-06, "loss": 0.0102, "num_tokens": 12618469.0, "reward": 0.09750000387430191, "reward_std": 0.0035355340223759413, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0, "rewards/unicoder_reward_fn/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 468.1750183105469, "completions/mean_terminated_length": 468.1750183105469, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.04604009451939935, "grad_norm": 0.13893385231494904, "kl": 0.01971435546875, "learning_rate": 4.193168494170065e-06, "loss": -0.0089, "num_tokens": 12700391.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 450.0874938964844, "completions/mean_terminated_length": 450.0874938964844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.04634499580760729, "grad_norm": 0.24638357758522034, "kl": 0.020843505859375, "learning_rate": 4.181956820559339e-06, "loss": 0.0055, "num_tokens": 12783810.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 482.5874938964844, "completions/mean_terminated_length": 462.77215576171875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.046649897095815226, "grad_norm": 0.19727939367294312, "kl": 0.01953125, "learning_rate": 4.170685064908342e-06, "loss": 0.0268, "num_tokens": 12870267.0, "reward": 0.16124999523162842, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1430.0, "completions/max_terminated_length": 1430.0, "completions/mean_length": 474.9125061035156, "completions/mean_terminated_length": 474.9125061035156, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.04695479838402317, "grad_norm": 0.16645734012126923, "kl": 0.02166748046875, "learning_rate": 4.159353700157365e-06, "loss": -0.006, "num_tokens": 12958860.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 426.45001220703125, "completions/mean_terminated_length": 426.45001220703125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.04725969967223111, "grad_norm": 0.11824029684066772, "kl": 0.022247314453125, "learning_rate": 4.14796320174778e-06, "loss": 0.0015, "num_tokens": 13040700.0, "reward": 0.23750002682209015, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 474.4250183105469, "completions/mean_terminated_length": 474.4250183105469, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.04756460096043906, "grad_norm": 0.20552416145801544, "kl": 0.01910400390625, "learning_rate": 4.136514047602087e-06, "loss": 0.0055, "num_tokens": 13127266.0, "reward": 0.15000000596046448, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 409.7375183105469, "completions/mean_terminated_length": 409.7375183105469, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "epoch": 0.047869502248647, "grad_norm": 0.2528684735298157, "kl": 0.02264404296875, "learning_rate": 4.1250067181038635e-06, "loss": 0.0083, "num_tokens": 13205545.0, "reward": 0.13625000417232513, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 397.0500183105469, "completions/mean_terminated_length": 397.0500183105469, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.04817440353685494, "grad_norm": 0.20501503348350525, "kl": 0.021697998046875, "learning_rate": 4.113441696077608e-06, "loss": -0.0023, "num_tokens": 13284185.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1467.0, "completions/max_terminated_length": 1467.0, "completions/mean_length": 413.4250183105469, "completions/mean_terminated_length": 413.4250183105469, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.04847930482506289, "grad_norm": 0.2203351855278015, "kl": 0.025726318359375, "learning_rate": 4.101819466768484e-06, "loss": 0.0014, "num_tokens": 13358097.0, "reward": 0.2250000238418579, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 425.38751220703125, "completions/mean_terminated_length": 425.38751220703125, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.04878420611327083, "grad_norm": 0.1432938277721405, "kl": 0.026611328125, "learning_rate": 4.0901405178219535e-06, "loss": 0.0072, "num_tokens": 13440104.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 431.8374938964844, "completions/mean_terminated_length": 431.8374938964844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.049089107401478774, "grad_norm": 0.09334322065114975, "kl": 0.025604248046875, "learning_rate": 4.078405339263326e-06, "loss": 0.0031, "num_tokens": 13520051.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 435.3625183105469, "completions/mean_terminated_length": 435.3625183105469, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.049394008689686714, "grad_norm": 0.1539635807275772, "kl": 0.0269775390625, "learning_rate": 4.06661442347719e-06, "loss": 0.0098, "num_tokens": 13602028.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1262.0, "completions/max_terminated_length": 1262.0, "completions/mean_length": 461.9125061035156, "completions/mean_terminated_length": 461.9125061035156, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.049698909977894654, "grad_norm": 0.18769684433937073, "kl": 0.02703857421875, "learning_rate": 4.054768265186758e-06, "loss": 0.0163, "num_tokens": 13687471.0, "reward": 0.1625000238418579, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1472.0, "completions/max_terminated_length": 1472.0, "completions/mean_length": 474.4624938964844, "completions/mean_terminated_length": 474.4624938964844, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.0500038112661026, "grad_norm": 0.13729074597358704, "kl": 0.02593994140625, "learning_rate": 4.0428673614331036e-06, "loss": 0.0008, "num_tokens": 13769654.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 451.1625061035156, "completions/mean_terminated_length": 451.1625061035156, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.05030871255431054, "grad_norm": 0.15735217928886414, "kl": 0.027252197265625, "learning_rate": 4.030912211554316e-06, "loss": -0.004, "num_tokens": 13847909.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 890.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 466.01251220703125, "completions/mean_terminated_length": 466.01251220703125, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.05061361384251849, "grad_norm": 0.1063733845949173, "kl": 0.026611328125, "learning_rate": 4.018903317164539e-06, "loss": 0.0017, "num_tokens": 13932246.0, "reward": 0.13750000298023224, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1003.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 464.6125183105469, "completions/mean_terminated_length": 464.6125183105469, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05091851513072643, "grad_norm": 0.18809597194194794, "kl": 0.029754638671875, "learning_rate": 4.006841182132932e-06, "loss": -0.0045, "num_tokens": 14018471.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 481.875, "completions/mean_terminated_length": 481.875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.05122341641893437, "grad_norm": 0.1885657161474228, "kl": 0.029754638671875, "learning_rate": 3.9947263125625195e-06, "loss": -0.0012, "num_tokens": 14104497.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 417.9624938964844, "completions/mean_terminated_length": 417.9624938964844, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.051528317707142314, "grad_norm": 0.28072646260261536, "kl": 0.0343017578125, "learning_rate": 3.982559216768967e-06, "loss": 0.0045, "num_tokens": 14183420.0, "reward": 0.2500000298023224, "reward_std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.15000000596046448, "rewards/unicoder_reward_fn/std": 0.35932427644729614, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 455.07501220703125, "completions/mean_terminated_length": 455.07501220703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.051833218995350254, "grad_norm": 0.16700702905654907, "kl": 0.04742431640625, "learning_rate": 3.970340405259245e-06, "loss": -0.0007, "num_tokens": 14268502.0, "reward": 0.13625000417232513, "reward_std": 0.05126523971557617, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 435.3374938964844, "completions/mean_terminated_length": 435.3374938964844, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.0521381202835582, "grad_norm": 0.2016594558954239, "kl": 0.03515625, "learning_rate": 3.958070390710214e-06, "loss": 0.0184, "num_tokens": 14347941.0, "reward": 0.2250000238418579, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 840.0, "completions/max_terminated_length": 840.0, "completions/mean_length": 430.5500183105469, "completions/mean_terminated_length": 430.5500183105469, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.05244302157176614, "grad_norm": 0.14722523093223572, "kl": 0.033660888671875, "learning_rate": 3.945749687947109e-06, "loss": -0.0036, "num_tokens": 14429165.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 390.01251220703125, "completions/mean_terminated_length": 390.01251220703125, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.05274792285997408, "grad_norm": 0.176780104637146, "kl": 0.04071044921875, "learning_rate": 3.933378813921942e-06, "loss": -0.0041, "num_tokens": 14507066.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1310.0, "completions/max_terminated_length": 1310.0, "completions/mean_length": 426.2124938964844, "completions/mean_terminated_length": 426.2124938964844, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.05305282414818203, "grad_norm": 0.11216724663972855, "kl": 0.03741455078125, "learning_rate": 3.920958287691811e-06, "loss": -0.003, "num_tokens": 14595147.0, "reward": 0.1875000298023224, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1236.0, "completions/max_terminated_length": 1236.0, "completions/mean_length": 421.0, "completions/mean_terminated_length": 421.0, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.05335772543638997, "grad_norm": 0.012539403513073921, "kl": 0.03759765625, "learning_rate": 3.908488630397121e-06, "loss": 0.0004, "num_tokens": 14679785.0, "reward": 0.1250000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 386.1625061035156, "completions/mean_terminated_length": 386.1625061035156, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.053662626724597914, "grad_norm": 0.28521767258644104, "kl": 0.0400390625, "learning_rate": 3.8959703652397175e-06, "loss": 0.0077, "num_tokens": 14754882.0, "reward": 0.1600000113248825, "reward_std": 0.09192388504743576, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 396.88751220703125, "completions/mean_terminated_length": 396.88751220703125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.053967528012805854, "grad_norm": 0.2568298876285553, "kl": 0.04534912109375, "learning_rate": 3.883404017460935e-06, "loss": 0.0135, "num_tokens": 14835143.0, "reward": 0.1625000238418579, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1079.0, "completions/max_terminated_length": 1079.0, "completions/mean_length": 396.6000061035156, "completions/mean_terminated_length": 396.6000061035156, "completions/min_length": 60.0, "completions/min_terminated_length": 60.0, "epoch": 0.054272429301013794, "grad_norm": 0.17389893531799316, "kl": 0.041015625, "learning_rate": 3.870790114319559e-06, "loss": -0.006, "num_tokens": 14917165.0, "reward": 0.11000000685453415, "reward_std": 0.02121320180594921, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1661.0, "completions/max_terminated_length": 1661.0, "completions/mean_length": 365.9875183105469, "completions/mean_terminated_length": 365.9875183105469, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.05457733058922174, "grad_norm": 0.27765053510665894, "kl": 0.043212890625, "learning_rate": 3.858129185069701e-06, "loss": -0.0025, "num_tokens": 14992326.0, "reward": 0.14874999225139618, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 381.3125, "completions/mean_terminated_length": 381.3125, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.05488223187742968, "grad_norm": 0.1604042500257492, "kl": 0.0423583984375, "learning_rate": 3.845421760938597e-06, "loss": -0.0024, "num_tokens": 15069501.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 403.6000061035156, "completions/mean_terminated_length": 403.6000061035156, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.05518713316563763, "grad_norm": 0.11354026198387146, "kl": 0.0467529296875, "learning_rate": 3.832668375104312e-06, "loss": 0.0026, "num_tokens": 15150189.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 911.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 389.9875183105469, "completions/mean_terminated_length": 389.9875183105469, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.05549203445384557, "grad_norm": 0.07974027842283249, "kl": 0.04486083984375, "learning_rate": 3.8198695626733725e-06, "loss": 0.0057, "num_tokens": 15228680.0, "reward": 0.16250000894069672, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1088.0, "completions/max_terminated_length": 1088.0, "completions/mean_length": 422.125, "completions/mean_terminated_length": 422.125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.05579693574205351, "grad_norm": 0.13898016512393951, "kl": 0.04345703125, "learning_rate": 3.8070258606583156e-06, "loss": 0.0012, "num_tokens": 15308694.0, "reward": 0.1612500101327896, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1294.0, "completions/max_terminated_length": 1294.0, "completions/mean_length": 412.57501220703125, "completions/mean_terminated_length": 412.57501220703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.056101837030261455, "grad_norm": 0.22532965242862701, "kl": 0.04083251953125, "learning_rate": 3.7941378079551544e-06, "loss": 0.0157, "num_tokens": 15386502.0, "reward": 0.19875001907348633, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1050.0, "completions/max_terminated_length": 1050.0, "completions/mean_length": 407.45001220703125, "completions/mean_terminated_length": 407.45001220703125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.056406738318469395, "grad_norm": 0.1868802011013031, "kl": 0.0443115234375, "learning_rate": 3.7812059453207677e-06, "loss": 0.0052, "num_tokens": 15465030.0, "reward": 0.16250000894069672, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 415.6625061035156, "completions/mean_terminated_length": 415.6625061035156, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.05671163960667734, "grad_norm": 0.1819785237312317, "kl": 0.04437255859375, "learning_rate": 3.768230815350213e-06, "loss": -0.0017, "num_tokens": 15545053.0, "reward": 0.2237500250339508, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 415.26251220703125, "completions/mean_terminated_length": 415.26251220703125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.05701654089488528, "grad_norm": 0.22398653626441956, "kl": 0.04693603515625, "learning_rate": 3.7552129624539557e-06, "loss": 0.0024, "num_tokens": 15624898.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 471.25, "completions/mean_terminated_length": 471.25, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.05732144218309322, "grad_norm": 0.15245965123176575, "kl": 0.04522705078125, "learning_rate": 3.7421529328350316e-06, "loss": 0.0105, "num_tokens": 15711106.0, "reward": 0.11000000685453415, "reward_std": 0.02121320180594921, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1312.0, "completions/mean_length": 498.1625061035156, "completions/mean_terminated_length": 478.5443115234375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.05762634347130117, "grad_norm": 0.2877877652645111, "kl": 0.0426025390625, "learning_rate": 3.7290512744661274e-06, "loss": 0.0201, "num_tokens": 15802865.0, "reward": 0.14374999701976776, "reward_std": 0.07954951375722885, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.2435886710882187, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 463.7250061035156, "completions/mean_terminated_length": 463.7250061035156, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.05793124475950911, "grad_norm": 0.220382422208786, "kl": 0.044677734375, "learning_rate": 3.715908537066589e-06, "loss": -0.0025, "num_tokens": 15888049.0, "reward": 0.19875001907348633, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 464.0874938964844, "completions/mean_terminated_length": 444.0379943847656, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.058236146047717055, "grad_norm": 0.15473699569702148, "kl": 0.0445556640625, "learning_rate": 3.7027252720793538e-06, "loss": 0.0245, "num_tokens": 15971338.0, "reward": 0.1862500160932541, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1325.0, "completions/mean_length": 499.75, "completions/mean_terminated_length": 480.15191650390625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.058541047335924995, "grad_norm": 0.2423507124185562, "kl": 0.04345703125, "learning_rate": 3.689502032647817e-06, "loss": 0.0232, "num_tokens": 16060400.0, "reward": 0.17125001549720764, "reward_std": 0.0760139748454094, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1072.0, "completions/max_terminated_length": 1072.0, "completions/mean_length": 456.5375061035156, "completions/mean_terminated_length": 456.5375061035156, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.058845948624132935, "grad_norm": 0.14700448513031006, "kl": 0.046142578125, "learning_rate": 3.6762393735926245e-06, "loss": -0.0042, "num_tokens": 16147755.0, "reward": 0.11125000566244125, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1706.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 496.1000061035156, "completions/mean_terminated_length": 496.1000061035156, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "epoch": 0.05915084991234088, "grad_norm": 0.109529048204422, "kl": 0.0516357421875, "learning_rate": 3.6629378513883852e-06, "loss": 0.005, "num_tokens": 16239961.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 457.7749938964844, "completions/mean_terminated_length": 457.7749938964844, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.05945575120054882, "grad_norm": 0.1422124207019806, "kl": 0.051513671875, "learning_rate": 3.6495980241403307e-06, "loss": 0.0032, "num_tokens": 16321133.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 506.3000183105469, "completions/mean_terminated_length": 486.7848205566406, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.05976065248875676, "grad_norm": 0.14283646643161774, "kl": 0.051513671875, "learning_rate": 3.636220451560896e-06, "loss": 0.0142, "num_tokens": 16412955.0, "reward": 0.12375000864267349, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1063.0, "completions/max_terminated_length": 1063.0, "completions/mean_length": 490.1625061035156, "completions/mean_terminated_length": 490.1625061035156, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.06006555377696471, "grad_norm": 0.1564817726612091, "kl": 0.0517578125, "learning_rate": 3.622805694946235e-06, "loss": 0.0033, "num_tokens": 16498910.0, "reward": 0.14875002205371857, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 429.9250183105469, "completions/mean_terminated_length": 429.9250183105469, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.06037045506517265, "grad_norm": 0.1235664039850235, "kl": 0.057861328125, "learning_rate": 3.609354317152667e-06, "loss": 0.0017, "num_tokens": 16578398.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 541.0499877929688, "completions/mean_terminated_length": 541.0499877929688, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.060675356353380595, "grad_norm": 0.1469779908657074, "kl": 0.050048828125, "learning_rate": 3.595866882573063e-06, "loss": 0.0093, "num_tokens": 16673246.0, "reward": 0.12375000864267349, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 492.2250061035156, "completions/mean_terminated_length": 492.2250061035156, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.060980257641588535, "grad_norm": 0.16372045874595642, "kl": 0.052734375, "learning_rate": 3.5823439571131675e-06, "loss": 0.0072, "num_tokens": 16758306.0, "reward": 0.23750002682209015, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1513.0, "completions/max_terminated_length": 1513.0, "completions/mean_length": 539.6875, "completions/mean_terminated_length": 539.6875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.061285158929796475, "grad_norm": 0.14389963448047638, "kl": 0.0560302734375, "learning_rate": 3.5687861081678477e-06, "loss": -0.0008, "num_tokens": 16850709.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 499.1875, "completions/mean_terminated_length": 499.1875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.06159006021800442, "grad_norm": 0.21020865440368652, "kl": 0.0526123046875, "learning_rate": 3.555193904597291e-06, "loss": -0.0215, "num_tokens": 16938202.0, "reward": 0.1612500101327896, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1098.0, "completions/max_terminated_length": 1098.0, "completions/mean_length": 517.1875, "completions/mean_terminated_length": 517.1875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.06189496150621236, "grad_norm": 0.19664756953716278, "kl": 0.053466796875, "learning_rate": 3.541567916703138e-06, "loss": -0.0017, "num_tokens": 17028301.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 495.375, "completions/mean_terminated_length": 495.375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.06219986279442031, "grad_norm": 0.04992162436246872, "kl": 0.0518798828125, "learning_rate": 3.5279087162045517e-06, "loss": 0.0006, "num_tokens": 17116071.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1911.0, "completions/max_terminated_length": 1911.0, "completions/mean_length": 544.8624877929688, "completions/mean_terminated_length": 544.8624877929688, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "epoch": 0.06250476408262826, "grad_norm": 0.06470425426959991, "kl": 0.05096435546875, "learning_rate": 3.5142168762142265e-06, "loss": -0.0015, "num_tokens": 17205530.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 468.8625183105469, "completions/mean_terminated_length": 468.8625183105469, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.0628096653708362, "grad_norm": 0.25689446926116943, "kl": 0.0528564453125, "learning_rate": 3.500492971214347e-06, "loss": 0.0098, "num_tokens": 17287257.0, "reward": 0.2225000113248825, "reward_std": 0.14495688676834106, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 536.5625, "completions/mean_terminated_length": 536.5625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.06311456665904414, "grad_norm": 0.1306605488061905, "kl": 0.04937744140625, "learning_rate": 3.48673757703248e-06, "loss": -0.0057, "num_tokens": 17376676.0, "reward": 0.20000003278255463, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 539.6749877929688, "completions/mean_terminated_length": 539.6749877929688, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.06341946794725208, "grad_norm": 0.1695621907711029, "kl": 0.04779052734375, "learning_rate": 3.472951270817418e-06, "loss": -0.005, "num_tokens": 17472766.0, "reward": 0.12250001728534698, "reward_std": 0.03889087215065956, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 511.9624938964844, "completions/mean_terminated_length": 511.9624938964844, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.06372436923546002, "grad_norm": 0.1479700207710266, "kl": 0.0560302734375, "learning_rate": 3.4591346310149578e-06, "loss": 0.0052, "num_tokens": 17560111.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1759.0, "completions/max_terminated_length": 1759.0, "completions/mean_length": 515.7374877929688, "completions/mean_terminated_length": 515.7374877929688, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.06402927052366797, "grad_norm": 0.10783959180116653, "kl": 0.05224609375, "learning_rate": 3.445288237343632e-06, "loss": 0.0036, "num_tokens": 17646166.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1273.0, "completions/max_terminated_length": 1273.0, "completions/mean_length": 477.3625183105469, "completions/mean_terminated_length": 477.3625183105469, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.06433417181187591, "grad_norm": 0.1348573863506317, "kl": 0.051513671875, "learning_rate": 3.4314126707703895e-06, "loss": -0.0013, "num_tokens": 17733095.0, "reward": 0.1875000149011612, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 981.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 463.5, "completions/mean_terminated_length": 463.5, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.06463907310008385, "grad_norm": 0.18264122307300568, "kl": 0.05426025390625, "learning_rate": 3.4175085134862128e-06, "loss": 0.0083, "num_tokens": 17817549.0, "reward": 0.19875001907348633, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1192.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 476.375, "completions/mean_terminated_length": 476.375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.06494397438829179, "grad_norm": 0.17124944925308228, "kl": 0.0538330078125, "learning_rate": 3.4035763488816953e-06, "loss": 0.0049, "num_tokens": 17903083.0, "reward": 0.14875002205371857, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 465.2375183105469, "completions/mean_terminated_length": 465.2375183105469, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.06524887567649973, "grad_norm": 0.1577434241771698, "kl": 0.0517578125, "learning_rate": 3.3896167615225594e-06, "loss": 0.0007, "num_tokens": 17987156.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1317.0, "completions/max_terminated_length": 1317.0, "completions/mean_length": 489.1875, "completions/mean_terminated_length": 489.1875, "completions/min_length": 10.0, "completions/min_terminated_length": 10.0, "epoch": 0.06555377696470767, "grad_norm": 0.2025170624256134, "kl": 0.05126953125, "learning_rate": 3.375630337125133e-06, "loss": 0.006, "num_tokens": 18078797.0, "reward": 0.13500002026557922, "reward_std": 0.05656854063272476, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 469.8374938964844, "completions/mean_terminated_length": 469.8374938964844, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.06585867825291562, "grad_norm": 0.12381523847579956, "kl": 0.05096435546875, "learning_rate": 3.361617662531772e-06, "loss": -0.0053, "num_tokens": 18165112.0, "reward": 0.21250002086162567, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 411.3500061035156, "completions/mean_terminated_length": 411.3500061035156, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.06616357954112356, "grad_norm": 0.16251295804977417, "kl": 0.05572509765625, "learning_rate": 3.347579325686237e-06, "loss": -0.0059, "num_tokens": 18246038.0, "reward": 0.1875000149011612, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 416.6000061035156, "completions/mean_terminated_length": 416.6000061035156, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.0664684808293315, "grad_norm": 0.22227387130260468, "kl": 0.059814453125, "learning_rate": 3.333515915609027e-06, "loss": -0.0285, "num_tokens": 18330676.0, "reward": 0.1862500160932541, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 467.13751220703125, "completions/mean_terminated_length": 447.1265869140625, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.06677338211753944, "grad_norm": 0.05367077514529228, "kl": 0.0504150390625, "learning_rate": 3.3194280223726616e-06, "loss": 0.0194, "num_tokens": 18418503.0, "reward": 0.14875000715255737, "reward_std": 0.0017677670111879706, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 448.9875183105469, "completions/mean_terminated_length": 448.9875183105469, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.06707828340574738, "grad_norm": 0.015069164335727692, "kl": 0.0482177734375, "learning_rate": 3.305316237076927e-06, "loss": 0.0005, "num_tokens": 18507032.0, "reward": 0.15000002086162567, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 475.1499938964844, "completions/mean_terminated_length": 475.1499938964844, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.06738318469395534, "grad_norm": 0.20499880611896515, "kl": 0.0531005859375, "learning_rate": 3.291181151824071e-06, "loss": 0.0038, "num_tokens": 18592450.0, "reward": 0.2250000238418579, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 434.0874938964844, "completions/mean_terminated_length": 434.0874938964844, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "epoch": 0.06768808598216328, "grad_norm": 0.1476088911294937, "kl": 0.052734375, "learning_rate": 3.27702335969396e-06, "loss": -0.0044, "num_tokens": 18676199.0, "reward": 0.21250002086162567, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1681.0, "completions/mean_length": 511.5, "completions/mean_terminated_length": 492.0506591796875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.06799298727037122, "grad_norm": 0.2085094004869461, "kl": 0.05047607421875, "learning_rate": 3.2628434547191985e-06, "loss": 0.0071, "num_tokens": 18767377.0, "reward": 0.17250001430511475, "reward_std": 0.07424621284008026, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 484.45001220703125, "completions/mean_terminated_length": 484.45001220703125, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "epoch": 0.06829788855857916, "grad_norm": 0.20141257345676422, "kl": 0.0501708984375, "learning_rate": 3.2486420318601973e-06, "loss": -0.0068, "num_tokens": 18856697.0, "reward": 0.21250000596046448, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 461.0625, "completions/mean_terminated_length": 461.0625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.0686027898467871, "grad_norm": 0.21380314230918884, "kl": 0.05499267578125, "learning_rate": 3.2344196869802187e-06, "loss": 0.0102, "num_tokens": 18940914.0, "reward": 0.2250000238418579, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1515.0, "completions/max_terminated_length": 1515.0, "completions/mean_length": 479.88751220703125, "completions/mean_terminated_length": 479.88751220703125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.06890769113499505, "grad_norm": 0.2144291251897812, "kl": 0.0491943359375, "learning_rate": 3.2201770168203694e-06, "loss": 0.0036, "num_tokens": 19026411.0, "reward": 0.27500003576278687, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.17499999701976776, "rewards/unicoder_reward_fn/std": 0.3823643922805786, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 492.5249938964844, "completions/mean_terminated_length": 492.5249938964844, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.06921259242320299, "grad_norm": 0.08882104605436325, "kl": 0.04925537109375, "learning_rate": 3.205914618974563e-06, "loss": 0.0047, "num_tokens": 19112903.0, "reward": 0.20000003278255463, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 435.6625061035156, "completions/mean_terminated_length": 435.6625061035156, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.06951749371141093, "grad_norm": 0.21108779311180115, "kl": 0.05218505859375, "learning_rate": 3.1916330918644496e-06, "loss": -0.006, "num_tokens": 19190060.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 546.9249877929688, "completions/mean_terminated_length": 546.9249877929688, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.06982239499961887, "grad_norm": 0.11964229494333267, "kl": 0.0482177734375, "learning_rate": 3.177333034714303e-06, "loss": -0.0016, "num_tokens": 19287010.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1137.0, "completions/max_terminated_length": 1137.0, "completions/mean_length": 467.1625061035156, "completions/mean_terminated_length": 467.1625061035156, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07012729628782681, "grad_norm": 0.0965242087841034, "kl": 0.0548095703125, "learning_rate": 3.1630150475258813e-06, "loss": 0.0042, "num_tokens": 19371217.0, "reward": 0.17499999701976776, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 524.3375244140625, "completions/mean_terminated_length": 524.3375244140625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.07043219757603476, "grad_norm": 0.11118588596582413, "kl": 0.04840087890625, "learning_rate": 3.148679731053252e-06, "loss": 0.0046, "num_tokens": 19466568.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 543.4249877929688, "completions/mean_terminated_length": 543.4249877929688, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.0707370988642427, "grad_norm": 0.15548087656497955, "kl": 0.0535888671875, "learning_rate": 3.1343276867775805e-06, "loss": 0.0054, "num_tokens": 19557878.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 510.375, "completions/mean_terminated_length": 510.375, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.07104200015245064, "grad_norm": 0.10689180344343185, "kl": 0.050048828125, "learning_rate": 3.1199595168819043e-06, "loss": -0.0, "num_tokens": 19643374.0, "reward": 0.20000003278255463, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 474.9375, "completions/mean_terminated_length": 474.9375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.07134690144065858, "grad_norm": 0.20179383456707, "kl": 0.05975341796875, "learning_rate": 3.105575824225852e-06, "loss": -0.019, "num_tokens": 19726871.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 525.5125122070312, "completions/mean_terminated_length": 525.5125122070312, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.07165180272886652, "grad_norm": 0.1247158870100975, "kl": 0.05096435546875, "learning_rate": 3.091177212320363e-06, "loss": 0.0034, "num_tokens": 19819440.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 502.88751220703125, "completions/mean_terminated_length": 502.88751220703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.07195670401707448, "grad_norm": 0.10479047149419785, "kl": 0.05401611328125, "learning_rate": 3.0767642853023538e-06, "loss": 0.0005, "num_tokens": 19905347.0, "reward": 0.16250000894069672, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 520.3624877929688, "completions/mean_terminated_length": 520.3624877929688, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.07226160530528242, "grad_norm": 0.13667507469654083, "kl": 0.05584716796875, "learning_rate": 3.062337647909376e-06, "loss": 0.0007, "num_tokens": 19995426.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 538.1500244140625, "completions/mean_terminated_length": 538.1500244140625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.07256650659349036, "grad_norm": 0.15229485929012299, "kl": 0.055419921875, "learning_rate": 3.04789790545424e-06, "loss": -0.004, "num_tokens": 20086754.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1696.0, "completions/max_terminated_length": 1696.0, "completions/mean_length": 592.6625366210938, "completions/mean_terminated_length": 592.6625366210938, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.0728714078816983, "grad_norm": 0.20365579426288605, "kl": 0.05059814453125, "learning_rate": 3.033445663799621e-06, "loss": -0.0056, "num_tokens": 20185295.0, "reward": 0.13625000417232513, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1354.0, "completions/max_terminated_length": 1354.0, "completions/mean_length": 504.13751220703125, "completions/mean_terminated_length": 504.13751220703125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07317630916990624, "grad_norm": 0.13447296619415283, "kl": 0.05511474609375, "learning_rate": 3.018981529332633e-06, "loss": 0.0061, "num_tokens": 20268892.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 489.4875183105469, "completions/mean_terminated_length": 489.4875183105469, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07348121045811419, "grad_norm": 0.18564718961715698, "kl": 0.05279541015625, "learning_rate": 3.00450610893939e-06, "loss": 0.0078, "num_tokens": 20353243.0, "reward": 0.21250002086162567, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 544.4000244140625, "completions/mean_terminated_length": 544.4000244140625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.07378611174632213, "grad_norm": 0.18029005825519562, "kl": 0.0494384765625, "learning_rate": 2.9900200099795396e-06, "loss": 0.0083, "num_tokens": 20446257.0, "reward": 0.11124999821186066, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1963.0, "completions/max_terminated_length": 1963.0, "completions/mean_length": 501.1000061035156, "completions/mean_terminated_length": 501.1000061035156, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.07409101303453007, "grad_norm": 0.06964573264122009, "kl": 0.0528564453125, "learning_rate": 2.9755238402607826e-06, "loss": -0.0022, "num_tokens": 20535433.0, "reward": 0.1625000238418579, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 435.5, "completions/mean_terminated_length": 435.5, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.07439591432273801, "grad_norm": 0.19546860456466675, "kl": 0.057861328125, "learning_rate": 2.961018208013367e-06, "loss": 0.0005, "num_tokens": 20618669.0, "reward": 0.14875002205371857, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1013.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 512.2000122070312, "completions/mean_terminated_length": 512.2000122070312, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.07470081561094595, "grad_norm": 0.1879434734582901, "kl": 0.0555419921875, "learning_rate": 2.9465037218645694e-06, "loss": 0.0154, "num_tokens": 20708125.0, "reward": 0.20000003278255463, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 459.4624938964844, "completions/mean_terminated_length": 459.4624938964844, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.0750057168991539, "grad_norm": 0.1770198494195938, "kl": 0.0557861328125, "learning_rate": 2.9319809908131604e-06, "loss": 0.0076, "num_tokens": 20795776.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1406.0, "completions/max_terminated_length": 1406.0, "completions/mean_length": 499.3625183105469, "completions/mean_terminated_length": 499.3625183105469, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.07531061818736184, "grad_norm": 0.2515522539615631, "kl": 0.05352783203125, "learning_rate": 2.917450624203847e-06, "loss": -0.001, "num_tokens": 20885125.0, "reward": 0.20000003278255463, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 467.0874938964844, "completions/mean_terminated_length": 467.0874938964844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.07561551947556978, "grad_norm": 0.1412237584590912, "kl": 0.05340576171875, "learning_rate": 2.9029132317017118e-06, "loss": -0.0074, "num_tokens": 20971492.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 483.2250061035156, "completions/mean_terminated_length": 483.2250061035156, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.07592042076377772, "grad_norm": 0.08514195680618286, "kl": 0.05108642578125, "learning_rate": 2.888369423266629e-06, "loss": 0.0062, "num_tokens": 21055896.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1658.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 498.8374938964844, "completions/mean_terminated_length": 498.8374938964844, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.07622532205198566, "grad_norm": 0.011307273991405964, "kl": 0.05059814453125, "learning_rate": 2.8738198091276712e-06, "loss": 0.0005, "num_tokens": 21144641.0, "reward": 0.1250000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 490.13751220703125, "completions/mean_terminated_length": 490.13751220703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.07653022334019362, "grad_norm": 0.06738625466823578, "kl": 0.05255126953125, "learning_rate": 2.859264999757509e-06, "loss": 0.0021, "num_tokens": 21236112.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 440.5500183105469, "completions/mean_terminated_length": 440.5500183105469, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.07683512462840156, "grad_norm": 0.1394670307636261, "kl": 0.0572509765625, "learning_rate": 2.8447056058467928e-06, "loss": -0.0058, "num_tokens": 21317460.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 412.1125183105469, "completions/mean_terminated_length": 412.1125183105469, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.0771400259166095, "grad_norm": 0.18536435067653656, "kl": 0.05535888671875, "learning_rate": 2.830142238278531e-06, "loss": 0.0095, "num_tokens": 21397969.0, "reward": 0.2250000238418579, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1132.0, "completions/max_terminated_length": 1132.0, "completions/mean_length": 485.13751220703125, "completions/mean_terminated_length": 485.13751220703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.07744492720481744, "grad_norm": 0.16279473900794983, "kl": 0.05804443359375, "learning_rate": 2.81557550810246e-06, "loss": 0.0074, "num_tokens": 21483776.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1400.0, "completions/max_terminated_length": 1400.0, "completions/mean_length": 501.2124938964844, "completions/mean_terminated_length": 501.2124938964844, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.07774982849302538, "grad_norm": 0.136715367436409, "kl": 0.05419921875, "learning_rate": 2.8010060265094026e-06, "loss": -0.0002, "num_tokens": 21573965.0, "reward": 0.1625000238418579, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 516.1500244140625, "completions/mean_terminated_length": 516.1500244140625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.07805472978123333, "grad_norm": 0.23355121910572052, "kl": 0.0560302734375, "learning_rate": 2.786434404805629e-06, "loss": -0.0051, "num_tokens": 21660273.0, "reward": 0.17375002801418304, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 484.8999938964844, "completions/mean_terminated_length": 484.8999938964844, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.07835963106944127, "grad_norm": 0.07816074043512344, "kl": 0.05767822265625, "learning_rate": 2.771861254387199e-06, "loss": 0.0014, "num_tokens": 21750347.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1437.0, "completions/max_terminated_length": 1437.0, "completions/mean_length": 518.9000244140625, "completions/mean_terminated_length": 518.9000244140625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.07866453235764921, "grad_norm": 0.37866440415382385, "kl": 0.08160400390625, "learning_rate": 2.7572871867143204e-06, "loss": 0.0085, "num_tokens": 21846487.0, "reward": 0.14875000715255737, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 467.82501220703125, "completions/mean_terminated_length": 467.82501220703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.07896943364585715, "grad_norm": 0.2316463440656662, "kl": 0.058837890625, "learning_rate": 2.742712813285681e-06, "loss": -0.0169, "num_tokens": 21932599.0, "reward": 0.23750002682209015, "reward_std": 0.1237436905503273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 454.5874938964844, "completions/mean_terminated_length": 454.5874938964844, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.07927433493406509, "grad_norm": 0.21413984894752502, "kl": 0.0577392578125, "learning_rate": 2.7281387456128017e-06, "loss": 0.0008, "num_tokens": 22020246.0, "reward": 0.18250000476837158, "reward_std": 0.02474873699247837, "rewards/format_reward/mean": 0.949999988079071, "rewards/format_reward/std": 0.21931999921798706, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1379.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 490.07501220703125, "completions/mean_terminated_length": 490.07501220703125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.07957923622227304, "grad_norm": 0.2023562639951706, "kl": 0.05303955078125, "learning_rate": 2.7135655951943716e-06, "loss": -0.0012, "num_tokens": 22106920.0, "reward": 0.14750002324581146, "reward_std": 0.03889087215065956, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 454.7875061035156, "completions/mean_terminated_length": 454.7875061035156, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.07988413751048098, "grad_norm": 0.36662039160728455, "kl": 0.0594482421875, "learning_rate": 2.698993973490598e-06, "loss": 0.0226, "num_tokens": 22187965.0, "reward": 0.13375000655651093, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.8374999761581421, "rewards/format_reward/std": 0.3712363839149475, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1335.0, "completions/max_terminated_length": 1335.0, "completions/mean_length": 514.9125366210938, "completions/mean_terminated_length": 514.9125366210938, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.08018903879868892, "grad_norm": 0.3255445659160614, "kl": 0.056640625, "learning_rate": 2.6844244918975416e-06, "loss": 0.0268, "num_tokens": 22278120.0, "reward": 0.16250000894069672, "reward_std": 0.08131727576255798, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.33280548453330994, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1663.0, "completions/mean_length": 513.7374877929688, "completions/mean_terminated_length": 494.31646728515625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.08049394008689686, "grad_norm": 0.39617183804512024, "kl": 0.05194091796875, "learning_rate": 2.66985776172147e-06, "loss": 0.0428, "num_tokens": 22365541.0, "reward": 0.15125001966953278, "reward_std": 0.10429824888706207, "rewards/format_reward/mean": 0.887499988079071, "rewards/format_reward/std": 0.3179742097854614, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 497.7250061035156, "completions/mean_terminated_length": 497.7250061035156, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.0807988413751048, "grad_norm": 0.1744794100522995, "kl": 0.0535888671875, "learning_rate": 2.6552943941532088e-06, "loss": -0.0065, "num_tokens": 22453057.0, "reward": 0.12250001728534698, "reward_std": 0.03889087215065956, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 525.3250122070312, "completions/mean_terminated_length": 525.3250122070312, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.08110374266331276, "grad_norm": 0.20356491208076477, "kl": 0.052001953125, "learning_rate": 2.6407350002424927e-06, "loss": 0.0127, "num_tokens": 22544929.0, "reward": 0.14625000953674316, "reward_std": 0.04065864160656929, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1503.0, "completions/max_terminated_length": 1503.0, "completions/mean_length": 471.7749938964844, "completions/mean_terminated_length": 471.7749938964844, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.0814086439515207, "grad_norm": 0.18078124523162842, "kl": 0.04986572265625, "learning_rate": 2.626180190872329e-06, "loss": -0.01, "num_tokens": 22630733.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1374.0, "completions/max_terminated_length": 1374.0, "completions/mean_length": 462.3125, "completions/mean_terminated_length": 462.3125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.08171354523972864, "grad_norm": 0.12597592175006866, "kl": 0.05572509765625, "learning_rate": 2.611630576733372e-06, "loss": 0.0038, "num_tokens": 22715850.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 507.1499938964844, "completions/mean_terminated_length": 507.1499938964844, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.08201844652793658, "grad_norm": 0.23196038603782654, "kl": 0.06365966796875, "learning_rate": 2.5970867682982885e-06, "loss": 0.013, "num_tokens": 22803240.0, "reward": 0.13375000655651093, "reward_std": 0.05833630636334419, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1414.0, "completions/max_terminated_length": 1414.0, "completions/mean_length": 464.3374938964844, "completions/mean_terminated_length": 464.3374938964844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.08232334781614452, "grad_norm": 0.23718927800655365, "kl": 0.05914306640625, "learning_rate": 2.582549375796154e-06, "loss": 0.0149, "num_tokens": 22891955.0, "reward": 0.14750002324581146, "reward_std": 0.07424621284008026, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1625.0, "completions/max_terminated_length": 1625.0, "completions/mean_length": 543.7000122070312, "completions/mean_terminated_length": 543.7000122070312, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.08262824910435247, "grad_norm": 0.19418348371982574, "kl": 0.0516357421875, "learning_rate": 2.568019009186841e-06, "loss": 0.0201, "num_tokens": 22988511.0, "reward": 0.17125001549720764, "reward_std": 0.07601398229598999, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 453.5874938964844, "completions/mean_terminated_length": 453.5874938964844, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.08293315039256041, "grad_norm": 0.20042388141155243, "kl": 0.05975341796875, "learning_rate": 2.5534962781354317e-06, "loss": -0.0057, "num_tokens": 23078212.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 435.8500061035156, "completions/mean_terminated_length": 435.8500061035156, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.08323805168076835, "grad_norm": 0.12753447890281677, "kl": 0.054443359375, "learning_rate": 2.538981791986634e-06, "loss": 0.0026, "num_tokens": 23165238.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1411.0, "completions/max_terminated_length": 1411.0, "completions/mean_length": 473.75, "completions/mean_terminated_length": 473.75, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.08354295296897629, "grad_norm": 0.12009397894144058, "kl": 0.052978515625, "learning_rate": 2.524476159739218e-06, "loss": 0.005, "num_tokens": 23252430.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1256.0, "completions/max_terminated_length": 1256.0, "completions/mean_length": 451.63751220703125, "completions/mean_terminated_length": 451.63751220703125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.08384785425718423, "grad_norm": 0.1936834752559662, "kl": 0.051513671875, "learning_rate": 2.5099799900204607e-06, "loss": 0.0055, "num_tokens": 23334535.0, "reward": 0.1612500250339508, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 455.9125061035156, "completions/mean_terminated_length": 455.9125061035156, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.08415275554539219, "grad_norm": 0.164746955037117, "kl": 0.052490234375, "learning_rate": 2.4954938910606108e-06, "loss": 0.005, "num_tokens": 23418978.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1540.0, "completions/max_terminated_length": 1540.0, "completions/mean_length": 451.3999938964844, "completions/mean_terminated_length": 451.3999938964844, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.08445765683360013, "grad_norm": 0.07814321666955948, "kl": 0.05133056640625, "learning_rate": 2.481018470667368e-06, "loss": 0.0041, "num_tokens": 23501772.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1570.0, "completions/max_terminated_length": 1570.0, "completions/mean_length": 468.1125183105469, "completions/mean_terminated_length": 468.1125183105469, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.08476255812180807, "grad_norm": 0.11970165371894836, "kl": 0.0550537109375, "learning_rate": 2.4665543362003802e-06, "loss": 0.0026, "num_tokens": 23588901.0, "reward": 0.19875001907348633, "reward_std": 0.03712311014533043, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 471.25, "completions/mean_terminated_length": 471.25, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.085067459410016, "grad_norm": 0.18868288397789001, "kl": 0.05419921875, "learning_rate": 2.4521020945457615e-06, "loss": 0.0044, "num_tokens": 23678053.0, "reward": 0.16250000894069672, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1797.0, "completions/max_terminated_length": 1797.0, "completions/mean_length": 431.6875, "completions/mean_terminated_length": 431.6875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.08537236069822395, "grad_norm": 0.17599807679653168, "kl": 0.0562744140625, "learning_rate": 2.4376623520906255e-06, "loss": -0.0002, "num_tokens": 23761322.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 949.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 419.5874938964844, "completions/mean_terminated_length": 419.5874938964844, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.0856772619864319, "grad_norm": 0.015127049759030342, "kl": 0.05780029296875, "learning_rate": 2.4232357146976478e-06, "loss": 0.0006, "num_tokens": 23842793.0, "reward": 0.1250000149011612, "reward_std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 405.07501220703125, "completions/mean_terminated_length": 405.07501220703125, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.08598216327463984, "grad_norm": 0.15912564098834991, "kl": 0.05450439453125, "learning_rate": 2.408822787679637e-06, "loss": 0.0003, "num_tokens": 23921019.0, "reward": 0.17375002801418304, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 465.9250183105469, "completions/mean_terminated_length": 465.9250183105469, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.08628706456284778, "grad_norm": 0.18430808186531067, "kl": 0.05889892578125, "learning_rate": 2.3944241757741475e-06, "loss": -0.0003, "num_tokens": 24009529.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 461.6750183105469, "completions/mean_terminated_length": 461.6750183105469, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.08659196585105572, "grad_norm": 0.10521125048398972, "kl": 0.054931640625, "learning_rate": 2.380040483118097e-06, "loss": 0.0058, "num_tokens": 24098463.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 431.9624938964844, "completions/mean_terminated_length": 431.9624938964844, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.08689686713926366, "grad_norm": 0.18808409571647644, "kl": 0.0546875, "learning_rate": 2.365672313222419e-06, "loss": 0.0099, "num_tokens": 24181320.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1415.0, "completions/max_terminated_length": 1415.0, "completions/mean_length": 448.2875061035156, "completions/mean_terminated_length": 448.2875061035156, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.08720176842747161, "grad_norm": 0.1849883794784546, "kl": 0.05560302734375, "learning_rate": 2.351320268946749e-06, "loss": -0.0067, "num_tokens": 24268271.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1215.0, "completions/max_terminated_length": 1215.0, "completions/mean_length": 485.70001220703125, "completions/mean_terminated_length": 485.70001220703125, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.08750666971567955, "grad_norm": 0.32291901111602783, "kl": 0.1077880859375, "learning_rate": 2.336984952474119e-06, "loss": 0.0046, "num_tokens": 24359331.0, "reward": 0.12375000864267349, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 439.2875061035156, "completions/mean_terminated_length": 439.2875061035156, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.08781157100388749, "grad_norm": 0.27845945954322815, "kl": 0.058837890625, "learning_rate": 2.322666965285697e-06, "loss": -0.0068, "num_tokens": 24440342.0, "reward": 0.21250002086162567, "reward_std": 0.12374367564916611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 394.5375061035156, "completions/mean_terminated_length": 394.5375061035156, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.08811647229209543, "grad_norm": 0.22477968037128448, "kl": 0.05712890625, "learning_rate": 2.3083669081355507e-06, "loss": 0.0006, "num_tokens": 24518207.0, "reward": 0.14875002205371857, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 493.63751220703125, "completions/mean_terminated_length": 493.63751220703125, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.08842137358030337, "grad_norm": 0.209952712059021, "kl": 0.05535888671875, "learning_rate": 2.2940853810254377e-06, "loss": -0.008, "num_tokens": 24604480.0, "reward": 0.1862500160932541, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 449.51251220703125, "completions/mean_terminated_length": 449.51251220703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.08872627486851133, "grad_norm": 0.21413753926753998, "kl": 0.051513671875, "learning_rate": 2.2798229831796313e-06, "loss": 0.0089, "num_tokens": 24687085.0, "reward": 0.12000000476837158, "reward_std": 0.0070710680447518826, "rewards/format_reward/mean": 0.949999988079071, "rewards/format_reward/std": 0.21931999921798706, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1057.0, "completions/max_terminated_length": 1057.0, "completions/mean_length": 448.9375, "completions/mean_terminated_length": 448.9375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.08903117615671927, "grad_norm": 0.2990638315677643, "kl": 0.04931640625, "learning_rate": 2.2655803130197816e-06, "loss": -0.0008, "num_tokens": 24772748.0, "reward": 0.11500000953674316, "reward_std": 0.04949747398495674, "rewards/format_reward/mean": 0.8999999761581421, "rewards/format_reward/std": 0.3018927276134491, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 444.3625183105469, "completions/mean_terminated_length": 444.3625183105469, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.0893360774449272, "grad_norm": 0.30735084414482117, "kl": 0.051025390625, "learning_rate": 2.2513579681398034e-06, "loss": -0.01, "num_tokens": 24861053.0, "reward": 0.15375001728534698, "reward_std": 0.10076271742582321, "rewards/format_reward/mean": 0.9125000238418579, "rewards/format_reward/std": 0.28434911370277405, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1176.0, "completions/max_terminated_length": 1176.0, "completions/mean_length": 424.51251220703125, "completions/mean_terminated_length": 424.51251220703125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.08964097873313515, "grad_norm": 0.28689104318618774, "kl": 0.052001953125, "learning_rate": 2.237156545280803e-06, "loss": -0.0063, "num_tokens": 24942834.0, "reward": 0.1300000101327896, "reward_std": 0.06010407209396362, "rewards/format_reward/mean": 0.925000011920929, "rewards/format_reward/std": 0.2650531232357025, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1527.0, "completions/max_terminated_length": 1527.0, "completions/mean_length": 504.5375061035156, "completions/mean_terminated_length": 504.5375061035156, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.08994588002134309, "grad_norm": 0.25867027044296265, "kl": 0.05072021484375, "learning_rate": 2.2229766403060403e-06, "loss": -0.0047, "num_tokens": 25030547.0, "reward": 0.16500000655651093, "reward_std": 0.11313708871603012, "rewards/format_reward/mean": 0.8999999761581421, "rewards/format_reward/std": 0.3018927574157715, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1292.0, "completions/max_terminated_length": 1292.0, "completions/mean_length": 453.5249938964844, "completions/mean_terminated_length": 453.5249938964844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.09025078130955104, "grad_norm": 0.30247601866722107, "kl": 0.0528564453125, "learning_rate": 2.2088188481759305e-06, "loss": -0.0018, "num_tokens": 25109955.0, "reward": 0.17750000953674316, "reward_std": 0.06717514246702194, "rewards/format_reward/mean": 0.8999999761581421, "rewards/format_reward/std": 0.3018927276134491, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1423.0, "completions/max_terminated_length": 1423.0, "completions/mean_length": 478.95001220703125, "completions/mean_terminated_length": 478.95001220703125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.09055568259775898, "grad_norm": 0.20232892036437988, "kl": 0.0496826171875, "learning_rate": 2.194683762923073e-06, "loss": 0.0001, "num_tokens": 25195167.0, "reward": 0.12000000476837158, "reward_std": 0.04242640733718872, "rewards/format_reward/mean": 0.949999988079071, "rewards/format_reward/std": 0.21931999921798706, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 459.76251220703125, "completions/mean_terminated_length": 459.76251220703125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.09086058388596692, "grad_norm": 0.28715941309928894, "kl": 0.05517578125, "learning_rate": 2.1805719776273387e-06, "loss": -0.0145, "num_tokens": 25272854.0, "reward": 0.1925000250339508, "reward_std": 0.08131728321313858, "rewards/format_reward/mean": 0.925000011920929, "rewards/format_reward/std": 0.2650531232357025, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 474.9250183105469, "completions/mean_terminated_length": 474.9250183105469, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.09116548517417486, "grad_norm": 0.19802211225032806, "kl": 0.05157470703125, "learning_rate": 2.166484084390974e-06, "loss": 0.0061, "num_tokens": 25365288.0, "reward": 0.1862500160932541, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 489.63751220703125, "completions/mean_terminated_length": 489.63751220703125, "completions/min_length": 197.0, "completions/min_terminated_length": 197.0, "epoch": 0.0914703864623828, "grad_norm": 0.2699923813343048, "kl": 0.0726318359375, "learning_rate": 2.1524206743137636e-06, "loss": -0.0068, "num_tokens": 25452087.0, "reward": 0.18500001728534698, "reward_std": 0.12727922201156616, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 514.1500244140625, "completions/mean_terminated_length": 514.1500244140625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.09177528775059074, "grad_norm": 0.1984640210866928, "kl": 0.05224609375, "learning_rate": 2.1383823374682287e-06, "loss": 0.0169, "num_tokens": 25546801.0, "reward": 0.1612500101327896, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 491.25, "completions/mean_terminated_length": 491.25, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.0920801890387987, "grad_norm": 0.18095120787620544, "kl": 0.05096435546875, "learning_rate": 2.124369662874868e-06, "loss": 0.0081, "num_tokens": 25636455.0, "reward": 0.12375000864267349, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 464.95001220703125, "completions/mean_terminated_length": 464.95001220703125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.09238509032700663, "grad_norm": 0.1827562004327774, "kl": 0.05792236328125, "learning_rate": 2.110383238477441e-06, "loss": 0.0124, "num_tokens": 25718507.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 474.45001220703125, "completions/mean_terminated_length": 474.45001220703125, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.09268999161521457, "grad_norm": 0.2177320420742035, "kl": 0.05657958984375, "learning_rate": 2.096423651118305e-06, "loss": -0.0, "num_tokens": 25806627.0, "reward": 0.1862500160932541, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 498.0375061035156, "completions/mean_terminated_length": 498.0375061035156, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.09299489290342251, "grad_norm": 0.1999090313911438, "kl": 0.05413818359375, "learning_rate": 2.082491486513788e-06, "loss": 0.011, "num_tokens": 25894094.0, "reward": 0.17375002801418304, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1739.0, "completions/max_terminated_length": 1739.0, "completions/mean_length": 488.5625, "completions/mean_terminated_length": 488.5625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.09329979419163045, "grad_norm": 0.18753774464130402, "kl": 0.05328369140625, "learning_rate": 2.0685873292296116e-06, "loss": -0.0051, "num_tokens": 25985229.0, "reward": 0.19875001907348633, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 454.63751220703125, "completions/mean_terminated_length": 454.63751220703125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.0936046954798384, "grad_norm": 0.17221041023731232, "kl": 0.052001953125, "learning_rate": 2.054711762656369e-06, "loss": -0.0021, "num_tokens": 26069054.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1486.0, "completions/max_terminated_length": 1486.0, "completions/mean_length": 523.4874877929688, "completions/mean_terminated_length": 523.4874877929688, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.09390959676804635, "grad_norm": 0.0688505545258522, "kl": 0.0504150390625, "learning_rate": 2.040865368985044e-06, "loss": -0.0007, "num_tokens": 26164117.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1228.0, "completions/max_terminated_length": 1228.0, "completions/mean_length": 457.9624938964844, "completions/mean_terminated_length": 457.9624938964844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.09421449805625429, "grad_norm": 0.09899485111236572, "kl": 0.05426025390625, "learning_rate": 2.027048729182583e-06, "loss": 0.0, "num_tokens": 26247982.0, "reward": 0.13750000298023224, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 460.8500061035156, "completions/mean_terminated_length": 460.8500061035156, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.09451939934446223, "grad_norm": 0.15719051659107208, "kl": 0.0552978515625, "learning_rate": 2.0132624229675205e-06, "loss": 0.0058, "num_tokens": 26335480.0, "reward": 0.1625000238418579, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1182.0, "completions/max_terminated_length": 1182.0, "completions/mean_length": 408.1750183105469, "completions/mean_terminated_length": 408.1750183105469, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.09482430063267017, "grad_norm": 0.16415221989154816, "kl": 0.05621337890625, "learning_rate": 1.9995070287856546e-06, "loss": -0.0017, "num_tokens": 26414044.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1355.0, "completions/max_terminated_length": 1355.0, "completions/mean_length": 459.1000061035156, "completions/mean_terminated_length": 459.1000061035156, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.09512920192087812, "grad_norm": 0.1567731350660324, "kl": 0.05615234375, "learning_rate": 1.985783123785774e-06, "loss": -0.0001, "num_tokens": 26499808.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 490.32501220703125, "completions/mean_terminated_length": 490.32501220703125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.09543410320908606, "grad_norm": 0.16579455137252808, "kl": 0.0517578125, "learning_rate": 1.9720912837954486e-06, "loss": -0.0028, "num_tokens": 26584350.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 443.6125183105469, "completions/mean_terminated_length": 443.6125183105469, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.095739004497294, "grad_norm": 0.1825886368751526, "kl": 0.0528564453125, "learning_rate": 1.958432083296862e-06, "loss": 0.0058, "num_tokens": 26668297.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 489.1875, "completions/mean_terminated_length": 489.1875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.09604390578550194, "grad_norm": 0.22271665930747986, "kl": 0.05328369140625, "learning_rate": 1.9448060954027093e-06, "loss": -0.0087, "num_tokens": 26753640.0, "reward": 0.2250000238418579, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 975.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 444.26251220703125, "completions/mean_terminated_length": 444.26251220703125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.09634880707370988, "grad_norm": 0.18735575675964355, "kl": 0.052734375, "learning_rate": 1.931213891832153e-06, "loss": -0.0087, "num_tokens": 26833649.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1114.0, "completions/max_terminated_length": 1114.0, "completions/mean_length": 444.4624938964844, "completions/mean_terminated_length": 444.4624938964844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.09665370836191783, "grad_norm": 0.2518155574798584, "kl": 0.05218505859375, "learning_rate": 1.9176560428868336e-06, "loss": 0.023, "num_tokens": 26913246.0, "reward": 0.30000004172325134, "reward_std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.20000000298023224, "rewards/unicoder_reward_fn/std": 0.4025236964225769, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 397.3500061035156, "completions/mean_terminated_length": 397.3500061035156, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.09695860965012577, "grad_norm": 0.16875123977661133, "kl": 0.0560302734375, "learning_rate": 1.9041331174269373e-06, "loss": 0.0031, "num_tokens": 26990552.0, "reward": 0.1875000149011612, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 463.1499938964844, "completions/mean_terminated_length": 463.1499938964844, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.09726351093833371, "grad_norm": 0.17430393397808075, "kl": 0.05401611328125, "learning_rate": 1.8906456828473341e-06, "loss": 0.0036, "num_tokens": 27076338.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 436.9125061035156, "completions/mean_terminated_length": 436.9125061035156, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.09756841222654165, "grad_norm": 0.1987638920545578, "kl": 0.0550537109375, "learning_rate": 1.8771943050537656e-06, "loss": 0.0036, "num_tokens": 27159067.0, "reward": 0.20000003278255463, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1049.0, "completions/max_terminated_length": 1049.0, "completions/mean_length": 444.95001220703125, "completions/mean_terminated_length": 444.95001220703125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.0978733135147496, "grad_norm": 0.14804542064666748, "kl": 0.05242919921875, "learning_rate": 1.8637795484391046e-06, "loss": 0.0037, "num_tokens": 27241497.0, "reward": 0.21250002086162567, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2032.0, "completions/max_terminated_length": 2032.0, "completions/mean_length": 520.5250244140625, "completions/mean_terminated_length": 520.5250244140625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.09817821480295755, "grad_norm": 0.18733462691307068, "kl": 0.05059814453125, "learning_rate": 1.8504019758596698e-06, "loss": 0.0081, "num_tokens": 27333439.0, "reward": 0.20000003278255463, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1542.0, "completions/max_terminated_length": 1542.0, "completions/mean_length": 422.7124938964844, "completions/mean_terminated_length": 422.7124938964844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.09848311609116549, "grad_norm": 0.19084835052490234, "kl": 0.05145263671875, "learning_rate": 1.8370621486116163e-06, "loss": 0.0045, "num_tokens": 27412614.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 480.5375061035156, "completions/mean_terminated_length": 480.5375061035156, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.09878801737937343, "grad_norm": 0.1860656887292862, "kl": 0.04840087890625, "learning_rate": 1.823760626407377e-06, "loss": 0.0028, "num_tokens": 27498657.0, "reward": 0.1862500160932541, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 472.2749938964844, "completions/mean_terminated_length": 472.2749938964844, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.09909291866758137, "grad_norm": 0.09327766299247742, "kl": 0.053955078125, "learning_rate": 1.8104979673521838e-06, "loss": -0.002, "num_tokens": 27583791.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 432.0, "completions/mean_terminated_length": 432.0, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.09939781995578931, "grad_norm": 0.2940045893192291, "kl": 0.05206298828125, "learning_rate": 1.7972747279206482e-06, "loss": -0.0095, "num_tokens": 27664373.0, "reward": 0.2500000298023224, "reward_std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.15000000596046448, "rewards/unicoder_reward_fn/std": 0.35932427644729614, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 431.9875183105469, "completions/mean_terminated_length": 431.9875183105469, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.09970272124399726, "grad_norm": 0.21221143007278442, "kl": 0.05511474609375, "learning_rate": 1.7840914629334122e-06, "loss": -0.0028, "num_tokens": 27747314.0, "reward": 0.20000003278255463, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 424.32501220703125, "completions/mean_terminated_length": 424.32501220703125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1000076225322052, "grad_norm": 0.1102011427283287, "kl": 0.05322265625, "learning_rate": 1.7709487255338731e-06, "loss": -0.0044, "num_tokens": 27829286.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 447.625, "completions/mean_terminated_length": 447.625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.10031252382041314, "grad_norm": 0.21028603613376617, "kl": 0.05322265625, "learning_rate": 1.7578470671649684e-06, "loss": 0.0188, "num_tokens": 27912630.0, "reward": 0.2237500250339508, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1119.0, "completions/max_terminated_length": 1119.0, "completions/mean_length": 422.70001220703125, "completions/mean_terminated_length": 422.70001220703125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.10061742510862108, "grad_norm": 0.09216304868459702, "kl": 0.05438232421875, "learning_rate": 1.744787037546045e-06, "loss": -0.0007, "num_tokens": 27997754.0, "reward": 0.2500000298023224, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.15000000596046448, "rewards/unicoder_reward_fn/std": 0.35932427644729614, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 414.6499938964844, "completions/mean_terminated_length": 414.6499938964844, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.10092232639682902, "grad_norm": 0.12062133103609085, "kl": 0.04913330078125, "learning_rate": 1.731769184649788e-06, "loss": -0.0032, "num_tokens": 28076546.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1300.0, "completions/mean_length": 455.8999938964844, "completions/mean_terminated_length": 435.7468566894531, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.10122722768503697, "grad_norm": 0.25286737084388733, "kl": 0.05377197265625, "learning_rate": 1.7187940546792325e-06, "loss": 0.0184, "num_tokens": 28159116.0, "reward": 0.23375001549720764, "reward_std": 0.09369164705276489, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 470.07501220703125, "completions/mean_terminated_length": 470.07501220703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.10153212897324491, "grad_norm": 0.2345660924911499, "kl": 0.048095703125, "learning_rate": 1.7058621920448465e-06, "loss": 0.0209, "num_tokens": 28244778.0, "reward": 0.23625002801418304, "reward_std": 0.12551145255565643, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 441.3374938964844, "completions/mean_terminated_length": 441.3374938964844, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.10183703026145285, "grad_norm": 0.07905353605747223, "kl": 0.053466796875, "learning_rate": 1.6929741393416855e-06, "loss": 0.0036, "num_tokens": 28330591.0, "reward": 0.13750000298023224, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1783.0, "completions/max_terminated_length": 1783.0, "completions/mean_length": 437.6750183105469, "completions/mean_terminated_length": 437.6750183105469, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1021419315496608, "grad_norm": 0.19001714885234833, "kl": 0.04754638671875, "learning_rate": 1.6801304373266286e-06, "loss": -0.0027, "num_tokens": 28411101.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1038.0, "completions/max_terminated_length": 1038.0, "completions/mean_length": 408.375, "completions/mean_terminated_length": 408.375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.10244683283786873, "grad_norm": 0.1954682320356369, "kl": 0.052490234375, "learning_rate": 1.667331624895689e-06, "loss": 0.0003, "num_tokens": 28490713.0, "reward": 0.17500001192092896, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 436.26251220703125, "completions/mean_terminated_length": 436.26251220703125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.10275173412607669, "grad_norm": 0.16154824197292328, "kl": 0.053466796875, "learning_rate": 1.6545782390614037e-06, "loss": 0.001, "num_tokens": 28573186.0, "reward": 0.12375000864267349, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 400.07501220703125, "completions/mean_terminated_length": 400.07501220703125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.10305663541428463, "grad_norm": 0.11140483617782593, "kl": 0.0548095703125, "learning_rate": 1.6418708149302992e-06, "loss": 0.0025, "num_tokens": 28650560.0, "reward": 0.16250000894069672, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 389.75, "completions/mean_terminated_length": 389.75, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.10336153670249257, "grad_norm": 0.19081075489521027, "kl": 0.0506591796875, "learning_rate": 1.6292098856804423e-06, "loss": 0.0001, "num_tokens": 28726146.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 435.8625183105469, "completions/mean_terminated_length": 435.8625183105469, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.10366643799070051, "grad_norm": 0.16997206211090088, "kl": 0.04998779296875, "learning_rate": 1.6165959825390661e-06, "loss": 0.006, "num_tokens": 28809467.0, "reward": 0.16250000894069672, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.10397133927890845, "grad_norm": 0.10942406207323074, "kl": 0.0523681640625, "learning_rate": 1.604029634760284e-06, "loss": 0.0081, "num_tokens": 28894931.0, "reward": 0.1875000298023224, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 903.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 396.8500061035156, "completions/mean_terminated_length": 396.8500061035156, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1042762405671164, "grad_norm": 0.16135147213935852, "kl": 0.04913330078125, "learning_rate": 1.59151136960288e-06, "loss": 0.0017, "num_tokens": 28969419.0, "reward": 0.17500001192092896, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 408.3625183105469, "completions/mean_terminated_length": 408.3625183105469, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.10458114185532434, "grad_norm": 0.1693839132785797, "kl": 0.053955078125, "learning_rate": 1.5790417123081903e-06, "loss": -0.0086, "num_tokens": 29045756.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 410.7250061035156, "completions/mean_terminated_length": 410.7250061035156, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.10488604314353228, "grad_norm": 0.16823109984397888, "kl": 0.0499267578125, "learning_rate": 1.5666211860780583e-06, "loss": 0.0005, "num_tokens": 29125906.0, "reward": 0.1862500160932541, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 385.1000061035156, "completions/mean_terminated_length": 385.1000061035156, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.10519094443174022, "grad_norm": 0.13253702223300934, "kl": 0.05218505859375, "learning_rate": 1.5542503120528918e-06, "loss": -0.0031, "num_tokens": 29206558.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 393.1750183105469, "completions/mean_terminated_length": 393.1750183105469, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.10549584571994816, "grad_norm": 0.16460463404655457, "kl": 0.0513916015625, "learning_rate": 1.5419296092897866e-06, "loss": 0.0061, "num_tokens": 29285000.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 383.76251220703125, "completions/mean_terminated_length": 383.76251220703125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.10580074700815612, "grad_norm": 0.23074668645858765, "kl": 0.05499267578125, "learning_rate": 1.529659594740755e-06, "loss": -0.0055, "num_tokens": 29362279.0, "reward": 0.23625002801418304, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 982.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 395.2124938964844, "completions/mean_terminated_length": 395.2124938964844, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "epoch": 0.10610564829636406, "grad_norm": 0.13016793131828308, "kl": 0.0482177734375, "learning_rate": 1.5174407832310338e-06, "loss": 0.0076, "num_tokens": 29443256.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1482.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 392.5625, "completions/mean_terminated_length": 392.5625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.106410549584572, "grad_norm": 0.21937747299671173, "kl": 0.05255126953125, "learning_rate": 1.5052736874374815e-06, "loss": 0.0085, "num_tokens": 29518973.0, "reward": 0.17375001311302185, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 444.9624938964844, "completions/mean_terminated_length": 444.9624938964844, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.10671545087277994, "grad_norm": 0.15810927748680115, "kl": 0.04937744140625, "learning_rate": 1.4931588178670695e-06, "loss": -0.0024, "num_tokens": 29604052.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 406.4375, "completions/mean_terminated_length": 406.4375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.10702035216098787, "grad_norm": 0.11895695328712463, "kl": 0.0513916015625, "learning_rate": 1.4810966828354605e-06, "loss": 0.0007, "num_tokens": 29683913.0, "reward": 0.20000003278255463, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 422.38751220703125, "completions/mean_terminated_length": 422.38751220703125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.10732525344919583, "grad_norm": 0.2081456184387207, "kl": 0.04931640625, "learning_rate": 1.469087788445684e-06, "loss": 0.0127, "num_tokens": 29764884.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 426.8999938964844, "completions/mean_terminated_length": 426.8999938964844, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10763015473740377, "grad_norm": 0.1928683966398239, "kl": 0.04541015625, "learning_rate": 1.4571326385668965e-06, "loss": 0.0038, "num_tokens": 29844874.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 452.3500061035156, "completions/mean_terminated_length": 432.15191650390625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.10793505602561171, "grad_norm": 0.26351362466812134, "kl": 0.05120849609375, "learning_rate": 1.4452317348132434e-06, "loss": 0.0117, "num_tokens": 29929006.0, "reward": 0.2237500250339508, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 418.01251220703125, "completions/mean_terminated_length": 418.01251220703125, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.10823995731381965, "grad_norm": 0.14276431500911713, "kl": 0.05224609375, "learning_rate": 1.4333855765228104e-06, "loss": -0.001, "num_tokens": 30014969.0, "reward": 0.1625000238418579, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 434.125, "completions/mean_terminated_length": 434.125, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.10854485860202759, "grad_norm": 0.1563921421766281, "kl": 0.04644775390625, "learning_rate": 1.421594660736675e-06, "loss": -0.0034, "num_tokens": 30097137.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.025000000000000022, "completions/max_length": 2048.0, "completions/max_terminated_length": 1260.0, "completions/mean_length": 481.4125061035156, "completions/mean_terminated_length": 441.24359130859375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.10884975989023554, "grad_norm": 0.17907285690307617, "kl": 0.04522705078125, "learning_rate": 1.4098594821780476e-06, "loss": 0.0443, "num_tokens": 30185818.0, "reward": 0.12250001728534698, "reward_std": 0.03889087215065956, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1816.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 467.26251220703125, "completions/mean_terminated_length": 467.26251220703125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.10915466117844348, "grad_norm": 0.15856198966503143, "kl": 0.04522705078125, "learning_rate": 1.3981805332315174e-06, "loss": -0.0018, "num_tokens": 30272527.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 451.0625, "completions/mean_terminated_length": 451.0625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.10945956246665142, "grad_norm": 0.23278465867042542, "kl": 0.04681396484375, "learning_rate": 1.3865583039223929e-06, "loss": 0.0174, "num_tokens": 30353838.0, "reward": 0.1875000298023224, "reward_std": 0.12374367564916611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 900.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 409.57501220703125, "completions/mean_terminated_length": 409.57501220703125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.10976446375485936, "grad_norm": 0.16648580133914948, "kl": 0.04815673828125, "learning_rate": 1.374993281896137e-06, "loss": 0.0039, "num_tokens": 30434910.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 399.6875, "completions/mean_terminated_length": 399.6875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.1100693650430673, "grad_norm": 0.23581717908382416, "kl": 0.05224609375, "learning_rate": 1.3634859523979134e-06, "loss": 0.0181, "num_tokens": 30514995.0, "reward": 0.1600000113248825, "reward_std": 0.05656854063272476, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 392.2749938964844, "completions/mean_terminated_length": 392.2749938964844, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.11037426633127526, "grad_norm": 0.17342065274715424, "kl": 0.04815673828125, "learning_rate": 1.3520367982522208e-06, "loss": 0.01, "num_tokens": 30592935.0, "reward": 0.14875002205371857, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 385.63751220703125, "completions/mean_terminated_length": 385.63751220703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.1106791676194832, "grad_norm": 0.17184416949748993, "kl": 0.05023193359375, "learning_rate": 1.3406462998426358e-06, "loss": 0.0057, "num_tokens": 30670002.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1234.0, "completions/max_terminated_length": 1234.0, "completions/mean_length": 424.88751220703125, "completions/mean_terminated_length": 424.88751220703125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.11098406890769114, "grad_norm": 0.2137637883424759, "kl": 0.04620361328125, "learning_rate": 1.3293149350916595e-06, "loss": 0.0155, "num_tokens": 30751821.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1108.0, "completions/max_terminated_length": 1108.0, "completions/mean_length": 383.45001220703125, "completions/mean_terminated_length": 383.45001220703125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.11128897019589908, "grad_norm": 0.2274777889251709, "kl": 0.05029296875, "learning_rate": 1.3180431794406623e-06, "loss": 0.0004, "num_tokens": 30827591.0, "reward": 0.2250000238418579, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1117.0, "completions/max_terminated_length": 1117.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.11159387148410702, "grad_norm": 0.22330601513385773, "kl": 0.04931640625, "learning_rate": 1.3068315058299358e-06, "loss": 0.0006, "num_tokens": 30911769.0, "reward": 0.1625000238418579, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 381.7749938964844, "completions/mean_terminated_length": 381.7749938964844, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.11189877277231497, "grad_norm": 0.2143508344888687, "kl": 0.05047607421875, "learning_rate": 1.2956803846788503e-06, "loss": 0.0126, "num_tokens": 30991433.0, "reward": 0.1625000238418579, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1107.0, "completions/max_terminated_length": 1107.0, "completions/mean_length": 405.5375061035156, "completions/mean_terminated_length": 405.5375061035156, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.11220367406052291, "grad_norm": 0.1381106972694397, "kl": 0.04779052734375, "learning_rate": 1.284590283866116e-06, "loss": 0.0077, "num_tokens": 31074002.0, "reward": 0.16250000894069672, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 417.6000061035156, "completions/mean_terminated_length": 417.6000061035156, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.11250857534873085, "grad_norm": 0.1931677609682083, "kl": 0.04852294921875, "learning_rate": 1.2735616687101518e-06, "loss": 0.0044, "num_tokens": 31156898.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1339.0, "completions/max_terminated_length": 1339.0, "completions/mean_length": 402.76251220703125, "completions/mean_terminated_length": 402.76251220703125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.11281347663693879, "grad_norm": 0.2690817713737488, "kl": 0.0499267578125, "learning_rate": 1.2625950019495614e-06, "loss": 0.0277, "num_tokens": 31240057.0, "reward": 0.19875001907348633, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 389.38751220703125, "completions/mean_terminated_length": 389.38751220703125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.11311837792514673, "grad_norm": 0.1295740157365799, "kl": 0.0501708984375, "learning_rate": 1.251690743723718e-06, "loss": 0.0025, "num_tokens": 31319472.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 759.0, "completions/max_terminated_length": 759.0, "completions/mean_length": 368.63751220703125, "completions/mean_terminated_length": 368.63751220703125, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.11342327921335468, "grad_norm": 0.14982041716575623, "kl": 0.049560546875, "learning_rate": 1.2408493515534581e-06, "loss": 0.0029, "num_tokens": 31394757.0, "reward": 0.1862500160932541, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1478.0, "completions/max_terminated_length": 1478.0, "completions/mean_length": 414.5375061035156, "completions/mean_terminated_length": 414.5375061035156, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.11372818050156262, "grad_norm": 0.14369921386241913, "kl": 0.04833984375, "learning_rate": 1.2300712803218834e-06, "loss": 0.0156, "num_tokens": 31476508.0, "reward": 0.20000003278255463, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 377.6125183105469, "completions/mean_terminated_length": 377.6125183105469, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.11403308178977056, "grad_norm": 0.15167175233364105, "kl": 0.05120849609375, "learning_rate": 1.2193569822552772e-06, "loss": -0.0041, "num_tokens": 31554491.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 390.1625061035156, "completions/mean_terminated_length": 390.1625061035156, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.1143379830779785, "grad_norm": 0.1379869431257248, "kl": 0.04913330078125, "learning_rate": 1.2087069069041268e-06, "loss": -0.011, "num_tokens": 31634458.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 927.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 346.1125183105469, "completions/mean_terminated_length": 346.1125183105469, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.11464288436618644, "grad_norm": 0.19083495438098907, "kl": 0.05242919921875, "learning_rate": 1.1981215011242654e-06, "loss": 0.0079, "num_tokens": 31708299.0, "reward": 0.17500001192092896, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 353.0, "completions/mean_terminated_length": 353.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.1149477856543944, "grad_norm": 0.31387728452682495, "kl": 0.05279541015625, "learning_rate": 1.1876012090581184e-06, "loss": 0.0001, "num_tokens": 31779549.0, "reward": 0.21250002086162567, "reward_std": 0.12374367564916611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1206.0, "completions/max_terminated_length": 1206.0, "completions/mean_length": 377.7375183105469, "completions/mean_terminated_length": 377.7375183105469, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.11525268694260234, "grad_norm": 0.23074573278427124, "kl": 0.0509033203125, "learning_rate": 1.177146472116071e-06, "loss": -0.0032, "num_tokens": 31858142.0, "reward": 0.14875000715255737, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 945.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 377.0, "completions/mean_terminated_length": 377.0, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.11555758823081028, "grad_norm": 0.08496691286563873, "kl": 0.04949951171875, "learning_rate": 1.1667577289579462e-06, "loss": 0.0013, "num_tokens": 31940052.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 431.20001220703125, "completions/mean_terminated_length": 431.20001220703125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.11586248951901822, "grad_norm": 0.18607525527477264, "kl": 0.0474853515625, "learning_rate": 1.1564354154746007e-06, "loss": 0.0094, "num_tokens": 32024476.0, "reward": 0.1875000149011612, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 343.3999938964844, "completions/mean_terminated_length": 343.3999938964844, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.11616739080722616, "grad_norm": 0.16431212425231934, "kl": 0.05230712890625, "learning_rate": 1.146179964769635e-06, "loss": -0.0015, "num_tokens": 32104930.0, "reward": 0.13625000417232513, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 432.13751220703125, "completions/mean_terminated_length": 432.13751220703125, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.11647229209543411, "grad_norm": 0.16867312788963318, "kl": 0.0478515625, "learning_rate": 1.1359918071412195e-06, "loss": -0.003, "num_tokens": 32193079.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 416.9375, "completions/mean_terminated_length": 416.9375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.11677719338364205, "grad_norm": 0.2334057241678238, "kl": 0.05169677734375, "learning_rate": 1.1258713700640456e-06, "loss": 0.003, "num_tokens": 32277986.0, "reward": 0.15000000596046448, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1081.0, "completions/max_terminated_length": 1081.0, "completions/mean_length": 364.625, "completions/mean_terminated_length": 364.625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.11708209467184999, "grad_norm": 0.3081050217151642, "kl": 0.05218505859375, "learning_rate": 1.115819078171383e-06, "loss": -0.01, "num_tokens": 32352954.0, "reward": 0.19875001907348633, "reward_std": 0.14318911731243134, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 454.5500183105469, "completions/mean_terminated_length": 454.5500183105469, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.11738699596005793, "grad_norm": 0.21313853561878204, "kl": 0.04852294921875, "learning_rate": 1.1058353532372667e-06, "loss": 0.0039, "num_tokens": 32439852.0, "reward": 0.1862500160932541, "reward_std": 0.09015612304210663, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 414.13751220703125, "completions/mean_terminated_length": 414.13751220703125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.11769189724826587, "grad_norm": 0.17836220562458038, "kl": 0.0465087890625, "learning_rate": 1.0959206141587998e-06, "loss": 0.0003, "num_tokens": 32516357.0, "reward": 0.15000000596046448, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1639.0, "completions/max_terminated_length": 1639.0, "completions/mean_length": 415.2124938964844, "completions/mean_terminated_length": 415.2124938964844, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.11799679853647381, "grad_norm": 0.1351797878742218, "kl": 0.04815673828125, "learning_rate": 1.0860752769385766e-06, "loss": 0.0018, "num_tokens": 32595424.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 421.2749938964844, "completions/mean_terminated_length": 400.6835632324219, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.11830169982468176, "grad_norm": 0.11575304716825485, "kl": 0.047119140625, "learning_rate": 1.0762997546672279e-06, "loss": 0.0158, "num_tokens": 32679826.0, "reward": 0.11125000566244125, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1026.0, "completions/max_terminated_length": 1026.0, "completions/mean_length": 440.7124938964844, "completions/mean_terminated_length": 440.7124938964844, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1186066011128897, "grad_norm": 0.12233486771583557, "kl": 0.0482177734375, "learning_rate": 1.0665944575060914e-06, "loss": 0.0091, "num_tokens": 32761793.0, "reward": 0.13750001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 938.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 404.7375183105469, "completions/mean_terminated_length": 404.7375183105469, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.11891150240109764, "grad_norm": 0.2841652035713196, "kl": 0.0499267578125, "learning_rate": 1.056959792669997e-06, "loss": 0.0192, "num_tokens": 32843012.0, "reward": 0.21250002086162567, "reward_std": 0.1237436905503273, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 399.38751220703125, "completions/mean_terminated_length": 399.38751220703125, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.11921640368930558, "grad_norm": 0.2231462299823761, "kl": 0.047607421875, "learning_rate": 1.0473961644101856e-06, "loss": 0.0048, "num_tokens": 32925553.0, "reward": 0.27500003576278687, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.17499999701976776, "rewards/unicoder_reward_fn/std": 0.3823643922805786, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 863.0, "completions/max_terminated_length": 863.0, "completions/mean_length": 418.5249938964844, "completions/mean_terminated_length": 418.5249938964844, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.11952130497751352, "grad_norm": 0.12139161676168442, "kl": 0.048095703125, "learning_rate": 1.037903973997345e-06, "loss": 0.0032, "num_tokens": 33008785.0, "reward": 0.17375002801418304, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 434.7749938964844, "completions/mean_terminated_length": 434.7749938964844, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.11982620626572148, "grad_norm": 0.1002897098660469, "kl": 0.0506591796875, "learning_rate": 1.0284836197047737e-06, "loss": -0.0004, "num_tokens": 33091849.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 367.88751220703125, "completions/mean_terminated_length": 367.88751220703125, "completions/min_length": 62.0, "completions/min_terminated_length": 62.0, "epoch": 0.12013110755392942, "grad_norm": 0.2262028157711029, "kl": 0.05035400390625, "learning_rate": 1.0191354967916712e-06, "loss": 0.0012, "num_tokens": 33165528.0, "reward": 0.23750002682209015, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 413.01251220703125, "completions/mean_terminated_length": 413.01251220703125, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.12043600884213736, "grad_norm": 0.3028506636619568, "kl": 0.05047607421875, "learning_rate": 1.0098599974865515e-06, "loss": 0.0182, "num_tokens": 33247377.0, "reward": 0.29750001430511475, "reward_std": 0.14495688676834106, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.20000000298023224, "rewards/unicoder_reward_fn/std": 0.4025236964225769, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 427.3999938964844, "completions/mean_terminated_length": 427.3999938964844, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.1207409101303453, "grad_norm": 0.16660848259925842, "kl": 0.04803466796875, "learning_rate": 1.0006575109707898e-06, "loss": -0.0099, "num_tokens": 33329183.0, "reward": 0.20000003278255463, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 375.51251220703125, "completions/mean_terminated_length": 375.51251220703125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.12104581141855324, "grad_norm": 0.25240859389305115, "kl": 0.05181884765625, "learning_rate": 9.915284233622877e-07, "loss": -0.0053, "num_tokens": 33405410.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 376.38751220703125, "completions/mean_terminated_length": 376.38751220703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.12135071270676119, "grad_norm": 0.21786899864673615, "kl": 0.05023193359375, "learning_rate": 9.824731176992796e-07, "loss": -0.0137, "num_tokens": 33478769.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 442.9250183105469, "completions/mean_terminated_length": 442.9250183105469, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.12165561399496913, "grad_norm": 0.21049322187900543, "kl": 0.0450439453125, "learning_rate": 9.734919739242543e-07, "loss": 0.014, "num_tokens": 33566537.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 431.51251220703125, "completions/mean_terminated_length": 431.51251220703125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.12196051528317707, "grad_norm": 0.23430253565311432, "kl": 0.04595947265625, "learning_rate": 9.645853688680177e-07, "loss": 0.0128, "num_tokens": 33649636.0, "reward": 0.1875000149011612, "reward_std": 0.12374367564916611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 990.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 411.4750061035156, "completions/mean_terminated_length": 411.4750061035156, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.12226541657138501, "grad_norm": 0.21432843804359436, "kl": 0.04547119140625, "learning_rate": 9.557536762338786e-07, "loss": 0.0005, "num_tokens": 33727294.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1485.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 446.7124938964844, "completions/mean_terminated_length": 446.7124938964844, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.12257031785959295, "grad_norm": 0.1339891105890274, "kl": 0.044921875, "learning_rate": 9.46997266581973e-07, "loss": -0.0003, "num_tokens": 33810863.0, "reward": 0.20000003278255463, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 431.7250061035156, "completions/mean_terminated_length": 431.7250061035156, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.1228752191478009, "grad_norm": 0.2357596755027771, "kl": 0.04827880859375, "learning_rate": 9.383165073137115e-07, "loss": 0.0063, "num_tokens": 33893307.0, "reward": 0.2500000298023224, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.15000000596046448, "rewards/unicoder_reward_fn/std": 0.35932427644729614, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1068.0, "completions/max_terminated_length": 1068.0, "completions/mean_length": 452.8374938964844, "completions/mean_terminated_length": 452.8374938964844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.12318012043600884, "grad_norm": 0.19236986339092255, "kl": 0.04608154296875, "learning_rate": 9.297117626563687e-07, "loss": 0.0161, "num_tokens": 33979250.0, "reward": 0.20000003278255463, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 438.32501220703125, "completions/mean_terminated_length": 438.32501220703125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.12348502172421678, "grad_norm": 0.20204347372055054, "kl": 0.04681396484375, "learning_rate": 9.211833936477957e-07, "loss": 0.0012, "num_tokens": 34062038.0, "reward": 0.1875000149011612, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 434.0625, "completions/mean_terminated_length": 413.6329345703125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.12378992301242472, "grad_norm": 0.14883990585803986, "kl": 0.04541015625, "learning_rate": 9.127317581212753e-07, "loss": 0.0263, "num_tokens": 34147497.0, "reward": 0.17375002801418304, "reward_std": 0.07247845083475113, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 437.5375061035156, "completions/mean_terminated_length": 437.5375061035156, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.12409482430063266, "grad_norm": 0.17637896537780762, "kl": 0.04620361328125, "learning_rate": 9.043572106905084e-07, "loss": 0.0249, "num_tokens": 34230302.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 437.8500061035156, "completions/mean_terminated_length": 437.8500061035156, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.12439972558884062, "grad_norm": 0.18132470548152924, "kl": 0.04522705078125, "learning_rate": 8.960601027347321e-07, "loss": 0.0439, "num_tokens": 34312412.0, "reward": 0.19875001907348633, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927574157715, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1449.0, "completions/max_terminated_length": 1449.0, "completions/mean_length": 454.0375061035156, "completions/mean_terminated_length": 454.0375061035156, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.12470462687704856, "grad_norm": 0.1201486811041832, "kl": 0.045166015625, "learning_rate": 8.878407823839788e-07, "loss": -0.0122, "num_tokens": 34397757.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 857.0, "completions/max_terminated_length": 857.0, "completions/mean_length": 423.7749938964844, "completions/mean_terminated_length": 423.7749938964844, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1250095281652565, "grad_norm": 0.17683278024196625, "kl": 0.05145263671875, "learning_rate": 8.796995945044689e-07, "loss": -0.0036, "num_tokens": 34478837.0, "reward": 0.22500000894069672, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 416.9125061035156, "completions/mean_terminated_length": 416.9125061035156, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.12531442945346444, "grad_norm": 0.22345435619354248, "kl": 0.0445556640625, "learning_rate": 8.716368806841405e-07, "loss": 0.0144, "num_tokens": 34557896.0, "reward": 0.1875000149011612, "reward_std": 0.12374367564916611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 397.9375, "completions/mean_terminated_length": 397.9375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1256193307416724, "grad_norm": 0.16894420981407166, "kl": 0.0504150390625, "learning_rate": 8.636529792183171e-07, "loss": -0.0049, "num_tokens": 34637915.0, "reward": 0.15000000596046448, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 399.6875, "completions/mean_terminated_length": 399.6875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.12592423202988032, "grad_norm": 0.1841939240694046, "kl": 0.052490234375, "learning_rate": 8.557482250955144e-07, "loss": 0.0012, "num_tokens": 34720942.0, "reward": 0.17500001192092896, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 966.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 393.1499938964844, "completions/mean_terminated_length": 393.1499938964844, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.12622913331808827, "grad_norm": 0.22368940711021423, "kl": 0.04644775390625, "learning_rate": 8.479229499833844e-07, "loss": -0.0111, "num_tokens": 34798866.0, "reward": 0.27500003576278687, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.17499999701976776, "rewards/unicoder_reward_fn/std": 0.3823643922805786, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1222.0, "completions/max_terminated_length": 1222.0, "completions/mean_length": 440.0, "completions/mean_terminated_length": 440.0, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.12653403460629623, "grad_norm": 0.19953802227973938, "kl": 0.044677734375, "learning_rate": 8.401774822147976e-07, "loss": -0.0027, "num_tokens": 34883180.0, "reward": 0.16250000894069672, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1347.0, "completions/max_terminated_length": 1347.0, "completions/mean_length": 436.6750183105469, "completions/mean_terminated_length": 436.6750183105469, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.12683893589450415, "grad_norm": 0.14660529792308807, "kl": 0.045166015625, "learning_rate": 8.325121467740695e-07, "loss": 0.0062, "num_tokens": 34971670.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 415.125, "completions/mean_terminated_length": 415.125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1271438371827121, "grad_norm": 3.1824533939361572, "kl": 0.06732177734375, "learning_rate": 8.249272652833226e-07, "loss": 0.0124, "num_tokens": 35050342.0, "reward": 0.20000000298023224, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 956.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 420.4875183105469, "completions/mean_terminated_length": 420.4875183105469, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.12744873847092003, "grad_norm": 0.16419346630573273, "kl": 0.0458984375, "learning_rate": 8.174231559889931e-07, "loss": 0.0151, "num_tokens": 35132545.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 390.51251220703125, "completions/mean_terminated_length": 390.51251220703125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.12775363975912798, "grad_norm": 0.18130381405353546, "kl": 0.04888916015625, "learning_rate": 8.100001337484787e-07, "loss": 0.0067, "num_tokens": 35214504.0, "reward": 0.23750002682209015, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1199.0, "completions/max_terminated_length": 1199.0, "completions/mean_length": 409.6499938964844, "completions/mean_terminated_length": 409.6499938964844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.12805854104733594, "grad_norm": 0.16933445632457733, "kl": 0.0457763671875, "learning_rate": 8.026585100169251e-07, "loss": 0.0005, "num_tokens": 35296306.0, "reward": 0.14875000715255737, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 425.6625061035156, "completions/mean_terminated_length": 425.6625061035156, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.12836344233554386, "grad_norm": 0.27452513575553894, "kl": 0.05029296875, "learning_rate": 7.953985928341601e-07, "loss": 0.0212, "num_tokens": 35380561.0, "reward": 0.20000003278255463, "reward_std": 0.1414213627576828, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1043.0, "completions/max_terminated_length": 1043.0, "completions/mean_length": 399.8374938964844, "completions/mean_terminated_length": 399.8374938964844, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.12866834362375182, "grad_norm": 0.19873785972595215, "kl": 0.046142578125, "learning_rate": 7.882206868117693e-07, "loss": -0.0027, "num_tokens": 35459580.0, "reward": 0.26250001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.16249999403953552, "rewards/unicoder_reward_fn/std": 0.3712363839149475, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 397.26251220703125, "completions/mean_terminated_length": 397.26251220703125, "completions/min_length": 72.0, "completions/min_terminated_length": 72.0, "epoch": 0.12897324491195974, "grad_norm": 0.14467059075832367, "kl": 0.04571533203125, "learning_rate": 7.81125093120313e-07, "loss": 0.0103, "num_tokens": 35538789.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 390.25, "completions/mean_terminated_length": 390.25, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.1292781462001677, "grad_norm": 0.1401596963405609, "kl": 0.046630859375, "learning_rate": 7.741121094766916e-07, "loss": 0.0055, "num_tokens": 35620247.0, "reward": 0.21250002086162567, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1279.0, "completions/max_terminated_length": 1279.0, "completions/mean_length": 372.9750061035156, "completions/mean_terminated_length": 372.9750061035156, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.12958304748837565, "grad_norm": 0.16129828989505768, "kl": 0.04754638671875, "learning_rate": 7.671820301316532e-07, "loss": 0.0035, "num_tokens": 35694351.0, "reward": 0.1625000238418579, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 930.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 384.51251220703125, "completions/mean_terminated_length": 384.51251220703125, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.12988794877658358, "grad_norm": 0.22551488876342773, "kl": 0.0595703125, "learning_rate": 7.603351458574474e-07, "loss": -0.0164, "num_tokens": 35772664.0, "reward": 0.17375002801418304, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1142.0, "completions/max_terminated_length": 1142.0, "completions/mean_length": 409.26251220703125, "completions/mean_terminated_length": 409.26251220703125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.13019285006479153, "grad_norm": 0.2181655764579773, "kl": 0.04736328125, "learning_rate": 7.535717439356255e-07, "loss": 0.0084, "num_tokens": 35853321.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 406.8999938964844, "completions/mean_terminated_length": 406.8999938964844, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.13049775135299946, "grad_norm": 0.09705975651741028, "kl": 0.0478515625, "learning_rate": 7.46892108144986e-07, "loss": 0.0028, "num_tokens": 35937717.0, "reward": 0.1250000149011612, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1916.0, "completions/max_terminated_length": 1916.0, "completions/mean_length": 414.5500183105469, "completions/mean_terminated_length": 414.5500183105469, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.1308026526412074, "grad_norm": 0.14803965389728546, "kl": 0.04669189453125, "learning_rate": 7.402965187496697e-07, "loss": 0.0065, "num_tokens": 36022223.0, "reward": 0.11124999821186066, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 396.9250183105469, "completions/mean_terminated_length": 396.9250183105469, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.13110755392941534, "grad_norm": 0.15297779440879822, "kl": 0.04736328125, "learning_rate": 7.337852524873974e-07, "loss": 0.0036, "num_tokens": 36103699.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1753.0, "completions/max_terminated_length": 1753.0, "completions/mean_length": 428.375, "completions/mean_terminated_length": 428.375, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1314124552176233, "grad_norm": 0.16848964989185333, "kl": 0.05059814453125, "learning_rate": 7.273585825578608e-07, "loss": 0.0129, "num_tokens": 36186971.0, "reward": 0.17375002801418304, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1241.0, "completions/max_terminated_length": 1241.0, "completions/mean_length": 424.7250061035156, "completions/mean_terminated_length": 424.7250061035156, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.13171735650583125, "grad_norm": 0.26175057888031006, "kl": 0.04840087890625, "learning_rate": 7.21016778611259e-07, "loss": 0.0095, "num_tokens": 36270993.0, "reward": 0.19875001907348633, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 359.32501220703125, "completions/mean_terminated_length": 359.32501220703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.13202225779403917, "grad_norm": 0.1780603528022766, "kl": 0.047607421875, "learning_rate": 7.147601067369835e-07, "loss": 0.0054, "num_tokens": 36347327.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 399.6750183105469, "completions/mean_terminated_length": 399.6750183105469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.13232715908224713, "grad_norm": 0.09010346233844757, "kl": 0.04693603515625, "learning_rate": 7.085888294524561e-07, "loss": -0.0012, "num_tokens": 36426937.0, "reward": 0.11250000447034836, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 381.9750061035156, "completions/mean_terminated_length": 381.9750061035156, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.13263206037045505, "grad_norm": 0.24115842580795288, "kl": 0.04803466796875, "learning_rate": 7.025032056921117e-07, "loss": 0.0098, "num_tokens": 36506775.0, "reward": 0.19875001907348633, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 868.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 406.75, "completions/mean_terminated_length": 406.75, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.132936961658663, "grad_norm": 0.15974393486976624, "kl": 0.048583984375, "learning_rate": 6.965034907965349e-07, "loss": 0.0085, "num_tokens": 36588207.0, "reward": 0.1625000238418579, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 830.0, "completions/max_terminated_length": 830.0, "completions/mean_length": 341.8500061035156, "completions/mean_terminated_length": 341.8500061035156, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.13324186294687096, "grad_norm": 0.17505139112472534, "kl": 0.04583740234375, "learning_rate": 6.905899365017462e-07, "loss": 0.0065, "num_tokens": 36660923.0, "reward": 0.13750000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 965.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 390.1750183105469, "completions/mean_terminated_length": 390.1750183105469, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.13354676423507889, "grad_norm": 0.13170388340950012, "kl": 0.04925537109375, "learning_rate": 6.847627909286409e-07, "loss": 0.0008, "num_tokens": 36740415.0, "reward": 0.13625000417232513, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 837.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 403.45001220703125, "completions/mean_terminated_length": 403.45001220703125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.13385166552328684, "grad_norm": 0.19051723182201385, "kl": 0.05145263671875, "learning_rate": 6.790222985725761e-07, "loss": 0.0026, "num_tokens": 36819045.0, "reward": 0.14875002205371857, "reward_std": 0.07247844338417053, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 381.1750183105469, "completions/mean_terminated_length": 381.1750183105469, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.13415656681149477, "grad_norm": 0.16633227467536926, "kl": 0.048095703125, "learning_rate": 6.733687002931141e-07, "loss": 0.0078, "num_tokens": 36899337.0, "reward": 0.21250002086162567, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 383.6625061035156, "completions/mean_terminated_length": 383.6625061035156, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.13446146809970272, "grad_norm": 0.18865156173706055, "kl": 0.04852294921875, "learning_rate": 6.678022333039158e-07, "loss": 0.0073, "num_tokens": 36977848.0, "reward": 0.2237500250339508, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 358.82501220703125, "completions/mean_terminated_length": 358.82501220703125, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.13476636938791067, "grad_norm": 0.19032283127307892, "kl": 0.04852294921875, "learning_rate": 6.623231311627876e-07, "loss": 0.0042, "num_tokens": 37050838.0, "reward": 0.26250001788139343, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.16249999403953552, "rewards/unicoder_reward_fn/std": 0.3712363839149475, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1254.0, "completions/max_terminated_length": 1254.0, "completions/mean_length": 400.8625183105469, "completions/mean_terminated_length": 400.8625183105469, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1350712706761186, "grad_norm": 0.11788418143987656, "kl": 0.04779052734375, "learning_rate": 6.569316237618811e-07, "loss": 0.0015, "num_tokens": 37131997.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 392.4750061035156, "completions/mean_terminated_length": 392.4750061035156, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.13537617196432655, "grad_norm": 0.14593246579170227, "kl": 0.05035400390625, "learning_rate": 6.516279373180499e-07, "loss": 0.0039, "num_tokens": 37214855.0, "reward": 0.19875001907348633, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 411.125, "completions/mean_terminated_length": 411.125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.13568107325253448, "grad_norm": 0.1687837392091751, "kl": 0.04461669921875, "learning_rate": 6.464122943633543e-07, "loss": 0.01, "num_tokens": 37298811.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 389.82501220703125, "completions/mean_terminated_length": 389.82501220703125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13598597454074243, "grad_norm": 0.14800910651683807, "kl": 0.0494384765625, "learning_rate": 6.412849137357271e-07, "loss": -0.0077, "num_tokens": 37378691.0, "reward": 0.23750002682209015, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1122.0, "completions/max_terminated_length": 1122.0, "completions/mean_length": 434.5249938964844, "completions/mean_terminated_length": 434.5249938964844, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.1362908758289504, "grad_norm": 0.06896167248487473, "kl": 0.0452880859375, "learning_rate": 6.3624601056979e-07, "loss": 0.0132, "num_tokens": 37462417.0, "reward": 0.16250000894069672, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 976.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 378.4875183105469, "completions/mean_terminated_length": 378.4875183105469, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1365957771171583, "grad_norm": 0.23473790287971497, "kl": 0.046630859375, "learning_rate": 6.312957962878278e-07, "loss": 0.0231, "num_tokens": 37542628.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 838.0, "completions/max_terminated_length": 838.0, "completions/mean_length": 349.0874938964844, "completions/mean_terminated_length": 349.0874938964844, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.13690067840536627, "grad_norm": 0.2421552687883377, "kl": 0.0560302734375, "learning_rate": 6.264344785909181e-07, "loss": 0.0092, "num_tokens": 37618047.0, "reward": 0.17250001430511475, "reward_std": 0.07424621284008026, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 985.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 398.7375183105469, "completions/mean_terminated_length": 398.7375183105469, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.1372055796935742, "grad_norm": 0.18554161489009857, "kl": 0.04620361328125, "learning_rate": 6.216622614502149e-07, "loss": 0.0162, "num_tokens": 37697614.0, "reward": 0.2250000238418579, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 371.125, "completions/mean_terminated_length": 371.125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.13751048098178215, "grad_norm": 0.19813232123851776, "kl": 0.04840087890625, "learning_rate": 6.169793450983916e-07, "loss": 0.0075, "num_tokens": 37771176.0, "reward": 0.1875000298023224, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 356.9750061035156, "completions/mean_terminated_length": 356.9750061035156, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.1378153822699901, "grad_norm": 0.21376754343509674, "kl": 0.0504150390625, "learning_rate": 6.123859260212393e-07, "loss": 0.013, "num_tokens": 37853318.0, "reward": 0.14875000715255737, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 779.0, "completions/max_terminated_length": 779.0, "completions/mean_length": 317.6125183105469, "completions/mean_terminated_length": 317.6125183105469, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.13812028355819803, "grad_norm": 0.21950644254684448, "kl": 0.0565185546875, "learning_rate": 6.07882196949423e-07, "loss": 0.0005, "num_tokens": 37925903.0, "reward": 0.27250000834465027, "reward_std": 0.07424621284008026, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.17499999701976776, "rewards/unicoder_reward_fn/std": 0.3823643922805786, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 366.5625, "completions/mean_terminated_length": 366.5625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.13842518484640598, "grad_norm": 0.15327180922031403, "kl": 0.044921875, "learning_rate": 6.034683468503948e-07, "loss": -0.002, "num_tokens": 38006736.0, "reward": 0.17500001192092896, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 375.3374938964844, "completions/mean_terminated_length": 375.3374938964844, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.1387300861346139, "grad_norm": 0.12953659892082214, "kl": 0.04559326171875, "learning_rate": 5.991445609204641e-07, "loss": -0.0036, "num_tokens": 38087977.0, "reward": 0.17500002682209015, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 892.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 365.9125061035156, "completions/mean_terminated_length": 365.9125061035156, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13903498742282186, "grad_norm": 0.1928936243057251, "kl": 0.04638671875, "learning_rate": 5.949110205770292e-07, "loss": 0.0041, "num_tokens": 38164564.0, "reward": 0.21250002086162567, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1813.0, "completions/max_terminated_length": 1813.0, "completions/mean_length": 368.6125183105469, "completions/mean_terminated_length": 368.6125183105469, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1393398887110298, "grad_norm": 0.248763307929039, "kl": 0.04815673828125, "learning_rate": 5.90767903450964e-07, "loss": 0.0063, "num_tokens": 38244613.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 391.5249938964844, "completions/mean_terminated_length": 391.5249938964844, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.13964478999923774, "grad_norm": 0.08757586777210236, "kl": 0.04510498046875, "learning_rate": 5.867153833791652e-07, "loss": -0.0026, "num_tokens": 38325939.0, "reward": 0.13500002026557922, "reward_std": 0.01767767034471035, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 886.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 328.1750183105469, "completions/mean_terminated_length": 328.1750183105469, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.1399496912874457, "grad_norm": 0.25197768211364746, "kl": 0.05291748046875, "learning_rate": 5.827536303972587e-07, "loss": 0.0076, "num_tokens": 38399427.0, "reward": 0.2237500250339508, "reward_std": 0.10783378034830093, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 348.25, "completions/mean_terminated_length": 348.25, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.14025459257565362, "grad_norm": 0.1697019338607788, "kl": 0.0504150390625, "learning_rate": 5.78882810732465e-07, "loss": -0.0017, "num_tokens": 38476585.0, "reward": 0.17250001430511475, "reward_std": 0.03889087215065956, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1145.0, "completions/max_terminated_length": 1145.0, "completions/mean_length": 345.5, "completions/mean_terminated_length": 345.5, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.14055949386386157, "grad_norm": 0.18673694133758545, "kl": 0.0472412109375, "learning_rate": 5.75103086796625e-07, "loss": 0.0019, "num_tokens": 38552001.0, "reward": 0.1862500160932541, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 801.0, "completions/max_terminated_length": 801.0, "completions/mean_length": 334.76251220703125, "completions/mean_terminated_length": 334.76251220703125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.14086439515206953, "grad_norm": 0.13160358369350433, "kl": 0.05072021484375, "learning_rate": 5.714146171793846e-07, "loss": -0.005, "num_tokens": 38623824.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1443.0, "completions/max_terminated_length": 1443.0, "completions/mean_length": 379.9624938964844, "completions/mean_terminated_length": 379.9624938964844, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.14116929644027745, "grad_norm": 0.19666478037834167, "kl": 0.046630859375, "learning_rate": 5.678175566415422e-07, "loss": 0.0027, "num_tokens": 38704121.0, "reward": 0.1875000149011612, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 919.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 355.4750061035156, "completions/mean_terminated_length": 355.4750061035156, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.1414741977284854, "grad_norm": 0.14412783086299896, "kl": 0.04425048828125, "learning_rate": 5.643120561085528e-07, "loss": 0.003, "num_tokens": 38779325.0, "reward": 0.1875000149011612, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 358.8625183105469, "completions/mean_terminated_length": 358.8625183105469, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.14177909901669333, "grad_norm": 0.21223917603492737, "kl": 0.0516357421875, "learning_rate": 5.608982626641991e-07, "loss": -0.0045, "num_tokens": 38855108.0, "reward": 0.10875000059604645, "reward_std": 0.02298097126185894, "rewards/format_reward/mean": 0.9624999761581421, "rewards/format_reward/std": 0.1911821961402893, "rewards/unicoder_reward_fn/mean": 0.012500000186264515, "rewards/unicoder_reward_fn/std": 0.11180339753627777, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 354.75, "completions/mean_terminated_length": 354.75, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.1420840003049013, "grad_norm": 0.18273437023162842, "kl": 0.0477294921875, "learning_rate": 5.575763195444166e-07, "loss": -0.0039, "num_tokens": 38932418.0, "reward": 0.2250000238418579, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 340.6125183105469, "completions/mean_terminated_length": 340.6125183105469, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.14238890159310924, "grad_norm": 0.1895855814218521, "kl": 0.05322265625, "learning_rate": 5.543463661312847e-07, "loss": -0.0005, "num_tokens": 39007663.0, "reward": 0.17500002682209015, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 364.6499938964844, "completions/mean_terminated_length": 364.6499938964844, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.14269380288131717, "grad_norm": 0.10075707733631134, "kl": 0.04315185546875, "learning_rate": 5.512085379471808e-07, "loss": 0.0025, "num_tokens": 39086407.0, "reward": 0.15000002086162567, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 968.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 356.0249938964844, "completions/mean_terminated_length": 356.0249938964844, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.14299870416952512, "grad_norm": 0.2628437280654907, "kl": 0.04742431640625, "learning_rate": 5.481629666490903e-07, "loss": 0.0207, "num_tokens": 39160057.0, "reward": 0.23625002801418304, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1448.0, "completions/max_terminated_length": 1448.0, "completions/mean_length": 354.4750061035156, "completions/mean_terminated_length": 354.4750061035156, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.14330360545773305, "grad_norm": 0.24940533936023712, "kl": 0.04864501953125, "learning_rate": 5.452097800230853e-07, "loss": -0.0037, "num_tokens": 39234485.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 324.2250061035156, "completions/mean_terminated_length": 324.2250061035156, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.143608506745941, "grad_norm": 0.23759277164936066, "kl": 0.0521240234375, "learning_rate": 5.423491019789623e-07, "loss": 0.0016, "num_tokens": 39308599.0, "reward": 0.1875000149011612, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012499999999999956, "completions/max_length": 2048.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 337.5375061035156, "completions/mean_terminated_length": 315.8860778808594, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.14391340803414895, "grad_norm": 0.33607712388038635, "kl": 0.04736328125, "learning_rate": 5.395810525450425e-07, "loss": 0.0124, "num_tokens": 39379734.0, "reward": 0.24750001728534698, "reward_std": 0.14495688676834106, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.15000000596046448, "rewards/unicoder_reward_fn/std": 0.35932427644729614, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 340.9250183105469, "completions/mean_terminated_length": 340.9250183105469, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.14421830932235688, "grad_norm": 0.13019315898418427, "kl": 0.044921875, "learning_rate": 5.369057478631359e-07, "loss": -0.0017, "num_tokens": 39455358.0, "reward": 0.14875000715255737, "reward_std": 0.03712310642004013, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1112.0, "completions/max_terminated_length": 1112.0, "completions/mean_length": 377.5500183105469, "completions/mean_terminated_length": 377.5500183105469, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.14452321061056483, "grad_norm": 0.18562644720077515, "kl": 0.04766845703125, "learning_rate": 5.343233001836694e-07, "loss": 0.01, "num_tokens": 39536722.0, "reward": 0.13625001907348633, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.03750000149011612, "rewards/unicoder_reward_fn/std": 0.1911821961402893, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1605.0, "completions/max_terminated_length": 1605.0, "completions/mean_length": 356.63751220703125, "completions/mean_terminated_length": 356.63751220703125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.14482811189877276, "grad_norm": 0.1613534539937973, "kl": 0.04925537109375, "learning_rate": 5.318338178609754e-07, "loss": -0.0015, "num_tokens": 39609775.0, "reward": 0.1862500160932541, "reward_std": 0.01944543607532978, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 341.8999938964844, "completions/mean_terminated_length": 341.8999938964844, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.1451330131869807, "grad_norm": 0.24990017712116241, "kl": 0.05035400390625, "learning_rate": 5.294374053487459e-07, "loss": 0.0108, "num_tokens": 39686339.0, "reward": 0.21250002086162567, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 344.375, "completions/mean_terminated_length": 344.375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14543791447518867, "grad_norm": 0.24874407052993774, "kl": 0.05511474609375, "learning_rate": 5.271341631956511e-07, "loss": -0.0035, "num_tokens": 39761839.0, "reward": 0.18500001728534698, "reward_std": 0.09192388504743576, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 794.0, "completions/max_terminated_length": 794.0, "completions/mean_length": 385.45001220703125, "completions/mean_terminated_length": 385.45001220703125, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 0.1457428157633966, "grad_norm": 0.2202742099761963, "kl": 0.04620361328125, "learning_rate": 5.249241880411181e-07, "loss": 0.0121, "num_tokens": 39843201.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 377.8625183105469, "completions/mean_terminated_length": 377.8625183105469, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.14604771705160455, "grad_norm": 0.21568068861961365, "kl": 0.04791259765625, "learning_rate": 5.228075726112785e-07, "loss": -0.0043, "num_tokens": 39922578.0, "reward": 0.1875000149011612, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434911370277405, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 336.1125183105469, "completions/mean_terminated_length": 336.1125183105469, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.14635261833981247, "grad_norm": 0.19107092916965485, "kl": 0.05096435546875, "learning_rate": 5.207844057150768e-07, "loss": 0.0107, "num_tokens": 39995037.0, "reward": 0.1875000298023224, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 321.88751220703125, "completions/mean_terminated_length": 321.88751220703125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.14665751962802043, "grad_norm": 0.2602921724319458, "kl": 0.0550537109375, "learning_rate": 5.188547722405437e-07, "loss": 0.0126, "num_tokens": 40066734.0, "reward": 0.21250002086162567, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 387.6875, "completions/mean_terminated_length": 387.6875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.14696242091622838, "grad_norm": 0.1923561990261078, "kl": 0.04376220703125, "learning_rate": 5.170187531512351e-07, "loss": -0.0073, "num_tokens": 40148063.0, "reward": 0.1612500250339508, "reward_std": 0.09015611559152603, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339753627777, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 371.9250183105469, "completions/mean_terminated_length": 371.9250183105469, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1472673222044363, "grad_norm": 0.12472382187843323, "kl": 0.0458984375, "learning_rate": 5.152764254828348e-07, "loss": 0.0028, "num_tokens": 40228487.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 335.5249938964844, "completions/mean_terminated_length": 335.5249938964844, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.14757222349264426, "grad_norm": 0.21659334003925323, "kl": 0.05364990234375, "learning_rate": 5.136278623399225e-07, "loss": 0.0077, "num_tokens": 40302519.0, "reward": 0.21250002086162567, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 372.6750183105469, "completions/mean_terminated_length": 372.6750183105469, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.1478771247808522, "grad_norm": 0.26435017585754395, "kl": 0.0491943359375, "learning_rate": 5.120731328929058e-07, "loss": 0.0008, "num_tokens": 40380843.0, "reward": 0.21125002205371857, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 869.0, "completions/max_terminated_length": 869.0, "completions/mean_length": 364.2875061035156, "completions/mean_terminated_length": 364.2875061035156, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.14818202606906014, "grad_norm": 0.22579143941402435, "kl": 0.04638671875, "learning_rate": 5.106123023751187e-07, "loss": -0.0071, "num_tokens": 40455084.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 376.2875061035156, "completions/mean_terminated_length": 376.2875061035156, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "epoch": 0.1484869273572681, "grad_norm": 0.15771447122097015, "kl": 0.04400634765625, "learning_rate": 5.092454320800833e-07, "loss": 0.003, "num_tokens": 40533413.0, "reward": 0.16250000894069672, "reward_std": 0.0530330054461956, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 359.1875, "completions/mean_terminated_length": 359.1875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.14879182864547602, "grad_norm": 0.2311951071023941, "kl": 0.0474853515625, "learning_rate": 5.079725793589405e-07, "loss": 0.0024, "num_tokens": 40607952.0, "reward": 0.2250000238418579, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.125, "rewards/unicoder_reward_fn/std": 0.33280548453330994, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 980.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 342.7375183105469, "completions/mean_terminated_length": 342.7375183105469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.14909672993368397, "grad_norm": 0.18008172512054443, "kl": 0.04840087890625, "learning_rate": 5.067937976180407e-07, "loss": 0.0046, "num_tokens": 40683967.0, "reward": 0.20000003278255463, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.10000000149011612, "rewards/unicoder_reward_fn/std": 0.3018927276134491, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 861.0, "completions/max_terminated_length": 861.0, "completions/mean_length": 366.8999938964844, "completions/mean_terminated_length": 366.8999938964844, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.1494016312218919, "grad_norm": 0.24172508716583252, "kl": 0.04803466796875, "learning_rate": 5.057091363167046e-07, "loss": -0.0081, "num_tokens": 40761483.0, "reward": 0.1862500160932541, "reward_std": 0.05480077490210533, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.08749999850988388, "rewards/unicoder_reward_fn/std": 0.28434914350509644, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1044.0, "completions/max_terminated_length": 1044.0, "completions/mean_length": 364.7875061035156, "completions/mean_terminated_length": 364.7875061035156, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.14970653251009985, "grad_norm": 0.18381540477275848, "kl": 0.04437255859375, "learning_rate": 5.047186409651489e-07, "loss": 0.0102, "num_tokens": 40838238.0, "reward": 0.15000002086162567, "reward_std": 0.0707106813788414, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 724.0, "completions/max_terminated_length": 724.0, "completions/mean_length": 346.7375183105469, "completions/mean_terminated_length": 346.7375183105469, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.1500114337983078, "grad_norm": 0.12727724015712738, "kl": 0.0479736328125, "learning_rate": 5.038223531225742e-07, "loss": -0.0009, "num_tokens": 40914663.0, "reward": 0.125, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.02500000037252903, "rewards/unicoder_reward_fn/std": 0.15710999071598053, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 707.0, "completions/max_terminated_length": 707.0, "completions/mean_length": 353.95001220703125, "completions/mean_terminated_length": 353.95001220703125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.15031633508651573, "grad_norm": 0.21529246866703033, "kl": 0.05010986328125, "learning_rate": 5.030203103954232e-07, "loss": 0.016, "num_tokens": 40989059.0, "reward": 0.17500002682209015, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531530380249, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 831.0, "completions/max_terminated_length": 831.0, "completions/mean_length": 391.7250061035156, "completions/mean_terminated_length": 391.7250061035156, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.1506212363747237, "grad_norm": 0.14168554544448853, "kl": 0.04669189453125, "learning_rate": 5.023125464358026e-07, "loss": 0.004, "num_tokens": 41072121.0, "reward": 0.14875000715255737, "reward_std": 0.03358757123351097, "rewards/format_reward/mean": 0.987500011920929, "rewards/format_reward/std": 0.11180339008569717, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1150.0, "completions/max_terminated_length": 1150.0, "completions/mean_length": 443.3000183105469, "completions/mean_terminated_length": 443.3000183105469, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.15092613766293161, "grad_norm": 0.14296044409275055, "kl": 0.04107666015625, "learning_rate": 5.016990909400709e-07, "loss": 0.0064, "num_tokens": 41157177.0, "reward": 0.15000000596046448, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.05000000074505806, "rewards/unicoder_reward_fn/std": 0.21931999921798706, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 376.1125183105469, "completions/mean_terminated_length": 376.1125183105469, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15123103895113957, "grad_norm": 0.23286934196949005, "kl": 0.04718017578125, "learning_rate": 5.011799696475915e-07, "loss": 0.011, "num_tokens": 41237998.0, "reward": 0.21000002324581146, "reward_std": 0.05656854063272476, "rewards/format_reward/mean": 0.9750000238418579, "rewards/format_reward/std": 0.15710997581481934, "rewards/unicoder_reward_fn/mean": 0.11249999701976776, "rewards/unicoder_reward_fn/std": 0.3179742097854614, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 385.8000183105469, "completions/mean_terminated_length": 385.8000183105469, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.15153594023934752, "grad_norm": 0.13556820154190063, "kl": 0.04815673828125, "learning_rate": 5.007552043396547e-07, "loss": 0.006, "num_tokens": 41312662.0, "reward": 0.17500001192092896, "reward_std": 0.0353553406894207, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.07500000298023224, "rewards/unicoder_reward_fn/std": 0.2650531232357025, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 366.8374938964844, "completions/mean_terminated_length": 366.8374938964844, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.15184084152755545, "grad_norm": 0.24061597883701324, "kl": 0.04522705078125, "learning_rate": 5.004248128385618e-07, "loss": 0.004, "num_tokens": 41388355.0, "reward": 0.2500000298023224, "reward_std": 0.1060660108923912, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.15000000596046448, "rewards/unicoder_reward_fn/std": 0.35932427644729614, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 412.2375183105469, "completions/mean_terminated_length": 412.2375183105469, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.1521457428157634, "grad_norm": 0.1985752284526825, "kl": 0.0421142578125, "learning_rate": 5.001888090068784e-07, "loss": -0.0046, "num_tokens": 41472988.0, "reward": 0.1625000238418579, "reward_std": 0.0883883461356163, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.0625, "rewards/unicoder_reward_fn/std": 0.2435886710882187, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 380.9875183105469, "completions/mean_terminated_length": 380.9875183105469, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.15245064410397133, "grad_norm": 0.24799692630767822, "kl": 0.0428466796875, "learning_rate": 5.000472027468528e-07, "loss": 0.0084, "num_tokens": 41550467.0, "reward": 0.23750002682209015, "reward_std": 0.12374367564916611, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/unicoder_reward_fn/mean": 0.13750000298023224, "rewards/unicoder_reward_fn/std": 0.3465471565723419, "step": 500 }, { "epoch": 0.15245064410397133, "step": 500, "total_flos": 0.0, "train_loss": 0.0025880077524270744, "train_runtime": 10060.4286, "train_samples_per_second": 3.976, "train_steps_per_second": 0.05 } ], "logging_steps": 1, "max_steps": 500, "num_input_tokens_seen": 41550467, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }