{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 391.0, "completions/max_terminated_length": 391.0, "completions/mean_length": 202.91666666666666, "completions/mean_terminated_length": 221.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.004, "format_failures": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0, "num_tokens": 18672.0, "reward": 0.0, "reward_std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 92.83333333333333, "completions/mean_terminated_length": 101.27272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.008, "format_failures": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 29988.0, "reward": 0.0, "reward_std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 52.333333333333336, "completions/mean_terminated_length": 57.09090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.012, "format_failures": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 39576.0, "reward": 0.0, "reward_std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 326.0, "completions/max_terminated_length": 326.0, "completions/mean_length": 161.75, "completions/mean_terminated_length": 176.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 18.0, "epoch": 0.016, "format_failures": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 53340.0, "reward": 0.0, "reward_std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 75.08333333333333, "completions/mean_terminated_length": 81.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.02, "format_failures": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 60420.0, "reward": 0.0, "reward_std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 374.0, "completions/max_terminated_length": 374.0, "completions/mean_length": 178.0, "completions/mean_terminated_length": 194.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.024, "format_failures": 0.0, "grad_norm": 0.0, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 75348.0, "reward": 0.0, "reward_std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 146.0, "completions/max_terminated_length": 146.0, "completions/mean_length": 99.08333333333333, "completions/mean_terminated_length": 108.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 94.0, "epoch": 0.028, "format_failures": 0.0, "grad_norm": 1.6362388134002686, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0806, "num_tokens": 83868.0, "reward": 0.5833333730697632, "reward_std": 0.5149286389350891, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 190.0, "completions/mean_terminated_length": 207.27272727272728, "completions/min_length": 0.0, "completions/min_terminated_length": 124.0, "epoch": 0.032, "format_failures": 0.0, "grad_norm": 0.0022762538865208626, "kl": 0.0005378490750445053, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 97464.0, "reward": 0.0, "reward_std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 141.16666666666666, "completions/mean_terminated_length": 154.0, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.036, "format_failures": 0.0, "grad_norm": 0.007215190213173628, "kl": 0.0019240143010392785, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 108636.0, "reward": 0.0, "reward_std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 266.0, "completions/max_terminated_length": 266.0, "completions/mean_length": 178.41666666666666, "completions/mean_terminated_length": 194.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 142.0, "epoch": 0.04, "format_failures": 0.0, "grad_norm": 0.7695807218551636, "kl": 0.014113324228674173, "learning_rate": 1e-06, "loss": -0.002, "num_tokens": 116256.0, "reward": 0.1666666716337204, "reward_std": 0.38924944400787354, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 135.0, "completions/max_terminated_length": 135.0, "completions/mean_length": 84.25, "completions/mean_terminated_length": 91.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.044, "format_failures": 0.0, "grad_norm": 1.026847243309021, "kl": 0.013075211551040411, "learning_rate": 1e-06, "loss": -0.0413, "num_tokens": 124440.0, "reward": 0.1666666716337204, "reward_std": 0.38924944400787354, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 99.25, "completions/mean_terminated_length": 108.27272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.048, "format_failures": 1.0, "grad_norm": 0.011818243190646172, "kl": 0.003624255710747093, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 132732.0, "reward": 0.0, "reward_std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 577.0, "completions/max_terminated_length": 577.0, "completions/mean_length": 246.08333333333334, "completions/mean_terminated_length": 268.45454545454544, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.052, "format_failures": 0.0, "grad_norm": 0.022241737693548203, "kl": 0.00960063119418919, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 152424.0, "reward": 0.0, "reward_std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 344.0, "completions/max_terminated_length": 344.0, "completions/mean_length": 173.0, "completions/mean_terminated_length": 188.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 117.0, "epoch": 0.056, "format_failures": 0.0, "grad_norm": 0.036054644733667374, "kl": 0.01761903613805771, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 162636.0, "reward": 0.0, "reward_std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 402.0, "completions/max_terminated_length": 402.0, "completions/mean_length": 268.1666666666667, "completions/mean_terminated_length": 292.54545454545456, "completions/min_length": 0.0, "completions/min_terminated_length": 102.0, "epoch": 0.06, "format_failures": 0.0, "grad_norm": 0.00860360637307167, "kl": 0.005409976467490196, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 176904.0, "reward": 0.0, "reward_std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 72.16666666666667, "completions/mean_terminated_length": 78.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.064, "format_failures": 0.0, "grad_norm": 0.8470466136932373, "kl": 0.09470756724476814, "learning_rate": 1e-06, "loss": 0.0173, "num_tokens": 186564.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 118.5, "completions/mean_terminated_length": 129.27272727272728, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 0.068, "format_failures": 0.0, "grad_norm": 0.10086339712142944, "kl": 0.04859759844839573, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 197484.0, "reward": 0.0, "reward_std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 57.833333333333336, "completions/mean_terminated_length": 63.09090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.072, "format_failures": 0.0, "grad_norm": 1.4592796564102173, "kl": 0.010172125417739153, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 207252.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 32.583333333333336, "completions/mean_terminated_length": 35.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.076, "format_failures": 0.0, "grad_norm": 2.4069900512695312, "kl": 0.025834742933511734, "learning_rate": 1e-06, "loss": -0.0695, "num_tokens": 214320.0, "reward": 0.6666666865348816, "reward_std": 0.4923659861087799, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 386.0, "completions/max_terminated_length": 386.0, "completions/mean_length": 192.41666666666666, "completions/mean_terminated_length": 209.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.08, "format_failures": 0.0, "grad_norm": 0.10245665162801743, "kl": 0.043199990526773036, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 228996.0, "reward": 0.0, "reward_std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 88.0, "completions/max_terminated_length": 88.0, "completions/mean_length": 67.91666666666667, "completions/mean_terminated_length": 81.5, "completions/min_length": 0.0, "completions/min_terminated_length": 74.0, "epoch": 0.084, "format_failures": 1.0, "grad_norm": 1.388899326324463, "kl": 0.07192051783204079, "learning_rate": 1e-06, "loss": -0.0112, "num_tokens": 238104.0, "reward": 0.75, "reward_std": 0.45226702094078064, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 43.583333333333336, "completions/mean_terminated_length": 47.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.088, "format_failures": 0.0, "grad_norm": 3.2448337078094482, "kl": 0.0771165993064642, "learning_rate": 1e-06, "loss": 0.0077, "num_tokens": 245280.0, "reward": 0.5, "reward_std": 0.5222329497337341, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 335.0, "completions/max_terminated_length": 335.0, "completions/mean_length": 167.83333333333334, "completions/mean_terminated_length": 183.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.092, "format_failures": 0.0, "grad_norm": 1.0195705890655518, "kl": 0.211347796022892, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 257148.0, "reward": 0.0, "reward_std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 350.0, "completions/max_terminated_length": 350.0, "completions/mean_length": 201.41666666666666, "completions/mean_terminated_length": 219.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 0.096, "format_failures": 0.0, "grad_norm": 0.20492610335350037, "kl": 0.08658944815397263, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 266304.0, "reward": 0.0, "reward_std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 312.0, "completions/max_terminated_length": 312.0, "completions/mean_length": 170.0, "completions/mean_terminated_length": 185.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.1, "format_failures": 0.0, "grad_norm": 0.0755978599190712, "kl": 0.040397679433226585, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 278760.0, "reward": 0.0, "reward_std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 411.0, "completions/max_terminated_length": 411.0, "completions/mean_length": 173.83333333333334, "completions/mean_terminated_length": 189.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.104, "format_failures": 0.0, "grad_norm": 0.04659981280565262, "kl": 0.023209942039102316, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 293628.0, "reward": 0.0, "reward_std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 92.0, "completions/max_terminated_length": 92.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 60.27272727272727, "completions/min_length": 0.0, "completions/min_terminated_length": 43.0, "epoch": 0.108, "format_failures": 0.0, "grad_norm": 21.968534469604492, "kl": 5.299874305725098, "learning_rate": 1e-06, "loss": 0.1192, "num_tokens": 301488.0, "reward": 0.5833333730697632, "reward_std": 0.5149286389350891, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 358.0, "completions/max_terminated_length": 358.0, "completions/mean_length": 165.41666666666666, "completions/mean_terminated_length": 180.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.112, "format_failures": 0.0, "grad_norm": 0.014507513493299484, "kl": 0.01523882569745183, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 314748.0, "reward": 0.0, "reward_std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 289.0, "completions/max_terminated_length": 289.0, "completions/mean_length": 193.33333333333334, "completions/mean_terminated_length": 210.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 119.0, "epoch": 0.116, "format_failures": 0.0, "grad_norm": 0.010872351005673409, "kl": 0.010655859019607306, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 328692.0, "reward": 0.0, "reward_std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 271.0, "completions/max_terminated_length": 271.0, "completions/mean_length": 167.16666666666666, "completions/mean_terminated_length": 182.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.12, "format_failures": 0.0, "grad_norm": 1.0025266408920288, "kl": 0.025600655004382133, "learning_rate": 1e-06, "loss": -0.0472, "num_tokens": 340752.0, "reward": 0.3333333432674408, "reward_std": 0.4923659861087799, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 79.33333333333333, "completions/mean_terminated_length": 86.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 17.0, "epoch": 0.124, "format_failures": 0.0, "grad_norm": 0.01500674244016409, "kl": 0.006932976422831416, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 366936.0, "reward": 0.0, "reward_std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 109.36363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.128, "format_failures": 0.0, "grad_norm": 0.572136640548706, "kl": 0.016836593858897686, "learning_rate": 1e-06, "loss": -0.0253, "num_tokens": 375948.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 91.0, "completions/max_terminated_length": 91.0, "completions/mean_length": 52.75, "completions/mean_terminated_length": 57.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.132, "format_failures": 0.0, "grad_norm": 2.6049137115478516, "kl": 0.08474422618746758, "learning_rate": 1e-06, "loss": -0.0534, "num_tokens": 382608.0, "reward": 0.3333333432674408, "reward_std": 0.4923659861087799, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 98.0, "completions/max_terminated_length": 98.0, "completions/mean_length": 61.416666666666664, "completions/mean_terminated_length": 67.0, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.136, "format_failures": 0.0, "grad_norm": 1.9431159496307373, "kl": 0.04839755780994892, "learning_rate": 1e-06, "loss": -0.1095, "num_tokens": 389208.0, "reward": 0.1666666716337204, "reward_std": 0.38924944400787354, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 151.0, "completions/max_terminated_length": 151.0, "completions/mean_length": 118.91666666666667, "completions/mean_terminated_length": 129.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 105.0, "epoch": 0.14, "format_failures": 0.0, "grad_norm": 0.03593799099326134, "kl": 0.03462314326316118, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 396696.0, "reward": 0.0, "reward_std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 371.0, "completions/max_terminated_length": 371.0, "completions/mean_length": 245.58333333333334, "completions/mean_terminated_length": 267.90909090909093, "completions/min_length": 0.0, "completions/min_terminated_length": 183.0, "epoch": 0.144, "format_failures": 0.0, "grad_norm": 0.025885488837957382, "kl": 0.02637413516640663, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 411372.0, "reward": 0.0, "reward_std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 296.0, "completions/max_terminated_length": 296.0, "completions/mean_length": 179.41666666666666, "completions/mean_terminated_length": 195.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 129.0, "epoch": 0.148, "format_failures": 0.0, "grad_norm": 0.11734314262866974, "kl": 0.0526489345356822, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 424404.0, "reward": 0.0, "reward_std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 332.0833333333333, "completions/mean_terminated_length": 362.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.152, "format_failures": 1.0, "grad_norm": 0.5079672932624817, "kl": 0.052276700269430876, "learning_rate": 1e-06, "loss": 0.0148, "num_tokens": 444576.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 148.0, "completions/max_terminated_length": 148.0, "completions/mean_length": 67.25, "completions/mean_terminated_length": 73.36363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.156, "format_failures": 0.0, "grad_norm": 2.6541359424591064, "kl": 0.5338308056816459, "learning_rate": 1e-06, "loss": -0.1217, "num_tokens": 453192.0, "reward": 0.25, "reward_std": 0.45226702094078064, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 245.0, "completions/max_terminated_length": 245.0, "completions/mean_length": 139.91666666666666, "completions/mean_terminated_length": 152.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.16, "format_failures": 0.0, "grad_norm": 0.3757868707180023, "kl": 0.13857688568532467, "learning_rate": 1e-06, "loss": 0.0009, "num_tokens": 467928.0, "reward": 0.0, "reward_std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 84.0, "completions/max_terminated_length": 84.0, "completions/mean_length": 48.0, "completions/mean_terminated_length": 52.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.164, "format_failures": 1.0, "grad_norm": 4.323275566101074, "kl": 0.21433213353157043, "learning_rate": 1e-06, "loss": -0.0993, "num_tokens": 473472.0, "reward": 0.25, "reward_std": 0.45226702094078064, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 311.0, "completions/max_terminated_length": 311.0, "completions/mean_length": 109.58333333333333, "completions/mean_terminated_length": 119.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.168, "format_failures": 0.0, "grad_norm": 0.22781899571418762, "kl": 0.07318945415318012, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 488148.0, "reward": 0.0, "reward_std": 0.0, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 49.0, "completions/max_terminated_length": 49.0, "completions/mean_length": 31.166666666666668, "completions/mean_terminated_length": 34.0, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.172, "format_failures": 0.0, "grad_norm": 2.492840051651001, "kl": 0.224076546728611, "learning_rate": 1e-06, "loss": -0.0283, "num_tokens": 492624.0, "reward": 0.25, "reward_std": 0.45226702094078064, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 66.0, "completions/max_terminated_length": 66.0, "completions/mean_length": 52.25, "completions/mean_terminated_length": 57.0, "completions/min_length": 0.0, "completions/min_terminated_length": 32.0, "epoch": 0.176, "format_failures": 0.0, "grad_norm": 3.037781000137329, "kl": 0.2150058075785637, "learning_rate": 1e-06, "loss": -0.013, "num_tokens": 499752.0, "reward": 0.6666666865348816, "reward_std": 0.4923659861087799, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 144.0, "completions/max_terminated_length": 144.0, "completions/mean_length": 83.0, "completions/mean_terminated_length": 90.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.18, "format_failures": 0.0, "grad_norm": 2.3224222660064697, "kl": 0.36255764216184616, "learning_rate": 1e-06, "loss": -0.0749, "num_tokens": 508428.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 160.0, "completions/max_terminated_length": 160.0, "completions/mean_length": 119.0, "completions/mean_terminated_length": 129.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 106.0, "epoch": 0.184, "format_failures": 0.0, "grad_norm": 1.5437301397323608, "kl": 0.06716796010732651, "learning_rate": 1e-06, "loss": 0.0526, "num_tokens": 517416.0, "reward": 0.75, "reward_std": 0.45226702094078064, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 86.66666666666667, "completions/mean_terminated_length": 94.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.188, "format_failures": 0.0, "grad_norm": 1.92403244972229, "kl": 0.04993921332061291, "learning_rate": 1e-06, "loss": -0.0667, "num_tokens": 525384.0, "reward": 0.25, "reward_std": 0.45226702094078064, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 263.0833333333333, "completions/mean_terminated_length": 287.0, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.192, "format_failures": 0.0, "grad_norm": 0.002583070658147335, "kl": 0.0069114591460675, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 560328.0, "reward": 0.0, "reward_std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 337.0, "completions/max_terminated_length": 337.0, "completions/mean_length": 178.16666666666666, "completions/mean_terminated_length": 194.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 9.0, "epoch": 0.196, "format_failures": 0.0, "grad_norm": 0.055018555372953415, "kl": 0.04814303293824196, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 573552.0, "reward": 0.0, "reward_std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 205.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 104.25, "completions/mean_terminated_length": 113.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.2, "format_failures": 0.0, "grad_norm": 0.10304596275091171, "kl": 0.0782565288245678, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 583980.0, "reward": 0.0, "reward_std": 0.0, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 529.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 171.25, "completions/mean_terminated_length": 186.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.204, "format_failures": 0.0, "grad_norm": 0.10456845909357071, "kl": 0.05266672745347023, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 606264.0, "reward": 0.0, "reward_std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 142.0, "completions/max_terminated_length": 142.0, "completions/mean_length": 86.0, "completions/mean_terminated_length": 93.81818181818181, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.208, "format_failures": 0.0, "grad_norm": 2.117820978164673, "kl": 0.12709446623921394, "learning_rate": 1e-06, "loss": -0.0977, "num_tokens": 616176.0, "reward": 0.25, "reward_std": 0.45226702094078064, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 67.66666666666667, "completions/mean_terminated_length": 73.81818181818181, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 0.212, "format_failures": 0.0, "grad_norm": 0.36178988218307495, "kl": 0.06635316368192434, "learning_rate": 1e-06, "loss": 0.0007, "num_tokens": 625992.0, "reward": 0.0, "reward_std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 180.5, "completions/mean_terminated_length": 196.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 88.0, "epoch": 0.216, "format_failures": 0.0, "grad_norm": 5.520895957946777, "kl": 0.6420021317899227, "learning_rate": 1e-06, "loss": 0.0043, "num_tokens": 640824.0, "reward": 0.0, "reward_std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 81.0, "completions/max_terminated_length": 81.0, "completions/mean_length": 59.5, "completions/mean_terminated_length": 64.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.22, "format_failures": 0.0, "grad_norm": 9.40858268737793, "kl": 1.514443002641201, "learning_rate": 1e-06, "loss": -0.0487, "num_tokens": 649008.0, "reward": 0.5833333730697632, "reward_std": 0.5149286389350891, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 647.0, "completions/max_terminated_length": 647.0, "completions/mean_length": 265.25, "completions/mean_terminated_length": 289.3636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.224, "format_failures": 0.0, "grad_norm": 0.12246920168399811, "kl": 0.04888852685689926, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 665112.0, "reward": 0.0, "reward_std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.16666666666666663, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 54.416666666666664, "completions/mean_terminated_length": 65.3, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.228, "format_failures": 0.0, "grad_norm": 0.4643149971961975, "kl": 0.2062125913798809, "learning_rate": 1e-06, "loss": 0.0018, "num_tokens": 671268.0, "reward": 0.0, "reward_std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 138.83333333333334, "completions/mean_terminated_length": 151.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.232, "format_failures": 0.0, "grad_norm": 0.028489232063293457, "kl": 0.028692953288555145, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 681648.0, "reward": 0.0, "reward_std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 119.33333333333333, "completions/mean_terminated_length": 130.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.236, "format_failures": 1.0, "grad_norm": 0.2943709194660187, "kl": 0.021217118948698044, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 692148.0, "reward": 0.0, "reward_std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 65.25, "completions/mean_terminated_length": 71.18181818181819, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.24, "format_failures": 1.0, "grad_norm": 0.4704815149307251, "kl": 0.1355944722890854, "learning_rate": 1e-06, "loss": 0.0016, "num_tokens": 705504.0, "reward": 0.0, "reward_std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 106.75, "completions/mean_terminated_length": 116.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 0.244, "format_failures": 0.0, "grad_norm": 0.8021370768547058, "kl": 0.06047418341040611, "learning_rate": 1e-06, "loss": 0.0229, "num_tokens": 712920.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 356.0, "completions/max_terminated_length": 356.0, "completions/mean_length": 201.83333333333334, "completions/mean_terminated_length": 220.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.248, "format_failures": 0.0, "grad_norm": 0.0354565754532814, "kl": 0.051246967166662216, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 725280.0, "reward": 0.0, "reward_std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 260.0, "completions/max_terminated_length": 260.0, "completions/mean_length": 129.0, "completions/mean_terminated_length": 140.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.252, "format_failures": 0.0, "grad_norm": 0.838399350643158, "kl": 0.03389432094991207, "learning_rate": 1e-06, "loss": 0.0256, "num_tokens": 733980.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 121.08333333333333, "completions/mean_terminated_length": 132.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.256, "format_failures": 0.0, "grad_norm": 0.008542679250240326, "kl": 0.02384038269519806, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 763224.0, "reward": 0.0, "reward_std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 302.0, "completions/max_terminated_length": 302.0, "completions/mean_length": 180.33333333333334, "completions/mean_terminated_length": 196.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 0.26, "format_failures": 0.0, "grad_norm": 0.01127433218061924, "kl": 0.013883833773434162, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 777984.0, "reward": 0.0, "reward_std": 0.0, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 75.83333333333333, "completions/mean_terminated_length": 82.72727272727273, "completions/min_length": 0.0, "completions/min_terminated_length": 21.0, "epoch": 0.264, "format_failures": 0.0, "grad_norm": 0.09972423315048218, "kl": 0.06396586634218693, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 785844.0, "reward": 0.0, "reward_std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 222.0, "completions/max_terminated_length": 222.0, "completions/mean_length": 140.41666666666666, "completions/mean_terminated_length": 153.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.268, "format_failures": 0.0, "grad_norm": 0.03430556878447533, "kl": 0.03857766184955835, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 796632.0, "reward": 0.0, "reward_std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 115.08333333333333, "completions/mean_terminated_length": 125.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 14.0, "epoch": 0.272, "format_failures": 0.0, "grad_norm": 1.6054855585098267, "kl": 0.020691730547696352, "learning_rate": 1e-06, "loss": 0.0511, "num_tokens": 807576.0, "reward": 0.4166666865348816, "reward_std": 0.5149286389350891, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 80.5, "completions/mean_terminated_length": 87.81818181818181, "completions/min_length": 0.0, "completions/min_terminated_length": 64.0, "epoch": 0.276, "format_failures": 0.0, "grad_norm": 1.1459321975708008, "kl": 0.017325148917734623, "learning_rate": 1e-06, "loss": 0.0067, "num_tokens": 814644.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 175.58333333333334, "completions/mean_terminated_length": 191.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 17.0, "epoch": 0.28, "format_failures": 0.0, "grad_norm": 0.008818876929581165, "kl": 0.012372600380331278, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 826932.0, "reward": 0.0, "reward_std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 292.0, "completions/max_terminated_length": 292.0, "completions/mean_length": 195.0, "completions/mean_terminated_length": 212.72727272727272, "completions/min_length": 0.0, "completions/min_terminated_length": 115.0, "epoch": 0.284, "format_failures": 0.0, "grad_norm": 0.014721119776368141, "kl": 0.012880454771220684, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 842268.0, "reward": 0.0, "reward_std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 145.0, "completions/max_terminated_length": 145.0, "completions/mean_length": 94.83333333333333, "completions/mean_terminated_length": 103.45454545454545, "completions/min_length": 0.0, "completions/min_terminated_length": 75.0, "epoch": 0.288, "format_failures": 0.0, "grad_norm": 0.9220354557037354, "kl": 0.046924193389713764, "learning_rate": 1e-06, "loss": 0.0118, "num_tokens": 849612.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 262.0, "completions/max_terminated_length": 262.0, "completions/mean_length": 149.5, "completions/mean_terminated_length": 163.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.292, "format_failures": 0.0, "grad_norm": 0.0295345988124609, "kl": 0.03905524965375662, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 859632.0, "reward": 0.0, "reward_std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 158.41666666666666, "completions/mean_terminated_length": 172.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 24.0, "epoch": 0.296, "format_failures": 0.0, "grad_norm": 0.11439846456050873, "kl": 0.07962214201688766, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 870756.0, "reward": 0.0, "reward_std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 223.0, "completions/max_terminated_length": 223.0, "completions/mean_length": 110.08333333333333, "completions/mean_terminated_length": 120.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.3, "format_failures": 0.0, "grad_norm": 0.04706709831953049, "kl": 0.03136777225881815, "learning_rate": 1e-06, "loss": 0.0003, "num_tokens": 887700.0, "reward": 0.0, "reward_std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 219.58333333333334, "completions/mean_terminated_length": 239.54545454545453, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.304, "format_failures": 0.0, "grad_norm": 0.106910839676857, "kl": 0.16153255105018616, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 899544.0, "reward": 0.0, "reward_std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 175.0, "completions/max_terminated_length": 175.0, "completions/mean_length": 118.66666666666667, "completions/mean_terminated_length": 129.45454545454547, "completions/min_length": 0.0, "completions/min_terminated_length": 48.0, "epoch": 0.308, "format_failures": 0.0, "grad_norm": 0.9582226276397705, "kl": 0.1435188725590706, "learning_rate": 1e-06, "loss": 0.0297, "num_tokens": 909816.0, "reward": 0.1666666716337204, "reward_std": 0.38924944400787354, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 200.0, "completions/max_terminated_length": 200.0, "completions/mean_length": 151.83333333333334, "completions/mean_terminated_length": 165.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 85.0, "epoch": 0.312, "format_failures": 0.0, "grad_norm": 0.6430385112762451, "kl": 0.021885435096919537, "learning_rate": 1e-06, "loss": -0.0413, "num_tokens": 919620.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 127.0, "completions/max_terminated_length": 127.0, "completions/mean_length": 87.66666666666667, "completions/mean_terminated_length": 95.63636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 72.0, "epoch": 0.316, "format_failures": 0.0, "grad_norm": 0.1316368579864502, "kl": 0.052431097254157066, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 930468.0, "reward": 0.0, "reward_std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 357.0, "completions/max_terminated_length": 357.0, "completions/mean_length": 188.66666666666666, "completions/mean_terminated_length": 205.8181818181818, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.32, "format_failures": 0.0, "grad_norm": 0.24080750346183777, "kl": 0.25305451452732086, "learning_rate": 1e-06, "loss": 0.0008, "num_tokens": 947112.0, "reward": 0.0, "reward_std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 171.0, "completions/max_terminated_length": 171.0, "completions/mean_length": 130.75, "completions/mean_terminated_length": 142.63636363636363, "completions/min_length": 0.0, "completions/min_terminated_length": 77.0, "epoch": 0.324, "format_failures": 0.0, "grad_norm": 0.9585680961608887, "kl": 0.02085646940395236, "learning_rate": 1e-06, "loss": -0.0725, "num_tokens": 956448.0, "reward": 0.5, "reward_std": 0.5222329497337341, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 130.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.328, "format_failures": 0.0, "grad_norm": 0.02760450914502144, "kl": 0.020923216827213764, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 966324.0, "reward": 0.0, "reward_std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 476.0, "completions/max_terminated_length": 476.0, "completions/mean_length": 252.33333333333334, "completions/mean_terminated_length": 275.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.332, "format_failures": 1.0, "grad_norm": 0.011845018714666367, "kl": 0.017354148440063, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 985296.0, "reward": 0.0, "reward_std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 234.0, "completions/max_terminated_length": 234.0, "completions/mean_length": 101.16666666666667, "completions/mean_terminated_length": 110.36363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.336, "format_failures": 1.0, "grad_norm": 0.02075113356113434, "kl": 0.013977942056953907, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 998856.0, "reward": 0.0, "reward_std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 170.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 44.0, "epoch": 0.34, "format_failures": 0.0, "grad_norm": 0.018603280186653137, "kl": 0.020112676545977592, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 1008864.0, "reward": 0.0, "reward_std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 364.0, "completions/max_terminated_length": 364.0, "completions/mean_length": 223.66666666666666, "completions/mean_terminated_length": 244.0, "completions/min_length": 0.0, "completions/min_terminated_length": 20.0, "epoch": 0.344, "format_failures": 0.0, "grad_norm": 0.011895284056663513, "kl": 0.021254747174680233, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 1022556.0, "reward": 0.0, "reward_std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 159.0, "completions/max_terminated_length": 159.0, "completions/mean_length": 117.66666666666667, "completions/mean_terminated_length": 128.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 112.0, "epoch": 0.348, "format_failures": 0.0, "grad_norm": 1.1451243162155151, "kl": 0.026615198701620102, "learning_rate": 1e-06, "loss": 0.0587, "num_tokens": 1032684.0, "reward": 0.4166666865348816, "reward_std": 0.5149286389350891, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 304.0, "completions/max_terminated_length": 304.0, "completions/mean_length": 142.25, "completions/mean_terminated_length": 155.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 12.0, "epoch": 0.352, "format_failures": 2.0, "grad_norm": 0.8502682447433472, "kl": 0.012907921802252531, "learning_rate": 1e-06, "loss": 0.079, "num_tokens": 1067328.0, "reward": 0.1666666716337204, "reward_std": 0.3892494738101959, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 114.5, "completions/mean_terminated_length": 124.9090909090909, "completions/min_length": 0.0, "completions/min_terminated_length": 78.0, "epoch": 0.356, "format_failures": 0.0, "grad_norm": 0.7634170651435852, "kl": 0.08245750516653061, "learning_rate": 1e-06, "loss": 0.0177, "num_tokens": 1074756.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 73.0, "completions/mean_terminated_length": 79.63636363636364, "completions/min_length": 0.0, "completions/min_terminated_length": 16.0, "epoch": 0.36, "format_failures": 0.0, "grad_norm": 0.750490665435791, "kl": 0.032081443816423416, "learning_rate": 1e-06, "loss": 0.0503, "num_tokens": 1083096.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 53.666666666666664, "completions/mean_terminated_length": 58.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.364, "format_failures": 0.0, "grad_norm": 0.12016791850328445, "kl": 0.04432140104472637, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 1090380.0, "reward": 0.0, "reward_std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 397.5, "completions/mean_terminated_length": 433.6363636363636, "completions/min_length": 0.0, "completions/min_terminated_length": 14.0, "epoch": 0.368, "format_failures": 0.0, "grad_norm": 0.012203319929540157, "kl": 0.009247956797480583, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 1113504.0, "reward": 0.0, "reward_std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 261.0, "completions/max_terminated_length": 261.0, "completions/mean_length": 138.75, "completions/mean_terminated_length": 151.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 23.0, "epoch": 0.372, "format_failures": 0.0, "grad_norm": 0.03371990844607353, "kl": 0.029644143767654896, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1125492.0, "reward": 0.0, "reward_std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 89.0, "completions/max_terminated_length": 89.0, "completions/mean_length": 51.833333333333336, "completions/mean_terminated_length": 56.54545454545455, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.376, "format_failures": 0.0, "grad_norm": 2.027597427368164, "kl": 0.19823284726589918, "learning_rate": 1e-06, "loss": -0.0712, "num_tokens": 1130748.0, "reward": 0.75, "reward_std": 0.45226702094078064, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 74.33333333333333, "completions/mean_terminated_length": 81.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 0.38, "format_failures": 0.0, "grad_norm": 1.4458988904953003, "kl": 0.07499337941408157, "learning_rate": 1e-06, "loss": -0.0187, "num_tokens": 1138584.0, "reward": 0.0833333358168602, "reward_std": 0.28867512941360474, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 285.0, "completions/max_terminated_length": 285.0, "completions/mean_length": 137.66666666666666, "completions/mean_terminated_length": 150.1818181818182, "completions/min_length": 0.0, "completions/min_terminated_length": 15.0, "epoch": 0.384, "format_failures": 0.0, "grad_norm": 0.03813532739877701, "kl": 0.023914064280688763, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1154124.0, "reward": 0.0, "reward_std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 125.66666666666667, "completions/mean_terminated_length": 137.0909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.388, "format_failures": 1.0, "grad_norm": 0.016639724373817444, "kl": 0.019042176194489002, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 1165560.0, "reward": 0.0, "reward_std": 0.0, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 209.0, "completions/max_terminated_length": 209.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 148.36363636363637, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.392, "format_failures": 0.0, "grad_norm": 0.041289571672677994, "kl": 0.025019565597176552, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 1176936.0, "reward": 0.0, "reward_std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 236.75, "completions/mean_terminated_length": 258.27272727272725, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 0.396, "format_failures": 0.0, "grad_norm": 0.029155507683753967, "kl": 0.03094907756894827, "learning_rate": 1e-06, "loss": 0.0001, "num_tokens": 1194108.0, "reward": 0.0, "reward_std": 0.0, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.08333333333333337, "completions/max_length": 31.0, "completions/max_terminated_length": 31.0, "completions/mean_length": 21.0, "completions/mean_terminated_length": 22.90909090909091, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 0.4, "format_failures": 0.0, "grad_norm": 5.876866340637207, "kl": 0.1100139394402504, "learning_rate": 1e-06, "loss": 0.0323, "num_tokens": 1202412.0, "reward": 0.75, "reward_std": 0.45226702094078064, "step": 100 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 1202412, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }