{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.384, "eval_steps": 500, "global_step": 102, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 105.5, "completions/mean_terminated_length": 120.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 16.0, "epoch": 0.064, "format_failures": 3.0, "grad_norm": 2.247725486755371, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0347, "num_tokens": 6048.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 75.625, "completions/mean_terminated_length": 86.42857142857143, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 0.128, "format_failures": 3.0, "grad_norm": 1.4242777824401855, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.1028, "num_tokens": 13280.0, "reward": 0.1875, "reward_std": 0.3720118999481201, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 213.0, "completions/max_terminated_length": 213.0, "completions/mean_length": 124.25, "completions/mean_terminated_length": 142.0, "completions/min_length": 0.0, "completions/min_terminated_length": 41.0, "epoch": 0.192, "format_failures": 1.0, "grad_norm": 0.020250532776117325, "kl": 0.0035181287967134267, "learning_rate": 1e-06, "loss": 0.0, "num_tokens": 21904.0, "reward": 0.0, "reward_std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 73.25, "completions/mean_terminated_length": 83.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 18.0, "epoch": 0.256, "format_failures": 1.0, "grad_norm": 8.061470031738281, "kl": 0.034313585492782295, "learning_rate": 1e-06, "loss": -0.2682, "num_tokens": 27552.0, "reward": 0.27916666865348816, "reward_std": 0.8364584445953369, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 385.0, "completions/max_terminated_length": 385.0, "completions/mean_length": 145.625, "completions/mean_terminated_length": 166.42857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.32, "format_failures": 1.0, "grad_norm": 1.223435401916504, "kl": 0.03014595981221646, "learning_rate": 1e-06, "loss": 0.1171, "num_tokens": 43192.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 109.625, "completions/mean_terminated_length": 125.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 56.0, "epoch": 0.384, "format_failures": 0.0, "grad_norm": 0.1720724254846573, "kl": 0.03908220527227968, "learning_rate": 1e-06, "loss": 0.0004, "num_tokens": 55448.0, "reward": 0.0, "reward_std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 74.625, "completions/mean_terminated_length": 85.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 51.0, "epoch": 0.448, "format_failures": 0.0, "grad_norm": 0.5268336534500122, "kl": 0.021530768717639148, "learning_rate": 1e-06, "loss": 0.0075, "num_tokens": 62672.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 118.0, "completions/max_terminated_length": 118.0, "completions/mean_length": 62.125, "completions/mean_terminated_length": 71.0, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.512, "format_failures": 2.0, "grad_norm": 2.541877031326294, "kl": 0.3408850164851174, "learning_rate": 1e-06, "loss": -0.1278, "num_tokens": 70896.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 117.0, "completions/max_terminated_length": 117.0, "completions/mean_length": 82.125, "completions/mean_terminated_length": 93.85714285714286, "completions/min_length": 0.0, "completions/min_terminated_length": 65.0, "epoch": 0.576, "format_failures": 2.0, "grad_norm": 1.876581072807312, "kl": 0.0260943416506052, "learning_rate": 1e-06, "loss": -0.053, "num_tokens": 78128.0, "reward": 0.4166666865348816, "reward_std": 0.49601587653160095, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 106.0, "completions/max_terminated_length": 106.0, "completions/mean_length": 59.125, "completions/mean_terminated_length": 67.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 0.64, "format_failures": 1.0, "grad_norm": 1.4804662466049194, "kl": 0.17110479215625674, "learning_rate": 1e-06, "loss": -0.0096, "num_tokens": 83696.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 90.625, "completions/mean_terminated_length": 103.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 0.704, "format_failures": 0.0, "grad_norm": 1.5350069999694824, "kl": 0.48000563448294997, "learning_rate": 1e-06, "loss": -0.0756, "num_tokens": 92216.0, "reward": 0.375, "reward_std": 0.4154745042324066, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 66.125, "completions/mean_terminated_length": 75.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 0.768, "format_failures": 0.0, "grad_norm": 7.105235576629639, "kl": 0.25097161275334656, "learning_rate": 1e-06, "loss": 0.1211, "num_tokens": 101288.0, "reward": 0.25, "reward_std": 0.38832157850265503, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 41.375, "completions/mean_terminated_length": 47.285714285714285, "completions/min_length": 0.0, "completions/min_terminated_length": 38.0, "epoch": 0.832, "format_failures": 1.0, "grad_norm": 8.552057266235352, "kl": 0.887442918960005, "learning_rate": 1e-06, "loss": 0.6279, "num_tokens": 108296.0, "reward": 0.625, "reward_std": 0.4520675837993622, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 169.125, "completions/mean_terminated_length": 193.28571428571428, "completions/min_length": 0.0, "completions/min_terminated_length": 73.0, "epoch": 0.896, "format_failures": 0.0, "grad_norm": 3.0173394680023193, "kl": 0.7231281753629446, "learning_rate": 1e-06, "loss": 0.0056, "num_tokens": 123336.0, "reward": 0.0535714291036129, "reward_std": 0.15152288973331451, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 89.375, "completions/mean_terminated_length": 102.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 47.0, "epoch": 0.96, "format_failures": 0.0, "grad_norm": 4.813839912414551, "kl": 1.1184300668537617, "learning_rate": 1e-06, "loss": -0.231, "num_tokens": 136000.0, "reward": -0.125, "reward_std": 0.3535533845424652, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 206.0, "completions/max_terminated_length": 206.0, "completions/mean_length": 131.875, "completions/mean_terminated_length": 150.71428571428572, "completions/min_length": 0.0, "completions/min_terminated_length": 20.0, "epoch": 1.0, "format_failures": 0.0, "grad_norm": 0.23264528810977936, "kl": 0.09705191291868687, "learning_rate": 1e-06, "loss": 0.0006, "num_tokens": 146704.0, "reward": 0.0, "reward_std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 702.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 300.875, "completions/mean_terminated_length": 343.85714285714283, "completions/min_length": 0.0, "completions/min_terminated_length": 163.0, "epoch": 1.064, "format_failures": 0.0, "grad_norm": 1.7797789573669434, "kl": 0.031833621207624674, "learning_rate": 1e-06, "loss": -0.0936, "num_tokens": 161184.0, "reward": 0.32083332538604736, "reward_std": 0.4521333873271942, "step": 17 }, { "clip_ratio/high_max": 0.000908265239559114, "clip_ratio/high_mean": 0.000908265239559114, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000908265239559114, "completions/clipped_ratio": 0.125, "completions/max_length": 73.0, "completions/max_terminated_length": 73.0, "completions/mean_length": 51.75, "completions/mean_terminated_length": 59.142857142857146, "completions/min_length": 0.0, "completions/min_terminated_length": 20.0, "epoch": 1.1280000000000001, "format_failures": 0.0, "grad_norm": 1.9153517484664917, "kl": 0.045906367246061563, "learning_rate": 1e-06, "loss": 0.2096, "num_tokens": 165496.0, "reward": 0.0, "reward_std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 354.0, "completions/max_terminated_length": 354.0, "completions/mean_length": 242.375, "completions/mean_terminated_length": 277.0, "completions/min_length": 0.0, "completions/min_terminated_length": 136.0, "epoch": 1.192, "format_failures": 1.0, "grad_norm": 0.8472970724105835, "kl": 0.020359830697998405, "learning_rate": 1e-06, "loss": -0.0893, "num_tokens": 174512.0, "reward": 0.5625, "reward_std": 0.4955156147480011, "step": 19 }, { "clip_ratio/high_max": 0.00041345093632116914, "clip_ratio/high_mean": 0.00041345093632116914, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00041345093632116914, "completions/clipped_ratio": 0.125, "completions/max_length": 115.0, "completions/max_terminated_length": 115.0, "completions/mean_length": 80.0, "completions/mean_terminated_length": 91.42857142857143, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 1.256, "format_failures": 0.0, "grad_norm": 4.18609619140625, "kl": 0.037674687220714986, "learning_rate": 1e-06, "loss": 0.5542, "num_tokens": 182664.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0023937864461913705, "clip_ratio/low_min": 0.0023937864461913705, "clip_ratio/region_mean": 0.0023937864461913705, "completions/clipped_ratio": 0.125, "completions/max_length": 189.0, "completions/max_terminated_length": 189.0, "completions/mean_length": 90.375, "completions/mean_terminated_length": 103.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 22.0, "epoch": 1.32, "format_failures": 0.0, "grad_norm": 5.047491550445557, "kl": 0.262689758092165, "learning_rate": 1e-06, "loss": -0.5506, "num_tokens": 189752.0, "reward": 0.03125, "reward_std": 0.0883883461356163, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00015489467477891594, "clip_ratio/low_min": 0.00015489467477891594, "clip_ratio/region_mean": 0.00015489467477891594, "completions/clipped_ratio": 0.125, "completions/max_length": 220.0, "completions/max_terminated_length": 220.0, "completions/mean_length": 110.25, "completions/mean_terminated_length": 126.0, "completions/min_length": 0.0, "completions/min_terminated_length": 90.0, "epoch": 1.384, "format_failures": 1.0, "grad_norm": 1.870309591293335, "kl": 0.15177738456986845, "learning_rate": 1e-06, "loss": 0.0972, "num_tokens": 197664.0, "reward": 0.36250001192092896, "reward_std": 0.4405759274959564, "step": 22 }, { "clip_ratio/high_max": 0.0007937598857097328, "clip_ratio/high_mean": 0.0007937598857097328, "clip_ratio/low_mean": 0.00033377838553860784, "clip_ratio/low_min": 0.00033377838553860784, "clip_ratio/region_mean": 0.0011275382712483406, "completions/clipped_ratio": 0.125, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 174.75, "completions/mean_terminated_length": 199.71428571428572, "completions/min_length": 0.0, "completions/min_terminated_length": 82.0, "epoch": 1.448, "format_failures": 0.0, "grad_norm": 7.092167377471924, "kl": 0.34660289715975523, "learning_rate": 1e-06, "loss": 0.8114, "num_tokens": 210072.0, "reward": 0.25275737047195435, "reward_std": 0.3869698941707611, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0025806452613323927, "clip_ratio/low_min": 0.0025806452613323927, "clip_ratio/region_mean": 0.0025806452613323927, "completions/clipped_ratio": 0.125, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 233.125, "completions/mean_terminated_length": 266.42857142857144, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 1.512, "format_failures": 0.0, "grad_norm": 8.143402099609375, "kl": 0.3320934564108029, "learning_rate": 1e-06, "loss": -0.9434, "num_tokens": 224336.0, "reward": 0.5197916626930237, "reward_std": 0.43734264373779297, "step": 24 }, { "clip_ratio/high_max": 0.0009831460192799568, "clip_ratio/high_mean": 0.0009831460192799568, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009831460192799568, "completions/clipped_ratio": 0.25, "completions/max_length": 169.0, "completions/max_terminated_length": 169.0, "completions/mean_length": 75.0, "completions/mean_terminated_length": 100.0, "completions/min_length": 0.0, "completions/min_terminated_length": 43.0, "epoch": 1.576, "format_failures": 0.0, "grad_norm": 12.648838996887207, "kl": 0.08752637438010424, "learning_rate": 1e-06, "loss": -1.3126, "num_tokens": 237344.0, "reward": 0.3630952537059784, "reward_std": 0.3474069833755493, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 60.25, "completions/mean_terminated_length": 80.33333333333333, "completions/min_length": 0.0, "completions/min_terminated_length": 76.0, "epoch": 1.6400000000000001, "format_failures": 0.0, "grad_norm": 22.935155868530273, "kl": 0.040498227812349796, "learning_rate": 1e-06, "loss": 2.0449, "num_tokens": 243264.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00016914748994167894, "clip_ratio/low_min": 0.00016914748994167894, "clip_ratio/region_mean": 0.00016914748994167894, "completions/clipped_ratio": 0.125, "completions/max_length": 114.0, "completions/max_terminated_length": 114.0, "completions/mean_length": 68.5, "completions/mean_terminated_length": 78.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 20.0, "epoch": 1.704, "format_failures": 0.0, "grad_norm": 6.5060811042785645, "kl": 0.05175229045562446, "learning_rate": 1e-06, "loss": -0.231, "num_tokens": 248064.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 120.0, "completions/max_terminated_length": 120.0, "completions/mean_length": 74.375, "completions/mean_terminated_length": 85.0, "completions/min_length": 0.0, "completions/min_terminated_length": 14.0, "epoch": 1.768, "format_failures": 2.0, "grad_norm": 5.602163791656494, "kl": 0.16080649592913687, "learning_rate": 1e-06, "loss": 0.4472, "num_tokens": 255520.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003766579437069595, "clip_ratio/low_min": 0.003766579437069595, "clip_ratio/region_mean": 0.003766579437069595, "completions/clipped_ratio": 0.125, "completions/max_length": 854.0, "completions/max_terminated_length": 854.0, "completions/mean_length": 186.5, "completions/mean_terminated_length": 213.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 1.8319999999999999, "format_failures": 1.0, "grad_norm": 8.381872177124023, "kl": 0.047735671047121286, "learning_rate": 1e-06, "loss": -1.0193, "num_tokens": 268512.0, "reward": 0.109375, "reward_std": 0.30935922265052795, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 183.0, "completions/max_terminated_length": 183.0, "completions/mean_length": 116.75, "completions/mean_terminated_length": 133.42857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 15.0, "epoch": 1.896, "format_failures": 0.0, "grad_norm": 5.668828010559082, "kl": 0.038008465664461255, "learning_rate": 1e-06, "loss": -0.7992, "num_tokens": 282112.0, "reward": 0.3333333432674408, "reward_std": 0.4714045226573944, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 7.35726862330921e-05, "clip_ratio/low_min": 7.35726862330921e-05, "clip_ratio/region_mean": 7.35726862330921e-05, "completions/clipped_ratio": 0.125, "completions/max_length": 126.0, "completions/max_terminated_length": 126.0, "completions/mean_length": 78.125, "completions/mean_terminated_length": 89.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 68.0, "epoch": 1.96, "format_failures": 1.0, "grad_norm": 11.598993301391602, "kl": 0.08647240558639169, "learning_rate": 1e-06, "loss": 1.9553, "num_tokens": 289264.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002619047649204731, "clip_ratio/low_min": 0.002619047649204731, "clip_ratio/region_mean": 0.002619047649204731, "completions/clipped_ratio": 0.125, "completions/max_length": 257.0, "completions/max_terminated_length": 257.0, "completions/mean_length": 108.25, "completions/mean_terminated_length": 123.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 2.0, "format_failures": 0.0, "grad_norm": 5.694812297821045, "kl": 0.039789453893899915, "learning_rate": 1e-06, "loss": -0.1238, "num_tokens": 302080.0, "reward": 0.574999988079071, "reward_std": 0.41661903262138367, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004414635943248868, "clip_ratio/low_min": 0.004414635943248868, "clip_ratio/region_mean": 0.004414635943248868, "completions/clipped_ratio": 0.125, "completions/max_length": 130.0, "completions/max_terminated_length": 130.0, "completions/mean_length": 65.75, "completions/mean_terminated_length": 75.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 2.064, "format_failures": 0.0, "grad_norm": 5.186154842376709, "kl": 0.050242609810084105, "learning_rate": 1e-06, "loss": 0.3273, "num_tokens": 308472.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 33 }, { "clip_ratio/high_max": 0.0015756364446133375, "clip_ratio/high_mean": 0.0015756364446133375, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015756364446133375, "completions/clipped_ratio": 0.25, "completions/max_length": 355.0, "completions/max_terminated_length": 355.0, "completions/mean_length": 152.375, "completions/mean_terminated_length": 203.16666666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 57.0, "epoch": 2.128, "format_failures": 0.0, "grad_norm": 6.178646564483643, "kl": 0.04819304798729718, "learning_rate": 1e-06, "loss": 0.0732, "num_tokens": 321144.0, "reward": 0.0, "reward_std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 70.25, "completions/mean_terminated_length": 80.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 43.0, "epoch": 2.192, "format_failures": 0.0, "grad_norm": 9.679683685302734, "kl": 0.04483591788448393, "learning_rate": 1e-06, "loss": -0.7427, "num_tokens": 332568.0, "reward": 0.5416666865348816, "reward_std": 0.5019802451133728, "step": 35 }, { "clip_ratio/high_max": 0.0005470459582284093, "clip_ratio/high_mean": 0.0005470459582284093, "clip_ratio/low_mean": 0.0024912295630201697, "clip_ratio/low_min": 0.0024912295630201697, "clip_ratio/region_mean": 0.003038275521248579, "completions/clipped_ratio": 0.125, "completions/max_length": 123.0, "completions/max_terminated_length": 123.0, "completions/mean_length": 61.875, "completions/mean_terminated_length": 70.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, "epoch": 2.2560000000000002, "format_failures": 0.0, "grad_norm": 20.632793426513672, "kl": 0.06712129758670926, "learning_rate": 1e-06, "loss": 2.5061, "num_tokens": 338352.0, "reward": 0.2708333432674408, "reward_std": 0.39778655767440796, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0031341375433839858, "clip_ratio/low_min": 0.0031341375433839858, "clip_ratio/region_mean": 0.0031341375433839858, "completions/clipped_ratio": 0.25, "completions/max_length": 201.0, "completions/max_terminated_length": 201.0, "completions/mean_length": 97.25, "completions/mean_terminated_length": 129.66666666666666, "completions/min_length": 0.0, "completions/min_terminated_length": 93.0, "epoch": 2.32, "format_failures": 0.0, "grad_norm": 8.325549125671387, "kl": 0.07476615975610912, "learning_rate": 1e-06, "loss": 0.0014, "num_tokens": 351528.0, "reward": 0.4439394176006317, "reward_std": 0.215702623128891, "step": 37 }, { "clip_ratio/high_max": 0.0006686007836833596, "clip_ratio/high_mean": 0.0006686007836833596, "clip_ratio/low_mean": 0.004799673450179398, "clip_ratio/low_min": 0.004799673450179398, "clip_ratio/region_mean": 0.005468274233862758, "completions/clipped_ratio": 0.125, "completions/max_length": 162.0, "completions/max_terminated_length": 162.0, "completions/mean_length": 83.25, "completions/mean_terminated_length": 95.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 2.384, "format_failures": 0.0, "grad_norm": 8.579444885253906, "kl": 0.44708020030520856, "learning_rate": 1e-06, "loss": 0.7024, "num_tokens": 357048.0, "reward": 0.44583332538604736, "reward_std": 0.4876042604446411, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0014224655460566282, "clip_ratio/low_min": 0.0014224655460566282, "clip_ratio/region_mean": 0.0014224655460566282, "completions/clipped_ratio": 0.125, "completions/max_length": 168.0, "completions/max_terminated_length": 168.0, "completions/mean_length": 105.5, "completions/mean_terminated_length": 120.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 81.0, "epoch": 2.448, "format_failures": 1.0, "grad_norm": 8.134405136108398, "kl": 0.4579888880252838, "learning_rate": 1e-06, "loss": -0.7953, "num_tokens": 370592.0, "reward": 0.0, "reward_std": 0.0, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 152.0, "completions/max_terminated_length": 152.0, "completions/mean_length": 101.0, "completions/mean_terminated_length": 115.42857142857143, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, "epoch": 2.512, "format_failures": 0.0, "grad_norm": 6.998343467712402, "kl": 0.059122598730027676, "learning_rate": 1e-06, "loss": -0.4243, "num_tokens": 387824.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 40 }, { "clip_ratio/high_max": 0.002086994703859091, "clip_ratio/high_mean": 0.002086994703859091, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002086994703859091, "completions/clipped_ratio": 0.125, "completions/max_length": 249.0, "completions/max_terminated_length": 249.0, "completions/mean_length": 98.5, "completions/mean_terminated_length": 112.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 65.0, "epoch": 2.576, "format_failures": 0.0, "grad_norm": 72.67411804199219, "kl": 0.05187072162516415, "learning_rate": 1e-06, "loss": 0.3373, "num_tokens": 403648.0, "reward": 0.5208333730697632, "reward_std": 0.39276695251464844, "step": 41 }, { "clip_ratio/high_max": 0.0003625637182267383, "clip_ratio/high_mean": 0.0003625637182267383, "clip_ratio/low_mean": 0.0002896586374845356, "clip_ratio/low_min": 0.0002896586374845356, "clip_ratio/region_mean": 0.0006522223557112738, "completions/clipped_ratio": 0.125, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 73.25, "completions/mean_terminated_length": 83.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 61.0, "epoch": 2.64, "format_failures": 0.0, "grad_norm": 14.554330825805664, "kl": 0.15414534136652946, "learning_rate": 1e-06, "loss": 2.2367, "num_tokens": 409704.0, "reward": 0.4583333432674408, "reward_std": 0.501980185508728, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0007432290876749903, "clip_ratio/low_min": 0.0007432290876749903, "clip_ratio/region_mean": 0.0007432290876749903, "completions/clipped_ratio": 0.125, "completions/max_length": 174.0, "completions/max_terminated_length": 174.0, "completions/mean_length": 109.375, "completions/mean_terminated_length": 125.0, "completions/min_length": 0.0, "completions/min_terminated_length": 29.0, "epoch": 2.7039999999999997, "format_failures": 0.0, "grad_norm": 53.890411376953125, "kl": 1.3919735243543983, "learning_rate": 1e-06, "loss": -0.6229, "num_tokens": 417312.0, "reward": 0.3125, "reward_std": 0.2912411689758301, "step": 43 }, { "clip_ratio/high_max": 0.00039795115299057215, "clip_ratio/high_mean": 0.00039795115299057215, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039795115299057215, "completions/clipped_ratio": 0.25, "completions/max_length": 50.0, "completions/max_terminated_length": 50.0, "completions/mean_length": 36.375, "completions/mean_terminated_length": 48.5, "completions/min_length": 0.0, "completions/min_terminated_length": 45.0, "epoch": 2.768, "format_failures": 0.0, "grad_norm": 5.3910932540893555, "kl": 0.1744281006976962, "learning_rate": 1e-06, "loss": -0.1856, "num_tokens": 422800.0, "reward": 0.0, "reward_std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 68.0, "completions/max_terminated_length": 68.0, "completions/mean_length": 49.0, "completions/mean_terminated_length": 56.0, "completions/min_length": 0.0, "completions/min_terminated_length": 42.0, "epoch": 2.832, "format_failures": 0.0, "grad_norm": 0.309182733297348, "kl": 0.09828702360391617, "learning_rate": 1e-06, "loss": 0.0019, "num_tokens": 431080.0, "reward": 0.0, "reward_std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 56.714285714285715, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 2.896, "format_failures": 1.0, "grad_norm": 16.60660743713379, "kl": 0.11247169971466064, "learning_rate": 1e-06, "loss": -1.7005, "num_tokens": 439296.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 134.0, "completions/max_terminated_length": 134.0, "completions/mean_length": 68.75, "completions/mean_terminated_length": 78.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 46.0, "epoch": 2.96, "format_failures": 0.0, "grad_norm": 31.673078536987305, "kl": 2.0126035660505295, "learning_rate": 1e-06, "loss": 0.8165, "num_tokens": 449224.0, "reward": 0.6875, "reward_std": 0.45806270837783813, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0006863656220957637, "clip_ratio/low_min": 0.0006863656220957637, "clip_ratio/region_mean": 0.0006863656220957637, "epoch": 3.0, "grad_norm": 6.059280872344971, "kl": 0.1403810739517212, "learning_rate": 1e-06, "loss": 0.5743, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00021477663540281355, "clip_ratio/low_min": 0.00021477663540281355, "clip_ratio/region_mean": 0.00021477663540281355, "completions/clipped_ratio": 0.125, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 80.25, "completions/mean_terminated_length": 91.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 22.0, "epoch": 3.064, "format_failures": 0.0, "grad_norm": 24.96416664123535, "kl": 4.326897906605154, "learning_rate": 1e-06, "loss": 0.25, "num_tokens": 458280.0, "reward": 0.0, "reward_std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 158.0, "completions/max_terminated_length": 158.0, "completions/mean_length": 73.625, "completions/mean_terminated_length": 84.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 3.128, "format_failures": 0.0, "grad_norm": 3.976156711578369, "kl": 0.1405428946018219, "learning_rate": 1e-06, "loss": -0.6803, "num_tokens": 465592.0, "reward": 0.3125, "reward_std": 0.38253021240234375, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 52.0, "completions/max_terminated_length": 52.0, "completions/mean_length": 40.375, "completions/mean_terminated_length": 46.142857142857146, "completions/min_length": 0.0, "completions/min_terminated_length": 21.0, "epoch": 3.192, "format_failures": 0.0, "grad_norm": 767.0111694335938, "kl": 30.081211734563112, "learning_rate": 1e-06, "loss": 1.7347, "num_tokens": 470880.0, "reward": 0.5833333730697632, "reward_std": 0.49601587653160095, "step": 51 }, { "clip_ratio/high_max": 0.00039308174746111035, "clip_ratio/high_mean": 0.00039308174746111035, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00039308174746111035, "completions/clipped_ratio": 0.125, "completions/max_length": 108.0, "completions/max_terminated_length": 108.0, "completions/mean_length": 56.25, "completions/mean_terminated_length": 64.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 25.0, "epoch": 3.2560000000000002, "format_failures": 0.0, "grad_norm": 10.541399955749512, "kl": 0.2744437651708722, "learning_rate": 1e-06, "loss": -1.0422, "num_tokens": 479136.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 85.0, "completions/max_terminated_length": 85.0, "completions/mean_length": 61.75, "completions/mean_terminated_length": 70.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 60.0, "epoch": 3.32, "format_failures": 0.0, "grad_norm": 3.463606595993042, "kl": 0.10342029482126236, "learning_rate": 1e-06, "loss": 0.4301, "num_tokens": 486000.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 53 }, { "clip_ratio/high_max": 0.00014585764438379556, "clip_ratio/high_mean": 0.00014585764438379556, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00014585764438379556, "completions/clipped_ratio": 0.125, "completions/max_length": 287.0, "completions/max_terminated_length": 287.0, "completions/mean_length": 177.625, "completions/mean_terminated_length": 203.0, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 3.384, "format_failures": 0.0, "grad_norm": 3.68437123298645, "kl": 0.10048098210245371, "learning_rate": 1e-06, "loss": -0.7393, "num_tokens": 497136.0, "reward": 0.5583333373069763, "reward_std": 0.4766783118247986, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 101.0, "completions/max_terminated_length": 101.0, "completions/mean_length": 72.25, "completions/mean_terminated_length": 82.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 3.448, "format_failures": 0.0, "grad_norm": 7.700087070465088, "kl": 0.17961894627660513, "learning_rate": 1e-06, "loss": 1.025, "num_tokens": 502688.0, "reward": 0.44999998807907104, "reward_std": 0.4985693693161011, "step": 55 }, { "clip_ratio/high_max": 0.00020938023226335645, "clip_ratio/high_mean": 0.00020938023226335645, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00020938023226335645, "completions/clipped_ratio": 0.125, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 114.125, "completions/mean_terminated_length": 130.42857142857142, "completions/min_length": 0.0, "completions/min_terminated_length": 69.0, "epoch": 3.512, "format_failures": 1.0, "grad_norm": 6.477407455444336, "kl": 0.18405211344361305, "learning_rate": 1e-06, "loss": -0.815, "num_tokens": 509816.0, "reward": 0.28125, "reward_std": 0.33905068039894104, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 217.0, "completions/max_terminated_length": 217.0, "completions/mean_length": 144.375, "completions/mean_terminated_length": 165.0, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 3.576, "format_failures": 0.0, "grad_norm": 3.1489181518554688, "kl": 0.18948577530682087, "learning_rate": 1e-06, "loss": 0.1775, "num_tokens": 518072.0, "reward": 0.6666666865348816, "reward_std": 0.4364357590675354, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 246.0, "completions/max_terminated_length": 246.0, "completions/mean_length": 125.125, "completions/mean_terminated_length": 143.0, "completions/min_length": 0.0, "completions/min_terminated_length": 89.0, "epoch": 3.64, "format_failures": 0.0, "grad_norm": 2.2657573223114014, "kl": 0.1387784667313099, "learning_rate": 1e-06, "loss": 0.0604, "num_tokens": 531728.0, "reward": 0.25, "reward_std": 0.4629100561141968, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00014551804633811116, "clip_ratio/low_min": 0.00014551804633811116, "clip_ratio/region_mean": 0.00014551804633811116, "completions/clipped_ratio": 0.125, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 56.5, "completions/mean_terminated_length": 64.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 52.0, "epoch": 3.7039999999999997, "format_failures": 0.0, "grad_norm": 2.2649238109588623, "kl": 0.28891171142458916, "learning_rate": 1e-06, "loss": 0.2216, "num_tokens": 536768.0, "reward": 0.0, "reward_std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 227.0, "completions/max_terminated_length": 227.0, "completions/mean_length": 91.5, "completions/mean_terminated_length": 104.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 3.768, "format_failures": 0.0, "grad_norm": 3.3132457733154297, "kl": 0.096153249964118, "learning_rate": 1e-06, "loss": 0.3965, "num_tokens": 546784.0, "reward": 0.1875, "reward_std": 0.2587745785713196, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 143.0, "completions/max_terminated_length": 143.0, "completions/mean_length": 90.75, "completions/mean_terminated_length": 103.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 71.0, "epoch": 3.832, "format_failures": 0.0, "grad_norm": 4.948695182800293, "kl": 0.1259058197028935, "learning_rate": 1e-06, "loss": -0.4309, "num_tokens": 559872.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0003612716682255268, "clip_ratio/low_min": 0.0003612716682255268, "clip_ratio/region_mean": 0.0003612716682255268, "completions/clipped_ratio": 0.125, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 62.125, "completions/mean_terminated_length": 71.0, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 3.896, "format_failures": 0.0, "grad_norm": 8.347101211547852, "kl": 0.8767695324495435, "learning_rate": 1e-06, "loss": -0.0695, "num_tokens": 565032.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 138.0, "completions/max_terminated_length": 138.0, "completions/mean_length": 84.625, "completions/mean_terminated_length": 96.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 47.0, "epoch": 3.96, "format_failures": 0.0, "grad_norm": 5.449214935302734, "kl": 0.26848094910383224, "learning_rate": 1e-06, "loss": -0.0353, "num_tokens": 572664.0, "reward": 0.4464285671710968, "reward_std": 0.49744242429733276, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 136.75, "completions/mean_terminated_length": 156.28571428571428, "completions/min_length": 0.0, "completions/min_terminated_length": 54.0, "epoch": 4.0, "format_failures": 0.0, "grad_norm": 5.11106014251709, "kl": 0.17550407350063324, "learning_rate": 1e-06, "loss": 0.7516, "num_tokens": 586696.0, "reward": 0.6588234901428223, "reward_std": 0.44546374678611755, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 80.0, "completions/max_terminated_length": 80.0, "completions/mean_length": 40.125, "completions/mean_terminated_length": 45.857142857142854, "completions/min_length": 0.0, "completions/min_terminated_length": 26.0, "epoch": 4.064, "format_failures": 1.0, "grad_norm": 9.72535514831543, "kl": 0.20796778332442045, "learning_rate": 1e-06, "loss": 0.8956, "num_tokens": 591600.0, "reward": 0.30000001192092896, "reward_std": 0.4535573422908783, "step": 65 }, { "clip_ratio/high_max": 0.0013570611481554806, "clip_ratio/high_mean": 0.0013570611481554806, "clip_ratio/low_mean": 0.012927594594657421, "clip_ratio/low_min": 0.012927594594657421, "clip_ratio/region_mean": 0.014284655742812902, "completions/clipped_ratio": 0.125, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 55.25, "completions/mean_terminated_length": 63.142857142857146, "completions/min_length": 0.0, "completions/min_terminated_length": 48.0, "epoch": 4.128, "format_failures": 0.0, "grad_norm": 15.252068519592285, "kl": 0.22740534879267216, "learning_rate": 1e-06, "loss": 0.8349, "num_tokens": 596880.0, "reward": 0.8125, "reward_std": 0.3720118999481201, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 110.0, "completions/max_terminated_length": 110.0, "completions/mean_length": 55.0, "completions/mean_terminated_length": 88.0, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, "epoch": 4.192, "format_failures": 1.0, "grad_norm": 16.80088233947754, "kl": 0.31182049214839935, "learning_rate": 1e-06, "loss": 0.693, "num_tokens": 603344.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004188144113868475, "clip_ratio/low_min": 0.004188144113868475, "clip_ratio/region_mean": 0.004188144113868475, "completions/clipped_ratio": 0.125, "completions/max_length": 161.0, "completions/max_terminated_length": 161.0, "completions/mean_length": 100.25, "completions/mean_terminated_length": 114.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 84.0, "epoch": 4.256, "format_failures": 0.0, "grad_norm": 13.643096923828125, "kl": 0.11746187414973974, "learning_rate": 1e-06, "loss": -1.335, "num_tokens": 610184.0, "reward": 0.5208333730697632, "reward_std": 0.46664538979530334, "step": 68 }, { "clip_ratio/high_max": 0.00018115942657459527, "clip_ratio/high_mean": 0.00018115942657459527, "clip_ratio/low_mean": 0.00037650601007044315, "clip_ratio/low_min": 0.00037650601007044315, "clip_ratio/region_mean": 0.0005576654366450384, "completions/clipped_ratio": 0.125, "completions/max_length": 199.0, "completions/max_terminated_length": 199.0, "completions/mean_length": 72.875, "completions/mean_terminated_length": 83.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 46.0, "epoch": 4.32, "format_failures": 0.0, "grad_norm": 5.553096771240234, "kl": 0.19322836678475142, "learning_rate": 1e-06, "loss": -0.6568, "num_tokens": 619440.0, "reward": 0.2916666865348816, "reward_std": 0.4520675837993622, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 61.375, "completions/mean_terminated_length": 70.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 17.0, "epoch": 4.384, "format_failures": 0.0, "grad_norm": 11.096977233886719, "kl": 0.205445297062397, "learning_rate": 1e-06, "loss": -0.9614, "num_tokens": 626024.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 70 }, { "clip_ratio/high_max": 0.0001707650226308033, "clip_ratio/high_mean": 0.0001707650226308033, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001707650226308033, "completions/clipped_ratio": 0.125, "completions/max_length": 113.0, "completions/max_terminated_length": 113.0, "completions/mean_length": 59.25, "completions/mean_terminated_length": 67.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 30.0, "epoch": 4.448, "format_failures": 0.0, "grad_norm": 13.960062026977539, "kl": 0.2419998161494732, "learning_rate": 1e-06, "loss": 1.6815, "num_tokens": 631704.0, "reward": 0.30000001192092896, "reward_std": 0.4535573720932007, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 236.0, "completions/max_terminated_length": 236.0, "completions/mean_length": 119.625, "completions/mean_terminated_length": 136.71428571428572, "completions/min_length": 0.0, "completions/min_terminated_length": 66.0, "epoch": 4.5120000000000005, "format_failures": 0.0, "grad_norm": 4.790798187255859, "kl": 0.14366307947784662, "learning_rate": 1e-06, "loss": -0.622, "num_tokens": 640736.0, "reward": 0.6041666269302368, "reward_std": 0.5034602880477905, "step": 72 }, { "clip_ratio/high_max": 0.001402775407768786, "clip_ratio/high_mean": 0.001402775407768786, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001402775407768786, "completions/clipped_ratio": 0.125, "completions/max_length": 163.0, "completions/max_terminated_length": 163.0, "completions/mean_length": 89.875, "completions/mean_terminated_length": 102.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 24.0, "epoch": 4.576, "format_failures": 0.0, "grad_norm": 24.572803497314453, "kl": 2.3969106171280146, "learning_rate": 1e-06, "loss": 0.7312, "num_tokens": 649920.0, "reward": 0.0416666679084301, "reward_std": 0.1178511381149292, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001328374055447057, "clip_ratio/low_min": 0.0001328374055447057, "clip_ratio/region_mean": 0.0001328374055447057, "completions/clipped_ratio": 0.125, "completions/max_length": 231.0, "completions/max_terminated_length": 231.0, "completions/mean_length": 118.0, "completions/mean_terminated_length": 134.85714285714286, "completions/min_length": 0.0, "completions/min_terminated_length": 101.0, "epoch": 4.64, "format_failures": 0.0, "grad_norm": 11.096585273742676, "kl": 0.14362134877592325, "learning_rate": 1e-06, "loss": 1.6126, "num_tokens": 660512.0, "reward": 0.59375, "reward_std": 0.4988826811313629, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 277.0, "completions/max_terminated_length": 277.0, "completions/mean_length": 171.625, "completions/mean_terminated_length": 196.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 42.0, "epoch": 4.704, "format_failures": 0.0, "grad_norm": 10.942404747009277, "kl": 0.09571220818907022, "learning_rate": 1e-06, "loss": -1.9221, "num_tokens": 671728.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 83.0, "completions/max_terminated_length": 83.0, "completions/mean_length": 51.25, "completions/mean_terminated_length": 58.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 33.0, "epoch": 4.768, "format_failures": 0.0, "grad_norm": 17.237686157226562, "kl": 0.1505587575957179, "learning_rate": 1e-06, "loss": -0.4979, "num_tokens": 678808.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 76 }, { "clip_ratio/high_max": 0.0069027612917125225, "clip_ratio/high_mean": 0.0069027612917125225, "clip_ratio/low_mean": 0.00044653778604697436, "clip_ratio/low_min": 0.00044653778604697436, "clip_ratio/region_mean": 0.007349299077759497, "completions/clipped_ratio": 0.125, "completions/max_length": 95.0, "completions/max_terminated_length": 95.0, "completions/mean_length": 63.25, "completions/mean_terminated_length": 72.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 42.0, "epoch": 4.832, "format_failures": 0.0, "grad_norm": 48.175540924072266, "kl": 0.12417041137814522, "learning_rate": 1e-06, "loss": -0.3992, "num_tokens": 685376.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 105.0, "completions/max_terminated_length": 105.0, "completions/mean_length": 66.625, "completions/mean_terminated_length": 76.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 40.0, "epoch": 4.896, "format_failures": 0.0, "grad_norm": 21.25414276123047, "kl": 0.3155105458572507, "learning_rate": 1e-06, "loss": 2.8688, "num_tokens": 691504.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 428.0, "completions/max_terminated_length": 428.0, "completions/mean_length": 160.875, "completions/mean_terminated_length": 183.85714285714286, "completions/min_length": 0.0, "completions/min_terminated_length": 55.0, "epoch": 4.96, "format_failures": 0.0, "grad_norm": 11.223791122436523, "kl": 0.1855175606906414, "learning_rate": 1e-06, "loss": -1.5493, "num_tokens": 700192.0, "reward": 0.3015109896659851, "reward_std": 0.42723536491394043, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 87.875, "completions/mean_terminated_length": 100.42857142857143, "completions/min_length": 0.0, "completions/min_terminated_length": 8.0, "epoch": 5.0, "format_failures": 0.0, "grad_norm": 5.051183700561523, "kl": 0.6679443523287774, "learning_rate": 1e-06, "loss": 0.6727, "num_tokens": 715360.0, "reward": 0.3965517282485962, "reward_std": 0.503090500831604, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0012536601134343073, "clip_ratio/low_min": 0.0012536601134343073, "clip_ratio/region_mean": 0.0012536601134343073, "completions/clipped_ratio": 0.125, "completions/max_length": 295.0, "completions/max_terminated_length": 295.0, "completions/mean_length": 192.25, "completions/mean_terminated_length": 219.71428571428572, "completions/min_length": 0.0, "completions/min_terminated_length": 51.0, "epoch": 5.064, "format_failures": 0.0, "grad_norm": 6.032639503479004, "kl": 0.1344920275732875, "learning_rate": 1e-06, "loss": -0.8074, "num_tokens": 725176.0, "reward": 0.5416666865348816, "reward_std": 0.46929529309272766, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 57.0, "completions/max_terminated_length": 57.0, "completions/mean_length": 44.625, "completions/mean_terminated_length": 51.0, "completions/min_length": 0.0, "completions/min_terminated_length": 34.0, "epoch": 5.128, "format_failures": 0.0, "grad_norm": 9.636035919189453, "kl": 0.15830809529870749, "learning_rate": 1e-06, "loss": 0.9562, "num_tokens": 729984.0, "reward": 0.46875, "reward_std": 0.5077524185180664, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.000637518212897703, "clip_ratio/low_min": 0.000637518212897703, "clip_ratio/region_mean": 0.000637518212897703, "completions/clipped_ratio": 0.125, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 63.625, "completions/mean_terminated_length": 72.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 38.0, "epoch": 5.192, "format_failures": 0.0, "grad_norm": 7.9917144775390625, "kl": 0.1429830752313137, "learning_rate": 1e-06, "loss": -0.1994, "num_tokens": 736312.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.001965374161954969, "clip_ratio/low_min": 0.001965374161954969, "clip_ratio/region_mean": 0.001965374161954969, "completions/clipped_ratio": 0.25, "completions/max_length": 394.0, "completions/max_terminated_length": 394.0, "completions/mean_length": 136.0, "completions/mean_terminated_length": 181.33333333333334, "completions/min_length": 0.0, "completions/min_terminated_length": 10.0, "epoch": 5.256, "format_failures": 0.0, "grad_norm": 8.276095390319824, "kl": 0.28229224402457476, "learning_rate": 1e-06, "loss": -1.1183, "num_tokens": 749648.0, "reward": 0.637499988079071, "reward_std": 0.4405759274959564, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 76.0, "completions/max_terminated_length": 76.0, "completions/mean_length": 56.125, "completions/mean_terminated_length": 64.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 52.0, "epoch": 5.32, "format_failures": 0.0, "grad_norm": 3.3836967945098877, "kl": 0.11320708272978663, "learning_rate": 1e-06, "loss": -0.262, "num_tokens": 755344.0, "reward": 0.1875, "reward_std": 0.3720118999481201, "step": 85 }, { "clip_ratio/high_max": 0.00038880249485373497, "clip_ratio/high_mean": 0.00038880249485373497, "clip_ratio/low_mean": 0.0007896393508417532, "clip_ratio/low_min": 0.0007896393508417532, "clip_ratio/region_mean": 0.0011784418456954882, "completions/clipped_ratio": 0.125, "completions/max_length": 192.0, "completions/max_terminated_length": 192.0, "completions/mean_length": 85.25, "completions/mean_terminated_length": 97.42857142857143, "completions/min_length": 0.0, "completions/min_terminated_length": 56.0, "epoch": 5.384, "format_failures": 0.0, "grad_norm": 6.297000885009766, "kl": 0.7561929021030664, "learning_rate": 1e-06, "loss": 0.5695, "num_tokens": 762432.0, "reward": 0.09375, "reward_std": 0.2651650309562683, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 153.0, "completions/max_terminated_length": 153.0, "completions/mean_length": 102.0, "completions/mean_terminated_length": 116.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 20.0, "epoch": 5.448, "format_failures": 0.0, "grad_norm": 3.523719310760498, "kl": 0.19376599509269, "learning_rate": 1e-06, "loss": -0.6165, "num_tokens": 769192.0, "reward": 0.5833333730697632, "reward_std": 0.49601587653160095, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00017908310110215098, "clip_ratio/low_min": 0.00017908310110215098, "clip_ratio/region_mean": 0.00017908310110215098, "completions/clipped_ratio": 0.25, "completions/max_length": 109.0, "completions/max_terminated_length": 109.0, "completions/mean_length": 81.75, "completions/mean_terminated_length": 109.0, "completions/min_length": 0.0, "completions/min_terminated_length": 109.0, "epoch": 5.5120000000000005, "format_failures": 0.0, "grad_norm": 4.026613235473633, "kl": 0.32431851979345083, "learning_rate": 1e-06, "loss": 0.3918, "num_tokens": 775912.0, "reward": 0.125, "reward_std": 0.3535533845424652, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0009421340801054612, "clip_ratio/low_min": 0.0009421340801054612, "clip_ratio/region_mean": 0.0009421340801054612, "completions/clipped_ratio": 0.125, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 41.625, "completions/mean_terminated_length": 47.57142857142857, "completions/min_length": 0.0, "completions/min_terminated_length": 9.0, "epoch": 5.576, "format_failures": 0.0, "grad_norm": 9.119647979736328, "kl": 0.20464181900024414, "learning_rate": 1e-06, "loss": -0.8791, "num_tokens": 781112.0, "reward": 0.5890151262283325, "reward_std": 0.4705297350883484, "step": 89 }, { "clip_ratio/high_max": 0.00911893486045301, "clip_ratio/high_mean": 0.00911893486045301, "clip_ratio/low_mean": 0.000877421407494694, "clip_ratio/low_min": 0.000877421407494694, "clip_ratio/region_mean": 0.009996356267947704, "completions/clipped_ratio": 0.375, "completions/max_length": 586.0, "completions/max_terminated_length": 586.0, "completions/mean_length": 143.375, "completions/mean_terminated_length": 229.4, "completions/min_length": 0.0, "completions/min_terminated_length": 36.0, "epoch": 5.64, "format_failures": 0.0, "grad_norm": 32.418331146240234, "kl": 0.3150494508445263, "learning_rate": 1e-06, "loss": 0.7753, "num_tokens": 793976.0, "reward": 0.02777777798473835, "reward_std": 0.07856742292642593, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 181.0, "completions/max_terminated_length": 181.0, "completions/mean_length": 85.5, "completions/mean_terminated_length": 97.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 14.0, "epoch": 5.704, "format_failures": 0.0, "grad_norm": 1.2142812013626099, "kl": 0.1678312411531806, "learning_rate": 1e-06, "loss": -0.2285, "num_tokens": 800528.0, "reward": 0.5806547999382019, "reward_std": 0.40297815203666687, "step": 91 }, { "clip_ratio/high_max": 0.008678364916704595, "clip_ratio/high_mean": 0.008678364916704595, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008678364916704595, "completions/clipped_ratio": 0.125, "completions/max_length": 195.0, "completions/max_terminated_length": 195.0, "completions/mean_length": 119.875, "completions/mean_terminated_length": 137.0, "completions/min_length": 0.0, "completions/min_terminated_length": 62.0, "epoch": 5.768, "format_failures": 0.0, "grad_norm": 8.557031631469727, "kl": 0.26262282859534025, "learning_rate": 1e-06, "loss": -0.786, "num_tokens": 811680.0, "reward": 0.375, "reward_std": 0.5175491571426392, "step": 92 }, { "clip_ratio/high_max": 0.0003788308094954118, "clip_ratio/high_mean": 0.0003788308094954118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003788308094954118, "completions/clipped_ratio": 0.125, "completions/max_length": 387.0, "completions/max_terminated_length": 387.0, "completions/mean_length": 156.0, "completions/mean_terminated_length": 178.28571428571428, "completions/min_length": 0.0, "completions/min_terminated_length": 82.0, "epoch": 5.832, "format_failures": 0.0, "grad_norm": 8.220466613769531, "kl": 0.14401236828416586, "learning_rate": 1e-06, "loss": 1.0757, "num_tokens": 824400.0, "reward": 0.3035714328289032, "reward_std": 0.45456862449645996, "step": 93 }, { "clip_ratio/high_max": 0.00015356265066657215, "clip_ratio/high_mean": 0.00015356265066657215, "clip_ratio/low_mean": 0.0011615749244811013, "clip_ratio/low_min": 0.0011615749244811013, "clip_ratio/region_mean": 0.0013151375751476735, "completions/clipped_ratio": 0.125, "completions/max_length": 177.0, "completions/max_terminated_length": 177.0, "completions/mean_length": 77.625, "completions/mean_terminated_length": 88.71428571428571, "completions/min_length": 0.0, "completions/min_terminated_length": 58.0, "epoch": 5.896, "format_failures": 0.0, "grad_norm": 5.23447322845459, "kl": 0.21856553480029106, "learning_rate": 1e-06, "loss": -0.4024, "num_tokens": 838072.0, "reward": 0.359375, "reward_std": 0.469790518283844, "step": 94 }, { "clip_ratio/high_max": 0.002261076238937676, "clip_ratio/high_mean": 0.002261076238937676, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002261076238937676, "completions/clipped_ratio": 0.125, "completions/max_length": 87.0, "completions/max_terminated_length": 87.0, "completions/mean_length": 49.625, "completions/mean_terminated_length": 56.714285714285715, "completions/min_length": 0.0, "completions/min_terminated_length": 11.0, "epoch": 5.96, "format_failures": 0.0, "grad_norm": 16.173349380493164, "kl": 0.41087135300040245, "learning_rate": 1e-06, "loss": 0.8071, "num_tokens": 846120.0, "reward": 0.0, "reward_std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "epoch": 6.0, "grad_norm": 2.644404888153076, "kl": 0.6906098246574401, "learning_rate": 1e-06, "loss": 0.0081, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 131.0, "completions/max_terminated_length": 131.0, "completions/mean_length": 100.875, "completions/mean_terminated_length": 115.28571428571429, "completions/min_length": 0.0, "completions/min_terminated_length": 108.0, "epoch": 6.064, "format_failures": 0.0, "grad_norm": 10.684069633483887, "kl": 1.1826152130961418, "learning_rate": 1e-06, "loss": 0.7069, "num_tokens": 854688.0, "reward": 0.75, "reward_std": 0.4629100561141968, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 141.0, "completions/max_terminated_length": 141.0, "completions/mean_length": 89.375, "completions/mean_terminated_length": 102.14285714285714, "completions/min_length": 0.0, "completions/min_terminated_length": 13.0, "epoch": 6.128, "format_failures": 0.0, "grad_norm": 6.4194746017456055, "kl": 0.12883292511105537, "learning_rate": 1e-06, "loss": -1.1247, "num_tokens": 863240.0, "reward": 0.5625, "reward_std": 0.3720118999481201, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 72.0, "completions/max_terminated_length": 72.0, "completions/mean_length": 36.0, "completions/mean_terminated_length": 72.0, "completions/min_length": 0.0, "completions/min_terminated_length": 72.0, "epoch": 6.192, "format_failures": 0.0, "grad_norm": 7.2926411628723145, "kl": 0.1561364121735096, "learning_rate": 1e-06, "loss": 1.0999, "num_tokens": 869160.0, "reward": 0.875, "reward_std": 0.3535533845424652, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.75, "completions/max_length": 48.0, "completions/max_terminated_length": 48.0, "completions/mean_length": 11.25, "completions/mean_terminated_length": 45.0, "completions/min_length": 0.0, "completions/min_terminated_length": 42.0, "epoch": 6.256, "format_failures": 0.0, "grad_norm": 68.3973159790039, "kl": 0.22240112535655499, "learning_rate": 1e-06, "loss": 6.8633, "num_tokens": 874632.0, "reward": 0.5, "reward_std": 0.5345224738121033, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 90.0, "completions/max_terminated_length": 90.0, "completions/mean_length": 59.625, "completions/mean_terminated_length": 79.5, "completions/min_length": 0.0, "completions/min_terminated_length": 75.0, "epoch": 6.32, "format_failures": 0.0, "grad_norm": 61.82405090332031, "kl": 0.04734344594180584, "learning_rate": 1e-06, "loss": -7.4582, "num_tokens": 881504.0, "reward": 1.0, "reward_std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 154.0, "completions/max_terminated_length": 154.0, "completions/mean_length": 127.625, "completions/mean_terminated_length": 145.85714285714286, "completions/min_length": 0.0, "completions/min_terminated_length": 97.0, "epoch": 6.384, "format_failures": 1.0, "grad_norm": 0.07336875051259995, "kl": 0.056114144157618284, "learning_rate": 1e-06, "loss": 0.0005, "num_tokens": 889832.0, "reward": 0.0, "reward_std": 0.0, "step": 102 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 889832, "num_train_epochs": 63, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }