lcb_test_generator_1.5b_100steps / trainer_state.json
Harryllh's picture
Upload folder using huggingface_hub
c64e8ad verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 6.384,
"eval_steps": 500,
"global_step": 102,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 177.0,
"completions/max_terminated_length": 177.0,
"completions/mean_length": 105.5,
"completions/mean_terminated_length": 120.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.064,
"format_failures": 3.0,
"grad_norm": 2.247725486755371,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0347,
"num_tokens": 6048.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 131.0,
"completions/max_terminated_length": 131.0,
"completions/mean_length": 75.625,
"completions/mean_terminated_length": 86.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.128,
"format_failures": 3.0,
"grad_norm": 1.4242777824401855,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": -0.1028,
"num_tokens": 13280.0,
"reward": 0.1875,
"reward_std": 0.3720118999481201,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 213.0,
"completions/max_terminated_length": 213.0,
"completions/mean_length": 124.25,
"completions/mean_terminated_length": 142.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.192,
"format_failures": 1.0,
"grad_norm": 0.020250532776117325,
"kl": 0.0035181287967134267,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 21904.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 249.0,
"completions/max_terminated_length": 249.0,
"completions/mean_length": 73.25,
"completions/mean_terminated_length": 83.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.256,
"format_failures": 1.0,
"grad_norm": 8.061470031738281,
"kl": 0.034313585492782295,
"learning_rate": 1e-06,
"loss": -0.2682,
"num_tokens": 27552.0,
"reward": 0.27916666865348816,
"reward_std": 0.8364584445953369,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 385.0,
"completions/max_terminated_length": 385.0,
"completions/mean_length": 145.625,
"completions/mean_terminated_length": 166.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.32,
"format_failures": 1.0,
"grad_norm": 1.223435401916504,
"kl": 0.03014595981221646,
"learning_rate": 1e-06,
"loss": 0.1171,
"num_tokens": 43192.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 277.0,
"completions/max_terminated_length": 277.0,
"completions/mean_length": 109.625,
"completions/mean_terminated_length": 125.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.384,
"format_failures": 0.0,
"grad_norm": 0.1720724254846573,
"kl": 0.03908220527227968,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 55448.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 126.0,
"completions/max_terminated_length": 126.0,
"completions/mean_length": 74.625,
"completions/mean_terminated_length": 85.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.448,
"format_failures": 0.0,
"grad_norm": 0.5268336534500122,
"kl": 0.021530768717639148,
"learning_rate": 1e-06,
"loss": 0.0075,
"num_tokens": 62672.0,
"reward": 0.03125,
"reward_std": 0.0883883461356163,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 62.125,
"completions/mean_terminated_length": 71.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.512,
"format_failures": 2.0,
"grad_norm": 2.541877031326294,
"kl": 0.3408850164851174,
"learning_rate": 1e-06,
"loss": -0.1278,
"num_tokens": 70896.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 117.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 82.125,
"completions/mean_terminated_length": 93.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.576,
"format_failures": 2.0,
"grad_norm": 1.876581072807312,
"kl": 0.0260943416506052,
"learning_rate": 1e-06,
"loss": -0.053,
"num_tokens": 78128.0,
"reward": 0.4166666865348816,
"reward_std": 0.49601587653160095,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 106.0,
"completions/max_terminated_length": 106.0,
"completions/mean_length": 59.125,
"completions/mean_terminated_length": 67.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.64,
"format_failures": 1.0,
"grad_norm": 1.4804662466049194,
"kl": 0.17110479215625674,
"learning_rate": 1e-06,
"loss": -0.0096,
"num_tokens": 83696.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 90.625,
"completions/mean_terminated_length": 103.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.704,
"format_failures": 0.0,
"grad_norm": 1.5350069999694824,
"kl": 0.48000563448294997,
"learning_rate": 1e-06,
"loss": -0.0756,
"num_tokens": 92216.0,
"reward": 0.375,
"reward_std": 0.4154745042324066,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 130.0,
"completions/max_terminated_length": 130.0,
"completions/mean_length": 66.125,
"completions/mean_terminated_length": 75.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.768,
"format_failures": 0.0,
"grad_norm": 7.105235576629639,
"kl": 0.25097161275334656,
"learning_rate": 1e-06,
"loss": 0.1211,
"num_tokens": 101288.0,
"reward": 0.25,
"reward_std": 0.38832157850265503,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 76.0,
"completions/max_terminated_length": 76.0,
"completions/mean_length": 41.375,
"completions/mean_terminated_length": 47.285714285714285,
"completions/min_length": 0.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.832,
"format_failures": 1.0,
"grad_norm": 8.552057266235352,
"kl": 0.887442918960005,
"learning_rate": 1e-06,
"loss": 0.6279,
"num_tokens": 108296.0,
"reward": 0.625,
"reward_std": 0.4520675837993622,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 508.0,
"completions/max_terminated_length": 508.0,
"completions/mean_length": 169.125,
"completions/mean_terminated_length": 193.28571428571428,
"completions/min_length": 0.0,
"completions/min_terminated_length": 73.0,
"epoch": 0.896,
"format_failures": 0.0,
"grad_norm": 3.0173394680023193,
"kl": 0.7231281753629446,
"learning_rate": 1e-06,
"loss": 0.0056,
"num_tokens": 123336.0,
"reward": 0.0535714291036129,
"reward_std": 0.15152288973331451,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 89.375,
"completions/mean_terminated_length": 102.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.96,
"format_failures": 0.0,
"grad_norm": 4.813839912414551,
"kl": 1.1184300668537617,
"learning_rate": 1e-06,
"loss": -0.231,
"num_tokens": 136000.0,
"reward": -0.125,
"reward_std": 0.3535533845424652,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 206.0,
"completions/max_terminated_length": 206.0,
"completions/mean_length": 131.875,
"completions/mean_terminated_length": 150.71428571428572,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 1.0,
"format_failures": 0.0,
"grad_norm": 0.23264528810977936,
"kl": 0.09705191291868687,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 146704.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 702.0,
"completions/max_terminated_length": 702.0,
"completions/mean_length": 300.875,
"completions/mean_terminated_length": 343.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 1.064,
"format_failures": 0.0,
"grad_norm": 1.7797789573669434,
"kl": 0.031833621207624674,
"learning_rate": 1e-06,
"loss": -0.0936,
"num_tokens": 161184.0,
"reward": 0.32083332538604736,
"reward_std": 0.4521333873271942,
"step": 17
},
{
"clip_ratio/high_max": 0.000908265239559114,
"clip_ratio/high_mean": 0.000908265239559114,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000908265239559114,
"completions/clipped_ratio": 0.125,
"completions/max_length": 73.0,
"completions/max_terminated_length": 73.0,
"completions/mean_length": 51.75,
"completions/mean_terminated_length": 59.142857142857146,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 1.1280000000000001,
"format_failures": 0.0,
"grad_norm": 1.9153517484664917,
"kl": 0.045906367246061563,
"learning_rate": 1e-06,
"loss": 0.2096,
"num_tokens": 165496.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 354.0,
"completions/max_terminated_length": 354.0,
"completions/mean_length": 242.375,
"completions/mean_terminated_length": 277.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 1.192,
"format_failures": 1.0,
"grad_norm": 0.8472970724105835,
"kl": 0.020359830697998405,
"learning_rate": 1e-06,
"loss": -0.0893,
"num_tokens": 174512.0,
"reward": 0.5625,
"reward_std": 0.4955156147480011,
"step": 19
},
{
"clip_ratio/high_max": 0.00041345093632116914,
"clip_ratio/high_mean": 0.00041345093632116914,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00041345093632116914,
"completions/clipped_ratio": 0.125,
"completions/max_length": 115.0,
"completions/max_terminated_length": 115.0,
"completions/mean_length": 80.0,
"completions/mean_terminated_length": 91.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 1.256,
"format_failures": 0.0,
"grad_norm": 4.18609619140625,
"kl": 0.037674687220714986,
"learning_rate": 1e-06,
"loss": 0.5542,
"num_tokens": 182664.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0023937864461913705,
"clip_ratio/low_min": 0.0023937864461913705,
"clip_ratio/region_mean": 0.0023937864461913705,
"completions/clipped_ratio": 0.125,
"completions/max_length": 189.0,
"completions/max_terminated_length": 189.0,
"completions/mean_length": 90.375,
"completions/mean_terminated_length": 103.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 22.0,
"epoch": 1.32,
"format_failures": 0.0,
"grad_norm": 5.047491550445557,
"kl": 0.262689758092165,
"learning_rate": 1e-06,
"loss": -0.5506,
"num_tokens": 189752.0,
"reward": 0.03125,
"reward_std": 0.0883883461356163,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00015489467477891594,
"clip_ratio/low_min": 0.00015489467477891594,
"clip_ratio/region_mean": 0.00015489467477891594,
"completions/clipped_ratio": 0.125,
"completions/max_length": 220.0,
"completions/max_terminated_length": 220.0,
"completions/mean_length": 110.25,
"completions/mean_terminated_length": 126.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 1.384,
"format_failures": 1.0,
"grad_norm": 1.870309591293335,
"kl": 0.15177738456986845,
"learning_rate": 1e-06,
"loss": 0.0972,
"num_tokens": 197664.0,
"reward": 0.36250001192092896,
"reward_std": 0.4405759274959564,
"step": 22
},
{
"clip_ratio/high_max": 0.0007937598857097328,
"clip_ratio/high_mean": 0.0007937598857097328,
"clip_ratio/low_mean": 0.00033377838553860784,
"clip_ratio/low_min": 0.00033377838553860784,
"clip_ratio/region_mean": 0.0011275382712483406,
"completions/clipped_ratio": 0.125,
"completions/max_length": 552.0,
"completions/max_terminated_length": 552.0,
"completions/mean_length": 174.75,
"completions/mean_terminated_length": 199.71428571428572,
"completions/min_length": 0.0,
"completions/min_terminated_length": 82.0,
"epoch": 1.448,
"format_failures": 0.0,
"grad_norm": 7.092167377471924,
"kl": 0.34660289715975523,
"learning_rate": 1e-06,
"loss": 0.8114,
"num_tokens": 210072.0,
"reward": 0.25275737047195435,
"reward_std": 0.3869698941707611,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0025806452613323927,
"clip_ratio/low_min": 0.0025806452613323927,
"clip_ratio/region_mean": 0.0025806452613323927,
"completions/clipped_ratio": 0.125,
"completions/max_length": 953.0,
"completions/max_terminated_length": 953.0,
"completions/mean_length": 233.125,
"completions/mean_terminated_length": 266.42857142857144,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 1.512,
"format_failures": 0.0,
"grad_norm": 8.143402099609375,
"kl": 0.3320934564108029,
"learning_rate": 1e-06,
"loss": -0.9434,
"num_tokens": 224336.0,
"reward": 0.5197916626930237,
"reward_std": 0.43734264373779297,
"step": 24
},
{
"clip_ratio/high_max": 0.0009831460192799568,
"clip_ratio/high_mean": 0.0009831460192799568,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0009831460192799568,
"completions/clipped_ratio": 0.25,
"completions/max_length": 169.0,
"completions/max_terminated_length": 169.0,
"completions/mean_length": 75.0,
"completions/mean_terminated_length": 100.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 43.0,
"epoch": 1.576,
"format_failures": 0.0,
"grad_norm": 12.648838996887207,
"kl": 0.08752637438010424,
"learning_rate": 1e-06,
"loss": -1.3126,
"num_tokens": 237344.0,
"reward": 0.3630952537059784,
"reward_std": 0.3474069833755493,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 85.0,
"completions/max_terminated_length": 85.0,
"completions/mean_length": 60.25,
"completions/mean_terminated_length": 80.33333333333333,
"completions/min_length": 0.0,
"completions/min_terminated_length": 76.0,
"epoch": 1.6400000000000001,
"format_failures": 0.0,
"grad_norm": 22.935155868530273,
"kl": 0.040498227812349796,
"learning_rate": 1e-06,
"loss": 2.0449,
"num_tokens": 243264.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00016914748994167894,
"clip_ratio/low_min": 0.00016914748994167894,
"clip_ratio/region_mean": 0.00016914748994167894,
"completions/clipped_ratio": 0.125,
"completions/max_length": 114.0,
"completions/max_terminated_length": 114.0,
"completions/mean_length": 68.5,
"completions/mean_terminated_length": 78.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 1.704,
"format_failures": 0.0,
"grad_norm": 6.5060811042785645,
"kl": 0.05175229045562446,
"learning_rate": 1e-06,
"loss": -0.231,
"num_tokens": 248064.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 120.0,
"completions/max_terminated_length": 120.0,
"completions/mean_length": 74.375,
"completions/mean_terminated_length": 85.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 14.0,
"epoch": 1.768,
"format_failures": 2.0,
"grad_norm": 5.602163791656494,
"kl": 0.16080649592913687,
"learning_rate": 1e-06,
"loss": 0.4472,
"num_tokens": 255520.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.003766579437069595,
"clip_ratio/low_min": 0.003766579437069595,
"clip_ratio/region_mean": 0.003766579437069595,
"completions/clipped_ratio": 0.125,
"completions/max_length": 854.0,
"completions/max_terminated_length": 854.0,
"completions/mean_length": 186.5,
"completions/mean_terminated_length": 213.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 1.8319999999999999,
"format_failures": 1.0,
"grad_norm": 8.381872177124023,
"kl": 0.047735671047121286,
"learning_rate": 1e-06,
"loss": -1.0193,
"num_tokens": 268512.0,
"reward": 0.109375,
"reward_std": 0.30935922265052795,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 116.75,
"completions/mean_terminated_length": 133.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 15.0,
"epoch": 1.896,
"format_failures": 0.0,
"grad_norm": 5.668828010559082,
"kl": 0.038008465664461255,
"learning_rate": 1e-06,
"loss": -0.7992,
"num_tokens": 282112.0,
"reward": 0.3333333432674408,
"reward_std": 0.4714045226573944,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 7.35726862330921e-05,
"clip_ratio/low_min": 7.35726862330921e-05,
"clip_ratio/region_mean": 7.35726862330921e-05,
"completions/clipped_ratio": 0.125,
"completions/max_length": 126.0,
"completions/max_terminated_length": 126.0,
"completions/mean_length": 78.125,
"completions/mean_terminated_length": 89.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 1.96,
"format_failures": 1.0,
"grad_norm": 11.598993301391602,
"kl": 0.08647240558639169,
"learning_rate": 1e-06,
"loss": 1.9553,
"num_tokens": 289264.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.002619047649204731,
"clip_ratio/low_min": 0.002619047649204731,
"clip_ratio/region_mean": 0.002619047649204731,
"completions/clipped_ratio": 0.125,
"completions/max_length": 257.0,
"completions/max_terminated_length": 257.0,
"completions/mean_length": 108.25,
"completions/mean_terminated_length": 123.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 2.0,
"format_failures": 0.0,
"grad_norm": 5.694812297821045,
"kl": 0.039789453893899915,
"learning_rate": 1e-06,
"loss": -0.1238,
"num_tokens": 302080.0,
"reward": 0.574999988079071,
"reward_std": 0.41661903262138367,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004414635943248868,
"clip_ratio/low_min": 0.004414635943248868,
"clip_ratio/region_mean": 0.004414635943248868,
"completions/clipped_ratio": 0.125,
"completions/max_length": 130.0,
"completions/max_terminated_length": 130.0,
"completions/mean_length": 65.75,
"completions/mean_terminated_length": 75.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 2.064,
"format_failures": 0.0,
"grad_norm": 5.186154842376709,
"kl": 0.050242609810084105,
"learning_rate": 1e-06,
"loss": 0.3273,
"num_tokens": 308472.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 33
},
{
"clip_ratio/high_max": 0.0015756364446133375,
"clip_ratio/high_mean": 0.0015756364446133375,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0015756364446133375,
"completions/clipped_ratio": 0.25,
"completions/max_length": 355.0,
"completions/max_terminated_length": 355.0,
"completions/mean_length": 152.375,
"completions/mean_terminated_length": 203.16666666666666,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 2.128,
"format_failures": 0.0,
"grad_norm": 6.178646564483643,
"kl": 0.04819304798729718,
"learning_rate": 1e-06,
"loss": 0.0732,
"num_tokens": 321144.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 70.25,
"completions/mean_terminated_length": 80.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 43.0,
"epoch": 2.192,
"format_failures": 0.0,
"grad_norm": 9.679683685302734,
"kl": 0.04483591788448393,
"learning_rate": 1e-06,
"loss": -0.7427,
"num_tokens": 332568.0,
"reward": 0.5416666865348816,
"reward_std": 0.5019802451133728,
"step": 35
},
{
"clip_ratio/high_max": 0.0005470459582284093,
"clip_ratio/high_mean": 0.0005470459582284093,
"clip_ratio/low_mean": 0.0024912295630201697,
"clip_ratio/low_min": 0.0024912295630201697,
"clip_ratio/region_mean": 0.003038275521248579,
"completions/clipped_ratio": 0.125,
"completions/max_length": 123.0,
"completions/max_terminated_length": 123.0,
"completions/mean_length": 61.875,
"completions/mean_terminated_length": 70.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 40.0,
"epoch": 2.2560000000000002,
"format_failures": 0.0,
"grad_norm": 20.632793426513672,
"kl": 0.06712129758670926,
"learning_rate": 1e-06,
"loss": 2.5061,
"num_tokens": 338352.0,
"reward": 0.2708333432674408,
"reward_std": 0.39778655767440796,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0031341375433839858,
"clip_ratio/low_min": 0.0031341375433839858,
"clip_ratio/region_mean": 0.0031341375433839858,
"completions/clipped_ratio": 0.25,
"completions/max_length": 201.0,
"completions/max_terminated_length": 201.0,
"completions/mean_length": 97.25,
"completions/mean_terminated_length": 129.66666666666666,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 2.32,
"format_failures": 0.0,
"grad_norm": 8.325549125671387,
"kl": 0.07476615975610912,
"learning_rate": 1e-06,
"loss": 0.0014,
"num_tokens": 351528.0,
"reward": 0.4439394176006317,
"reward_std": 0.215702623128891,
"step": 37
},
{
"clip_ratio/high_max": 0.0006686007836833596,
"clip_ratio/high_mean": 0.0006686007836833596,
"clip_ratio/low_mean": 0.004799673450179398,
"clip_ratio/low_min": 0.004799673450179398,
"clip_ratio/region_mean": 0.005468274233862758,
"completions/clipped_ratio": 0.125,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 83.25,
"completions/mean_terminated_length": 95.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 2.384,
"format_failures": 0.0,
"grad_norm": 8.579444885253906,
"kl": 0.44708020030520856,
"learning_rate": 1e-06,
"loss": 0.7024,
"num_tokens": 357048.0,
"reward": 0.44583332538604736,
"reward_std": 0.4876042604446411,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0014224655460566282,
"clip_ratio/low_min": 0.0014224655460566282,
"clip_ratio/region_mean": 0.0014224655460566282,
"completions/clipped_ratio": 0.125,
"completions/max_length": 168.0,
"completions/max_terminated_length": 168.0,
"completions/mean_length": 105.5,
"completions/mean_terminated_length": 120.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 81.0,
"epoch": 2.448,
"format_failures": 1.0,
"grad_norm": 8.134405136108398,
"kl": 0.4579888880252838,
"learning_rate": 1e-06,
"loss": -0.7953,
"num_tokens": 370592.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 152.0,
"completions/max_terminated_length": 152.0,
"completions/mean_length": 101.0,
"completions/mean_terminated_length": 115.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 2.512,
"format_failures": 0.0,
"grad_norm": 6.998343467712402,
"kl": 0.059122598730027676,
"learning_rate": 1e-06,
"loss": -0.4243,
"num_tokens": 387824.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 40
},
{
"clip_ratio/high_max": 0.002086994703859091,
"clip_ratio/high_mean": 0.002086994703859091,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002086994703859091,
"completions/clipped_ratio": 0.125,
"completions/max_length": 249.0,
"completions/max_terminated_length": 249.0,
"completions/mean_length": 98.5,
"completions/mean_terminated_length": 112.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 65.0,
"epoch": 2.576,
"format_failures": 0.0,
"grad_norm": 72.67411804199219,
"kl": 0.05187072162516415,
"learning_rate": 1e-06,
"loss": 0.3373,
"num_tokens": 403648.0,
"reward": 0.5208333730697632,
"reward_std": 0.39276695251464844,
"step": 41
},
{
"clip_ratio/high_max": 0.0003625637182267383,
"clip_ratio/high_mean": 0.0003625637182267383,
"clip_ratio/low_mean": 0.0002896586374845356,
"clip_ratio/low_min": 0.0002896586374845356,
"clip_ratio/region_mean": 0.0006522223557112738,
"completions/clipped_ratio": 0.125,
"completions/max_length": 105.0,
"completions/max_terminated_length": 105.0,
"completions/mean_length": 73.25,
"completions/mean_terminated_length": 83.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 2.64,
"format_failures": 0.0,
"grad_norm": 14.554330825805664,
"kl": 0.15414534136652946,
"learning_rate": 1e-06,
"loss": 2.2367,
"num_tokens": 409704.0,
"reward": 0.4583333432674408,
"reward_std": 0.501980185508728,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0007432290876749903,
"clip_ratio/low_min": 0.0007432290876749903,
"clip_ratio/region_mean": 0.0007432290876749903,
"completions/clipped_ratio": 0.125,
"completions/max_length": 174.0,
"completions/max_terminated_length": 174.0,
"completions/mean_length": 109.375,
"completions/mean_terminated_length": 125.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 2.7039999999999997,
"format_failures": 0.0,
"grad_norm": 53.890411376953125,
"kl": 1.3919735243543983,
"learning_rate": 1e-06,
"loss": -0.6229,
"num_tokens": 417312.0,
"reward": 0.3125,
"reward_std": 0.2912411689758301,
"step": 43
},
{
"clip_ratio/high_max": 0.00039795115299057215,
"clip_ratio/high_mean": 0.00039795115299057215,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00039795115299057215,
"completions/clipped_ratio": 0.25,
"completions/max_length": 50.0,
"completions/max_terminated_length": 50.0,
"completions/mean_length": 36.375,
"completions/mean_terminated_length": 48.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 45.0,
"epoch": 2.768,
"format_failures": 0.0,
"grad_norm": 5.3910932540893555,
"kl": 0.1744281006976962,
"learning_rate": 1e-06,
"loss": -0.1856,
"num_tokens": 422800.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 68.0,
"completions/max_terminated_length": 68.0,
"completions/mean_length": 49.0,
"completions/mean_terminated_length": 56.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 2.832,
"format_failures": 0.0,
"grad_norm": 0.309182733297348,
"kl": 0.09828702360391617,
"learning_rate": 1e-06,
"loss": 0.0019,
"num_tokens": 431080.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 95.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 49.625,
"completions/mean_terminated_length": 56.714285714285715,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 2.896,
"format_failures": 1.0,
"grad_norm": 16.60660743713379,
"kl": 0.11247169971466064,
"learning_rate": 1e-06,
"loss": -1.7005,
"num_tokens": 439296.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 134.0,
"completions/max_terminated_length": 134.0,
"completions/mean_length": 68.75,
"completions/mean_terminated_length": 78.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 46.0,
"epoch": 2.96,
"format_failures": 0.0,
"grad_norm": 31.673078536987305,
"kl": 2.0126035660505295,
"learning_rate": 1e-06,
"loss": 0.8165,
"num_tokens": 449224.0,
"reward": 0.6875,
"reward_std": 0.45806270837783813,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0006863656220957637,
"clip_ratio/low_min": 0.0006863656220957637,
"clip_ratio/region_mean": 0.0006863656220957637,
"epoch": 3.0,
"grad_norm": 6.059280872344971,
"kl": 0.1403810739517212,
"learning_rate": 1e-06,
"loss": 0.5743,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00021477663540281355,
"clip_ratio/low_min": 0.00021477663540281355,
"clip_ratio/region_mean": 0.00021477663540281355,
"completions/clipped_ratio": 0.125,
"completions/max_length": 154.0,
"completions/max_terminated_length": 154.0,
"completions/mean_length": 80.25,
"completions/mean_terminated_length": 91.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 22.0,
"epoch": 3.064,
"format_failures": 0.0,
"grad_norm": 24.96416664123535,
"kl": 4.326897906605154,
"learning_rate": 1e-06,
"loss": 0.25,
"num_tokens": 458280.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 73.625,
"completions/mean_terminated_length": 84.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 3.128,
"format_failures": 0.0,
"grad_norm": 3.976156711578369,
"kl": 0.1405428946018219,
"learning_rate": 1e-06,
"loss": -0.6803,
"num_tokens": 465592.0,
"reward": 0.3125,
"reward_std": 0.38253021240234375,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 52.0,
"completions/max_terminated_length": 52.0,
"completions/mean_length": 40.375,
"completions/mean_terminated_length": 46.142857142857146,
"completions/min_length": 0.0,
"completions/min_terminated_length": 21.0,
"epoch": 3.192,
"format_failures": 0.0,
"grad_norm": 767.0111694335938,
"kl": 30.081211734563112,
"learning_rate": 1e-06,
"loss": 1.7347,
"num_tokens": 470880.0,
"reward": 0.5833333730697632,
"reward_std": 0.49601587653160095,
"step": 51
},
{
"clip_ratio/high_max": 0.00039308174746111035,
"clip_ratio/high_mean": 0.00039308174746111035,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00039308174746111035,
"completions/clipped_ratio": 0.125,
"completions/max_length": 108.0,
"completions/max_terminated_length": 108.0,
"completions/mean_length": 56.25,
"completions/mean_terminated_length": 64.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 25.0,
"epoch": 3.2560000000000002,
"format_failures": 0.0,
"grad_norm": 10.541399955749512,
"kl": 0.2744437651708722,
"learning_rate": 1e-06,
"loss": -1.0422,
"num_tokens": 479136.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 85.0,
"completions/max_terminated_length": 85.0,
"completions/mean_length": 61.75,
"completions/mean_terminated_length": 70.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 3.32,
"format_failures": 0.0,
"grad_norm": 3.463606595993042,
"kl": 0.10342029482126236,
"learning_rate": 1e-06,
"loss": 0.4301,
"num_tokens": 486000.0,
"reward": 0.875,
"reward_std": 0.3535533845424652,
"step": 53
},
{
"clip_ratio/high_max": 0.00014585764438379556,
"clip_ratio/high_mean": 0.00014585764438379556,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00014585764438379556,
"completions/clipped_ratio": 0.125,
"completions/max_length": 287.0,
"completions/max_terminated_length": 287.0,
"completions/mean_length": 177.625,
"completions/mean_terminated_length": 203.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 71.0,
"epoch": 3.384,
"format_failures": 0.0,
"grad_norm": 3.68437123298645,
"kl": 0.10048098210245371,
"learning_rate": 1e-06,
"loss": -0.7393,
"num_tokens": 497136.0,
"reward": 0.5583333373069763,
"reward_std": 0.4766783118247986,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 101.0,
"completions/max_terminated_length": 101.0,
"completions/mean_length": 72.25,
"completions/mean_terminated_length": 82.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 3.448,
"format_failures": 0.0,
"grad_norm": 7.700087070465088,
"kl": 0.17961894627660513,
"learning_rate": 1e-06,
"loss": 1.025,
"num_tokens": 502688.0,
"reward": 0.44999998807907104,
"reward_std": 0.4985693693161011,
"step": 55
},
{
"clip_ratio/high_max": 0.00020938023226335645,
"clip_ratio/high_mean": 0.00020938023226335645,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00020938023226335645,
"completions/clipped_ratio": 0.125,
"completions/max_length": 181.0,
"completions/max_terminated_length": 181.0,
"completions/mean_length": 114.125,
"completions/mean_terminated_length": 130.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 3.512,
"format_failures": 1.0,
"grad_norm": 6.477407455444336,
"kl": 0.18405211344361305,
"learning_rate": 1e-06,
"loss": -0.815,
"num_tokens": 509816.0,
"reward": 0.28125,
"reward_std": 0.33905068039894104,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 217.0,
"completions/max_terminated_length": 217.0,
"completions/mean_length": 144.375,
"completions/mean_terminated_length": 165.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 3.576,
"format_failures": 0.0,
"grad_norm": 3.1489181518554688,
"kl": 0.18948577530682087,
"learning_rate": 1e-06,
"loss": 0.1775,
"num_tokens": 518072.0,
"reward": 0.6666666865348816,
"reward_std": 0.4364357590675354,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 246.0,
"completions/max_terminated_length": 246.0,
"completions/mean_length": 125.125,
"completions/mean_terminated_length": 143.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 89.0,
"epoch": 3.64,
"format_failures": 0.0,
"grad_norm": 2.2657573223114014,
"kl": 0.1387784667313099,
"learning_rate": 1e-06,
"loss": 0.0604,
"num_tokens": 531728.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00014551804633811116,
"clip_ratio/low_min": 0.00014551804633811116,
"clip_ratio/region_mean": 0.00014551804633811116,
"completions/clipped_ratio": 0.125,
"completions/max_length": 83.0,
"completions/max_terminated_length": 83.0,
"completions/mean_length": 56.5,
"completions/mean_terminated_length": 64.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 52.0,
"epoch": 3.7039999999999997,
"format_failures": 0.0,
"grad_norm": 2.2649238109588623,
"kl": 0.28891171142458916,
"learning_rate": 1e-06,
"loss": 0.2216,
"num_tokens": 536768.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 227.0,
"completions/max_terminated_length": 227.0,
"completions/mean_length": 91.5,
"completions/mean_terminated_length": 104.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 3.768,
"format_failures": 0.0,
"grad_norm": 3.3132457733154297,
"kl": 0.096153249964118,
"learning_rate": 1e-06,
"loss": 0.3965,
"num_tokens": 546784.0,
"reward": 0.1875,
"reward_std": 0.2587745785713196,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 90.75,
"completions/mean_terminated_length": 103.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 71.0,
"epoch": 3.832,
"format_failures": 0.0,
"grad_norm": 4.948695182800293,
"kl": 0.1259058197028935,
"learning_rate": 1e-06,
"loss": -0.4309,
"num_tokens": 559872.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0003612716682255268,
"clip_ratio/low_min": 0.0003612716682255268,
"clip_ratio/region_mean": 0.0003612716682255268,
"completions/clipped_ratio": 0.125,
"completions/max_length": 131.0,
"completions/max_terminated_length": 131.0,
"completions/mean_length": 62.125,
"completions/mean_terminated_length": 71.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 3.896,
"format_failures": 0.0,
"grad_norm": 8.347101211547852,
"kl": 0.8767695324495435,
"learning_rate": 1e-06,
"loss": -0.0695,
"num_tokens": 565032.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 138.0,
"completions/max_terminated_length": 138.0,
"completions/mean_length": 84.625,
"completions/mean_terminated_length": 96.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 3.96,
"format_failures": 0.0,
"grad_norm": 5.449214935302734,
"kl": 0.26848094910383224,
"learning_rate": 1e-06,
"loss": -0.0353,
"num_tokens": 572664.0,
"reward": 0.4464285671710968,
"reward_std": 0.49744242429733276,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 643.0,
"completions/max_terminated_length": 643.0,
"completions/mean_length": 136.75,
"completions/mean_terminated_length": 156.28571428571428,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 4.0,
"format_failures": 0.0,
"grad_norm": 5.11106014251709,
"kl": 0.17550407350063324,
"learning_rate": 1e-06,
"loss": 0.7516,
"num_tokens": 586696.0,
"reward": 0.6588234901428223,
"reward_std": 0.44546374678611755,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 80.0,
"completions/max_terminated_length": 80.0,
"completions/mean_length": 40.125,
"completions/mean_terminated_length": 45.857142857142854,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 4.064,
"format_failures": 1.0,
"grad_norm": 9.72535514831543,
"kl": 0.20796778332442045,
"learning_rate": 1e-06,
"loss": 0.8956,
"num_tokens": 591600.0,
"reward": 0.30000001192092896,
"reward_std": 0.4535573422908783,
"step": 65
},
{
"clip_ratio/high_max": 0.0013570611481554806,
"clip_ratio/high_mean": 0.0013570611481554806,
"clip_ratio/low_mean": 0.012927594594657421,
"clip_ratio/low_min": 0.012927594594657421,
"clip_ratio/region_mean": 0.014284655742812902,
"completions/clipped_ratio": 0.125,
"completions/max_length": 87.0,
"completions/max_terminated_length": 87.0,
"completions/mean_length": 55.25,
"completions/mean_terminated_length": 63.142857142857146,
"completions/min_length": 0.0,
"completions/min_terminated_length": 48.0,
"epoch": 4.128,
"format_failures": 0.0,
"grad_norm": 15.252068519592285,
"kl": 0.22740534879267216,
"learning_rate": 1e-06,
"loss": 0.8349,
"num_tokens": 596880.0,
"reward": 0.8125,
"reward_std": 0.3720118999481201,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 110.0,
"completions/max_terminated_length": 110.0,
"completions/mean_length": 55.0,
"completions/mean_terminated_length": 88.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 4.192,
"format_failures": 1.0,
"grad_norm": 16.80088233947754,
"kl": 0.31182049214839935,
"learning_rate": 1e-06,
"loss": 0.693,
"num_tokens": 603344.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.004188144113868475,
"clip_ratio/low_min": 0.004188144113868475,
"clip_ratio/region_mean": 0.004188144113868475,
"completions/clipped_ratio": 0.125,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 100.25,
"completions/mean_terminated_length": 114.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 84.0,
"epoch": 4.256,
"format_failures": 0.0,
"grad_norm": 13.643096923828125,
"kl": 0.11746187414973974,
"learning_rate": 1e-06,
"loss": -1.335,
"num_tokens": 610184.0,
"reward": 0.5208333730697632,
"reward_std": 0.46664538979530334,
"step": 68
},
{
"clip_ratio/high_max": 0.00018115942657459527,
"clip_ratio/high_mean": 0.00018115942657459527,
"clip_ratio/low_mean": 0.00037650601007044315,
"clip_ratio/low_min": 0.00037650601007044315,
"clip_ratio/region_mean": 0.0005576654366450384,
"completions/clipped_ratio": 0.125,
"completions/max_length": 199.0,
"completions/max_terminated_length": 199.0,
"completions/mean_length": 72.875,
"completions/mean_terminated_length": 83.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 46.0,
"epoch": 4.32,
"format_failures": 0.0,
"grad_norm": 5.553096771240234,
"kl": 0.19322836678475142,
"learning_rate": 1e-06,
"loss": -0.6568,
"num_tokens": 619440.0,
"reward": 0.2916666865348816,
"reward_std": 0.4520675837993622,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 87.0,
"completions/max_terminated_length": 87.0,
"completions/mean_length": 61.375,
"completions/mean_terminated_length": 70.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 17.0,
"epoch": 4.384,
"format_failures": 0.0,
"grad_norm": 11.096977233886719,
"kl": 0.205445297062397,
"learning_rate": 1e-06,
"loss": -0.9614,
"num_tokens": 626024.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 70
},
{
"clip_ratio/high_max": 0.0001707650226308033,
"clip_ratio/high_mean": 0.0001707650226308033,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0001707650226308033,
"completions/clipped_ratio": 0.125,
"completions/max_length": 113.0,
"completions/max_terminated_length": 113.0,
"completions/mean_length": 59.25,
"completions/mean_terminated_length": 67.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 30.0,
"epoch": 4.448,
"format_failures": 0.0,
"grad_norm": 13.960062026977539,
"kl": 0.2419998161494732,
"learning_rate": 1e-06,
"loss": 1.6815,
"num_tokens": 631704.0,
"reward": 0.30000001192092896,
"reward_std": 0.4535573720932007,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 236.0,
"completions/max_terminated_length": 236.0,
"completions/mean_length": 119.625,
"completions/mean_terminated_length": 136.71428571428572,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 4.5120000000000005,
"format_failures": 0.0,
"grad_norm": 4.790798187255859,
"kl": 0.14366307947784662,
"learning_rate": 1e-06,
"loss": -0.622,
"num_tokens": 640736.0,
"reward": 0.6041666269302368,
"reward_std": 0.5034602880477905,
"step": 72
},
{
"clip_ratio/high_max": 0.001402775407768786,
"clip_ratio/high_mean": 0.001402775407768786,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.001402775407768786,
"completions/clipped_ratio": 0.125,
"completions/max_length": 163.0,
"completions/max_terminated_length": 163.0,
"completions/mean_length": 89.875,
"completions/mean_terminated_length": 102.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 24.0,
"epoch": 4.576,
"format_failures": 0.0,
"grad_norm": 24.572803497314453,
"kl": 2.3969106171280146,
"learning_rate": 1e-06,
"loss": 0.7312,
"num_tokens": 649920.0,
"reward": 0.0416666679084301,
"reward_std": 0.1178511381149292,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0001328374055447057,
"clip_ratio/low_min": 0.0001328374055447057,
"clip_ratio/region_mean": 0.0001328374055447057,
"completions/clipped_ratio": 0.125,
"completions/max_length": 231.0,
"completions/max_terminated_length": 231.0,
"completions/mean_length": 118.0,
"completions/mean_terminated_length": 134.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 4.64,
"format_failures": 0.0,
"grad_norm": 11.096585273742676,
"kl": 0.14362134877592325,
"learning_rate": 1e-06,
"loss": 1.6126,
"num_tokens": 660512.0,
"reward": 0.59375,
"reward_std": 0.4988826811313629,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 277.0,
"completions/max_terminated_length": 277.0,
"completions/mean_length": 171.625,
"completions/mean_terminated_length": 196.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 4.704,
"format_failures": 0.0,
"grad_norm": 10.942404747009277,
"kl": 0.09571220818907022,
"learning_rate": 1e-06,
"loss": -1.9221,
"num_tokens": 671728.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 83.0,
"completions/max_terminated_length": 83.0,
"completions/mean_length": 51.25,
"completions/mean_terminated_length": 58.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 4.768,
"format_failures": 0.0,
"grad_norm": 17.237686157226562,
"kl": 0.1505587575957179,
"learning_rate": 1e-06,
"loss": -0.4979,
"num_tokens": 678808.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"step": 76
},
{
"clip_ratio/high_max": 0.0069027612917125225,
"clip_ratio/high_mean": 0.0069027612917125225,
"clip_ratio/low_mean": 0.00044653778604697436,
"clip_ratio/low_min": 0.00044653778604697436,
"clip_ratio/region_mean": 0.007349299077759497,
"completions/clipped_ratio": 0.125,
"completions/max_length": 95.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 63.25,
"completions/mean_terminated_length": 72.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 4.832,
"format_failures": 0.0,
"grad_norm": 48.175540924072266,
"kl": 0.12417041137814522,
"learning_rate": 1e-06,
"loss": -0.3992,
"num_tokens": 685376.0,
"reward": 0.75,
"reward_std": 0.4629100561141968,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 105.0,
"completions/max_terminated_length": 105.0,
"completions/mean_length": 66.625,
"completions/mean_terminated_length": 76.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 40.0,
"epoch": 4.896,
"format_failures": 0.0,
"grad_norm": 21.25414276123047,
"kl": 0.3155105458572507,
"learning_rate": 1e-06,
"loss": 2.8688,
"num_tokens": 691504.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 428.0,
"completions/max_terminated_length": 428.0,
"completions/mean_length": 160.875,
"completions/mean_terminated_length": 183.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 4.96,
"format_failures": 0.0,
"grad_norm": 11.223791122436523,
"kl": 0.1855175606906414,
"learning_rate": 1e-06,
"loss": -1.5493,
"num_tokens": 700192.0,
"reward": 0.3015109896659851,
"reward_std": 0.42723536491394043,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 192.0,
"completions/max_terminated_length": 192.0,
"completions/mean_length": 87.875,
"completions/mean_terminated_length": 100.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 5.0,
"format_failures": 0.0,
"grad_norm": 5.051183700561523,
"kl": 0.6679443523287774,
"learning_rate": 1e-06,
"loss": 0.6727,
"num_tokens": 715360.0,
"reward": 0.3965517282485962,
"reward_std": 0.503090500831604,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0012536601134343073,
"clip_ratio/low_min": 0.0012536601134343073,
"clip_ratio/region_mean": 0.0012536601134343073,
"completions/clipped_ratio": 0.125,
"completions/max_length": 295.0,
"completions/max_terminated_length": 295.0,
"completions/mean_length": 192.25,
"completions/mean_terminated_length": 219.71428571428572,
"completions/min_length": 0.0,
"completions/min_terminated_length": 51.0,
"epoch": 5.064,
"format_failures": 0.0,
"grad_norm": 6.032639503479004,
"kl": 0.1344920275732875,
"learning_rate": 1e-06,
"loss": -0.8074,
"num_tokens": 725176.0,
"reward": 0.5416666865348816,
"reward_std": 0.46929529309272766,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 57.0,
"completions/max_terminated_length": 57.0,
"completions/mean_length": 44.625,
"completions/mean_terminated_length": 51.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 5.128,
"format_failures": 0.0,
"grad_norm": 9.636035919189453,
"kl": 0.15830809529870749,
"learning_rate": 1e-06,
"loss": 0.9562,
"num_tokens": 729984.0,
"reward": 0.46875,
"reward_std": 0.5077524185180664,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.000637518212897703,
"clip_ratio/low_min": 0.000637518212897703,
"clip_ratio/region_mean": 0.000637518212897703,
"completions/clipped_ratio": 0.125,
"completions/max_length": 90.0,
"completions/max_terminated_length": 90.0,
"completions/mean_length": 63.625,
"completions/mean_terminated_length": 72.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 38.0,
"epoch": 5.192,
"format_failures": 0.0,
"grad_norm": 7.9917144775390625,
"kl": 0.1429830752313137,
"learning_rate": 1e-06,
"loss": -0.1994,
"num_tokens": 736312.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.001965374161954969,
"clip_ratio/low_min": 0.001965374161954969,
"clip_ratio/region_mean": 0.001965374161954969,
"completions/clipped_ratio": 0.25,
"completions/max_length": 394.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 136.0,
"completions/mean_terminated_length": 181.33333333333334,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 5.256,
"format_failures": 0.0,
"grad_norm": 8.276095390319824,
"kl": 0.28229224402457476,
"learning_rate": 1e-06,
"loss": -1.1183,
"num_tokens": 749648.0,
"reward": 0.637499988079071,
"reward_std": 0.4405759274959564,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 76.0,
"completions/max_terminated_length": 76.0,
"completions/mean_length": 56.125,
"completions/mean_terminated_length": 64.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 52.0,
"epoch": 5.32,
"format_failures": 0.0,
"grad_norm": 3.3836967945098877,
"kl": 0.11320708272978663,
"learning_rate": 1e-06,
"loss": -0.262,
"num_tokens": 755344.0,
"reward": 0.1875,
"reward_std": 0.3720118999481201,
"step": 85
},
{
"clip_ratio/high_max": 0.00038880249485373497,
"clip_ratio/high_mean": 0.00038880249485373497,
"clip_ratio/low_mean": 0.0007896393508417532,
"clip_ratio/low_min": 0.0007896393508417532,
"clip_ratio/region_mean": 0.0011784418456954882,
"completions/clipped_ratio": 0.125,
"completions/max_length": 192.0,
"completions/max_terminated_length": 192.0,
"completions/mean_length": 85.25,
"completions/mean_terminated_length": 97.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 56.0,
"epoch": 5.384,
"format_failures": 0.0,
"grad_norm": 6.297000885009766,
"kl": 0.7561929021030664,
"learning_rate": 1e-06,
"loss": 0.5695,
"num_tokens": 762432.0,
"reward": 0.09375,
"reward_std": 0.2651650309562683,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 153.0,
"completions/max_terminated_length": 153.0,
"completions/mean_length": 102.0,
"completions/mean_terminated_length": 116.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 5.448,
"format_failures": 0.0,
"grad_norm": 3.523719310760498,
"kl": 0.19376599509269,
"learning_rate": 1e-06,
"loss": -0.6165,
"num_tokens": 769192.0,
"reward": 0.5833333730697632,
"reward_std": 0.49601587653160095,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00017908310110215098,
"clip_ratio/low_min": 0.00017908310110215098,
"clip_ratio/region_mean": 0.00017908310110215098,
"completions/clipped_ratio": 0.25,
"completions/max_length": 109.0,
"completions/max_terminated_length": 109.0,
"completions/mean_length": 81.75,
"completions/mean_terminated_length": 109.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 5.5120000000000005,
"format_failures": 0.0,
"grad_norm": 4.026613235473633,
"kl": 0.32431851979345083,
"learning_rate": 1e-06,
"loss": 0.3918,
"num_tokens": 775912.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0009421340801054612,
"clip_ratio/low_min": 0.0009421340801054612,
"clip_ratio/region_mean": 0.0009421340801054612,
"completions/clipped_ratio": 0.125,
"completions/max_length": 87.0,
"completions/max_terminated_length": 87.0,
"completions/mean_length": 41.625,
"completions/mean_terminated_length": 47.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 9.0,
"epoch": 5.576,
"format_failures": 0.0,
"grad_norm": 9.119647979736328,
"kl": 0.20464181900024414,
"learning_rate": 1e-06,
"loss": -0.8791,
"num_tokens": 781112.0,
"reward": 0.5890151262283325,
"reward_std": 0.4705297350883484,
"step": 89
},
{
"clip_ratio/high_max": 0.00911893486045301,
"clip_ratio/high_mean": 0.00911893486045301,
"clip_ratio/low_mean": 0.000877421407494694,
"clip_ratio/low_min": 0.000877421407494694,
"clip_ratio/region_mean": 0.009996356267947704,
"completions/clipped_ratio": 0.375,
"completions/max_length": 586.0,
"completions/max_terminated_length": 586.0,
"completions/mean_length": 143.375,
"completions/mean_terminated_length": 229.4,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 5.64,
"format_failures": 0.0,
"grad_norm": 32.418331146240234,
"kl": 0.3150494508445263,
"learning_rate": 1e-06,
"loss": 0.7753,
"num_tokens": 793976.0,
"reward": 0.02777777798473835,
"reward_std": 0.07856742292642593,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 181.0,
"completions/max_terminated_length": 181.0,
"completions/mean_length": 85.5,
"completions/mean_terminated_length": 97.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 14.0,
"epoch": 5.704,
"format_failures": 0.0,
"grad_norm": 1.2142812013626099,
"kl": 0.1678312411531806,
"learning_rate": 1e-06,
"loss": -0.2285,
"num_tokens": 800528.0,
"reward": 0.5806547999382019,
"reward_std": 0.40297815203666687,
"step": 91
},
{
"clip_ratio/high_max": 0.008678364916704595,
"clip_ratio/high_mean": 0.008678364916704595,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.008678364916704595,
"completions/clipped_ratio": 0.125,
"completions/max_length": 195.0,
"completions/max_terminated_length": 195.0,
"completions/mean_length": 119.875,
"completions/mean_terminated_length": 137.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 5.768,
"format_failures": 0.0,
"grad_norm": 8.557031631469727,
"kl": 0.26262282859534025,
"learning_rate": 1e-06,
"loss": -0.786,
"num_tokens": 811680.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"step": 92
},
{
"clip_ratio/high_max": 0.0003788308094954118,
"clip_ratio/high_mean": 0.0003788308094954118,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003788308094954118,
"completions/clipped_ratio": 0.125,
"completions/max_length": 387.0,
"completions/max_terminated_length": 387.0,
"completions/mean_length": 156.0,
"completions/mean_terminated_length": 178.28571428571428,
"completions/min_length": 0.0,
"completions/min_terminated_length": 82.0,
"epoch": 5.832,
"format_failures": 0.0,
"grad_norm": 8.220466613769531,
"kl": 0.14401236828416586,
"learning_rate": 1e-06,
"loss": 1.0757,
"num_tokens": 824400.0,
"reward": 0.3035714328289032,
"reward_std": 0.45456862449645996,
"step": 93
},
{
"clip_ratio/high_max": 0.00015356265066657215,
"clip_ratio/high_mean": 0.00015356265066657215,
"clip_ratio/low_mean": 0.0011615749244811013,
"clip_ratio/low_min": 0.0011615749244811013,
"clip_ratio/region_mean": 0.0013151375751476735,
"completions/clipped_ratio": 0.125,
"completions/max_length": 177.0,
"completions/max_terminated_length": 177.0,
"completions/mean_length": 77.625,
"completions/mean_terminated_length": 88.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 5.896,
"format_failures": 0.0,
"grad_norm": 5.23447322845459,
"kl": 0.21856553480029106,
"learning_rate": 1e-06,
"loss": -0.4024,
"num_tokens": 838072.0,
"reward": 0.359375,
"reward_std": 0.469790518283844,
"step": 94
},
{
"clip_ratio/high_max": 0.002261076238937676,
"clip_ratio/high_mean": 0.002261076238937676,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.002261076238937676,
"completions/clipped_ratio": 0.125,
"completions/max_length": 87.0,
"completions/max_terminated_length": 87.0,
"completions/mean_length": 49.625,
"completions/mean_terminated_length": 56.714285714285715,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 5.96,
"format_failures": 0.0,
"grad_norm": 16.173349380493164,
"kl": 0.41087135300040245,
"learning_rate": 1e-06,
"loss": 0.8071,
"num_tokens": 846120.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"epoch": 6.0,
"grad_norm": 2.644404888153076,
"kl": 0.6906098246574401,
"learning_rate": 1e-06,
"loss": 0.0081,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 131.0,
"completions/max_terminated_length": 131.0,
"completions/mean_length": 100.875,
"completions/mean_terminated_length": 115.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 108.0,
"epoch": 6.064,
"format_failures": 0.0,
"grad_norm": 10.684069633483887,
"kl": 1.1826152130961418,
"learning_rate": 1e-06,
"loss": 0.7069,
"num_tokens": 854688.0,
"reward": 0.75,
"reward_std": 0.4629100561141968,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 89.375,
"completions/mean_terminated_length": 102.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 6.128,
"format_failures": 0.0,
"grad_norm": 6.4194746017456055,
"kl": 0.12883292511105537,
"learning_rate": 1e-06,
"loss": -1.1247,
"num_tokens": 863240.0,
"reward": 0.5625,
"reward_std": 0.3720118999481201,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 72.0,
"completions/max_terminated_length": 72.0,
"completions/mean_length": 36.0,
"completions/mean_terminated_length": 72.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 72.0,
"epoch": 6.192,
"format_failures": 0.0,
"grad_norm": 7.2926411628723145,
"kl": 0.1561364121735096,
"learning_rate": 1e-06,
"loss": 1.0999,
"num_tokens": 869160.0,
"reward": 0.875,
"reward_std": 0.3535533845424652,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 48.0,
"completions/max_terminated_length": 48.0,
"completions/mean_length": 11.25,
"completions/mean_terminated_length": 45.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 6.256,
"format_failures": 0.0,
"grad_norm": 68.3973159790039,
"kl": 0.22240112535655499,
"learning_rate": 1e-06,
"loss": 6.8633,
"num_tokens": 874632.0,
"reward": 0.5,
"reward_std": 0.5345224738121033,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 90.0,
"completions/max_terminated_length": 90.0,
"completions/mean_length": 59.625,
"completions/mean_terminated_length": 79.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 75.0,
"epoch": 6.32,
"format_failures": 0.0,
"grad_norm": 61.82405090332031,
"kl": 0.04734344594180584,
"learning_rate": 1e-06,
"loss": -7.4582,
"num_tokens": 881504.0,
"reward": 1.0,
"reward_std": 0.0,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 154.0,
"completions/max_terminated_length": 154.0,
"completions/mean_length": 127.625,
"completions/mean_terminated_length": 145.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 6.384,
"format_failures": 1.0,
"grad_norm": 0.07336875051259995,
"kl": 0.056114144157618284,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 889832.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 102
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 889832,
"num_train_epochs": 63,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}