lcb_prog_generator_3b_100steps / trainer_state.json
Harryllh's picture
Upload folder using huggingface_hub
3f2216f verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 391.0,
"completions/max_terminated_length": 391.0,
"completions/mean_length": 202.91666666666666,
"completions/mean_terminated_length": 221.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.004,
"format_failures": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0,
"num_tokens": 18672.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 92.83333333333333,
"completions/mean_terminated_length": 101.27272727272727,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.008,
"format_failures": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 29988.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 151.0,
"completions/max_terminated_length": 151.0,
"completions/mean_length": 52.333333333333336,
"completions/mean_terminated_length": 57.09090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.012,
"format_failures": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 39576.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 326.0,
"completions/max_terminated_length": 326.0,
"completions/mean_length": 161.75,
"completions/mean_terminated_length": 176.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.016,
"format_failures": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 53340.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 120.0,
"completions/max_terminated_length": 120.0,
"completions/mean_length": 75.08333333333333,
"completions/mean_terminated_length": 81.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.02,
"format_failures": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 60420.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 374.0,
"completions/max_terminated_length": 374.0,
"completions/mean_length": 178.0,
"completions/mean_terminated_length": 194.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.024,
"format_failures": 0.0,
"grad_norm": 0.0,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 75348.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 146.0,
"completions/max_terminated_length": 146.0,
"completions/mean_length": 99.08333333333333,
"completions/mean_terminated_length": 108.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.028,
"format_failures": 0.0,
"grad_norm": 1.6362388134002686,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0806,
"num_tokens": 83868.0,
"reward": 0.5833333730697632,
"reward_std": 0.5149286389350891,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 358.0,
"completions/max_terminated_length": 358.0,
"completions/mean_length": 190.0,
"completions/mean_terminated_length": 207.27272727272728,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.032,
"format_failures": 0.0,
"grad_norm": 0.0022762538865208626,
"kl": 0.0005378490750445053,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 97464.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 245.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 141.16666666666666,
"completions/mean_terminated_length": 154.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.036,
"format_failures": 0.0,
"grad_norm": 0.007215190213173628,
"kl": 0.0019240143010392785,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 108636.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 266.0,
"completions/max_terminated_length": 266.0,
"completions/mean_length": 178.41666666666666,
"completions/mean_terminated_length": 194.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.04,
"format_failures": 0.0,
"grad_norm": 0.7695807218551636,
"kl": 0.014113324228674173,
"learning_rate": 1e-06,
"loss": -0.002,
"num_tokens": 116256.0,
"reward": 0.1666666716337204,
"reward_std": 0.38924944400787354,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 135.0,
"completions/max_terminated_length": 135.0,
"completions/mean_length": 84.25,
"completions/mean_terminated_length": 91.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.044,
"format_failures": 0.0,
"grad_norm": 1.026847243309021,
"kl": 0.013075211551040411,
"learning_rate": 1e-06,
"loss": -0.0413,
"num_tokens": 124440.0,
"reward": 0.1666666716337204,
"reward_std": 0.38924944400787354,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 199.0,
"completions/max_terminated_length": 199.0,
"completions/mean_length": 99.25,
"completions/mean_terminated_length": 108.27272727272727,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.048,
"format_failures": 1.0,
"grad_norm": 0.011818243190646172,
"kl": 0.003624255710747093,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 132732.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 577.0,
"completions/max_terminated_length": 577.0,
"completions/mean_length": 246.08333333333334,
"completions/mean_terminated_length": 268.45454545454544,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.052,
"format_failures": 0.0,
"grad_norm": 0.022241737693548203,
"kl": 0.00960063119418919,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 152424.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 344.0,
"completions/max_terminated_length": 344.0,
"completions/mean_length": 173.0,
"completions/mean_terminated_length": 188.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.056,
"format_failures": 0.0,
"grad_norm": 0.036054644733667374,
"kl": 0.01761903613805771,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 162636.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 402.0,
"completions/max_terminated_length": 402.0,
"completions/mean_length": 268.1666666666667,
"completions/mean_terminated_length": 292.54545454545456,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.06,
"format_failures": 0.0,
"grad_norm": 0.00860360637307167,
"kl": 0.005409976467490196,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 176904.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 127.0,
"completions/max_terminated_length": 127.0,
"completions/mean_length": 72.16666666666667,
"completions/mean_terminated_length": 78.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.064,
"format_failures": 0.0,
"grad_norm": 0.8470466136932373,
"kl": 0.09470756724476814,
"learning_rate": 1e-06,
"loss": 0.0173,
"num_tokens": 186564.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 245.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 118.5,
"completions/mean_terminated_length": 129.27272727272728,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.068,
"format_failures": 0.0,
"grad_norm": 0.10086339712142944,
"kl": 0.04859759844839573,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 197484.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 88.0,
"completions/max_terminated_length": 88.0,
"completions/mean_length": 57.833333333333336,
"completions/mean_terminated_length": 63.09090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.072,
"format_failures": 0.0,
"grad_norm": 1.4592796564102173,
"kl": 0.010172125417739153,
"learning_rate": 1e-06,
"loss": 0.0087,
"num_tokens": 207252.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 57.0,
"completions/max_terminated_length": 57.0,
"completions/mean_length": 32.583333333333336,
"completions/mean_terminated_length": 35.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.076,
"format_failures": 0.0,
"grad_norm": 2.4069900512695312,
"kl": 0.025834742933511734,
"learning_rate": 1e-06,
"loss": -0.0695,
"num_tokens": 214320.0,
"reward": 0.6666666865348816,
"reward_std": 0.4923659861087799,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 386.0,
"completions/max_terminated_length": 386.0,
"completions/mean_length": 192.41666666666666,
"completions/mean_terminated_length": 209.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.08,
"format_failures": 0.0,
"grad_norm": 0.10245665162801743,
"kl": 0.043199990526773036,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 228996.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 88.0,
"completions/max_terminated_length": 88.0,
"completions/mean_length": 67.91666666666667,
"completions/mean_terminated_length": 81.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.084,
"format_failures": 1.0,
"grad_norm": 1.388899326324463,
"kl": 0.07192051783204079,
"learning_rate": 1e-06,
"loss": -0.0112,
"num_tokens": 238104.0,
"reward": 0.75,
"reward_std": 0.45226702094078064,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 73.0,
"completions/max_terminated_length": 73.0,
"completions/mean_length": 43.583333333333336,
"completions/mean_terminated_length": 47.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.088,
"format_failures": 0.0,
"grad_norm": 3.2448337078094482,
"kl": 0.0771165993064642,
"learning_rate": 1e-06,
"loss": 0.0077,
"num_tokens": 245280.0,
"reward": 0.5,
"reward_std": 0.5222329497337341,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 167.83333333333334,
"completions/mean_terminated_length": 183.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.092,
"format_failures": 0.0,
"grad_norm": 1.0195705890655518,
"kl": 0.211347796022892,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 257148.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 350.0,
"completions/max_terminated_length": 350.0,
"completions/mean_length": 201.41666666666666,
"completions/mean_terminated_length": 219.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.096,
"format_failures": 0.0,
"grad_norm": 0.20492610335350037,
"kl": 0.08658944815397263,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 266304.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 312.0,
"completions/max_terminated_length": 312.0,
"completions/mean_length": 170.0,
"completions/mean_terminated_length": 185.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.1,
"format_failures": 0.0,
"grad_norm": 0.0755978599190712,
"kl": 0.040397679433226585,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 278760.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 411.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 173.83333333333334,
"completions/mean_terminated_length": 189.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.104,
"format_failures": 0.0,
"grad_norm": 0.04659981280565262,
"kl": 0.023209942039102316,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 293628.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 92.0,
"completions/max_terminated_length": 92.0,
"completions/mean_length": 55.25,
"completions/mean_terminated_length": 60.27272727272727,
"completions/min_length": 0.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.108,
"format_failures": 0.0,
"grad_norm": 21.968534469604492,
"kl": 5.299874305725098,
"learning_rate": 1e-06,
"loss": 0.1192,
"num_tokens": 301488.0,
"reward": 0.5833333730697632,
"reward_std": 0.5149286389350891,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 358.0,
"completions/max_terminated_length": 358.0,
"completions/mean_length": 165.41666666666666,
"completions/mean_terminated_length": 180.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.112,
"format_failures": 0.0,
"grad_norm": 0.014507513493299484,
"kl": 0.01523882569745183,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 314748.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 289.0,
"completions/max_terminated_length": 289.0,
"completions/mean_length": 193.33333333333334,
"completions/mean_terminated_length": 210.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 119.0,
"epoch": 0.116,
"format_failures": 0.0,
"grad_norm": 0.010872351005673409,
"kl": 0.010655859019607306,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 328692.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 271.0,
"completions/max_terminated_length": 271.0,
"completions/mean_length": 167.16666666666666,
"completions/mean_terminated_length": 182.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.12,
"format_failures": 0.0,
"grad_norm": 1.0025266408920288,
"kl": 0.025600655004382133,
"learning_rate": 1e-06,
"loss": -0.0472,
"num_tokens": 340752.0,
"reward": 0.3333333432674408,
"reward_std": 0.4923659861087799,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 79.33333333333333,
"completions/mean_terminated_length": 86.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.124,
"format_failures": 0.0,
"grad_norm": 0.01500674244016409,
"kl": 0.006932976422831416,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 366936.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 153.0,
"completions/max_terminated_length": 153.0,
"completions/mean_length": 100.25,
"completions/mean_terminated_length": 109.36363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.128,
"format_failures": 0.0,
"grad_norm": 0.572136640548706,
"kl": 0.016836593858897686,
"learning_rate": 1e-06,
"loss": -0.0253,
"num_tokens": 375948.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 91.0,
"completions/max_terminated_length": 91.0,
"completions/mean_length": 52.75,
"completions/mean_terminated_length": 57.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.132,
"format_failures": 0.0,
"grad_norm": 2.6049137115478516,
"kl": 0.08474422618746758,
"learning_rate": 1e-06,
"loss": -0.0534,
"num_tokens": 382608.0,
"reward": 0.3333333432674408,
"reward_std": 0.4923659861087799,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 98.0,
"completions/max_terminated_length": 98.0,
"completions/mean_length": 61.416666666666664,
"completions/mean_terminated_length": 67.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.136,
"format_failures": 0.0,
"grad_norm": 1.9431159496307373,
"kl": 0.04839755780994892,
"learning_rate": 1e-06,
"loss": -0.1095,
"num_tokens": 389208.0,
"reward": 0.1666666716337204,
"reward_std": 0.38924944400787354,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 151.0,
"completions/max_terminated_length": 151.0,
"completions/mean_length": 118.91666666666667,
"completions/mean_terminated_length": 129.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 105.0,
"epoch": 0.14,
"format_failures": 0.0,
"grad_norm": 0.03593799099326134,
"kl": 0.03462314326316118,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 396696.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 371.0,
"completions/max_terminated_length": 371.0,
"completions/mean_length": 245.58333333333334,
"completions/mean_terminated_length": 267.90909090909093,
"completions/min_length": 0.0,
"completions/min_terminated_length": 183.0,
"epoch": 0.144,
"format_failures": 0.0,
"grad_norm": 0.025885488837957382,
"kl": 0.02637413516640663,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 411372.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 296.0,
"completions/max_terminated_length": 296.0,
"completions/mean_length": 179.41666666666666,
"completions/mean_terminated_length": 195.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.148,
"format_failures": 0.0,
"grad_norm": 0.11734314262866974,
"kl": 0.0526489345356822,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 424404.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 630.0,
"completions/max_terminated_length": 630.0,
"completions/mean_length": 332.0833333333333,
"completions/mean_terminated_length": 362.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.152,
"format_failures": 1.0,
"grad_norm": 0.5079672932624817,
"kl": 0.052276700269430876,
"learning_rate": 1e-06,
"loss": 0.0148,
"num_tokens": 444576.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 67.25,
"completions/mean_terminated_length": 73.36363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.156,
"format_failures": 0.0,
"grad_norm": 2.6541359424591064,
"kl": 0.5338308056816459,
"learning_rate": 1e-06,
"loss": -0.1217,
"num_tokens": 453192.0,
"reward": 0.25,
"reward_std": 0.45226702094078064,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 245.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 139.91666666666666,
"completions/mean_terminated_length": 152.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.16,
"format_failures": 0.0,
"grad_norm": 0.3757868707180023,
"kl": 0.13857688568532467,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 467928.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 84.0,
"completions/max_terminated_length": 84.0,
"completions/mean_length": 48.0,
"completions/mean_terminated_length": 52.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.164,
"format_failures": 1.0,
"grad_norm": 4.323275566101074,
"kl": 0.21433213353157043,
"learning_rate": 1e-06,
"loss": -0.0993,
"num_tokens": 473472.0,
"reward": 0.25,
"reward_std": 0.45226702094078064,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 311.0,
"completions/max_terminated_length": 311.0,
"completions/mean_length": 109.58333333333333,
"completions/mean_terminated_length": 119.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.168,
"format_failures": 0.0,
"grad_norm": 0.22781899571418762,
"kl": 0.07318945415318012,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 488148.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 49.0,
"completions/max_terminated_length": 49.0,
"completions/mean_length": 31.166666666666668,
"completions/mean_terminated_length": 34.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.172,
"format_failures": 0.0,
"grad_norm": 2.492840051651001,
"kl": 0.224076546728611,
"learning_rate": 1e-06,
"loss": -0.0283,
"num_tokens": 492624.0,
"reward": 0.25,
"reward_std": 0.45226702094078064,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 66.0,
"completions/max_terminated_length": 66.0,
"completions/mean_length": 52.25,
"completions/mean_terminated_length": 57.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.176,
"format_failures": 0.0,
"grad_norm": 3.037781000137329,
"kl": 0.2150058075785637,
"learning_rate": 1e-06,
"loss": -0.013,
"num_tokens": 499752.0,
"reward": 0.6666666865348816,
"reward_std": 0.4923659861087799,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 144.0,
"completions/max_terminated_length": 144.0,
"completions/mean_length": 83.0,
"completions/mean_terminated_length": 90.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.18,
"format_failures": 0.0,
"grad_norm": 2.3224222660064697,
"kl": 0.36255764216184616,
"learning_rate": 1e-06,
"loss": -0.0749,
"num_tokens": 508428.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 160.0,
"completions/max_terminated_length": 160.0,
"completions/mean_length": 119.0,
"completions/mean_terminated_length": 129.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.184,
"format_failures": 0.0,
"grad_norm": 1.5437301397323608,
"kl": 0.06716796010732651,
"learning_rate": 1e-06,
"loss": 0.0526,
"num_tokens": 517416.0,
"reward": 0.75,
"reward_std": 0.45226702094078064,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 86.66666666666667,
"completions/mean_terminated_length": 94.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.188,
"format_failures": 0.0,
"grad_norm": 1.92403244972229,
"kl": 0.04993921332061291,
"learning_rate": 1e-06,
"loss": -0.0667,
"num_tokens": 525384.0,
"reward": 0.25,
"reward_std": 0.45226702094078064,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 607.0,
"completions/max_terminated_length": 607.0,
"completions/mean_length": 263.0833333333333,
"completions/mean_terminated_length": 287.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.192,
"format_failures": 0.0,
"grad_norm": 0.002583070658147335,
"kl": 0.0069114591460675,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 560328.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 337.0,
"completions/max_terminated_length": 337.0,
"completions/mean_length": 178.16666666666666,
"completions/mean_terminated_length": 194.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 9.0,
"epoch": 0.196,
"format_failures": 0.0,
"grad_norm": 0.055018555372953415,
"kl": 0.04814303293824196,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 573552.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 205.0,
"completions/max_terminated_length": 205.0,
"completions/mean_length": 104.25,
"completions/mean_terminated_length": 113.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.2,
"format_failures": 0.0,
"grad_norm": 0.10304596275091171,
"kl": 0.0782565288245678,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 583980.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 529.0,
"completions/max_terminated_length": 529.0,
"completions/mean_length": 171.25,
"completions/mean_terminated_length": 186.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.204,
"format_failures": 0.0,
"grad_norm": 0.10456845909357071,
"kl": 0.05266672745347023,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 606264.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 142.0,
"completions/max_terminated_length": 142.0,
"completions/mean_length": 86.0,
"completions/mean_terminated_length": 93.81818181818181,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.208,
"format_failures": 0.0,
"grad_norm": 2.117820978164673,
"kl": 0.12709446623921394,
"learning_rate": 1e-06,
"loss": -0.0977,
"num_tokens": 616176.0,
"reward": 0.25,
"reward_std": 0.45226702094078064,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 169.0,
"completions/max_terminated_length": 169.0,
"completions/mean_length": 67.66666666666667,
"completions/mean_terminated_length": 73.81818181818181,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.212,
"format_failures": 0.0,
"grad_norm": 0.36178988218307495,
"kl": 0.06635316368192434,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 625992.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 292.0,
"completions/max_terminated_length": 292.0,
"completions/mean_length": 180.5,
"completions/mean_terminated_length": 196.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.216,
"format_failures": 0.0,
"grad_norm": 5.520895957946777,
"kl": 0.6420021317899227,
"learning_rate": 1e-06,
"loss": 0.0043,
"num_tokens": 640824.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 81.0,
"completions/max_terminated_length": 81.0,
"completions/mean_length": 59.5,
"completions/mean_terminated_length": 64.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.22,
"format_failures": 0.0,
"grad_norm": 9.40858268737793,
"kl": 1.514443002641201,
"learning_rate": 1e-06,
"loss": -0.0487,
"num_tokens": 649008.0,
"reward": 0.5833333730697632,
"reward_std": 0.5149286389350891,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 647.0,
"completions/max_terminated_length": 647.0,
"completions/mean_length": 265.25,
"completions/mean_terminated_length": 289.3636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.224,
"format_failures": 0.0,
"grad_norm": 0.12246920168399811,
"kl": 0.04888852685689926,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 665112.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 90.0,
"completions/max_terminated_length": 90.0,
"completions/mean_length": 54.416666666666664,
"completions/mean_terminated_length": 65.3,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.228,
"format_failures": 0.0,
"grad_norm": 0.4643149971961975,
"kl": 0.2062125913798809,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 671268.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 285.0,
"completions/max_terminated_length": 285.0,
"completions/mean_length": 138.83333333333334,
"completions/mean_terminated_length": 151.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.232,
"format_failures": 0.0,
"grad_norm": 0.028489232063293457,
"kl": 0.028692953288555145,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 681648.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 189.0,
"completions/max_terminated_length": 189.0,
"completions/mean_length": 119.33333333333333,
"completions/mean_terminated_length": 130.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.236,
"format_failures": 1.0,
"grad_norm": 0.2943709194660187,
"kl": 0.021217118948698044,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 692148.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 123.0,
"completions/max_terminated_length": 123.0,
"completions/mean_length": 65.25,
"completions/mean_terminated_length": 71.18181818181819,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.24,
"format_failures": 1.0,
"grad_norm": 0.4704815149307251,
"kl": 0.1355944722890854,
"learning_rate": 1e-06,
"loss": 0.0016,
"num_tokens": 705504.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 209.0,
"completions/max_terminated_length": 209.0,
"completions/mean_length": 106.75,
"completions/mean_terminated_length": 116.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.244,
"format_failures": 0.0,
"grad_norm": 0.8021370768547058,
"kl": 0.06047418341040611,
"learning_rate": 1e-06,
"loss": 0.0229,
"num_tokens": 712920.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 356.0,
"completions/max_terminated_length": 356.0,
"completions/mean_length": 201.83333333333334,
"completions/mean_terminated_length": 220.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.248,
"format_failures": 0.0,
"grad_norm": 0.0354565754532814,
"kl": 0.051246967166662216,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 725280.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 260.0,
"completions/max_terminated_length": 260.0,
"completions/mean_length": 129.0,
"completions/mean_terminated_length": 140.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.252,
"format_failures": 0.0,
"grad_norm": 0.838399350643158,
"kl": 0.03389432094991207,
"learning_rate": 1e-06,
"loss": 0.0256,
"num_tokens": 733980.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 227.0,
"completions/max_terminated_length": 227.0,
"completions/mean_length": 121.08333333333333,
"completions/mean_terminated_length": 132.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.256,
"format_failures": 0.0,
"grad_norm": 0.008542679250240326,
"kl": 0.02384038269519806,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 763224.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 302.0,
"completions/max_terminated_length": 302.0,
"completions/mean_length": 180.33333333333334,
"completions/mean_terminated_length": 196.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.26,
"format_failures": 0.0,
"grad_norm": 0.01127433218061924,
"kl": 0.013883833773434162,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 777984.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 75.83333333333333,
"completions/mean_terminated_length": 82.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.264,
"format_failures": 0.0,
"grad_norm": 0.09972423315048218,
"kl": 0.06396586634218693,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 785844.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 222.0,
"completions/max_terminated_length": 222.0,
"completions/mean_length": 140.41666666666666,
"completions/mean_terminated_length": 153.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.268,
"format_failures": 0.0,
"grad_norm": 0.03430556878447533,
"kl": 0.03857766184955835,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 796632.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 209.0,
"completions/max_terminated_length": 209.0,
"completions/mean_length": 115.08333333333333,
"completions/mean_terminated_length": 125.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.272,
"format_failures": 0.0,
"grad_norm": 1.6054855585098267,
"kl": 0.020691730547696352,
"learning_rate": 1e-06,
"loss": 0.0511,
"num_tokens": 807576.0,
"reward": 0.4166666865348816,
"reward_std": 0.5149286389350891,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 106.0,
"completions/max_terminated_length": 106.0,
"completions/mean_length": 80.5,
"completions/mean_terminated_length": 87.81818181818181,
"completions/min_length": 0.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.276,
"format_failures": 0.0,
"grad_norm": 1.1459321975708008,
"kl": 0.017325148917734623,
"learning_rate": 1e-06,
"loss": 0.0067,
"num_tokens": 814644.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 304.0,
"completions/max_terminated_length": 304.0,
"completions/mean_length": 175.58333333333334,
"completions/mean_terminated_length": 191.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 17.0,
"epoch": 0.28,
"format_failures": 0.0,
"grad_norm": 0.008818876929581165,
"kl": 0.012372600380331278,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 826932.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 292.0,
"completions/max_terminated_length": 292.0,
"completions/mean_length": 195.0,
"completions/mean_terminated_length": 212.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.284,
"format_failures": 0.0,
"grad_norm": 0.014721119776368141,
"kl": 0.012880454771220684,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 842268.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 145.0,
"completions/max_terminated_length": 145.0,
"completions/mean_length": 94.83333333333333,
"completions/mean_terminated_length": 103.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.288,
"format_failures": 0.0,
"grad_norm": 0.9220354557037354,
"kl": 0.046924193389713764,
"learning_rate": 1e-06,
"loss": 0.0118,
"num_tokens": 849612.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 262.0,
"completions/max_terminated_length": 262.0,
"completions/mean_length": 149.5,
"completions/mean_terminated_length": 163.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.292,
"format_failures": 0.0,
"grad_norm": 0.0295345988124609,
"kl": 0.03905524965375662,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 859632.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 220.0,
"completions/max_terminated_length": 220.0,
"completions/mean_length": 158.41666666666666,
"completions/mean_terminated_length": 172.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.296,
"format_failures": 0.0,
"grad_norm": 0.11439846456050873,
"kl": 0.07962214201688766,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 870756.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 223.0,
"completions/max_terminated_length": 223.0,
"completions/mean_length": 110.08333333333333,
"completions/mean_terminated_length": 120.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.3,
"format_failures": 0.0,
"grad_norm": 0.04706709831953049,
"kl": 0.03136777225881815,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 887700.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 544.0,
"completions/max_terminated_length": 544.0,
"completions/mean_length": 219.58333333333334,
"completions/mean_terminated_length": 239.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 8.0,
"epoch": 0.304,
"format_failures": 0.0,
"grad_norm": 0.106910839676857,
"kl": 0.16153255105018616,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 899544.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 175.0,
"completions/max_terminated_length": 175.0,
"completions/mean_length": 118.66666666666667,
"completions/mean_terminated_length": 129.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.308,
"format_failures": 0.0,
"grad_norm": 0.9582226276397705,
"kl": 0.1435188725590706,
"learning_rate": 1e-06,
"loss": 0.0297,
"num_tokens": 909816.0,
"reward": 0.1666666716337204,
"reward_std": 0.38924944400787354,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 200.0,
"completions/max_terminated_length": 200.0,
"completions/mean_length": 151.83333333333334,
"completions/mean_terminated_length": 165.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.312,
"format_failures": 0.0,
"grad_norm": 0.6430385112762451,
"kl": 0.021885435096919537,
"learning_rate": 1e-06,
"loss": -0.0413,
"num_tokens": 919620.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 127.0,
"completions/max_terminated_length": 127.0,
"completions/mean_length": 87.66666666666667,
"completions/mean_terminated_length": 95.63636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.316,
"format_failures": 0.0,
"grad_norm": 0.1316368579864502,
"kl": 0.052431097254157066,
"learning_rate": 1e-06,
"loss": 0.0005,
"num_tokens": 930468.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 357.0,
"completions/max_terminated_length": 357.0,
"completions/mean_length": 188.66666666666666,
"completions/mean_terminated_length": 205.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.32,
"format_failures": 0.0,
"grad_norm": 0.24080750346183777,
"kl": 0.25305451452732086,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 947112.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 171.0,
"completions/max_terminated_length": 171.0,
"completions/mean_length": 130.75,
"completions/mean_terminated_length": 142.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.324,
"format_failures": 0.0,
"grad_norm": 0.9585680961608887,
"kl": 0.02085646940395236,
"learning_rate": 1e-06,
"loss": -0.0725,
"num_tokens": 956448.0,
"reward": 0.5,
"reward_std": 0.5222329497337341,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 220.0,
"completions/max_terminated_length": 220.0,
"completions/mean_length": 119.25,
"completions/mean_terminated_length": 130.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.328,
"format_failures": 0.0,
"grad_norm": 0.02760450914502144,
"kl": 0.020923216827213764,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 966324.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 476.0,
"completions/max_terminated_length": 476.0,
"completions/mean_length": 252.33333333333334,
"completions/mean_terminated_length": 275.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.332,
"format_failures": 1.0,
"grad_norm": 0.011845018714666367,
"kl": 0.017354148440063,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 985296.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 234.0,
"completions/max_terminated_length": 234.0,
"completions/mean_length": 101.16666666666667,
"completions/mean_terminated_length": 110.36363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.336,
"format_failures": 1.0,
"grad_norm": 0.02075113356113434,
"kl": 0.013977942056953907,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 998856.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 217.0,
"completions/max_terminated_length": 217.0,
"completions/mean_length": 156.0,
"completions/mean_terminated_length": 170.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 44.0,
"epoch": 0.34,
"format_failures": 0.0,
"grad_norm": 0.018603280186653137,
"kl": 0.020112676545977592,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 1008864.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 364.0,
"completions/max_terminated_length": 364.0,
"completions/mean_length": 223.66666666666666,
"completions/mean_terminated_length": 244.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.344,
"format_failures": 0.0,
"grad_norm": 0.011895284056663513,
"kl": 0.021254747174680233,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 1022556.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 159.0,
"completions/max_terminated_length": 159.0,
"completions/mean_length": 117.66666666666667,
"completions/mean_terminated_length": 128.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.348,
"format_failures": 0.0,
"grad_norm": 1.1451243162155151,
"kl": 0.026615198701620102,
"learning_rate": 1e-06,
"loss": 0.0587,
"num_tokens": 1032684.0,
"reward": 0.4166666865348816,
"reward_std": 0.5149286389350891,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 304.0,
"completions/max_terminated_length": 304.0,
"completions/mean_length": 142.25,
"completions/mean_terminated_length": 155.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.352,
"format_failures": 2.0,
"grad_norm": 0.8502682447433472,
"kl": 0.012907921802252531,
"learning_rate": 1e-06,
"loss": 0.079,
"num_tokens": 1067328.0,
"reward": 0.1666666716337204,
"reward_std": 0.3892494738101959,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 177.0,
"completions/max_terminated_length": 177.0,
"completions/mean_length": 114.5,
"completions/mean_terminated_length": 124.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 78.0,
"epoch": 0.356,
"format_failures": 0.0,
"grad_norm": 0.7634170651435852,
"kl": 0.08245750516653061,
"learning_rate": 1e-06,
"loss": 0.0177,
"num_tokens": 1074756.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 73.0,
"completions/mean_terminated_length": 79.63636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.36,
"format_failures": 0.0,
"grad_norm": 0.750490665435791,
"kl": 0.032081443816423416,
"learning_rate": 1e-06,
"loss": 0.0503,
"num_tokens": 1083096.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 138.0,
"completions/max_terminated_length": 138.0,
"completions/mean_length": 53.666666666666664,
"completions/mean_terminated_length": 58.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.364,
"format_failures": 0.0,
"grad_norm": 0.12016791850328445,
"kl": 0.04432140104472637,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 1090380.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 800.0,
"completions/max_terminated_length": 800.0,
"completions/mean_length": 397.5,
"completions/mean_terminated_length": 433.6363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.368,
"format_failures": 0.0,
"grad_norm": 0.012203319929540157,
"kl": 0.009247956797480583,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 1113504.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 261.0,
"completions/max_terminated_length": 261.0,
"completions/mean_length": 138.75,
"completions/mean_terminated_length": 151.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.372,
"format_failures": 0.0,
"grad_norm": 0.03371990844607353,
"kl": 0.029644143767654896,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 1125492.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 89.0,
"completions/max_terminated_length": 89.0,
"completions/mean_length": 51.833333333333336,
"completions/mean_terminated_length": 56.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.376,
"format_failures": 0.0,
"grad_norm": 2.027597427368164,
"kl": 0.19823284726589918,
"learning_rate": 1e-06,
"loss": -0.0712,
"num_tokens": 1130748.0,
"reward": 0.75,
"reward_std": 0.45226702094078064,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 74.33333333333333,
"completions/mean_terminated_length": 81.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.38,
"format_failures": 0.0,
"grad_norm": 1.4458988904953003,
"kl": 0.07499337941408157,
"learning_rate": 1e-06,
"loss": -0.0187,
"num_tokens": 1138584.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 285.0,
"completions/max_terminated_length": 285.0,
"completions/mean_length": 137.66666666666666,
"completions/mean_terminated_length": 150.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.384,
"format_failures": 0.0,
"grad_norm": 0.03813532739877701,
"kl": 0.023914064280688763,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 1154124.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 206.0,
"completions/max_terminated_length": 206.0,
"completions/mean_length": 125.66666666666667,
"completions/mean_terminated_length": 137.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.388,
"format_failures": 1.0,
"grad_norm": 0.016639724373817444,
"kl": 0.019042176194489002,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 1165560.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 209.0,
"completions/max_terminated_length": 209.0,
"completions/mean_length": 136.0,
"completions/mean_terminated_length": 148.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.392,
"format_failures": 0.0,
"grad_norm": 0.041289571672677994,
"kl": 0.025019565597176552,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 1176936.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 539.0,
"completions/max_terminated_length": 539.0,
"completions/mean_length": 236.75,
"completions/mean_terminated_length": 258.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 10.0,
"epoch": 0.396,
"format_failures": 0.0,
"grad_norm": 0.029155507683753967,
"kl": 0.03094907756894827,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 1194108.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 31.0,
"completions/max_terminated_length": 31.0,
"completions/mean_length": 21.0,
"completions/mean_terminated_length": 22.90909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.4,
"format_failures": 0.0,
"grad_norm": 5.876866340637207,
"kl": 0.1100139394402504,
"learning_rate": 1e-06,
"loss": 0.0323,
"num_tokens": 1202412.0,
"reward": 0.75,
"reward_std": 0.45226702094078064,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 1202412,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}