lcb_test_generator_3b_100steps / trainer_state.json
Harryllh's picture
Upload folder using huggingface_hub
372bae8 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.4,
"eval_steps": 500,
"global_step": 100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 434.0,
"completions/max_terminated_length": 434.0,
"completions/mean_length": 293.75,
"completions/mean_terminated_length": 335.7142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 184.0,
"epoch": 0.004,
"format_failures": 0.0,
"grad_norm": 0.5197089910507202,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.0278,
"num_tokens": 9800.0,
"reward": 0.3660714328289032,
"reward_std": 0.36236491799354553,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 278.0,
"completions/max_terminated_length": 278.0,
"completions/mean_length": 134.875,
"completions/mean_terminated_length": 154.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 51.0,
"epoch": 0.008,
"format_failures": 0.0,
"grad_norm": 1.8656461238861084,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.1584,
"num_tokens": 19920.0,
"reward": 0.34375,
"reward_std": 0.48065245151519775,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 261.0,
"completions/max_terminated_length": 261.0,
"completions/mean_length": 176.625,
"completions/mean_terminated_length": 201.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 138.0,
"epoch": 0.012,
"format_failures": 0.0,
"grad_norm": 7.7805867195129395,
"kl": 1.0173164680600166,
"learning_rate": 1e-06,
"loss": 0.0063,
"num_tokens": 28896.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 332.0,
"completions/max_terminated_length": 332.0,
"completions/mean_length": 216.625,
"completions/mean_terminated_length": 247.57142857142858,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.016,
"format_failures": 0.0,
"grad_norm": 0.34460729360580444,
"kl": 0.005293647991493344,
"learning_rate": 1e-06,
"loss": 0.0149,
"num_tokens": 35688.0,
"reward": 0.316850483417511,
"reward_std": 0.19629573822021484,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 107.75,
"completions/mean_terminated_length": 123.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.02,
"format_failures": 0.0,
"grad_norm": 1.950016975402832,
"kl": 0.19140876829624176,
"learning_rate": 1e-06,
"loss": -0.0265,
"num_tokens": 44320.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 480.0,
"completions/max_terminated_length": 480.0,
"completions/mean_length": 347.375,
"completions/mean_terminated_length": 397.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 316.0,
"epoch": 0.024,
"format_failures": 0.0,
"grad_norm": 0.27606070041656494,
"kl": 0.004609360825270414,
"learning_rate": 1e-06,
"loss": 0.019,
"num_tokens": 55480.0,
"reward": 0.20555555820465088,
"reward_std": 0.22662308812141418,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 98.0,
"completions/max_terminated_length": 98.0,
"completions/mean_length": 54.75,
"completions/mean_terminated_length": 62.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.028,
"format_failures": 0.0,
"grad_norm": 1.512669563293457,
"kl": 0.0004560185334412381,
"learning_rate": 1e-06,
"loss": 0.1926,
"num_tokens": 76568.0,
"reward": 0.0416666679084301,
"reward_std": 0.1178511381149292,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 380.0,
"completions/max_terminated_length": 380.0,
"completions/mean_length": 189.75,
"completions/mean_terminated_length": 216.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.032,
"format_failures": 0.0,
"grad_norm": 1.6258090734481812,
"kl": 0.133640818297863,
"learning_rate": 1e-06,
"loss": 0.0094,
"num_tokens": 88120.0,
"reward": 0.05000000074505806,
"reward_std": 0.1414213478565216,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1412.0,
"completions/max_terminated_length": 1412.0,
"completions/mean_length": 426.125,
"completions/mean_terminated_length": 487.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.036,
"format_failures": 1.0,
"grad_norm": 0.3745494782924652,
"kl": 0.0010488361003808677,
"learning_rate": 1e-06,
"loss": -0.1003,
"num_tokens": 110584.0,
"reward": 0.05859375,
"reward_std": 0.1657281517982483,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 62.0,
"completions/max_terminated_length": 62.0,
"completions/mean_length": 41.25,
"completions/mean_terminated_length": 47.142857142857146,
"completions/min_length": 0.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.04,
"format_failures": 0.0,
"grad_norm": 6.635150909423828,
"kl": 1.000607669353485,
"learning_rate": 1e-06,
"loss": -0.0558,
"num_tokens": 115888.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 126.0,
"completions/max_terminated_length": 126.0,
"completions/mean_length": 60.25,
"completions/mean_terminated_length": 96.4,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.044,
"format_failures": 0.0,
"grad_norm": 5.5436906814575195,
"kl": 0.534478023648262,
"learning_rate": 1e-06,
"loss": -0.1301,
"num_tokens": 123984.0,
"reward": 0.375,
"reward_std": 0.5175491571426392,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2047.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 702.625,
"completions/mean_terminated_length": 936.8333333333334,
"completions/min_length": 0.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.048,
"format_failures": 0.0,
"grad_norm": 0.34704723954200745,
"kl": 0.0009783765999600291,
"learning_rate": 1e-06,
"loss": 0.0431,
"num_tokens": 146192.0,
"reward": 0.38749998807907104,
"reward_std": 0.4181165099143982,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 122.0,
"completions/max_terminated_length": 122.0,
"completions/mean_length": 40.375,
"completions/mean_terminated_length": 46.142857142857146,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.052,
"format_failures": 0.0,
"grad_norm": 0.004240340553224087,
"kl": 0.004628603579476476,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 166896.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 973.0,
"completions/max_terminated_length": 973.0,
"completions/mean_length": 452.5,
"completions/mean_terminated_length": 517.1428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.056,
"format_failures": 0.0,
"grad_norm": 0.18779706954956055,
"kl": 0.0052806169260293245,
"learning_rate": 1e-06,
"loss": 0.0313,
"num_tokens": 185392.0,
"reward": 0.11513157933950424,
"reward_std": 0.16955535113811493,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 304.0,
"completions/max_terminated_length": 304.0,
"completions/mean_length": 202.0,
"completions/mean_terminated_length": 230.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.06,
"format_failures": 0.0,
"grad_norm": 0.6387383341789246,
"kl": 0.02643415331840515,
"learning_rate": 1e-06,
"loss": 0.0717,
"num_tokens": 193056.0,
"reward": 0.53125,
"reward_std": 0.31045761704444885,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 203.0,
"completions/max_terminated_length": 203.0,
"completions/mean_length": 151.25,
"completions/mean_terminated_length": 172.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.064,
"format_failures": 0.0,
"grad_norm": 0.2569343149662018,
"kl": 0.09986447170376778,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 201256.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 295.0,
"completions/max_terminated_length": 295.0,
"completions/mean_length": 192.0,
"completions/mean_terminated_length": 219.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.068,
"format_failures": 1.0,
"grad_norm": 0.04395958036184311,
"kl": 0.027548893354833126,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 209920.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 44.0,
"completions/max_terminated_length": 44.0,
"completions/mean_length": 20.125,
"completions/mean_terminated_length": 40.25,
"completions/min_length": 0.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.072,
"format_failures": 0.0,
"grad_norm": 0.16681237518787384,
"kl": 0.03394318092614412,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 214144.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 527.0,
"completions/max_terminated_length": 527.0,
"completions/mean_length": 215.75,
"completions/mean_terminated_length": 246.57142857142858,
"completions/min_length": 0.0,
"completions/min_terminated_length": 18.0,
"epoch": 0.076,
"format_failures": 0.0,
"grad_norm": 0.5867045521736145,
"kl": 0.00954199954867363,
"learning_rate": 1e-06,
"loss": -0.2047,
"num_tokens": 234096.0,
"reward": 0.1666666716337204,
"reward_std": 0.35634833574295044,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 169.0,
"completions/max_terminated_length": 169.0,
"completions/mean_length": 91.75,
"completions/mean_terminated_length": 104.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.08,
"format_failures": 0.0,
"grad_norm": 2.331188917160034,
"kl": 0.05314544588327408,
"learning_rate": 1e-06,
"loss": 0.048,
"num_tokens": 243464.0,
"reward": 0.21875,
"reward_std": 0.36443448066711426,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 130.0,
"completions/max_terminated_length": 130.0,
"completions/mean_length": 81.25,
"completions/mean_terminated_length": 92.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.084,
"format_failures": 0.0,
"grad_norm": 1.2006300687789917,
"kl": 0.07363329455256462,
"learning_rate": 1e-06,
"loss": 0.0094,
"num_tokens": 250720.0,
"reward": 0.21875,
"reward_std": 0.33905068039894104,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 197.0,
"completions/max_terminated_length": 197.0,
"completions/mean_length": 82.0,
"completions/mean_terminated_length": 93.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 27.0,
"epoch": 0.088,
"format_failures": 0.0,
"grad_norm": 1.3736180067062378,
"kl": 0.04446508176624775,
"learning_rate": 1e-06,
"loss": -0.0541,
"num_tokens": 257944.0,
"reward": 0.0535714291036129,
"reward_std": 0.15152288973331451,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 544.0,
"completions/max_terminated_length": 544.0,
"completions/mean_length": 242.75,
"completions/mean_terminated_length": 277.42857142857144,
"completions/min_length": 0.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.092,
"format_failures": 0.0,
"grad_norm": 0.9332400560379028,
"kl": 0.026759919710457325,
"learning_rate": 1e-06,
"loss": -0.0979,
"num_tokens": 270512.0,
"reward": 0.17383432388305664,
"reward_std": 0.5423066020011902,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 334.0,
"completions/max_terminated_length": 334.0,
"completions/mean_length": 193.875,
"completions/mean_terminated_length": 221.57142857142858,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.096,
"format_failures": 0.0,
"grad_norm": 0.5741273164749146,
"kl": 0.061491173692047596,
"learning_rate": 1e-06,
"loss": 0.0724,
"num_tokens": 279544.0,
"reward": 0.3214285969734192,
"reward_std": 0.3162277638912201,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 191.0,
"completions/max_terminated_length": 191.0,
"completions/mean_length": 131.625,
"completions/mean_terminated_length": 150.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.1,
"format_failures": 0.0,
"grad_norm": 0.8438379168510437,
"kl": 0.10757053177803755,
"learning_rate": 1e-06,
"loss": -0.0168,
"num_tokens": 285872.0,
"reward": 0.3083333373069763,
"reward_std": 0.3443548381328583,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 345.0,
"completions/max_terminated_length": 345.0,
"completions/mean_length": 224.0,
"completions/mean_terminated_length": 256.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 151.0,
"epoch": 0.104,
"format_failures": 0.0,
"grad_norm": 0.6450461149215698,
"kl": 0.04460714943706989,
"learning_rate": 1e-06,
"loss": 0.0276,
"num_tokens": 293816.0,
"reward": 0.3494505286216736,
"reward_std": 0.3268265724182129,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 210.0,
"completions/max_terminated_length": 210.0,
"completions/mean_length": 110.375,
"completions/mean_terminated_length": 126.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.108,
"format_failures": 0.0,
"grad_norm": 0.17123964428901672,
"kl": 0.09914526715874672,
"learning_rate": 1e-06,
"loss": 0.0006,
"num_tokens": 300160.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 128.0,
"completions/max_terminated_length": 128.0,
"completions/mean_length": 82.5,
"completions/mean_terminated_length": 94.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.112,
"format_failures": 0.0,
"grad_norm": 0.9953401684761047,
"kl": 0.18897472321987152,
"learning_rate": 1e-06,
"loss": 0.002,
"num_tokens": 307720.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 229.375,
"completions/mean_terminated_length": 262.14285714285717,
"completions/min_length": 0.0,
"completions/min_terminated_length": 187.0,
"epoch": 0.116,
"format_failures": 0.0,
"grad_norm": 2.1179044246673584,
"kl": 0.013377793598920107,
"learning_rate": 1e-06,
"loss": 0.3156,
"num_tokens": 328920.0,
"reward": 0.3519230782985687,
"reward_std": 0.3794543743133545,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 184.0,
"completions/max_terminated_length": 184.0,
"completions/mean_length": 131.375,
"completions/mean_terminated_length": 150.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 14.0,
"epoch": 0.12,
"format_failures": 0.0,
"grad_norm": 1.2885483503341675,
"kl": 0.009146903175860643,
"learning_rate": 1e-06,
"loss": -0.0387,
"num_tokens": 335880.0,
"reward": 0.25,
"reward_std": 0.4629100561141968,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1936.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 410.0,
"completions/mean_terminated_length": 468.57142857142856,
"completions/min_length": 0.0,
"completions/min_terminated_length": 15.0,
"epoch": 0.124,
"format_failures": 1.0,
"grad_norm": 1.5897152423858643,
"kl": 0.06828754395246506,
"learning_rate": 1e-06,
"loss": 0.0215,
"num_tokens": 358104.0,
"reward": 0.45494991540908813,
"reward_std": 0.48848965764045715,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 366.0,
"completions/max_terminated_length": 366.0,
"completions/mean_length": 202.375,
"completions/mean_terminated_length": 231.28571428571428,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.128,
"format_failures": 0.0,
"grad_norm": 0.8364682793617249,
"kl": 0.12048156931996346,
"learning_rate": 1e-06,
"loss": 0.0898,
"num_tokens": 365656.0,
"reward": 0.4521104097366333,
"reward_std": 0.2924821972846985,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 68.0,
"completions/max_terminated_length": 68.0,
"completions/mean_length": 48.875,
"completions/mean_terminated_length": 55.857142857142854,
"completions/min_length": 0.0,
"completions/min_terminated_length": 46.0,
"epoch": 0.132,
"format_failures": 0.0,
"grad_norm": 1.7178492546081543,
"kl": 0.13572826609015465,
"learning_rate": 1e-06,
"loss": -0.0249,
"num_tokens": 371392.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 435.0,
"completions/max_terminated_length": 435.0,
"completions/mean_length": 293.5,
"completions/mean_terminated_length": 335.42857142857144,
"completions/min_length": 0.0,
"completions/min_terminated_length": 210.0,
"epoch": 0.136,
"format_failures": 1.0,
"grad_norm": 0.9806227087974548,
"kl": 0.012222900055348873,
"learning_rate": 1e-06,
"loss": 0.3233,
"num_tokens": 392240.0,
"reward": 0.47658732533454895,
"reward_std": 0.4081757962703705,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 97.0,
"completions/max_terminated_length": 97.0,
"completions/mean_length": 64.875,
"completions/mean_terminated_length": 74.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 43.0,
"epoch": 0.14,
"format_failures": 0.0,
"grad_norm": 0.8304542303085327,
"kl": 0.031799230724573135,
"learning_rate": 1e-06,
"loss": 0.0113,
"num_tokens": 396792.0,
"reward": 0.6166666746139526,
"reward_std": 0.31773003935813904,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 265.0,
"completions/max_terminated_length": 265.0,
"completions/mean_length": 114.25,
"completions/mean_terminated_length": 130.57142857142858,
"completions/min_length": 0.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.144,
"format_failures": 0.0,
"grad_norm": 1.793579339981079,
"kl": 0.6158746182918549,
"learning_rate": 1e-06,
"loss": 0.0043,
"num_tokens": 404472.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 233.0,
"completions/max_terminated_length": 233.0,
"completions/mean_length": 169.75,
"completions/mean_terminated_length": 194.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 122.0,
"epoch": 0.148,
"format_failures": 0.0,
"grad_norm": 0.3936280906200409,
"kl": 0.04245052766054869,
"learning_rate": 1e-06,
"loss": -0.0153,
"num_tokens": 411600.0,
"reward": 0.5294643044471741,
"reward_std": 0.21430060267448425,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 152.0,
"completions/max_terminated_length": 152.0,
"completions/mean_length": 74.625,
"completions/mean_terminated_length": 85.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.152,
"format_failures": 0.0,
"grad_norm": 0.592628002166748,
"kl": 0.14406441897153854,
"learning_rate": 1e-06,
"loss": -0.0363,
"num_tokens": 417456.0,
"reward": 0.0555555559694767,
"reward_std": 0.11878278106451035,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 726.0,
"completions/max_terminated_length": 726.0,
"completions/mean_length": 330.25,
"completions/mean_terminated_length": 377.42857142857144,
"completions/min_length": 0.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.156,
"format_failures": 0.0,
"grad_norm": 0.7340777516365051,
"kl": 0.02144559659063816,
"learning_rate": 1e-06,
"loss": 0.0557,
"num_tokens": 439208.0,
"reward": 0.10000000149011612,
"reward_std": 0.2828426957130432,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 625.0,
"completions/max_terminated_length": 625.0,
"completions/mean_length": 336.0,
"completions/mean_terminated_length": 384.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.16,
"format_failures": 0.0,
"grad_norm": 0.32950443029403687,
"kl": 0.018678720109164715,
"learning_rate": 1e-06,
"loss": 0.1579,
"num_tokens": 464616.0,
"reward": 0.68376624584198,
"reward_std": 0.16028425097465515,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 91.0,
"completions/max_terminated_length": 91.0,
"completions/mean_length": 53.75,
"completions/mean_terminated_length": 61.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 39.0,
"epoch": 0.164,
"format_failures": 0.0,
"grad_norm": 15.617924690246582,
"kl": 2.1802964210510254,
"learning_rate": 1e-06,
"loss": -0.1623,
"num_tokens": 473272.0,
"reward": 0.4464285671710968,
"reward_std": 0.49744242429733276,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 74.0,
"completions/max_terminated_length": 74.0,
"completions/mean_length": 62.625,
"completions/mean_terminated_length": 71.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.168,
"format_failures": 0.0,
"grad_norm": 0.5167672634124756,
"kl": 0.192179337143898,
"learning_rate": 1e-06,
"loss": 0.0018,
"num_tokens": 477896.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 187.0,
"completions/max_terminated_length": 187.0,
"completions/mean_length": 124.625,
"completions/mean_terminated_length": 142.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.172,
"format_failures": 1.0,
"grad_norm": 1.7434178590774536,
"kl": 0.43839313089847565,
"learning_rate": 1e-06,
"loss": -0.0081,
"num_tokens": 485584.0,
"reward": 0.1041666716337204,
"reward_std": 0.19795581698417664,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 53.0,
"completions/max_terminated_length": 53.0,
"completions/mean_length": 21.5,
"completions/mean_terminated_length": 43.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 33.0,
"epoch": 0.176,
"format_failures": 0.0,
"grad_norm": 0.19118274748325348,
"kl": 0.021482082083821297,
"learning_rate": 1e-06,
"loss": 0.0007,
"num_tokens": 491072.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 184.0,
"completions/max_terminated_length": 184.0,
"completions/mean_length": 101.375,
"completions/mean_terminated_length": 115.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.18,
"format_failures": 0.0,
"grad_norm": 0.5414936542510986,
"kl": 0.23846322298049927,
"learning_rate": 1e-06,
"loss": 0.0026,
"num_tokens": 501048.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 184.0,
"completions/max_terminated_length": 184.0,
"completions/mean_length": 105.25,
"completions/mean_terminated_length": 120.28571428571429,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.184,
"format_failures": 0.0,
"grad_norm": 1.3124736547470093,
"kl": 0.02640421688556671,
"learning_rate": 1e-06,
"loss": 0.0418,
"num_tokens": 509688.0,
"reward": 0.3333333432674408,
"reward_std": 0.35634833574295044,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 308.0,
"completions/max_terminated_length": 308.0,
"completions/mean_length": 222.625,
"completions/mean_terminated_length": 254.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.188,
"format_failures": 0.0,
"grad_norm": 0.6642023324966431,
"kl": 0.038137754425406456,
"learning_rate": 1e-06,
"loss": -0.0281,
"num_tokens": 516136.0,
"reward": 0.5722222328186035,
"reward_std": 0.3752013146877289,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 220.0,
"completions/max_terminated_length": 220.0,
"completions/mean_length": 139.0,
"completions/mean_terminated_length": 158.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.192,
"format_failures": 0.0,
"grad_norm": 1.5801048278808594,
"kl": 0.31588232330977917,
"learning_rate": 1e-06,
"loss": -0.0356,
"num_tokens": 525216.0,
"reward": 0.16785714030265808,
"reward_std": 0.3453776240348816,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 142.0,
"completions/max_terminated_length": 142.0,
"completions/mean_length": 103.0,
"completions/mean_terminated_length": 117.71428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.196,
"format_failures": 0.0,
"grad_norm": 1.5228773355484009,
"kl": 0.3656068593263626,
"learning_rate": 1e-06,
"loss": -0.0299,
"num_tokens": 532920.0,
"reward": 0.0833333358168602,
"reward_std": 0.15430335700511932,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 167.0,
"completions/max_terminated_length": 167.0,
"completions/mean_length": 58.625,
"completions/mean_terminated_length": 67.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 23.0,
"epoch": 0.2,
"format_failures": 0.0,
"grad_norm": 2.357253074645996,
"kl": 0.021084215957671404,
"learning_rate": 1e-06,
"loss": -0.1241,
"num_tokens": 539800.0,
"reward": 0.24715909361839294,
"reward_std": 0.3969031274318695,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 79.0,
"completions/max_terminated_length": 79.0,
"completions/mean_length": 47.5,
"completions/mean_terminated_length": 76.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.204,
"format_failures": 0.0,
"grad_norm": 3.9780025482177734,
"kl": 0.04299665614962578,
"learning_rate": 1e-06,
"loss": -0.0066,
"num_tokens": 547080.0,
"reward": 0.75,
"reward_std": 0.38832157850265503,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 377.0,
"completions/max_terminated_length": 377.0,
"completions/mean_length": 245.0,
"completions/mean_terminated_length": 280.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 236.0,
"epoch": 0.208,
"format_failures": 0.0,
"grad_norm": 0.824322521686554,
"kl": 0.04343542829155922,
"learning_rate": 1e-06,
"loss": -0.394,
"num_tokens": 565368.0,
"reward": 0.3678571581840515,
"reward_std": 0.38505232334136963,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 313.0,
"completions/max_terminated_length": 313.0,
"completions/mean_length": 223.5,
"completions/mean_terminated_length": 255.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 229.0,
"epoch": 0.212,
"format_failures": 0.0,
"grad_norm": 0.8966130018234253,
"kl": 0.022847690619528294,
"learning_rate": 1e-06,
"loss": 0.0523,
"num_tokens": 584552.0,
"reward": 0.09375,
"reward_std": 0.2651650309562683,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 463.0,
"completions/max_terminated_length": 463.0,
"completions/mean_length": 301.75,
"completions/mean_terminated_length": 344.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.216,
"format_failures": 0.0,
"grad_norm": 0.5948707461357117,
"kl": 0.0344517957419157,
"learning_rate": 1e-06,
"loss": -0.0372,
"num_tokens": 605144.0,
"reward": 0.3611606955528259,
"reward_std": 0.24707795679569244,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 99.75,
"completions/mean_terminated_length": 114.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.22,
"format_failures": 0.0,
"grad_norm": 2.431544065475464,
"kl": 0.39844033867120743,
"learning_rate": 1e-06,
"loss": 0.0435,
"num_tokens": 612304.0,
"reward": 0.3895833492279053,
"reward_std": 0.4363391399383545,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 189.0,
"completions/max_terminated_length": 189.0,
"completions/mean_length": 158.875,
"completions/mean_terminated_length": 181.57142857142858,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.224,
"format_failures": 0.0,
"grad_norm": 3.419069528579712,
"kl": 0.18863588571548462,
"learning_rate": 1e-06,
"loss": -0.0102,
"num_tokens": 619832.0,
"reward": 0.3333333432674408,
"reward_std": 0.4714045226573944,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 560.0,
"completions/max_terminated_length": 560.0,
"completions/mean_length": 250.5,
"completions/mean_terminated_length": 286.2857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.228,
"format_failures": 0.0,
"grad_norm": 0.0427495501935482,
"kl": 0.06415125727653503,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 632688.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 95.0,
"completions/max_terminated_length": 95.0,
"completions/mean_length": 62.0,
"completions/mean_terminated_length": 70.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.232,
"format_failures": 0.0,
"grad_norm": 1.9774202108383179,
"kl": 0.05197676923125982,
"learning_rate": 1e-06,
"loss": -0.0204,
"num_tokens": 637680.0,
"reward": 0.125,
"reward_std": 0.3535533845424652,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1235.0,
"completions/max_terminated_length": 1235.0,
"completions/mean_length": 317.5,
"completions/mean_terminated_length": 362.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.236,
"format_failures": 0.0,
"grad_norm": 0.3588317036628723,
"kl": 0.008119639242067933,
"learning_rate": 1e-06,
"loss": 0.0679,
"num_tokens": 662240.0,
"reward": 0.0625,
"reward_std": 0.1767766922712326,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 482.0,
"completions/max_terminated_length": 482.0,
"completions/mean_length": 302.625,
"completions/mean_terminated_length": 345.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.24,
"format_failures": 0.0,
"grad_norm": 0.43694156408309937,
"kl": 0.13442928344011307,
"learning_rate": 1e-06,
"loss": 0.035,
"num_tokens": 671136.0,
"reward": 0.4389880895614624,
"reward_std": 0.314676970243454,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 155.0,
"completions/max_terminated_length": 155.0,
"completions/mean_length": 76.625,
"completions/mean_terminated_length": 87.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.244,
"format_failures": 0.0,
"grad_norm": 2.0356831550598145,
"kl": 0.10412658751010895,
"learning_rate": 1e-06,
"loss": 0.0941,
"num_tokens": 678296.0,
"reward": 0.2856481671333313,
"reward_std": 0.44585946202278137,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 526.0,
"completions/max_terminated_length": 526.0,
"completions/mean_length": 302.125,
"completions/mean_terminated_length": 345.2857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 173.0,
"epoch": 0.248,
"format_failures": 0.0,
"grad_norm": 0.2828364074230194,
"kl": 0.06026838719844818,
"learning_rate": 1e-06,
"loss": 0.0307,
"num_tokens": 688328.0,
"reward": 0.37730082869529724,
"reward_std": 0.22057875990867615,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 1564.0,
"completions/max_terminated_length": 1564.0,
"completions/mean_length": 436.5,
"completions/mean_terminated_length": 498.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 278.0,
"epoch": 0.252,
"format_failures": 0.0,
"grad_norm": 0.460735559463501,
"kl": 0.03187366481870413,
"learning_rate": 1e-06,
"loss": 0.3464,
"num_tokens": 710552.0,
"reward": 0.7753968238830566,
"reward_std": 0.3274153470993042,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 112.0,
"completions/mean_terminated_length": 128.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.256,
"format_failures": 0.0,
"grad_norm": 0.9710547924041748,
"kl": 0.056045059114694595,
"learning_rate": 1e-06,
"loss": 0.397,
"num_tokens": 730936.0,
"reward": 0.4721861779689789,
"reward_std": 0.31307727098464966,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 282.0,
"completions/max_terminated_length": 282.0,
"completions/mean_length": 181.25,
"completions/mean_terminated_length": 207.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 115.0,
"epoch": 0.26,
"format_failures": 0.0,
"grad_norm": 0.5494914054870605,
"kl": 0.17688407003879547,
"learning_rate": 1e-06,
"loss": 0.0636,
"num_tokens": 737640.0,
"reward": 0.4345238208770752,
"reward_std": 0.24914170801639557,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 782.0,
"completions/max_terminated_length": 782.0,
"completions/mean_length": 442.625,
"completions/mean_terminated_length": 505.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 371.0,
"epoch": 0.264,
"format_failures": 0.0,
"grad_norm": 0.2535926103591919,
"kl": 0.027257385663688183,
"learning_rate": 1e-06,
"loss": 0.0455,
"num_tokens": 749424.0,
"reward": 0.4035714268684387,
"reward_std": 0.21609759330749512,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 536.0,
"completions/max_terminated_length": 536.0,
"completions/mean_length": 360.375,
"completions/mean_terminated_length": 411.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.268,
"format_failures": 0.0,
"grad_norm": 0.2211979627609253,
"kl": 0.03450755029916763,
"learning_rate": 1e-06,
"loss": -0.0173,
"num_tokens": 758368.0,
"reward": 0.26453372836112976,
"reward_std": 0.18241503834724426,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 324.0,
"completions/max_terminated_length": 324.0,
"completions/mean_length": 171.0,
"completions/mean_terminated_length": 195.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.272,
"format_failures": 0.0,
"grad_norm": 1.1518077850341797,
"kl": 0.7764540687203407,
"learning_rate": 1e-06,
"loss": 0.0543,
"num_tokens": 769808.0,
"reward": 0.20863094925880432,
"reward_std": 0.1800907701253891,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 255.0,
"completions/max_terminated_length": 255.0,
"completions/mean_length": 146.875,
"completions/mean_terminated_length": 167.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 90.0,
"epoch": 0.276,
"format_failures": 0.0,
"grad_norm": 1.4199182987213135,
"kl": 0.03853025659918785,
"learning_rate": 1e-06,
"loss": -0.3424,
"num_tokens": 787960.0,
"reward": 0.29305553436279297,
"reward_std": 0.3426187038421631,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 497.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 260.25,
"completions/mean_terminated_length": 297.42857142857144,
"completions/min_length": 0.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.28,
"format_failures": 0.0,
"grad_norm": 0.95790034532547,
"kl": 0.04087948985397816,
"learning_rate": 1e-06,
"loss": -0.0072,
"num_tokens": 808840.0,
"reward": 0.30420100688934326,
"reward_std": 0.21492989361286163,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 457.0,
"completions/max_terminated_length": 457.0,
"completions/mean_length": 277.125,
"completions/mean_terminated_length": 316.7142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.284,
"format_failures": 0.0,
"grad_norm": 0.6122504472732544,
"kl": 0.043809447437524796,
"learning_rate": 1e-06,
"loss": 0.0844,
"num_tokens": 820184.0,
"reward": 0.4826388657093048,
"reward_std": 0.40854451060295105,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 130.0,
"completions/max_terminated_length": 130.0,
"completions/mean_length": 31.875,
"completions/mean_terminated_length": 85.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.288,
"format_failures": 0.0,
"grad_norm": 3.6429221630096436,
"kl": 0.14530150592327118,
"learning_rate": 1e-06,
"loss": -0.3358,
"num_tokens": 828280.0,
"reward": 0.625,
"reward_std": 0.41547447443008423,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2028.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 568.375,
"completions/mean_terminated_length": 649.5714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.292,
"format_failures": 0.0,
"grad_norm": 0.340351402759552,
"kl": 0.04210643842816353,
"learning_rate": 1e-06,
"loss": 0.1705,
"num_tokens": 850536.0,
"reward": 0.255952388048172,
"reward_std": 0.28989601135253906,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 298.0,
"completions/max_terminated_length": 298.0,
"completions/mean_length": 243.5,
"completions/mean_terminated_length": 278.2857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 271.0,
"epoch": 0.296,
"format_failures": 0.0,
"grad_norm": 16.964588165283203,
"kl": 2.3798545002937317,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 861552.0,
"reward": 0.5833333730697632,
"reward_std": 0.4629100263118744,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 545.0,
"completions/max_terminated_length": 545.0,
"completions/mean_length": 225.375,
"completions/mean_terminated_length": 257.57142857142856,
"completions/min_length": 0.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.3,
"format_failures": 0.0,
"grad_norm": 0.23826824128627777,
"kl": 0.033232852816581726,
"learning_rate": 1e-06,
"loss": 0.0132,
"num_tokens": 872312.0,
"reward": 0.20226716995239258,
"reward_std": 0.15315401554107666,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 222.0,
"completions/max_terminated_length": 222.0,
"completions/mean_length": 145.75,
"completions/mean_terminated_length": 166.57142857142858,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.304,
"format_failures": 0.0,
"grad_norm": 1.913487434387207,
"kl": 1.3894951939582825,
"learning_rate": 1e-06,
"loss": -0.0165,
"num_tokens": 879880.0,
"reward": 0.17698413133621216,
"reward_std": 0.1964721530675888,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 264.0,
"completions/max_terminated_length": 264.0,
"completions/mean_length": 155.0,
"completions/mean_terminated_length": 177.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.308,
"format_failures": 0.0,
"grad_norm": 2.5412757396698,
"kl": 1.028398334980011,
"learning_rate": 1e-06,
"loss": 0.0962,
"num_tokens": 887960.0,
"reward": 0.45376986265182495,
"reward_std": 0.3097318112850189,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 536.0,
"completions/max_terminated_length": 536.0,
"completions/mean_length": 286.375,
"completions/mean_terminated_length": 327.2857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 176.0,
"epoch": 0.312,
"format_failures": 0.0,
"grad_norm": 0.6730135679244995,
"kl": 0.0538824163377285,
"learning_rate": 1e-06,
"loss": 0.1157,
"num_tokens": 898928.0,
"reward": 0.20416666567325592,
"reward_std": 0.3781481683254242,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 188.0,
"completions/max_terminated_length": 188.0,
"completions/mean_length": 99.25,
"completions/mean_terminated_length": 158.8,
"completions/min_length": 0.0,
"completions/min_terminated_length": 48.0,
"epoch": 0.316,
"format_failures": 0.0,
"grad_norm": 1.8478459119796753,
"kl": 0.015719112940132618,
"learning_rate": 1e-06,
"loss": -0.134,
"num_tokens": 908336.0,
"reward": 0.75,
"reward_std": 0.4629100561141968,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 290.0,
"completions/max_terminated_length": 290.0,
"completions/mean_length": 218.0,
"completions/mean_terminated_length": 249.14285714285714,
"completions/min_length": 0.0,
"completions/min_terminated_length": 111.0,
"epoch": 0.32,
"format_failures": 0.0,
"grad_norm": 4.647150039672852,
"kl": 1.3871727883815765,
"learning_rate": 1e-06,
"loss": 0.0114,
"num_tokens": 919144.0,
"reward": 0.515625,
"reward_std": 0.5194326043128967,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 61.0,
"completions/max_terminated_length": 61.0,
"completions/mean_length": 44.75,
"completions/mean_terminated_length": 51.142857142857146,
"completions/min_length": 0.0,
"completions/min_terminated_length": 42.0,
"epoch": 0.324,
"format_failures": 0.0,
"grad_norm": 4.4413957595825195,
"kl": 1.4963605403900146,
"learning_rate": 1e-06,
"loss": 0.0199,
"num_tokens": 924120.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 114.0,
"completions/mean_terminated_length": 130.28571428571428,
"completions/min_length": 0.0,
"completions/min_terminated_length": 70.0,
"epoch": 0.328,
"format_failures": 0.0,
"grad_norm": 0.7050689458847046,
"kl": 0.046199409291148186,
"learning_rate": 1e-06,
"loss": 0.0456,
"num_tokens": 930960.0,
"reward": 0.5011904835700989,
"reward_std": 0.24937564134597778,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 518.0,
"completions/max_terminated_length": 518.0,
"completions/mean_length": 449.875,
"completions/mean_terminated_length": 514.1428571428571,
"completions/min_length": 0.0,
"completions/min_terminated_length": 509.0,
"epoch": 0.332,
"format_failures": 0.0,
"grad_norm": 0.26836591958999634,
"kl": 0.006152217974886298,
"learning_rate": 1e-06,
"loss": -0.0312,
"num_tokens": 948424.0,
"reward": 0.7916666865348816,
"reward_std": 0.39591163396835327,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 246.0,
"completions/max_terminated_length": 246.0,
"completions/mean_length": 138.625,
"completions/mean_terminated_length": 158.42857142857142,
"completions/min_length": 0.0,
"completions/min_terminated_length": 130.0,
"epoch": 0.336,
"format_failures": 0.0,
"grad_norm": 1.0764328241348267,
"kl": 0.07650505751371384,
"learning_rate": 1e-06,
"loss": -0.0964,
"num_tokens": 956768.0,
"reward": 0.3864583373069763,
"reward_std": 0.3207734227180481,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 531.0,
"completions/max_terminated_length": 531.0,
"completions/mean_length": 292.0,
"completions/mean_terminated_length": 333.7142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.34,
"format_failures": 0.0,
"grad_norm": 0.5540055632591248,
"kl": 0.054012734442949295,
"learning_rate": 1e-06,
"loss": -0.1183,
"num_tokens": 966600.0,
"reward": 0.34756946563720703,
"reward_std": 0.300673246383667,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 153.0,
"completions/max_terminated_length": 153.0,
"completions/mean_length": 126.0,
"completions/mean_terminated_length": 144.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.344,
"format_failures": 0.0,
"grad_norm": 2.176490306854248,
"kl": 0.14486993476748466,
"learning_rate": 1e-06,
"loss": 0.044,
"num_tokens": 974040.0,
"reward": 0.6666666269302368,
"reward_std": 0.4714045226573944,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 163.0,
"completions/max_terminated_length": 163.0,
"completions/mean_length": 139.875,
"completions/mean_terminated_length": 159.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 154.0,
"epoch": 0.348,
"format_failures": 0.0,
"grad_norm": 3.048673391342163,
"kl": 0.05823306553065777,
"learning_rate": 1e-06,
"loss": 1.0611,
"num_tokens": 995888.0,
"reward": 0.625,
"reward_std": 0.5175491571426392,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 281.0,
"completions/max_terminated_length": 281.0,
"completions/mean_length": 101.125,
"completions/mean_terminated_length": 134.83333333333334,
"completions/min_length": 0.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.352,
"format_failures": 0.0,
"grad_norm": 1.9394124746322632,
"kl": 0.09709636494517326,
"learning_rate": 1e-06,
"loss": 0.3171,
"num_tokens": 1016272.0,
"reward": 0.47559523582458496,
"reward_std": 0.2696917653083801,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 160.0,
"completions/max_terminated_length": 160.0,
"completions/mean_length": 92.375,
"completions/mean_terminated_length": 105.57142857142857,
"completions/min_length": 0.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.356,
"format_failures": 0.0,
"grad_norm": 1.0850152969360352,
"kl": 0.11065866611897945,
"learning_rate": 1e-06,
"loss": -0.0191,
"num_tokens": 1022584.0,
"reward": 0.027205882593989372,
"reward_std": 0.050595808774232864,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 238.0,
"completions/max_terminated_length": 238.0,
"completions/mean_length": 152.125,
"completions/mean_terminated_length": 173.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.36,
"format_failures": 0.0,
"grad_norm": 0.7975893020629883,
"kl": 0.4505193531513214,
"learning_rate": 1e-06,
"loss": 0.0489,
"num_tokens": 1028024.0,
"reward": 0.4837797284126282,
"reward_std": 0.3459106385707855,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 198.0,
"completions/max_terminated_length": 198.0,
"completions/mean_length": 122.875,
"completions/mean_terminated_length": 196.6,
"completions/min_length": 0.0,
"completions/min_terminated_length": 195.0,
"epoch": 0.364,
"format_failures": 0.0,
"grad_norm": 0.371446430683136,
"kl": 0.017493599094450474,
"learning_rate": 1e-06,
"loss": -0.0009,
"num_tokens": 1039176.0,
"reward": 0.7916666865348816,
"reward_std": 0.39591163396835327,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 368.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 228.5,
"completions/mean_terminated_length": 261.14285714285717,
"completions/min_length": 0.0,
"completions/min_terminated_length": 132.0,
"epoch": 0.368,
"format_failures": 0.0,
"grad_norm": 1.6181436777114868,
"kl": 1.322296380996704,
"learning_rate": 1e-06,
"loss": -0.0419,
"num_tokens": 1047784.0,
"reward": 0.2874999940395355,
"reward_std": 0.39957815408706665,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 445.0,
"completions/max_terminated_length": 445.0,
"completions/mean_length": 250.125,
"completions/mean_terminated_length": 285.85714285714283,
"completions/min_length": 0.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.372,
"format_failures": 0.0,
"grad_norm": 0.4590940773487091,
"kl": 0.03011018969118595,
"learning_rate": 1e-06,
"loss": -0.0477,
"num_tokens": 1058760.0,
"reward": 0.38749998807907104,
"reward_std": 0.3058944642543793,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 64.0,
"completions/max_terminated_length": 64.0,
"completions/mean_length": 55.75,
"completions/mean_terminated_length": 63.714285714285715,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.376,
"format_failures": 0.0,
"grad_norm": 3.706254720687866,
"kl": 0.022694013081490993,
"learning_rate": 1e-06,
"loss": 0.4609,
"num_tokens": 1069792.0,
"reward": 0.5052083730697632,
"reward_std": 0.25630685687065125,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 155.75,
"completions/mean_terminated_length": 178.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.38,
"format_failures": 0.0,
"grad_norm": 1.6162223815917969,
"kl": 0.43194980919361115,
"learning_rate": 1e-06,
"loss": -0.0132,
"num_tokens": 1079864.0,
"reward": 0.21741071343421936,
"reward_std": 0.28225868940353394,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 120.125,
"completions/mean_terminated_length": 137.28571428571428,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.384,
"format_failures": 0.0,
"grad_norm": 18.852705001831055,
"kl": 4.019676446914673,
"learning_rate": 1e-06,
"loss": 0.0359,
"num_tokens": 1088416.0,
"reward": 0.90625,
"reward_std": 0.1293872892856598,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 81.0,
"completions/max_terminated_length": 81.0,
"completions/mean_length": 65.125,
"completions/mean_terminated_length": 74.42857142857143,
"completions/min_length": 0.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.388,
"format_failures": 0.0,
"grad_norm": 0.17805831134319305,
"kl": 0.0494217723608017,
"learning_rate": 1e-06,
"loss": 0.0198,
"num_tokens": 1095056.0,
"reward": 0.984375,
"reward_std": 0.04419417306780815,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 63.0,
"completions/max_terminated_length": 63.0,
"completions/mean_length": 34.75,
"completions/mean_terminated_length": 39.714285714285715,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.392,
"format_failures": 0.0,
"grad_norm": 1.5279428958892822,
"kl": 0.29206034541130066,
"learning_rate": 1e-06,
"loss": -0.0386,
"num_tokens": 1100752.0,
"reward": 0.0416666679084301,
"reward_std": 0.1178511381149292,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 351.0,
"completions/max_terminated_length": 351.0,
"completions/mean_length": 249.375,
"completions/mean_terminated_length": 285.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.396,
"format_failures": 0.0,
"grad_norm": 0.56284499168396,
"kl": 0.11262823268771172,
"learning_rate": 1e-06,
"loss": 0.0758,
"num_tokens": 1112056.0,
"reward": 0.5658119916915894,
"reward_std": 0.2206362932920456,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 194.0,
"completions/max_terminated_length": 194.0,
"completions/mean_length": 149.5,
"completions/mean_terminated_length": 170.85714285714286,
"completions/min_length": 0.0,
"completions/min_terminated_length": 52.0,
"epoch": 0.4,
"format_failures": 0.0,
"grad_norm": 2.1969668865203857,
"kl": 0.0690736249089241,
"learning_rate": 1e-06,
"loss": -0.001,
"num_tokens": 1121104.0,
"reward": 0.75,
"reward_std": 0.4629100561141968,
"step": 100
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 1121104,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}