lcb_test_generator_3b_140steps / trainer_state.json
Harryllh's picture
Upload folder using huggingface_hub
b7bdaaa verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.28,
"eval_steps": 500,
"global_step": 140,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 1005.0,
"completions/max_terminated_length": 1005.0,
"completions/mean_length": 442.6666666666667,
"completions/mean_terminated_length": 482.90909090909093,
"completions/min_length": 0.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.002,
"format_failures": 0.0,
"grad_norm": 0.3274489641189575,
"kl": 0.0,
"learning_rate": 0.0,
"loss": 0.048,
"num_tokens": 21804.0,
"reward": 0.26185137033462524,
"reward_std": 0.28920137882232666,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 264.0,
"completions/max_terminated_length": 264.0,
"completions/mean_length": 136.5,
"completions/mean_terminated_length": 148.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.004,
"format_failures": 0.0,
"grad_norm": 1.2693145275115967,
"kl": 0.0,
"learning_rate": 1e-06,
"loss": 0.0962,
"num_tokens": 42324.0,
"reward": 0.38461539149284363,
"reward_std": 0.3770364224910736,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 362.0,
"completions/max_terminated_length": 362.0,
"completions/mean_length": 217.83333333333334,
"completions/mean_terminated_length": 237.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 124.0,
"epoch": 0.006,
"format_failures": 0.0,
"grad_norm": 0.3044165074825287,
"kl": 0.19029825925827026,
"learning_rate": 1e-06,
"loss": 0.0009,
"num_tokens": 58980.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 896.0,
"completions/max_terminated_length": 896.0,
"completions/mean_length": 321.0833333333333,
"completions/mean_terminated_length": 350.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 103.0,
"epoch": 0.008,
"format_failures": 1.0,
"grad_norm": 0.3372040390968323,
"kl": 0.029289670288562775,
"learning_rate": 1e-06,
"loss": 0.1107,
"num_tokens": 81756.0,
"reward": 0.23689448833465576,
"reward_std": 0.2267814427614212,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 193.0,
"completions/max_terminated_length": 193.0,
"completions/mean_length": 119.08333333333333,
"completions/mean_terminated_length": 129.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.01,
"format_failures": 0.0,
"grad_norm": 10.779764175415039,
"kl": 3.1303787231445312,
"learning_rate": 1e-06,
"loss": 0.0311,
"num_tokens": 96360.0,
"reward": 0.1666666716337204,
"reward_std": 0.30772873759269714,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 745.0,
"completions/max_terminated_length": 745.0,
"completions/mean_length": 420.6666666666667,
"completions/mean_terminated_length": 458.90909090909093,
"completions/min_length": 0.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.012,
"format_failures": 1.0,
"grad_norm": 0.2519327402114868,
"kl": 0.016291129169985652,
"learning_rate": 1e-06,
"loss": 0.0559,
"num_tokens": 119712.0,
"reward": 0.34878918528556824,
"reward_std": 0.2739146649837494,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 164.0,
"completions/max_terminated_length": 164.0,
"completions/mean_length": 67.33333333333333,
"completions/mean_terminated_length": 73.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.014,
"format_failures": 0.0,
"grad_norm": 2531.101806640625,
"kl": 562.2636108398438,
"learning_rate": 1e-06,
"loss": 5.4405,
"num_tokens": 128772.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 421.0,
"completions/max_terminated_length": 421.0,
"completions/mean_length": 186.41666666666666,
"completions/mean_terminated_length": 203.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 85.0,
"epoch": 0.016,
"format_failures": 0.0,
"grad_norm": 0.7023671865463257,
"kl": 0.0004708967899205163,
"learning_rate": 1e-06,
"loss": -0.1143,
"num_tokens": 164100.0,
"reward": 0.06388889253139496,
"reward_std": 0.1274919956922531,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 484.0,
"completions/max_terminated_length": 484.0,
"completions/mean_length": 253.41666666666666,
"completions/mean_terminated_length": 276.45454545454544,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.018,
"format_failures": 0.0,
"grad_norm": 1.1911135911941528,
"kl": 0.0012580148177221417,
"learning_rate": 1e-06,
"loss": -0.3277,
"num_tokens": 197808.0,
"reward": 0.1118159219622612,
"reward_std": 0.2614404261112213,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 170.0,
"completions/max_terminated_length": 170.0,
"completions/mean_length": 64.83333333333333,
"completions/mean_terminated_length": 70.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 35.0,
"epoch": 0.02,
"format_failures": 0.0,
"grad_norm": 1.324984073638916,
"kl": 0.2648707218468189,
"learning_rate": 1e-06,
"loss": -0.0221,
"num_tokens": 207000.0,
"reward": 0.01666666753590107,
"reward_std": 0.057735029608011246,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 126.33333333333333,
"completions/mean_terminated_length": 137.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.022,
"format_failures": 0.0,
"grad_norm": 0.5873882174491882,
"kl": 0.017587594222277403,
"learning_rate": 1e-06,
"loss": 0.0197,
"num_tokens": 221808.0,
"reward": 0.1805555671453476,
"reward_std": 0.3134874999523163,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 2049.0,
"completions/max_terminated_length": 2049.0,
"completions/mean_length": 541.25,
"completions/mean_terminated_length": 649.5,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.024,
"format_failures": 0.0,
"grad_norm": 0.48546102643013,
"kl": 0.002345994464121759,
"learning_rate": 1e-06,
"loss": 0.0336,
"num_tokens": 255132.0,
"reward": 0.4682539701461792,
"reward_std": 0.4320843815803528,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 53.0,
"completions/max_terminated_length": 53.0,
"completions/mean_length": 29.666666666666668,
"completions/mean_terminated_length": 32.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.026,
"format_failures": 0.0,
"grad_norm": 0.186175137758255,
"kl": 0.041642000898718834,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 265092.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 708.0,
"completions/max_terminated_length": 708.0,
"completions/mean_length": 381.6666666666667,
"completions/mean_terminated_length": 416.3636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 188.0,
"epoch": 0.028,
"format_failures": 0.0,
"grad_norm": 0.20345070958137512,
"kl": 0.009796573780477047,
"learning_rate": 1e-06,
"loss": 0.0257,
"num_tokens": 294096.0,
"reward": 0.29761505126953125,
"reward_std": 0.16453009843826294,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 1034.0,
"completions/max_terminated_length": 1034.0,
"completions/mean_length": 332.25,
"completions/mean_terminated_length": 362.45454545454544,
"completions/min_length": 0.0,
"completions/min_terminated_length": 125.0,
"epoch": 0.03,
"format_failures": 1.0,
"grad_norm": 0.5157941579818726,
"kl": 0.004433898604474962,
"learning_rate": 1e-06,
"loss": -0.0103,
"num_tokens": 325368.0,
"reward": 0.2917824387550354,
"reward_std": 0.3325340151786804,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 268.0,
"completions/max_terminated_length": 268.0,
"completions/mean_length": 150.16666666666666,
"completions/mean_terminated_length": 163.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 30.0,
"epoch": 0.032,
"format_failures": 0.0,
"grad_norm": 0.05657627806067467,
"kl": 0.0326845021918416,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 341196.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 296.0,
"completions/max_terminated_length": 296.0,
"completions/mean_length": 228.41666666666666,
"completions/mean_terminated_length": 249.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.034,
"format_failures": 0.0,
"grad_norm": 1.8653935194015503,
"kl": 0.8598212422803044,
"learning_rate": 1e-06,
"loss": 0.014,
"num_tokens": 354228.0,
"reward": 0.01666666753590107,
"reward_std": 0.05773502588272095,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 71.0,
"completions/max_terminated_length": 71.0,
"completions/mean_length": 48.333333333333336,
"completions/mean_terminated_length": 52.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.036,
"format_failures": 1.0,
"grad_norm": 0.018069056794047356,
"kl": 0.023271435871720314,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 381468.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 719.0,
"completions/max_terminated_length": 719.0,
"completions/mean_length": 228.91666666666666,
"completions/mean_terminated_length": 249.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 16.0,
"epoch": 0.038,
"format_failures": 0.0,
"grad_norm": 1.073132872581482,
"kl": 0.003063492476940155,
"learning_rate": 1e-06,
"loss": 0.0334,
"num_tokens": 415356.0,
"reward": 0.1666666716337204,
"reward_std": 0.38924944400787354,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 153.0,
"completions/max_terminated_length": 153.0,
"completions/mean_length": 84.58333333333333,
"completions/mean_terminated_length": 92.27272727272727,
"completions/min_length": 0.0,
"completions/min_terminated_length": 66.0,
"epoch": 0.04,
"format_failures": 0.0,
"grad_norm": 1.1736811399459839,
"kl": 0.018741012550890446,
"learning_rate": 1e-06,
"loss": 0.0962,
"num_tokens": 442596.0,
"reward": 0.1041666716337204,
"reward_std": 0.22508415579795837,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 89.58333333333333,
"completions/mean_terminated_length": 97.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.042,
"format_failures": 0.0,
"grad_norm": 0.960914671421051,
"kl": 0.03209133446216583,
"learning_rate": 1e-06,
"loss": -0.0169,
"num_tokens": 453252.0,
"reward": 0.2708333432674408,
"reward_std": 0.4454101026058197,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 249.0,
"completions/max_terminated_length": 249.0,
"completions/mean_length": 124.33333333333333,
"completions/mean_terminated_length": 135.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 74.0,
"epoch": 0.044,
"format_failures": 0.0,
"grad_norm": 1.0618880987167358,
"kl": 0.03219995368272066,
"learning_rate": 1e-06,
"loss": -0.3593,
"num_tokens": 481656.0,
"reward": 0.09444444626569748,
"reward_std": 0.17164288461208344,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 673.0,
"completions/max_terminated_length": 673.0,
"completions/mean_length": 299.5,
"completions/mean_terminated_length": 326.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 148.0,
"epoch": 0.046,
"format_failures": 0.0,
"grad_norm": 0.3598278760910034,
"kl": 0.031054741702973843,
"learning_rate": 1e-06,
"loss": 0.0131,
"num_tokens": 505704.0,
"reward": 0.4847402572631836,
"reward_std": 0.25003767013549805,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 497.0,
"completions/max_terminated_length": 497.0,
"completions/mean_length": 297.5,
"completions/mean_terminated_length": 324.54545454545456,
"completions/min_length": 0.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.048,
"format_failures": 0.0,
"grad_norm": 0.27960336208343506,
"kl": 0.04240706283599138,
"learning_rate": 1e-06,
"loss": -0.0398,
"num_tokens": 523500.0,
"reward": 0.2615740895271301,
"reward_std": 0.219794362783432,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 383.0,
"completions/max_terminated_length": 383.0,
"completions/mean_length": 179.16666666666666,
"completions/mean_terminated_length": 195.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 114.0,
"epoch": 0.05,
"format_failures": 0.0,
"grad_norm": 1.2980320453643799,
"kl": 0.0048073166981339455,
"learning_rate": 1e-06,
"loss": -0.3887,
"num_tokens": 555300.0,
"reward": 0.5003399848937988,
"reward_std": 0.39150455594062805,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 699.0,
"completions/max_terminated_length": 699.0,
"completions/mean_length": 315.9166666666667,
"completions/mean_terminated_length": 344.6363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 239.0,
"epoch": 0.052,
"format_failures": 0.0,
"grad_norm": 0.2552706003189087,
"kl": 0.027493927627801895,
"learning_rate": 1e-06,
"loss": 0.0567,
"num_tokens": 576000.0,
"reward": 0.43729767203330994,
"reward_std": 0.18975813686847687,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 140.0,
"completions/max_terminated_length": 140.0,
"completions/mean_length": 72.91666666666667,
"completions/mean_terminated_length": 79.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.054,
"format_failures": 0.0,
"grad_norm": 1.1299240589141846,
"kl": 0.0332061443477869,
"learning_rate": 1e-06,
"loss": -0.057,
"num_tokens": 584712.0,
"reward": 0.33095240592956543,
"reward_std": 0.444376677274704,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 181.0,
"completions/max_terminated_length": 181.0,
"completions/mean_length": 91.16666666666667,
"completions/mean_terminated_length": 99.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.056,
"format_failures": 0.0,
"grad_norm": 0.044371046125888824,
"kl": 0.03765446413308382,
"learning_rate": 1e-06,
"loss": 0.0004,
"num_tokens": 598032.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 522.0,
"completions/max_terminated_length": 522.0,
"completions/mean_length": 304.5,
"completions/mean_terminated_length": 332.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.058,
"format_failures": 0.0,
"grad_norm": 0.5104940533638,
"kl": 0.03451683558523655,
"learning_rate": 1e-06,
"loss": -0.0274,
"num_tokens": 615204.0,
"reward": 0.4068452715873718,
"reward_std": 0.37161099910736084,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 296.0,
"completions/max_terminated_length": 296.0,
"completions/mean_length": 162.91666666666666,
"completions/mean_terminated_length": 177.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.06,
"format_failures": 0.0,
"grad_norm": 1.2335582971572876,
"kl": 0.007039119256660342,
"learning_rate": 1e-06,
"loss": 0.2673,
"num_tokens": 647892.0,
"reward": 0.3291666805744171,
"reward_std": 0.4266456663608551,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 812.0,
"completions/max_terminated_length": 812.0,
"completions/mean_length": 332.5,
"completions/mean_terminated_length": 362.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 222.0,
"epoch": 0.062,
"format_failures": 2.0,
"grad_norm": 0.3000166416168213,
"kl": 0.03664882015436888,
"learning_rate": 1e-06,
"loss": 0.0306,
"num_tokens": 670860.0,
"reward": 0.6458902955055237,
"reward_std": 0.26038500666618347,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 335.0,
"completions/max_terminated_length": 335.0,
"completions/mean_length": 218.66666666666666,
"completions/mean_terminated_length": 238.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 180.0,
"epoch": 0.064,
"format_failures": 0.0,
"grad_norm": 0.37272748351097107,
"kl": 0.07015270553529263,
"learning_rate": 1e-06,
"loss": 0.0169,
"num_tokens": 682212.0,
"reward": 0.43658646941185,
"reward_std": 0.24143192172050476,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 73.0,
"completions/max_terminated_length": 73.0,
"completions/mean_length": 53.25,
"completions/mean_terminated_length": 58.09090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.066,
"format_failures": 0.0,
"grad_norm": 1.1589769124984741,
"kl": 0.03555137664079666,
"learning_rate": 1e-06,
"loss": -0.0651,
"num_tokens": 692040.0,
"reward": 0.11666666716337204,
"reward_std": 0.301008403301239,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 465.0,
"completions/max_terminated_length": 465.0,
"completions/mean_length": 336.0,
"completions/mean_terminated_length": 366.54545454545456,
"completions/min_length": 0.0,
"completions/min_terminated_length": 292.0,
"epoch": 0.068,
"format_failures": 0.0,
"grad_norm": 0.42152470350265503,
"kl": 0.19683832861483097,
"learning_rate": 1e-06,
"loss": -0.0173,
"num_tokens": 704484.0,
"reward": 0.5136784911155701,
"reward_std": 0.38917282223701477,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 93.0,
"completions/max_terminated_length": 93.0,
"completions/mean_length": 59.166666666666664,
"completions/mean_terminated_length": 64.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 40.0,
"epoch": 0.07,
"format_failures": 0.0,
"grad_norm": 1.729435682296753,
"kl": 0.055947478860616684,
"learning_rate": 1e-06,
"loss": 0.0028,
"num_tokens": 710520.0,
"reward": 0.5611110925674438,
"reward_std": 0.45256468653678894,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 192.0,
"completions/max_terminated_length": 192.0,
"completions/mean_length": 91.91666666666667,
"completions/mean_terminated_length": 100.27272727272727,
"completions/min_length": 0.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.072,
"format_failures": 0.0,
"grad_norm": 0.7297618389129639,
"kl": 0.28226011246442795,
"learning_rate": 1e-06,
"loss": 0.0022,
"num_tokens": 720588.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 255.0,
"completions/max_terminated_length": 255.0,
"completions/mean_length": 184.66666666666666,
"completions/mean_terminated_length": 201.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 152.0,
"epoch": 0.074,
"format_failures": 0.0,
"grad_norm": 0.1786535382270813,
"kl": 0.05143214017152786,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 731112.0,
"reward": 0.5931217074394226,
"reward_std": 0.15197694301605225,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 127.0,
"completions/max_terminated_length": 127.0,
"completions/mean_length": 61.416666666666664,
"completions/mean_terminated_length": 67.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 36.0,
"epoch": 0.076,
"format_failures": 1.0,
"grad_norm": 2.560441732406616,
"kl": 0.061069367453455925,
"learning_rate": 1e-06,
"loss": 0.1107,
"num_tokens": 758340.0,
"reward": 0.0833333358168602,
"reward_std": 0.28867512941360474,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 2050.0,
"completions/max_terminated_length": 2050.0,
"completions/mean_length": 715.0,
"completions/mean_terminated_length": 780.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.078,
"format_failures": 0.0,
"grad_norm": 0.41932860016822815,
"kl": 0.01548363408073783,
"learning_rate": 1e-06,
"loss": 0.0106,
"num_tokens": 790968.0,
"reward": 0.25740742683410645,
"reward_std": 0.32573264837265015,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 1162.0,
"completions/max_terminated_length": 1162.0,
"completions/mean_length": 471.75,
"completions/mean_terminated_length": 514.6363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.08,
"format_failures": 0.0,
"grad_norm": 0.8145480155944824,
"kl": 0.016389482654631138,
"learning_rate": 1e-06,
"loss": 0.154,
"num_tokens": 829104.0,
"reward": 0.43334314227104187,
"reward_std": 0.3763042986392975,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 254.0,
"completions/max_terminated_length": 254.0,
"completions/mean_length": 99.91666666666667,
"completions/mean_terminated_length": 109.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 47.0,
"epoch": 0.082,
"format_failures": 0.0,
"grad_norm": 18.232030868530273,
"kl": 1.717683531343937,
"learning_rate": 1e-06,
"loss": 0.197,
"num_tokens": 850716.0,
"reward": 0.2430555671453476,
"reward_std": 0.4042987823486328,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 156.0,
"completions/max_terminated_length": 156.0,
"completions/mean_length": 77.33333333333333,
"completions/mean_terminated_length": 84.36363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 63.0,
"epoch": 0.084,
"format_failures": 0.0,
"grad_norm": 0.5794758796691895,
"kl": 0.21323725581169128,
"learning_rate": 1e-06,
"loss": -0.0344,
"num_tokens": 859644.0,
"reward": 0.0476190522313118,
"reward_std": 0.1649572253227234,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 186.0,
"completions/max_terminated_length": 186.0,
"completions/mean_length": 136.66666666666666,
"completions/mean_terminated_length": 149.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.086,
"format_failures": 0.0,
"grad_norm": 2.507535934448242,
"kl": 0.2139158956706524,
"learning_rate": 1e-06,
"loss": -0.0282,
"num_tokens": 871596.0,
"reward": 0.3333333432674408,
"reward_std": 0.4923659861087799,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.33333333333333337,
"completions/max_length": 53.0,
"completions/max_terminated_length": 53.0,
"completions/mean_length": 28.25,
"completions/mean_terminated_length": 42.375,
"completions/min_length": 0.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.088,
"format_failures": 0.0,
"grad_norm": 0.33207282423973083,
"kl": 0.035286733880639076,
"learning_rate": 1e-06,
"loss": 0.0008,
"num_tokens": 879828.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 359.0,
"completions/max_terminated_length": 359.0,
"completions/mean_length": 117.83333333333333,
"completions/mean_terminated_length": 128.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 68.0,
"epoch": 0.09,
"format_failures": 0.0,
"grad_norm": 0.2761678099632263,
"kl": 0.15724625438451767,
"learning_rate": 1e-06,
"loss": 0.0015,
"num_tokens": 899448.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 179.0,
"completions/max_terminated_length": 179.0,
"completions/mean_length": 105.16666666666667,
"completions/mean_terminated_length": 114.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 26.0,
"epoch": 0.092,
"format_failures": 0.0,
"grad_norm": 1.1471128463745117,
"kl": 0.12899010255932808,
"learning_rate": 1e-06,
"loss": 0.0117,
"num_tokens": 914760.0,
"reward": 0.1666666716337204,
"reward_std": 0.30151134729385376,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 345.0,
"completions/max_terminated_length": 345.0,
"completions/mean_length": 233.66666666666666,
"completions/mean_terminated_length": 254.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 153.0,
"epoch": 0.094,
"format_failures": 0.0,
"grad_norm": 0.5467153191566467,
"kl": 0.2796362675726414,
"learning_rate": 1e-06,
"loss": -0.0318,
"num_tokens": 925212.0,
"reward": 0.549458920955658,
"reward_std": 0.3676450848579407,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 264.0,
"completions/max_terminated_length": 264.0,
"completions/mean_length": 166.25,
"completions/mean_terminated_length": 181.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.096,
"format_failures": 0.0,
"grad_norm": 0.78724205493927,
"kl": 0.49516983330249786,
"learning_rate": 1e-06,
"loss": -0.0104,
"num_tokens": 938424.0,
"reward": 0.02083333395421505,
"reward_std": 0.07216878235340118,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 203.0,
"completions/max_terminated_length": 203.0,
"completions/mean_length": 106.08333333333333,
"completions/mean_terminated_length": 115.72727272727273,
"completions/min_length": 0.0,
"completions/min_terminated_length": 29.0,
"epoch": 0.098,
"format_failures": 1.0,
"grad_norm": 1.7356528043746948,
"kl": 0.389555960893631,
"learning_rate": 1e-06,
"loss": -0.0599,
"num_tokens": 950172.0,
"reward": 0.1944444626569748,
"reward_std": 0.38816672563552856,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 1127.0,
"completions/max_terminated_length": 1127.0,
"completions/mean_length": 186.58333333333334,
"completions/mean_terminated_length": 223.9,
"completions/min_length": 0.0,
"completions/min_terminated_length": 49.0,
"epoch": 0.1,
"format_failures": 0.0,
"grad_norm": 1.3811311721801758,
"kl": 0.0656690001487732,
"learning_rate": 1e-06,
"loss": 0.949,
"num_tokens": 981816.0,
"reward": 0.5007641911506653,
"reward_std": 0.4272591173648834,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 121.0,
"completions/max_terminated_length": 121.0,
"completions/mean_length": 74.75,
"completions/mean_terminated_length": 81.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 67.0,
"epoch": 0.102,
"format_failures": 0.0,
"grad_norm": 3.630605697631836,
"kl": 0.11415744014084339,
"learning_rate": 1e-06,
"loss": 0.1083,
"num_tokens": 994800.0,
"reward": 0.4722222685813904,
"reward_std": 0.4596514403820038,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 447.0,
"completions/max_terminated_length": 447.0,
"completions/mean_length": 292.9166666666667,
"completions/mean_terminated_length": 319.54545454545456,
"completions/min_length": 0.0,
"completions/min_terminated_length": 230.0,
"epoch": 0.104,
"format_failures": 0.0,
"grad_norm": 0.664616048336029,
"kl": 0.024851050227880478,
"learning_rate": 1e-06,
"loss": -0.0988,
"num_tokens": 1028352.0,
"reward": 0.5121031999588013,
"reward_std": 0.26174625754356384,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 478.0,
"completions/max_terminated_length": 478.0,
"completions/mean_length": 267.4166666666667,
"completions/mean_terminated_length": 291.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 158.0,
"epoch": 0.106,
"format_failures": 0.0,
"grad_norm": 0.3362949788570404,
"kl": 0.09099859930574894,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 1053264.0,
"reward": 0.0625,
"reward_std": 0.21650634706020355,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 571.0,
"completions/max_terminated_length": 571.0,
"completions/mean_length": 292.0833333333333,
"completions/mean_terminated_length": 318.6363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 190.0,
"epoch": 0.108,
"format_failures": 0.0,
"grad_norm": 0.17621153593063354,
"kl": 0.03119577933102846,
"learning_rate": 1e-06,
"loss": 0.0012,
"num_tokens": 1068108.0,
"reward": 0.4200083613395691,
"reward_std": 0.194437637925148,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 168.0,
"completions/max_terminated_length": 168.0,
"completions/mean_length": 88.75,
"completions/mean_terminated_length": 96.81818181818181,
"completions/min_length": 0.0,
"completions/min_terminated_length": 69.0,
"epoch": 0.11,
"format_failures": 0.0,
"grad_norm": 0.6367191672325134,
"kl": 0.03671593498438597,
"learning_rate": 1e-06,
"loss": 0.0088,
"num_tokens": 1079820.0,
"reward": 0.19027778506278992,
"reward_std": 0.15930061042308807,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 189.0,
"completions/max_terminated_length": 189.0,
"completions/mean_length": 163.58333333333334,
"completions/mean_terminated_length": 178.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 113.0,
"epoch": 0.112,
"format_failures": 0.0,
"grad_norm": 2.1606733798980713,
"kl": 0.20935122203081846,
"learning_rate": 1e-06,
"loss": -0.0277,
"num_tokens": 1091832.0,
"reward": 0.5777778029441833,
"reward_std": 0.4515592157840729,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 456.0,
"completions/max_terminated_length": 456.0,
"completions/mean_length": 288.4166666666667,
"completions/mean_terminated_length": 314.6363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 169.0,
"epoch": 0.114,
"format_failures": 0.0,
"grad_norm": 0.32393601536750793,
"kl": 0.031358057633042336,
"learning_rate": 1e-06,
"loss": -0.044,
"num_tokens": 1105608.0,
"reward": 0.1666666716337204,
"reward_std": 0.24984844028949738,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 96.0,
"completions/max_terminated_length": 96.0,
"completions/mean_length": 65.5,
"completions/mean_terminated_length": 71.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 55.0,
"epoch": 0.116,
"format_failures": 0.0,
"grad_norm": 0.021954922005534172,
"kl": 0.018348069861531258,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 1113168.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 533.0,
"completions/max_terminated_length": 533.0,
"completions/mean_length": 224.41666666666666,
"completions/mean_terminated_length": 244.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.118,
"format_failures": 0.0,
"grad_norm": 1.1990734338760376,
"kl": 0.3062889650464058,
"learning_rate": 1e-06,
"loss": 0.0431,
"num_tokens": 1136832.0,
"reward": 0.2395833432674408,
"reward_std": 0.25259074568748474,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 374.0,
"completions/max_terminated_length": 374.0,
"completions/mean_length": 238.0,
"completions/mean_terminated_length": 259.6363636363636,
"completions/min_length": 0.0,
"completions/min_terminated_length": 80.0,
"epoch": 0.12,
"format_failures": 0.0,
"grad_norm": 0.5170612931251526,
"kl": 0.03292474150657654,
"learning_rate": 1e-06,
"loss": 0.0251,
"num_tokens": 1150536.0,
"reward": 0.39345240592956543,
"reward_std": 0.3553503155708313,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 82.16666666666667,
"completions/mean_terminated_length": 89.63636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.122,
"format_failures": 0.0,
"grad_norm": 1.1562092304229736,
"kl": 0.023061166517436504,
"learning_rate": 1e-06,
"loss": 0.1452,
"num_tokens": 1158984.0,
"reward": 0.7333333492279053,
"reward_std": 0.3639269173145294,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 553.0,
"completions/max_terminated_length": 553.0,
"completions/mean_length": 296.1666666666667,
"completions/mean_terminated_length": 323.09090909090907,
"completions/min_length": 0.0,
"completions/min_terminated_length": 201.0,
"epoch": 0.124,
"format_failures": 0.0,
"grad_norm": 0.32044336199760437,
"kl": 0.06375124305486679,
"learning_rate": 1e-06,
"loss": 0.0015,
"num_tokens": 1173504.0,
"reward": 0.43736547231674194,
"reward_std": 0.25956276059150696,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 2051.0,
"completions/max_terminated_length": 2051.0,
"completions/mean_length": 586.4166666666666,
"completions/mean_terminated_length": 639.7272727272727,
"completions/min_length": 0.0,
"completions/min_terminated_length": 38.0,
"epoch": 0.126,
"format_failures": 0.0,
"grad_norm": 0.6462875008583069,
"kl": 0.023477558977901936,
"learning_rate": 1e-06,
"loss": 0.0492,
"num_tokens": 1206840.0,
"reward": 0.501884937286377,
"reward_std": 0.5706992149353027,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 270.0,
"completions/max_terminated_length": 270.0,
"completions/mean_length": 150.66666666666666,
"completions/mean_terminated_length": 164.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 97.0,
"epoch": 0.128,
"format_failures": 0.0,
"grad_norm": 0.4827415347099304,
"kl": 0.11513948068022728,
"learning_rate": 1e-06,
"loss": 0.2183,
"num_tokens": 1230888.0,
"reward": 0.3715476393699646,
"reward_std": 0.17215265333652496,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 1340.0,
"completions/max_terminated_length": 1340.0,
"completions/mean_length": 277.5833333333333,
"completions/mean_terminated_length": 302.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.13,
"format_failures": 0.0,
"grad_norm": 0.46889665722846985,
"kl": 0.9275694619864225,
"learning_rate": 1e-06,
"loss": 0.2754,
"num_tokens": 1262100.0,
"reward": 0.3917522430419922,
"reward_std": 0.2266404628753662,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 599.0,
"completions/max_terminated_length": 599.0,
"completions/mean_length": 366.25,
"completions/mean_terminated_length": 399.54545454545456,
"completions/min_length": 0.0,
"completions/min_terminated_length": 212.0,
"epoch": 0.132,
"format_failures": 1.0,
"grad_norm": 0.30657899379730225,
"kl": 0.16883518174290657,
"learning_rate": 1e-06,
"loss": 0.0155,
"num_tokens": 1278012.0,
"reward": 0.34761905670166016,
"reward_std": 0.2757572531700134,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 559.0,
"completions/max_terminated_length": 559.0,
"completions/mean_length": 300.9166666666667,
"completions/mean_terminated_length": 361.1,
"completions/min_length": 0.0,
"completions/min_terminated_length": 224.0,
"epoch": 0.134,
"format_failures": 0.0,
"grad_norm": 0.6152874231338501,
"kl": 0.10999106336385012,
"learning_rate": 1e-06,
"loss": 0.3303,
"num_tokens": 1308996.0,
"reward": 0.32609128952026367,
"reward_std": 0.23752012848854065,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 227.0,
"completions/max_terminated_length": 227.0,
"completions/mean_length": 137.5,
"completions/mean_terminated_length": 150.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 59.0,
"epoch": 0.136,
"format_failures": 0.0,
"grad_norm": 1.7395364046096802,
"kl": 0.7087040841579437,
"learning_rate": 1e-06,
"loss": -0.0121,
"num_tokens": 1321020.0,
"reward": 0.20873016119003296,
"reward_std": 0.34043052792549133,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 297.0,
"completions/max_terminated_length": 297.0,
"completions/mean_length": 129.83333333333334,
"completions/mean_terminated_length": 141.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 28.0,
"epoch": 0.138,
"format_failures": 0.0,
"grad_norm": 0.902642548084259,
"kl": 0.7902000248432159,
"learning_rate": 1e-06,
"loss": 0.0035,
"num_tokens": 1332492.0,
"reward": 0.0877976268529892,
"reward_std": 0.20928393304347992,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 1172.0,
"completions/max_terminated_length": 1172.0,
"completions/mean_length": 333.1666666666667,
"completions/mean_terminated_length": 444.22222222222223,
"completions/min_length": 0.0,
"completions/min_terminated_length": 133.0,
"epoch": 0.14,
"format_failures": 0.0,
"grad_norm": 0.22367094457149506,
"kl": 0.03544241935014725,
"learning_rate": 1e-06,
"loss": 0.0442,
"num_tokens": 1363812.0,
"reward": 0.22601282596588135,
"reward_std": 0.1535530686378479,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 547.0,
"completions/max_terminated_length": 547.0,
"completions/mean_length": 368.5833333333333,
"completions/mean_terminated_length": 402.09090909090907,
"completions/min_length": 0.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.142,
"format_failures": 0.0,
"grad_norm": 0.25884878635406494,
"kl": 0.0446395231410861,
"learning_rate": 1e-06,
"loss": 0.0091,
"num_tokens": 1396788.0,
"reward": 0.6545634865760803,
"reward_std": 0.2292691022157669,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 228.0,
"completions/max_terminated_length": 228.0,
"completions/mean_length": 127.75,
"completions/mean_terminated_length": 139.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.144,
"format_failures": 0.0,
"grad_norm": 2.139310121536255,
"kl": 0.2615228593349457,
"learning_rate": 1e-06,
"loss": 0.0935,
"num_tokens": 1411512.0,
"reward": 0.625,
"reward_std": 0.4826536476612091,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 644.0,
"completions/max_terminated_length": 644.0,
"completions/mean_length": 321.1666666666667,
"completions/mean_terminated_length": 350.3636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 194.0,
"epoch": 0.146,
"format_failures": 0.0,
"grad_norm": 0.7009347081184387,
"kl": 0.13678913563489914,
"learning_rate": 1e-06,
"loss": 0.0771,
"num_tokens": 1436532.0,
"reward": 0.3439815044403076,
"reward_std": 0.27971503138542175,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 280.0,
"completions/max_terminated_length": 280.0,
"completions/mean_length": 253.08333333333334,
"completions/mean_terminated_length": 276.09090909090907,
"completions/min_length": 0.0,
"completions/min_terminated_length": 271.0,
"epoch": 0.148,
"format_failures": 0.0,
"grad_norm": 1.2899372577667236,
"kl": 0.10085960477590561,
"learning_rate": 1e-06,
"loss": 0.3862,
"num_tokens": 1471704.0,
"reward": 0.7222222685813904,
"reward_std": 0.4457052946090698,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 308.0,
"completions/max_terminated_length": 308.0,
"completions/mean_length": 196.5,
"completions/mean_terminated_length": 214.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.15,
"format_failures": 0.0,
"grad_norm": 0.4177331328392029,
"kl": 0.026733385398983955,
"learning_rate": 1e-06,
"loss": 0.0579,
"num_tokens": 1485468.0,
"reward": 0.2735119163990021,
"reward_std": 0.30911651253700256,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 368.0,
"completions/max_terminated_length": 368.0,
"completions/mean_length": 200.41666666666666,
"completions/mean_terminated_length": 218.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 62.0,
"epoch": 0.152,
"format_failures": 0.0,
"grad_norm": 0.8074631094932556,
"kl": 0.45791861414909363,
"learning_rate": 1e-06,
"loss": -0.0476,
"num_tokens": 1500636.0,
"reward": 0.17129629850387573,
"reward_std": 0.19502559304237366,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 220.0,
"completions/max_terminated_length": 220.0,
"completions/mean_length": 144.08333333333334,
"completions/mean_terminated_length": 157.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.154,
"format_failures": 0.0,
"grad_norm": 1.8004605770111084,
"kl": 0.32159996032714844,
"learning_rate": 1e-06,
"loss": -0.0603,
"num_tokens": 1512264.0,
"reward": 0.5055555701255798,
"reward_std": 0.29963788390159607,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 564.0,
"completions/max_terminated_length": 564.0,
"completions/mean_length": 312.1666666666667,
"completions/mean_terminated_length": 340.54545454545456,
"completions/min_length": 0.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.156,
"format_failures": 0.0,
"grad_norm": 0.3055727481842041,
"kl": 0.03414521459490061,
"learning_rate": 1e-06,
"loss": -0.0067,
"num_tokens": 1526292.0,
"reward": 0.5897321701049805,
"reward_std": 0.2986750900745392,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 192.0,
"completions/max_terminated_length": 192.0,
"completions/mean_length": 167.5,
"completions/mean_terminated_length": 182.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.158,
"format_failures": 0.0,
"grad_norm": 2.3401753902435303,
"kl": 0.03888106718659401,
"learning_rate": 1e-06,
"loss": -0.0218,
"num_tokens": 1540416.0,
"reward": 0.6666666865348816,
"reward_std": 0.4923659861087799,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 291.0,
"completions/max_terminated_length": 291.0,
"completions/mean_length": 210.91666666666666,
"completions/mean_terminated_length": 253.1,
"completions/min_length": 0.0,
"completions/min_terminated_length": 137.0,
"epoch": 0.16,
"format_failures": 0.0,
"grad_norm": 28.73111343383789,
"kl": 15.663371562957764,
"learning_rate": 1e-06,
"loss": 0.0445,
"num_tokens": 1553580.0,
"reward": 0.4305555820465088,
"reward_std": 0.4738534092903137,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 60.0,
"completions/max_terminated_length": 60.0,
"completions/mean_length": 43.166666666666664,
"completions/mean_terminated_length": 47.09090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 32.0,
"epoch": 0.162,
"format_failures": 0.0,
"grad_norm": 13.234149932861328,
"kl": 2.6492202281951904,
"learning_rate": 1e-06,
"loss": -0.0385,
"num_tokens": 1560816.0,
"reward": 0.27916666865348816,
"reward_std": 0.42504456639289856,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 331.0,
"completions/max_terminated_length": 331.0,
"completions/mean_length": 189.66666666666666,
"completions/mean_terminated_length": 206.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.164,
"format_failures": 0.0,
"grad_norm": 1.0555896759033203,
"kl": 0.060676803812384605,
"learning_rate": 1e-06,
"loss": -0.0432,
"num_tokens": 1573524.0,
"reward": 0.39722225069999695,
"reward_std": 0.2684729993343353,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 684.0,
"completions/max_terminated_length": 684.0,
"completions/mean_length": 482.1666666666667,
"completions/mean_terminated_length": 526.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 479.0,
"epoch": 0.166,
"format_failures": 0.0,
"grad_norm": 0.27017322182655334,
"kl": 0.013310576789081097,
"learning_rate": 1e-06,
"loss": -0.0023,
"num_tokens": 1595796.0,
"reward": 0.8000000715255737,
"reward_std": 0.39080336689949036,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 257.0,
"completions/max_terminated_length": 257.0,
"completions/mean_length": 144.91666666666666,
"completions/mean_terminated_length": 158.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 83.0,
"epoch": 0.168,
"format_failures": 0.0,
"grad_norm": 1.0021555423736572,
"kl": 0.2212899848818779,
"learning_rate": 1e-06,
"loss": 0.0304,
"num_tokens": 1606284.0,
"reward": 0.2957010865211487,
"reward_std": 0.2737172842025757,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 2050.0,
"completions/max_terminated_length": 2050.0,
"completions/mean_length": 510.0833333333333,
"completions/mean_terminated_length": 556.4545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.17,
"format_failures": 0.0,
"grad_norm": 0.3675689399242401,
"kl": 0.2206931747496128,
"learning_rate": 1e-06,
"loss": 0.1278,
"num_tokens": 1639152.0,
"reward": 0.43888890743255615,
"reward_std": 0.2596941888332367,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 280.0,
"completions/max_terminated_length": 280.0,
"completions/mean_length": 152.25,
"completions/mean_terminated_length": 166.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 117.0,
"epoch": 0.172,
"format_failures": 0.0,
"grad_norm": 2.8949317932128906,
"kl": 1.413679599761963,
"learning_rate": 1e-06,
"loss": 0.0356,
"num_tokens": 1652364.0,
"reward": 0.4761905074119568,
"reward_std": 0.5035434365272522,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 245.0,
"completions/max_terminated_length": 245.0,
"completions/mean_length": 152.91666666666666,
"completions/mean_terminated_length": 166.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.174,
"format_failures": 0.0,
"grad_norm": 1.7609695196151733,
"kl": 0.07055489160120487,
"learning_rate": 1e-06,
"loss": 0.3366,
"num_tokens": 1685136.0,
"reward": 0.33750003576278687,
"reward_std": 0.43647608160972595,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 180.0,
"completions/max_terminated_length": 180.0,
"completions/mean_length": 135.25,
"completions/mean_terminated_length": 147.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.176,
"format_failures": 0.0,
"grad_norm": 0.6215497255325317,
"kl": 0.08650689758360386,
"learning_rate": 1e-06,
"loss": 0.0112,
"num_tokens": 1693764.0,
"reward": 0.5745911598205566,
"reward_std": 0.1768045872449875,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 140.0,
"completions/max_terminated_length": 140.0,
"completions/mean_length": 73.25,
"completions/mean_terminated_length": 79.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 61.0,
"epoch": 0.178,
"format_failures": 1.0,
"grad_norm": 0.8421996235847473,
"kl": 0.016213122755289078,
"learning_rate": 1e-06,
"loss": 0.0149,
"num_tokens": 1707588.0,
"reward": 0.06666667014360428,
"reward_std": 0.1775250881910324,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 277.0,
"completions/max_terminated_length": 277.0,
"completions/mean_length": 178.16666666666666,
"completions/mean_terminated_length": 194.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.18,
"format_failures": 0.0,
"grad_norm": 0.4202212691307068,
"kl": 0.3119240030646324,
"learning_rate": 1e-06,
"loss": 0.0093,
"num_tokens": 1716792.0,
"reward": 0.6381944417953491,
"reward_std": 0.22775352001190186,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 220.0,
"completions/max_terminated_length": 220.0,
"completions/mean_length": 165.58333333333334,
"completions/mean_terminated_length": 180.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 56.0,
"epoch": 0.182,
"format_failures": 0.0,
"grad_norm": 3.5526509284973145,
"kl": 0.04295740742236376,
"learning_rate": 1e-06,
"loss": -0.007,
"num_tokens": 1735188.0,
"reward": 0.6666666865348816,
"reward_std": 0.4923659861087799,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 554.0,
"completions/max_terminated_length": 554.0,
"completions/mean_length": 296.3333333333333,
"completions/mean_terminated_length": 323.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.184,
"format_failures": 0.0,
"grad_norm": 0.7098760008811951,
"kl": 0.14585042744874954,
"learning_rate": 1e-06,
"loss": -0.052,
"num_tokens": 1748808.0,
"reward": 0.4570105969905853,
"reward_std": 0.29787296056747437,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 442.0,
"completions/max_terminated_length": 442.0,
"completions/mean_length": 325.1666666666667,
"completions/mean_terminated_length": 354.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.186,
"format_failures": 0.0,
"grad_norm": 4.00807523727417,
"kl": 2.2327868938446045,
"learning_rate": 1e-06,
"loss": 0.0328,
"num_tokens": 1763196.0,
"reward": 0.37762749195098877,
"reward_std": 0.2510078251361847,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 146.0,
"completions/max_terminated_length": 146.0,
"completions/mean_length": 78.66666666666667,
"completions/mean_terminated_length": 85.81818181818181,
"completions/min_length": 0.0,
"completions/min_terminated_length": 60.0,
"epoch": 0.188,
"format_failures": 0.0,
"grad_norm": 4.166850566864014,
"kl": 0.4828091114759445,
"learning_rate": 1e-06,
"loss": -0.0043,
"num_tokens": 1775700.0,
"reward": 0.41428571939468384,
"reward_std": 0.20157082378864288,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 288.0,
"completions/max_terminated_length": 288.0,
"completions/mean_length": 163.0,
"completions/mean_terminated_length": 177.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 86.0,
"epoch": 0.19,
"format_failures": 0.0,
"grad_norm": 2.0013251304626465,
"kl": 0.3356290655210614,
"learning_rate": 1e-06,
"loss": -0.0532,
"num_tokens": 1790064.0,
"reward": 0.4275793731212616,
"reward_std": 0.3848039209842682,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 144.0,
"completions/max_terminated_length": 144.0,
"completions/mean_length": 128.33333333333334,
"completions/mean_terminated_length": 140.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 134.0,
"epoch": 0.192,
"format_failures": 0.0,
"grad_norm": 6.922305107116699,
"kl": 3.5449295742437243,
"learning_rate": 1e-06,
"loss": 0.0385,
"num_tokens": 1803036.0,
"reward": 0.6979166865348816,
"reward_std": 0.31738603115081787,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 221.0,
"completions/max_terminated_length": 221.0,
"completions/mean_length": 94.33333333333333,
"completions/mean_terminated_length": 102.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 72.0,
"epoch": 0.194,
"format_failures": 0.0,
"grad_norm": 1.4514728784561157,
"kl": 0.1412234902381897,
"learning_rate": 1e-06,
"loss": 0.3157,
"num_tokens": 1816092.0,
"reward": 0.8380953073501587,
"reward_std": 0.30834609270095825,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 82.0,
"completions/max_terminated_length": 82.0,
"completions/mean_length": 43.5,
"completions/mean_terminated_length": 47.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 31.0,
"epoch": 0.196,
"format_failures": 0.0,
"grad_norm": 2.004136085510254,
"kl": 0.6110408902168274,
"learning_rate": 1e-06,
"loss": 0.0095,
"num_tokens": 1827024.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 373.0,
"completions/max_terminated_length": 373.0,
"completions/mean_length": 212.08333333333334,
"completions/mean_terminated_length": 231.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.198,
"format_failures": 0.0,
"grad_norm": 0.8370314240455627,
"kl": 0.09233395755290985,
"learning_rate": 1e-06,
"loss": 0.1438,
"num_tokens": 1860576.0,
"reward": 0.2782828211784363,
"reward_std": 0.2644941210746765,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 225.0,
"completions/max_terminated_length": 225.0,
"completions/mean_length": 163.25,
"completions/mean_terminated_length": 178.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 101.0,
"epoch": 0.2,
"format_failures": 0.0,
"grad_norm": 1.565374732017517,
"kl": 0.391565203666687,
"learning_rate": 1e-06,
"loss": -0.0497,
"num_tokens": 1872996.0,
"reward": 0.5944445133209229,
"reward_std": 0.47775429487228394,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 411.0,
"completions/max_terminated_length": 411.0,
"completions/mean_length": 150.16666666666666,
"completions/mean_terminated_length": 163.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 94.0,
"epoch": 0.202,
"format_failures": 0.0,
"grad_norm": 1.6569881439208984,
"kl": 0.24375841114670038,
"learning_rate": 1e-06,
"loss": 0.0387,
"num_tokens": 1892856.0,
"reward": 0.3499999940395355,
"reward_std": 0.36666667461395264,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 150.0,
"completions/max_terminated_length": 150.0,
"completions/mean_length": 107.66666666666667,
"completions/mean_terminated_length": 117.45454545454545,
"completions/min_length": 0.0,
"completions/min_terminated_length": 93.0,
"epoch": 0.204,
"format_failures": 0.0,
"grad_norm": 0.9490823745727539,
"kl": 0.010788497282192111,
"learning_rate": 1e-06,
"loss": 0.0193,
"num_tokens": 1903992.0,
"reward": 0.7714947462081909,
"reward_std": 0.2890874743461609,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 110.0,
"completions/max_terminated_length": 110.0,
"completions/mean_length": 66.0,
"completions/mean_terminated_length": 72.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.206,
"format_failures": 0.0,
"grad_norm": 1.482935905456543,
"kl": 0.03114949818700552,
"learning_rate": 1e-06,
"loss": -0.0754,
"num_tokens": 1913640.0,
"reward": 0.3333333432674408,
"reward_std": 0.32566946744918823,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 379.0,
"completions/max_terminated_length": 379.0,
"completions/mean_length": 260.5833333333333,
"completions/mean_terminated_length": 284.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 197.0,
"epoch": 0.208,
"format_failures": 0.0,
"grad_norm": 0.4501963257789612,
"kl": 0.011977697955444455,
"learning_rate": 1e-06,
"loss": -0.0496,
"num_tokens": 1932468.0,
"reward": 0.37487921118736267,
"reward_std": 0.29262858629226685,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.16666666666666663,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 113.91666666666667,
"completions/mean_terminated_length": 136.7,
"completions/min_length": 0.0,
"completions/min_terminated_length": 120.0,
"epoch": 0.21,
"format_failures": 0.0,
"grad_norm": 3.2958946228027344,
"kl": 0.024902154691517353,
"learning_rate": 1e-06,
"loss": 0.0181,
"num_tokens": 1942992.0,
"reward": 0.5,
"reward_std": 0.5222329497337341,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 714.0,
"completions/max_terminated_length": 714.0,
"completions/mean_length": 166.0,
"completions/mean_terminated_length": 181.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 57.0,
"epoch": 0.212,
"format_failures": 0.0,
"grad_norm": 1.3716078996658325,
"kl": 1.098541870713234,
"learning_rate": 1e-06,
"loss": 0.0299,
"num_tokens": 1964208.0,
"reward": 0.07500000298023224,
"reward_std": 0.17645499110221863,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 171.58333333333334,
"completions/mean_terminated_length": 187.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 100.0,
"epoch": 0.214,
"format_failures": 2.0,
"grad_norm": 0.27850034832954407,
"kl": 0.020487794652581215,
"learning_rate": 1e-06,
"loss": 0.0329,
"num_tokens": 1974972.0,
"reward": 0.4126984477043152,
"reward_std": 0.18834668397903442,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 54.0,
"completions/max_terminated_length": 54.0,
"completions/mean_length": 45.416666666666664,
"completions/mean_terminated_length": 49.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.216,
"format_failures": 0.0,
"grad_norm": 2.118313789367676,
"kl": 0.03025034721940756,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 1981716.0,
"reward": 0.8333333730697632,
"reward_std": 0.38924944400787354,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 233.0,
"completions/max_terminated_length": 233.0,
"completions/mean_length": 117.5,
"completions/mean_terminated_length": 128.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 88.0,
"epoch": 0.218,
"format_failures": 0.0,
"grad_norm": 1.9193243980407715,
"kl": 0.04295819811522961,
"learning_rate": 1e-06,
"loss": 0.009,
"num_tokens": 1992420.0,
"reward": 0.701388955116272,
"reward_std": 0.38302528858184814,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 131.0,
"completions/max_terminated_length": 131.0,
"completions/mean_length": 108.66666666666667,
"completions/mean_terminated_length": 118.54545454545455,
"completions/min_length": 0.0,
"completions/min_terminated_length": 92.0,
"epoch": 0.22,
"format_failures": 0.0,
"grad_norm": 4.0581183433532715,
"kl": 0.34252697695046663,
"learning_rate": 1e-06,
"loss": -0.014,
"num_tokens": 2004288.0,
"reward": 0.479166716337204,
"reward_std": 0.30592837929725647,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 192.0,
"completions/max_terminated_length": 192.0,
"completions/mean_length": 129.0,
"completions/mean_terminated_length": 140.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 112.0,
"epoch": 0.222,
"format_failures": 0.0,
"grad_norm": 2.901212692260742,
"kl": 0.451558455824852,
"learning_rate": 1e-06,
"loss": 0.0047,
"num_tokens": 2021400.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 174.0,
"completions/max_terminated_length": 174.0,
"completions/mean_length": 147.08333333333334,
"completions/mean_terminated_length": 160.45454545454547,
"completions/min_length": 0.0,
"completions/min_terminated_length": 65.0,
"epoch": 0.224,
"format_failures": 0.0,
"grad_norm": 3.0557456016540527,
"kl": 0.1749698342755437,
"learning_rate": 1e-06,
"loss": 0.0461,
"num_tokens": 2033580.0,
"reward": 0.7708333730697632,
"reward_std": 0.32784304022789,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 274.0,
"completions/max_terminated_length": 274.0,
"completions/mean_length": 81.75,
"completions/mean_terminated_length": 89.18181818181819,
"completions/min_length": 0.0,
"completions/min_terminated_length": 41.0,
"epoch": 0.226,
"format_failures": 0.0,
"grad_norm": 2.929105281829834,
"kl": 1.0704956352710724,
"learning_rate": 1e-06,
"loss": -0.1432,
"num_tokens": 2065740.0,
"reward": 0.6625000238418579,
"reward_std": 0.3711928129196167,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 796.0,
"completions/max_terminated_length": 796.0,
"completions/mean_length": 420.5,
"completions/mean_terminated_length": 458.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 171.0,
"epoch": 0.228,
"format_failures": 0.0,
"grad_norm": 0.966941237449646,
"kl": 0.012734876945614815,
"learning_rate": 1e-06,
"loss": -0.0432,
"num_tokens": 2101236.0,
"reward": 0.6500000357627869,
"reward_std": 0.40886637568473816,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 302.0,
"completions/max_terminated_length": 302.0,
"completions/mean_length": 263.75,
"completions/mean_terminated_length": 287.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.23,
"format_failures": 0.0,
"grad_norm": 7.276376247406006,
"kl": 2.2721076011657715,
"learning_rate": 1e-06,
"loss": 0.0151,
"num_tokens": 2114484.0,
"reward": 0.7777778506278992,
"reward_std": 0.3576955795288086,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 188.0,
"completions/max_terminated_length": 188.0,
"completions/mean_length": 167.41666666666666,
"completions/mean_terminated_length": 182.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.232,
"format_failures": 0.0,
"grad_norm": 0.6819717884063721,
"kl": 0.020047412253916264,
"learning_rate": 1e-06,
"loss": 0.0179,
"num_tokens": 2125992.0,
"reward": 0.8819445371627808,
"reward_std": 0.2524084150791168,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 394.0,
"completions/max_terminated_length": 394.0,
"completions/mean_length": 211.33333333333334,
"completions/mean_terminated_length": 230.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 147.0,
"epoch": 0.234,
"format_failures": 0.0,
"grad_norm": 0.19310350716114044,
"kl": 0.019224281422793865,
"learning_rate": 1e-06,
"loss": 0.012,
"num_tokens": 2137692.0,
"reward": 0.585936427116394,
"reward_std": 0.09784586727619171,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 205.0,
"completions/max_terminated_length": 205.0,
"completions/mean_length": 142.16666666666666,
"completions/mean_terminated_length": 155.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 110.0,
"epoch": 0.236,
"format_failures": 0.0,
"grad_norm": 2.085691213607788,
"kl": 0.09273007325828075,
"learning_rate": 1e-06,
"loss": 0.0139,
"num_tokens": 2148816.0,
"reward": 0.319444477558136,
"reward_std": 0.2289450317621231,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 583.0,
"completions/max_terminated_length": 583.0,
"completions/mean_length": 317.0833333333333,
"completions/mean_terminated_length": 345.90909090909093,
"completions/min_length": 0.0,
"completions/min_terminated_length": 140.0,
"epoch": 0.238,
"format_failures": 0.0,
"grad_norm": 0.37083595991134644,
"kl": 0.0630851686000824,
"learning_rate": 1e-06,
"loss": 0.0918,
"num_tokens": 2168256.0,
"reward": 0.37870368361473083,
"reward_std": 0.2895275950431824,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 306.0,
"completions/max_terminated_length": 306.0,
"completions/mean_length": 126.66666666666667,
"completions/mean_terminated_length": 138.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.24,
"format_failures": 0.0,
"grad_norm": 6.606923580169678,
"kl": 3.8295647501945496,
"learning_rate": 1e-06,
"loss": 0.1365,
"num_tokens": 2183124.0,
"reward": 0.4027777910232544,
"reward_std": 0.3723955750465393,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5833333333333333,
"completions/max_length": 77.0,
"completions/max_terminated_length": 77.0,
"completions/mean_length": 32.083333333333336,
"completions/mean_terminated_length": 77.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 77.0,
"epoch": 0.242,
"format_failures": 0.0,
"grad_norm": 0.08047831058502197,
"kl": 0.013985397294163704,
"learning_rate": 1e-06,
"loss": 0.0003,
"num_tokens": 2190396.0,
"reward": 1.0,
"reward_std": 0.0,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 595.0,
"completions/max_terminated_length": 595.0,
"completions/mean_length": 431.0833333333333,
"completions/mean_terminated_length": 470.27272727272725,
"completions/min_length": 0.0,
"completions/min_terminated_length": 109.0,
"epoch": 0.244,
"format_failures": 0.0,
"grad_norm": 0.019394446164369583,
"kl": 0.01961024198681116,
"learning_rate": 1e-06,
"loss": 0.0001,
"num_tokens": 2218320.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 454.0,
"completions/max_terminated_length": 454.0,
"completions/mean_length": 284.9166666666667,
"completions/mean_terminated_length": 310.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 118.0,
"epoch": 0.246,
"format_failures": 0.0,
"grad_norm": 1.5184653997421265,
"kl": 1.0404187738895416,
"learning_rate": 1e-06,
"loss": -0.0335,
"num_tokens": 2231256.0,
"reward": 0.4014219641685486,
"reward_std": 0.31073111295700073,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 141.0,
"completions/max_terminated_length": 141.0,
"completions/mean_length": 64.75,
"completions/mean_terminated_length": 70.63636363636364,
"completions/min_length": 0.0,
"completions/min_terminated_length": 34.0,
"epoch": 0.248,
"format_failures": 0.0,
"grad_norm": 1.6326740980148315,
"kl": 0.3745545968413353,
"learning_rate": 1e-06,
"loss": 0.0517,
"num_tokens": 2240424.0,
"reward": 0.8037037253379822,
"reward_std": 0.3365945816040039,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6666666666666667,
"completions/max_length": 117.0,
"completions/max_terminated_length": 117.0,
"completions/mean_length": 37.75,
"completions/mean_terminated_length": 113.25,
"completions/min_length": 0.0,
"completions/min_terminated_length": 102.0,
"epoch": 0.25,
"format_failures": 0.0,
"grad_norm": 10.052517890930176,
"kl": 1.53599963337183,
"learning_rate": 1e-06,
"loss": -0.0049,
"num_tokens": 2249424.0,
"reward": 0.9166666865348816,
"reward_std": 0.28867512941360474,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 235.0,
"completions/max_terminated_length": 235.0,
"completions/mean_length": 199.5,
"completions/mean_terminated_length": 217.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 160.0,
"epoch": 0.252,
"format_failures": 0.0,
"grad_norm": 1.1388990879058838,
"kl": 0.24531831266358495,
"learning_rate": 1e-06,
"loss": 0.0013,
"num_tokens": 2263584.0,
"reward": 0.0,
"reward_std": 0.0,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 142.0,
"completions/max_terminated_length": 142.0,
"completions/mean_length": 125.0,
"completions/mean_terminated_length": 136.36363636363637,
"completions/min_length": 0.0,
"completions/min_terminated_length": 123.0,
"epoch": 0.254,
"format_failures": 0.0,
"grad_norm": 2.392914056777954,
"kl": 0.9988721050322056,
"learning_rate": 1e-06,
"loss": -0.0025,
"num_tokens": 2276520.0,
"reward": 0.7291666865348816,
"reward_std": 0.3608439266681671,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 199.0,
"completions/max_terminated_length": 199.0,
"completions/mean_length": 134.08333333333334,
"completions/mean_terminated_length": 146.27272727272728,
"completions/min_length": 0.0,
"completions/min_terminated_length": 106.0,
"epoch": 0.256,
"format_failures": 0.0,
"grad_norm": 0.5191885828971863,
"kl": 0.20999768376350403,
"learning_rate": 1e-06,
"loss": 0.0146,
"num_tokens": 2286408.0,
"reward": 0.717815101146698,
"reward_std": 0.14373189210891724,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 229.0,
"completions/max_terminated_length": 229.0,
"completions/mean_length": 137.75,
"completions/mean_terminated_length": 150.27272727272728,
"completions/min_length": 0.0,
"completions/min_terminated_length": 98.0,
"epoch": 0.258,
"format_failures": 0.0,
"grad_norm": 1.204528570175171,
"kl": 0.08800000417977571,
"learning_rate": 1e-06,
"loss": 0.0511,
"num_tokens": 2296044.0,
"reward": 0.5675595998764038,
"reward_std": 0.2289842963218689,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 198.0,
"completions/max_terminated_length": 198.0,
"completions/mean_length": 124.58333333333333,
"completions/mean_terminated_length": 135.9090909090909,
"completions/min_length": 0.0,
"completions/min_terminated_length": 54.0,
"epoch": 0.26,
"format_failures": 0.0,
"grad_norm": 0.44312867522239685,
"kl": 0.07202759943902493,
"learning_rate": 1e-06,
"loss": 0.0475,
"num_tokens": 2305644.0,
"reward": 0.5101972222328186,
"reward_std": 0.19489067792892456,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 328.0,
"completions/max_terminated_length": 328.0,
"completions/mean_length": 281.1666666666667,
"completions/mean_terminated_length": 306.72727272727275,
"completions/min_length": 0.0,
"completions/min_terminated_length": 253.0,
"epoch": 0.262,
"format_failures": 1.0,
"grad_norm": 1.5526983737945557,
"kl": 0.06795010529458523,
"learning_rate": 1e-06,
"loss": -0.0019,
"num_tokens": 2319192.0,
"reward": 0.75,
"reward_std": 0.3217690885066986,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 183.0,
"completions/max_terminated_length": 183.0,
"completions/mean_length": 162.83333333333334,
"completions/mean_terminated_length": 177.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 175.0,
"epoch": 0.264,
"format_failures": 0.0,
"grad_norm": 2.740288257598877,
"kl": 0.7462278339080513,
"learning_rate": 1e-06,
"loss": 0.0045,
"num_tokens": 2329488.0,
"reward": 0.9791666865348816,
"reward_std": 0.07216878235340118,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 532.0,
"completions/max_terminated_length": 532.0,
"completions/mean_length": 315.5,
"completions/mean_terminated_length": 344.1818181818182,
"completions/min_length": 0.0,
"completions/min_terminated_length": 233.0,
"epoch": 0.266,
"format_failures": 0.0,
"grad_norm": 0.11069951951503754,
"kl": 0.01982728624716401,
"learning_rate": 1e-06,
"loss": -0.034,
"num_tokens": 2358276.0,
"reward": 0.5852844715118408,
"reward_std": 0.12080158293247223,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 269.0,
"completions/max_terminated_length": 269.0,
"completions/mean_length": 161.16666666666666,
"completions/mean_terminated_length": 175.8181818181818,
"completions/min_length": 0.0,
"completions/min_terminated_length": 104.0,
"epoch": 0.268,
"format_failures": 0.0,
"grad_norm": 0.8276861906051636,
"kl": 0.09472572058439255,
"learning_rate": 1e-06,
"loss": 0.0149,
"num_tokens": 2368980.0,
"reward": 0.6518849730491638,
"reward_std": 0.2886110842227936,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 325.0,
"completions/max_terminated_length": 325.0,
"completions/mean_length": 227.08333333333334,
"completions/mean_terminated_length": 247.72727272727272,
"completions/min_length": 0.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.27,
"format_failures": 0.0,
"grad_norm": 0.5550012588500977,
"kl": 0.02074157353490591,
"learning_rate": 1e-06,
"loss": -0.0841,
"num_tokens": 2379828.0,
"reward": 0.6243386268615723,
"reward_std": 0.3905191719532013,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 232.0,
"completions/max_terminated_length": 232.0,
"completions/mean_length": 210.0,
"completions/mean_terminated_length": 229.0909090909091,
"completions/min_length": 0.0,
"completions/min_terminated_length": 228.0,
"epoch": 0.272,
"format_failures": 0.0,
"grad_norm": 1.019722580909729,
"kl": 0.13905800506472588,
"learning_rate": 1e-06,
"loss": 0.0123,
"num_tokens": 2394360.0,
"reward": 0.949999988079071,
"reward_std": 0.17320507764816284,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 348.0,
"completions/max_terminated_length": 348.0,
"completions/mean_length": 215.0,
"completions/mean_terminated_length": 234.54545454545453,
"completions/min_length": 0.0,
"completions/min_terminated_length": 145.0,
"epoch": 0.274,
"format_failures": 0.0,
"grad_norm": 0.32402342557907104,
"kl": 0.014864406548440456,
"learning_rate": 1e-06,
"loss": -0.0012,
"num_tokens": 2406096.0,
"reward": 0.6149470806121826,
"reward_std": 0.19829140603542328,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 323.0,
"completions/max_terminated_length": 323.0,
"completions/mean_length": 136.58333333333334,
"completions/mean_terminated_length": 149.0,
"completions/min_length": 0.0,
"completions/min_terminated_length": 58.0,
"epoch": 0.276,
"format_failures": 0.0,
"grad_norm": 1.005679965019226,
"kl": 0.023909798823297024,
"learning_rate": 1e-06,
"loss": -0.0608,
"num_tokens": 2423568.0,
"reward": 0.5231481790542603,
"reward_std": 0.3425479829311371,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08333333333333337,
"completions/max_length": 241.0,
"completions/max_terminated_length": 241.0,
"completions/mean_length": 165.58333333333334,
"completions/mean_terminated_length": 180.63636363636363,
"completions/min_length": 0.0,
"completions/min_terminated_length": 75.0,
"epoch": 0.278,
"format_failures": 0.0,
"grad_norm": 3.9986395835876465,
"kl": 2.975656658411026,
"learning_rate": 1e-06,
"loss": -0.0003,
"num_tokens": 2437320.0,
"reward": 0.7277778387069702,
"reward_std": 0.4172621965408325,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.33333333333333337,
"completions/max_length": 55.0,
"completions/max_terminated_length": 55.0,
"completions/mean_length": 36.5,
"completions/mean_terminated_length": 54.75,
"completions/min_length": 0.0,
"completions/min_terminated_length": 53.0,
"epoch": 0.28,
"format_failures": 0.0,
"grad_norm": 0.04945458099246025,
"kl": 0.008955058641731739,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 2449116.0,
"reward": 1.0,
"reward_std": 0.0,
"step": 140
}
],
"logging_steps": 1,
"max_steps": 1000,
"num_input_tokens_seen": 2449116,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}