Qwen3-1.7B-FC / trainer_state.json
nguyenhuy's picture
Upload Qwen3-1.7B-FC model (RLVR fine-tuned for function calling)
02cd7b9 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.11048059056897504,
"eval_steps": 500,
"global_step": 1100,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 459.2,
"completions/max_terminated_length": 272.7,
"completions/mean_length": 76.24375,
"completions/mean_terminated_length": 64.11458358764648,
"completions/min_length": 16.8,
"completions/min_terminated_length": 16.8,
"epoch": 0.0010043690051725004,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 1.1999999999999998e-08,
"loss": 0.0208,
"num_tokens": 108131.0,
"reward": 1.2312812566757203,
"reward_std": 0.05931956073036417,
"rewards/combined_reward/mean": 1.2312812566757203,
"rewards/combined_reward/std": 0.4361365109682083,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 330.9,
"completions/max_terminated_length": 147.6,
"completions/mean_length": 75.425,
"completions/mean_terminated_length": 61.425418090820315,
"completions/min_length": 13.0,
"completions/min_terminated_length": 13.0,
"epoch": 0.002008738010345001,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 2.5333333333333335e-08,
"loss": 0.0279,
"num_tokens": 233579.0,
"reward": 1.3428645849227905,
"reward_std": 0.029872814007103444,
"rewards/combined_reward/mean": 1.3428645849227905,
"rewards/combined_reward/std": 0.3860916443169117,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 110.6,
"completions/max_terminated_length": 110.6,
"completions/mean_length": 51.04375,
"completions/mean_terminated_length": 51.04375,
"completions/min_length": 16.7,
"completions/min_terminated_length": 16.7,
"epoch": 0.003013107015517501,
"frac_reward_zero_std": 0.875,
"grad_norm": 3.3646392822265625,
"learning_rate": 3.866666666666666e-08,
"loss": -0.0132,
"num_tokens": 352258.0,
"reward": 1.323312509059906,
"reward_std": 0.05337500050663948,
"rewards/combined_reward/mean": 1.323312509059906,
"rewards/combined_reward/std": 0.39539981335401536,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 307.1,
"completions/max_terminated_length": 211.8,
"completions/mean_length": 95.83125,
"completions/mean_terminated_length": 60.24375,
"completions/min_length": 11.5,
"completions/min_terminated_length": 11.5,
"epoch": 0.004017476020690002,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 5.2e-08,
"loss": 0.0143,
"num_tokens": 485155.0,
"reward": 1.2628658890724183,
"reward_std": 0.03280075653456151,
"rewards/combined_reward/mean": 1.2628658890724183,
"rewards/combined_reward/std": 0.4110621690750122,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.5,
"completions/max_terminated_length": 140.5,
"completions/mean_length": 61.7875,
"completions/mean_terminated_length": 61.7875,
"completions/min_length": 23.7,
"completions/min_terminated_length": 23.7,
"epoch": 0.005021845025862502,
"frac_reward_zero_std": 0.95,
"grad_norm": 1.8525996208190918,
"learning_rate": 6.533333333333332e-08,
"loss": 0.0147,
"num_tokens": 607629.0,
"reward": 1.3795833349227906,
"reward_std": 0.00583496168255806,
"rewards/combined_reward/mean": 1.3795833349227906,
"rewards/combined_reward/std": 0.30837071537971494,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 338.5,
"completions/max_terminated_length": 238.4,
"completions/mean_length": 102.60625,
"completions/mean_terminated_length": 91.32791748046876,
"completions/min_length": 21.6,
"completions/min_terminated_length": 21.6,
"epoch": 0.006026214031035002,
"frac_reward_zero_std": 0.9,
"grad_norm": 2.992983818054199,
"learning_rate": 7.866666666666666e-08,
"loss": 0.0045,
"num_tokens": 728802.0,
"reward": 1.3164896011352538,
"reward_std": 0.02619450243655592,
"rewards/combined_reward/mean": 1.3164896011352538,
"rewards/combined_reward/std": 0.3474510669708252,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 128.0,
"completions/max_terminated_length": 128.0,
"completions/mean_length": 62.81875,
"completions/mean_terminated_length": 61.769583511352536,
"completions/min_length": 20.2,
"completions/min_terminated_length": 20.2,
"epoch": 0.007030583036207502,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 9.2e-08,
"loss": 0.0098,
"num_tokens": 836341.0,
"reward": 1.355798614025116,
"reward_std": 0.004375000763684511,
"rewards/combined_reward/mean": 1.355798614025116,
"rewards/combined_reward/std": 0.29267608374357224,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 110.2,
"completions/max_terminated_length": 110.2,
"completions/mean_length": 54.3375,
"completions/mean_terminated_length": 54.3375,
"completions/min_length": 24.0,
"completions/min_terminated_length": 24.0,
"epoch": 0.008034952041380003,
"frac_reward_zero_std": 0.925,
"grad_norm": 13.820528984069824,
"learning_rate": 1.0533333333333332e-07,
"loss": 0.0119,
"num_tokens": 945703.0,
"reward": 1.4564843893051147,
"reward_std": 0.003906251955777406,
"rewards/combined_reward/mean": 1.4564843893051147,
"rewards/combined_reward/std": 0.1776508768554777,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 201.9,
"completions/max_terminated_length": 201.9,
"completions/mean_length": 70.6125,
"completions/mean_terminated_length": 70.6125,
"completions/min_length": 21.5,
"completions/min_terminated_length": 21.5,
"epoch": 0.009039321046552503,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.1866666666666667e-07,
"loss": 0.0195,
"num_tokens": 1062961.0,
"reward": 1.3238854348659514,
"reward_std": 0.005562501423992216,
"rewards/combined_reward/mean": 1.3238854348659514,
"rewards/combined_reward/std": 0.22054901346564293,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 113.7,
"completions/max_terminated_length": 113.7,
"completions/mean_length": 60.275,
"completions/mean_terminated_length": 60.275,
"completions/min_length": 24.1,
"completions/min_terminated_length": 24.1,
"epoch": 0.010043690051725004,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 1.32e-07,
"loss": 0.0058,
"num_tokens": 1175365.0,
"reward": 1.4070937514305115,
"reward_std": 0.034517763555049895,
"rewards/combined_reward/mean": 1.4070937514305115,
"rewards/combined_reward/std": 0.26661672741174697,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 158.5,
"completions/max_terminated_length": 158.5,
"completions/mean_length": 65.46875,
"completions/mean_terminated_length": 64.41750030517578,
"completions/min_length": 19.1,
"completions/min_terminated_length": 19.1,
"epoch": 0.011048059056897505,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.4533333333333334e-07,
"loss": 0.0019,
"num_tokens": 1288772.0,
"reward": 1.2793750286102294,
"reward_std": 0.0024999996647238733,
"rewards/combined_reward/mean": 1.2793750286102294,
"rewards/combined_reward/std": 0.31086390763521193,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 322.2,
"completions/max_terminated_length": 134.1,
"completions/mean_length": 77.01875,
"completions/mean_terminated_length": 64.77458343505859,
"completions/min_length": 20.7,
"completions/min_terminated_length": 20.7,
"epoch": 0.012052428062070004,
"frac_reward_zero_std": 0.9,
"grad_norm": 8.276171684265137,
"learning_rate": 1.5866666666666666e-07,
"loss": 0.0134,
"num_tokens": 1403035.0,
"reward": 1.3504362106323242,
"reward_std": 0.030459362699184568,
"rewards/combined_reward/mean": 1.3504362106323242,
"rewards/combined_reward/std": 0.309928272664547,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 102.1,
"completions/max_terminated_length": 102.1,
"completions/mean_length": 61.0625,
"completions/mean_terminated_length": 61.0625,
"completions/min_length": 31.5,
"completions/min_terminated_length": 31.5,
"epoch": 0.013056797067242505,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.7199999999999998e-07,
"loss": -0.0027,
"num_tokens": 1524697.0,
"reward": 1.361527794599533,
"reward_std": 0.008749999664723873,
"rewards/combined_reward/mean": 1.361527794599533,
"rewards/combined_reward/std": 0.2736371263861656,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 123.4,
"completions/max_terminated_length": 123.4,
"completions/mean_length": 58.05,
"completions/mean_terminated_length": 58.05,
"completions/min_length": 16.8,
"completions/min_terminated_length": 16.8,
"epoch": 0.014061166072415004,
"frac_reward_zero_std": 0.95,
"grad_norm": 3.9411776065826416,
"learning_rate": 1.8533333333333333e-07,
"loss": 0.0062,
"num_tokens": 1622389.0,
"reward": 1.3123229265213012,
"reward_std": 0.03212499991059303,
"rewards/combined_reward/mean": 1.3123229265213012,
"rewards/combined_reward/std": 0.35334871551021935,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 333.1,
"completions/max_terminated_length": 135.6,
"completions/mean_length": 111.125,
"completions/mean_terminated_length": 61.191666793823245,
"completions/min_length": 21.8,
"completions/min_terminated_length": 21.8,
"epoch": 0.015065535077587506,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9866666666666665e-07,
"loss": 0.0039,
"num_tokens": 1734901.0,
"reward": 1.2678720355033875,
"reward_std": 0.0006250014062970877,
"rewards/combined_reward/mean": 1.2678720355033875,
"rewards/combined_reward/std": 0.2531693406403065,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 85.7,
"completions/max_terminated_length": 85.7,
"completions/mean_length": 48.81875,
"completions/mean_terminated_length": 48.81875,
"completions/min_length": 17.9,
"completions/min_terminated_length": 17.9,
"epoch": 0.016069904082760007,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9999507890797406e-07,
"loss": 0.0046,
"num_tokens": 1847536.0,
"reward": 1.345395851135254,
"reward_std": 0.0016666671261191368,
"rewards/combined_reward/mean": 1.345395851135254,
"rewards/combined_reward/std": 0.29257251909002663,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 467.7,
"completions/max_terminated_length": 277.7,
"completions/mean_length": 144.23125,
"completions/mean_terminated_length": 95.81041717529297,
"completions/min_length": 28.9,
"completions/min_terminated_length": 28.9,
"epoch": 0.017074273087932506,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.9997806834748455e-07,
"loss": -0.0018,
"num_tokens": 1970837.0,
"reward": 1.3027083039283753,
"reward_std": 0.004424501396715641,
"rewards/combined_reward/mean": 1.3027083039283753,
"rewards/combined_reward/std": 0.4294335596263409,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 99.5,
"completions/max_terminated_length": 99.5,
"completions/mean_length": 50.44375,
"completions/mean_terminated_length": 50.44375,
"completions/min_length": 14.6,
"completions/min_terminated_length": 14.6,
"epoch": 0.018078642093105005,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9994890963073946e-07,
"loss": 0.0059,
"num_tokens": 2088820.0,
"reward": 1.2765364408493043,
"reward_std": 0.00015624959487468005,
"rewards/combined_reward/mean": 1.2765364408493043,
"rewards/combined_reward/std": 0.3481216669082642,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 170.0,
"completions/max_terminated_length": 170.0,
"completions/mean_length": 67.0,
"completions/mean_terminated_length": 67.0,
"completions/min_length": 15.8,
"completions/min_terminated_length": 15.8,
"epoch": 0.019083011098277508,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.9990760630076236e-07,
"loss": -0.0197,
"num_tokens": 2217116.0,
"reward": 1.3771250247955322,
"reward_std": 0.001916667865589261,
"rewards/combined_reward/mean": 1.3771250247955322,
"rewards/combined_reward/std": 0.29997652024030685,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 99.2,
"completions/max_terminated_length": 99.2,
"completions/mean_length": 41.91875,
"completions/mean_terminated_length": 41.91875,
"completions/min_length": 12.8,
"completions/min_terminated_length": 12.8,
"epoch": 0.020087380103450007,
"frac_reward_zero_std": 0.95,
"grad_norm": 3.989150047302246,
"learning_rate": 1.99854163376247e-07,
"loss": 0.0011,
"num_tokens": 2329863.0,
"reward": 1.1117187559604644,
"reward_std": 0.02916821506805718,
"rewards/combined_reward/mean": 1.1117187559604644,
"rewards/combined_reward/std": 0.37413454949855807,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 408.0,
"completions/max_terminated_length": 220.7,
"completions/mean_length": 133.575,
"completions/mean_terminated_length": 84.2875,
"completions/min_length": 25.0,
"completions/min_terminated_length": 25.0,
"epoch": 0.021091749108622507,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 1.9978858735094754e-07,
"loss": 0.0285,
"num_tokens": 2457743.0,
"reward": 1.3693958520889282,
"reward_std": 0.004563984216656536,
"rewards/combined_reward/mean": 1.3693958520889282,
"rewards/combined_reward/std": 0.33579447590745987,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 115.4,
"completions/max_terminated_length": 115.4,
"completions/mean_length": 60.24375,
"completions/mean_terminated_length": 60.24375,
"completions/min_length": 20.8,
"completions/min_terminated_length": 20.8,
"epoch": 0.02209611811379501,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.9971088619288948e-07,
"loss": 0.0,
"num_tokens": 2581282.0,
"reward": 1.284375011920929,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.284375011920929,
"rewards/combined_reward/std": 0.3291483834385872,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 110.9,
"completions/max_terminated_length": 110.9,
"completions/mean_length": 52.08125,
"completions/mean_terminated_length": 51.73625030517578,
"completions/min_length": 15.5,
"completions/min_terminated_length": 15.5,
"epoch": 0.02310048711896751,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.996210693434016e-07,
"loss": 0.0,
"num_tokens": 2716695.0,
"reward": 1.3078229188919068,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3078229188919068,
"rewards/combined_reward/std": 0.3146174341440201,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 316.8,
"completions/max_terminated_length": 296.7,
"completions/mean_length": 106.325,
"completions/mean_terminated_length": 71.55961608886719,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.024104856124140008,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.9951914771596858e-07,
"loss": 0.0,
"num_tokens": 2820347.0,
"reward": 1.2994583308696748,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2994583308696748,
"rewards/combined_reward/std": 0.35011555850505827,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 105.0,
"completions/max_terminated_length": 105.0,
"completions/mean_length": 58.80625,
"completions/mean_terminated_length": 57.67589340209961,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.02510922512931251,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.9940513369490513e-07,
"loss": 0.0119,
"num_tokens": 2937640.0,
"reward": 1.2942708253860473,
"reward_std": 0.0020473659737035633,
"rewards/combined_reward/mean": 1.2942708253860473,
"rewards/combined_reward/std": 0.34473495446145536,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 136.2,
"completions/max_terminated_length": 136.2,
"completions/mean_length": 68.56875,
"completions/mean_terminated_length": 68.56875,
"completions/min_length": 27.3,
"completions/min_terminated_length": 27.3,
"epoch": 0.02611359413448501,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.9927904113385096e-07,
"loss": 0.0134,
"num_tokens": 3051799.0,
"reward": 1.3380468726158141,
"reward_std": 0.00270459558814764,
"rewards/combined_reward/mean": 1.3380468726158141,
"rewards/combined_reward/std": 0.28382683396339414,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 172.2,
"completions/max_terminated_length": 172.2,
"completions/mean_length": 72.875,
"completions/mean_terminated_length": 72.875,
"completions/min_length": 26.4,
"completions/min_terminated_length": 26.4,
"epoch": 0.02711796313965751,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.9914088535408765e-07,
"loss": -0.0019,
"num_tokens": 3164803.0,
"reward": 1.4464478969573975,
"reward_std": 0.0021736113354563712,
"rewards/combined_reward/mean": 1.4464478969573975,
"rewards/combined_reward/std": 0.19929498732089995,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 140.5,
"completions/max_terminated_length": 140.5,
"completions/mean_length": 59.38125,
"completions/mean_terminated_length": 59.38125,
"completions/min_length": 15.9,
"completions/min_terminated_length": 15.9,
"epoch": 0.02812233214483001,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9899068314267685e-07,
"loss": 0.001,
"num_tokens": 3280220.0,
"reward": 1.3454687356948853,
"reward_std": 0.004999999329447747,
"rewards/combined_reward/mean": 1.3454687356948853,
"rewards/combined_reward/std": 0.31286893486976625,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 94.7,
"completions/max_terminated_length": 94.7,
"completions/mean_length": 55.0,
"completions/mean_terminated_length": 55.0,
"completions/min_length": 22.0,
"completions/min_terminated_length": 22.0,
"epoch": 0.029126701150002512,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.9882845275042067e-07,
"loss": 0.0065,
"num_tokens": 3385228.0,
"reward": 1.4142057299613953,
"reward_std": 0.00044270951766520736,
"rewards/combined_reward/mean": 1.4142057299613953,
"rewards/combined_reward/std": 0.20944447480142117,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.3,
"completions/max_terminated_length": 173.3,
"completions/mean_length": 76.13125,
"completions/mean_terminated_length": 76.13125,
"completions/min_length": 23.4,
"completions/min_terminated_length": 23.4,
"epoch": 0.03013107015517501,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9865421388964382e-07,
"loss": -0.0017,
"num_tokens": 3496189.0,
"reward": 1.3910624980926514,
"reward_std": 0.0021650632843375206,
"rewards/combined_reward/mean": 1.3910624980926514,
"rewards/combined_reward/std": 0.28597628474235537,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.4,
"completions/max_terminated_length": 315.4,
"completions/mean_length": 99.93125,
"completions/mean_terminated_length": 99.93125,
"completions/min_length": 18.4,
"completions/min_terminated_length": 18.4,
"epoch": 0.03113543916034751,
"frac_reward_zero_std": 0.9,
"grad_norm": 3.8047702312469482,
"learning_rate": 1.9846798773179865e-07,
"loss": 0.0118,
"num_tokens": 3602282.0,
"reward": 1.2963680744171142,
"reward_std": 0.01609460562467575,
"rewards/combined_reward/mean": 1.2963680744171142,
"rewards/combined_reward/std": 0.3926819786429405,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 103.8,
"completions/max_terminated_length": 103.8,
"completions/mean_length": 52.2875,
"completions/mean_terminated_length": 52.2875,
"completions/min_length": 20.6,
"completions/min_terminated_length": 20.6,
"epoch": 0.03213980816552001,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9826979690489249e-07,
"loss": 0.0014,
"num_tokens": 3717904.0,
"reward": 1.403697907924652,
"reward_std": 0.0003125001909211278,
"rewards/combined_reward/mean": 1.403697907924652,
"rewards/combined_reward/std": 0.24410614371299744,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.8,
"completions/max_terminated_length": 79.8,
"completions/mean_length": 44.49375,
"completions/mean_terminated_length": 44.49375,
"completions/min_length": 16.1,
"completions/min_terminated_length": 16.1,
"epoch": 0.03314417717069251,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 1.9805966549073822e-07,
"loss": 0.0057,
"num_tokens": 3825867.0,
"reward": 1.3135937452316284,
"reward_std": 0.007812501117587089,
"rewards/combined_reward/mean": 1.3135937452316284,
"rewards/combined_reward/std": 0.3756252348423004,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 117.9,
"completions/max_terminated_length": 117.9,
"completions/mean_length": 54.15,
"completions/mean_terminated_length": 54.15,
"completions/min_length": 15.8,
"completions/min_terminated_length": 15.8,
"epoch": 0.03414854617586501,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9783761902202812e-07,
"loss": 0.0067,
"num_tokens": 3942087.0,
"reward": 1.290208351612091,
"reward_std": 0.0010206203907728196,
"rewards/combined_reward/mean": 1.290208351612091,
"rewards/combined_reward/std": 0.27491325289011004,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 89.2,
"completions/max_terminated_length": 89.2,
"completions/mean_length": 45.46875,
"completions/mean_terminated_length": 45.46875,
"completions/min_length": 12.9,
"completions/min_terminated_length": 12.9,
"epoch": 0.03515291518103751,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.9760368447923143e-07,
"loss": 0.0,
"num_tokens": 4077218.0,
"reward": 1.271875011920929,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.271875011920929,
"rewards/combined_reward/std": 0.3903637401759624,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 86.1,
"completions/max_terminated_length": 86.1,
"completions/mean_length": 47.9125,
"completions/mean_terminated_length": 47.9125,
"completions/min_length": 19.1,
"completions/min_terminated_length": 19.1,
"epoch": 0.03615728418621001,
"frac_reward_zero_std": 0.925,
"grad_norm": 5.847682952880859,
"learning_rate": 1.9735789028731602e-07,
"loss": -0.0023,
"num_tokens": 4189144.0,
"reward": 1.3238541960716248,
"reward_std": 0.03020833432674408,
"rewards/combined_reward/mean": 1.3238541960716248,
"rewards/combined_reward/std": 0.32445888966321945,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 103.3,
"completions/max_terminated_length": 103.3,
"completions/mean_length": 55.5,
"completions/mean_terminated_length": 55.5,
"completions/min_length": 20.6,
"completions/min_terminated_length": 20.6,
"epoch": 0.03716165319138252,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.9710026631229448e-07,
"loss": 0.0001,
"num_tokens": 4294100.0,
"reward": 1.3909027934074403,
"reward_std": 0.00034722290001809597,
"rewards/combined_reward/mean": 1.3909027934074403,
"rewards/combined_reward/std": 0.2816110193729401,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 119.4,
"completions/max_terminated_length": 119.4,
"completions/mean_length": 57.65625,
"completions/mean_terminated_length": 57.65625,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.038166022196555016,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9683084385759522e-07,
"loss": -0.0002,
"num_tokens": 4400477.0,
"reward": 1.333958351612091,
"reward_std": 0.0012500007636845113,
"rewards/combined_reward/mean": 1.333958351612091,
"rewards/combined_reward/std": 0.2801030218601227,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 112.6,
"completions/max_terminated_length": 112.6,
"completions/mean_length": 55.225,
"completions/mean_terminated_length": 55.225,
"completions/min_length": 18.2,
"completions/min_terminated_length": 18.2,
"epoch": 0.039170391201727515,
"frac_reward_zero_std": 0.925,
"grad_norm": 4.480510234832764,
"learning_rate": 1.9654965566025878e-07,
"loss": 0.006,
"num_tokens": 4516865.0,
"reward": 1.370369803905487,
"reward_std": 0.002187502384185791,
"rewards/combined_reward/mean": 1.370369803905487,
"rewards/combined_reward/std": 0.27093904092907906,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 139.2,
"completions/max_terminated_length": 139.2,
"completions/mean_length": 55.54375,
"completions/mean_terminated_length": 55.54375,
"completions/min_length": 12.2,
"completions/min_terminated_length": 12.2,
"epoch": 0.040174760206900015,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.9625673588696007e-07,
"loss": 0.0,
"num_tokens": 4634776.0,
"reward": 1.2619999647140503,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2619999647140503,
"rewards/combined_reward/std": 0.3673270642757416,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 106.5,
"completions/max_terminated_length": 106.5,
"completions/mean_length": 52.2875,
"completions/mean_terminated_length": 52.2875,
"completions/min_length": 13.1,
"completions/min_terminated_length": 13.1,
"epoch": 0.041179129212072514,
"frac_reward_zero_std": 0.95,
"grad_norm": 5.624104022979736,
"learning_rate": 1.959521201298568e-07,
"loss": 0.0061,
"num_tokens": 4766894.0,
"reward": 1.3308506846427917,
"reward_std": 0.003342500701546669,
"rewards/combined_reward/mean": 1.3308506846427917,
"rewards/combined_reward/std": 0.37019643262028695,
"step": 410
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 144.9,
"completions/max_terminated_length": 144.9,
"completions/mean_length": 63.63125,
"completions/mean_terminated_length": 63.63125,
"completions/min_length": 18.3,
"completions/min_terminated_length": 18.3,
"epoch": 0.042183498217245013,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.956358454022648e-07,
"loss": -0.0011,
"num_tokens": 4887883.0,
"reward": 1.3249478936195374,
"reward_std": 0.016550703253597022,
"rewards/combined_reward/mean": 1.3249478936195374,
"rewards/combined_reward/std": 0.31248683035373687,
"step": 420
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 70.8,
"completions/max_terminated_length": 70.8,
"completions/mean_length": 40.03125,
"completions/mean_terminated_length": 40.03125,
"completions/min_length": 21.5,
"completions/min_terminated_length": 21.5,
"epoch": 0.04318786722241751,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9530795013416043e-07,
"loss": -0.0062,
"num_tokens": 5017432.0,
"reward": 1.2040624856948852,
"reward_std": 0.003125,
"rewards/combined_reward/mean": 1.2040624856948852,
"rewards/combined_reward/std": 0.28724531903862954,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 95.9,
"completions/max_terminated_length": 95.9,
"completions/mean_length": 47.64375,
"completions/mean_terminated_length": 46.64416732788086,
"completions/min_length": 14.4,
"completions/min_terminated_length": 14.4,
"epoch": 0.04419223622759002,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.9496847416751122e-07,
"loss": -0.0055,
"num_tokens": 5127539.0,
"reward": 1.3247395992279052,
"reward_std": 0.005520834401249885,
"rewards/combined_reward/mean": 1.3247395992279052,
"rewards/combined_reward/std": 0.353334778547287,
"step": 440
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.6,
"completions/max_terminated_length": 101.6,
"completions/mean_length": 53.95625,
"completions/mean_terminated_length": 53.95625,
"completions/min_length": 21.6,
"completions/min_terminated_length": 21.6,
"epoch": 0.04519660523276252,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9461745875143477e-07,
"loss": -0.0013,
"num_tokens": 5239592.0,
"reward": 1.2362499833106995,
"reward_std": 0.0016666660085320473,
"rewards/combined_reward/mean": 1.2362499833106995,
"rewards/combined_reward/std": 0.33721971064805983,
"step": 450
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 156.9,
"completions/max_terminated_length": 156.9,
"completions/mean_length": 73.56875,
"completions/mean_terminated_length": 73.56875,
"completions/min_length": 16.6,
"completions/min_terminated_length": 16.6,
"epoch": 0.04620097423793502,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.942549465371863e-07,
"loss": -0.0051,
"num_tokens": 5360759.0,
"reward": 1.364300584793091,
"reward_std": 0.0033333331346511843,
"rewards/combined_reward/mean": 1.364300584793091,
"rewards/combined_reward/std": 0.29198225438594816,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 89.4,
"completions/max_terminated_length": 89.4,
"completions/mean_length": 49.9,
"completions/mean_terminated_length": 49.9,
"completions/min_length": 14.5,
"completions/min_terminated_length": 14.5,
"epoch": 0.04720534324310752,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.938809815729766e-07,
"loss": 0.0,
"num_tokens": 5489735.0,
"reward": 1.2914583563804627,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2914583563804627,
"rewards/combined_reward/std": 0.32128691375255586,
"step": 470
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 116.8,
"completions/max_terminated_length": 116.8,
"completions/mean_length": 54.26875,
"completions/mean_terminated_length": 54.26875,
"completions/min_length": 16.8,
"completions/min_terminated_length": 16.8,
"epoch": 0.048209712248280016,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.9349560929861957e-07,
"loss": 0.0036,
"num_tokens": 5618126.0,
"reward": 1.2964062452316285,
"reward_std": 0.0034375011920928953,
"rewards/combined_reward/mean": 1.2964062452316285,
"rewards/combined_reward/std": 0.3410232897847891,
"step": 480
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 138.5,
"completions/max_terminated_length": 138.5,
"completions/mean_length": 63.425,
"completions/mean_terminated_length": 63.425,
"completions/min_length": 17.2,
"completions/min_terminated_length": 17.2,
"epoch": 0.049214081253452516,
"frac_reward_zero_std": 0.975,
"grad_norm": 5.859716892242432,
"learning_rate": 1.9309887654001093e-07,
"loss": -0.0122,
"num_tokens": 5732858.0,
"reward": 1.3710416555404663,
"reward_std": 0.005000000074505806,
"rewards/combined_reward/mean": 1.3710416555404663,
"rewards/combined_reward/std": 0.2569635409861803,
"step": 490
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 71.1,
"completions/max_terminated_length": 71.1,
"completions/mean_length": 37.5125,
"completions/mean_terminated_length": 37.5125,
"completions/min_length": 15.6,
"completions/min_terminated_length": 15.6,
"epoch": 0.05021845025862502,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.9269083150343857e-07,
"loss": 0.0,
"num_tokens": 5827508.0,
"reward": 1.2737499952316285,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2737499952316285,
"rewards/combined_reward/std": 0.36351585388183594,
"step": 500
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 92.3,
"completions/max_terminated_length": 92.3,
"completions/mean_length": 49.31875,
"completions/mean_terminated_length": 49.31875,
"completions/min_length": 16.5,
"completions/min_terminated_length": 16.5,
"epoch": 0.05122281926379752,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.9227152376972505e-07,
"loss": 0.0,
"num_tokens": 5940043.0,
"reward": 1.3223958492279053,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3223958492279053,
"rewards/combined_reward/std": 0.32680114805698396,
"step": 510
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 112.1,
"completions/max_terminated_length": 112.1,
"completions/mean_length": 60.84375,
"completions/mean_terminated_length": 60.84375,
"completions/min_length": 22.7,
"completions/min_terminated_length": 22.7,
"epoch": 0.05222718826897002,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.91841004288203e-07,
"loss": 0.0,
"num_tokens": 6061038.0,
"reward": 1.3749479293823241,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3749479293823241,
"rewards/combined_reward/std": 0.2760587348602712,
"step": 520
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 383.1,
"completions/max_terminated_length": 211.9,
"completions/mean_length": 101.45,
"completions/mean_terminated_length": 89.37000045776367,
"completions/min_length": 29.4,
"completions/min_terminated_length": 29.4,
"epoch": 0.05323155727414252,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 1.913993253705246e-07,
"loss": 0.0182,
"num_tokens": 6172502.0,
"reward": 1.3482013940811157,
"reward_std": 0.004686582600697875,
"rewards/combined_reward/mean": 1.3482013940811157,
"rewards/combined_reward/std": 0.26615125834941866,
"step": 530
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 116.0,
"completions/max_terminated_length": 116.0,
"completions/mean_length": 61.33125,
"completions/mean_terminated_length": 61.33125,
"completions/min_length": 26.7,
"completions/min_terminated_length": 26.7,
"epoch": 0.05423592627931502,
"frac_reward_zero_std": 0.95,
"grad_norm": 6.519238471984863,
"learning_rate": 1.9094654068430515e-07,
"loss": -0.014,
"num_tokens": 6279539.0,
"reward": 1.456402564048767,
"reward_std": 0.0006212619598954916,
"rewards/combined_reward/mean": 1.456402564048767,
"rewards/combined_reward/std": 0.17502975650131702,
"step": 540
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 319.8,
"completions/max_terminated_length": 302.8,
"completions/mean_length": 102.7,
"completions/mean_terminated_length": 92.22833557128907,
"completions/min_length": 14.5,
"completions/min_terminated_length": 14.5,
"epoch": 0.05524029528448752,
"frac_reward_zero_std": 0.9,
"grad_norm": 6.630038738250732,
"learning_rate": 1.9048270524660196e-07,
"loss": 0.0001,
"num_tokens": 6401355.0,
"reward": 1.2464791774749755,
"reward_std": 0.016750000603497028,
"rewards/combined_reward/mean": 1.2464791774749755,
"rewards/combined_reward/std": 0.43877428472042085,
"step": 550
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 108.4,
"completions/max_terminated_length": 108.4,
"completions/mean_length": 57.21875,
"completions/mean_terminated_length": 57.21875,
"completions/min_length": 22.1,
"completions/min_terminated_length": 22.1,
"epoch": 0.05624466428966002,
"frac_reward_zero_std": 0.95,
"grad_norm": 4.464468955993652,
"learning_rate": 1.9000787541722936e-07,
"loss": -0.0008,
"num_tokens": 6512806.0,
"reward": 1.3637500047683715,
"reward_std": 0.0056250004563480616,
"rewards/combined_reward/mean": 1.3637500047683715,
"rewards/combined_reward/std": 0.25516389338299633,
"step": 560
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 296.2,
"completions/max_terminated_length": 201.7,
"completions/mean_length": 86.75625,
"completions/mean_terminated_length": 75.22125091552735,
"completions/min_length": 24.9,
"completions/min_terminated_length": 24.9,
"epoch": 0.057249033294832524,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.8952210889191065e-07,
"loss": -0.0016,
"num_tokens": 6619515.0,
"reward": 1.3538541674613953,
"reward_std": 0.009270833618938924,
"rewards/combined_reward/mean": 1.3538541674613953,
"rewards/combined_reward/std": 0.35525577939115466,
"step": 570
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 104.1,
"completions/max_terminated_length": 104.1,
"completions/mean_length": 48.9,
"completions/mean_terminated_length": 48.9,
"completions/min_length": 16.2,
"completions/min_terminated_length": 16.2,
"epoch": 0.058253402300005024,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.890254646952674e-07,
"loss": 0.0,
"num_tokens": 6728163.0,
"reward": 1.2268749833106996,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2268749833106996,
"rewards/combined_reward/std": 0.33372554890811446,
"step": 580
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 115.7,
"completions/max_terminated_length": 115.7,
"completions/mean_length": 61.34375,
"completions/mean_terminated_length": 61.34375,
"completions/min_length": 20.3,
"completions/min_terminated_length": 20.3,
"epoch": 0.05925777130517752,
"frac_reward_zero_std": 0.95,
"grad_norm": 3.076678991317749,
"learning_rate": 1.885180031736477e-07,
"loss": -0.0013,
"num_tokens": 6845358.0,
"reward": 1.3715885639190675,
"reward_std": 0.0037068985402584076,
"rewards/combined_reward/mean": 1.3715885639190675,
"rewards/combined_reward/std": 0.3188589945435524,
"step": 590
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 122.6,
"completions/max_terminated_length": 122.6,
"completions/mean_length": 55.81875,
"completions/mean_terminated_length": 55.81875,
"completions/min_length": 14.8,
"completions/min_terminated_length": 14.8,
"epoch": 0.06026214031035002,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.879997859877932e-07,
"loss": 0.0032,
"num_tokens": 6971649.0,
"reward": 1.280833327770233,
"reward_std": 0.0006132050417363644,
"rewards/combined_reward/mean": 1.280833327770233,
"rewards/combined_reward/std": 0.338599956035614,
"step": 600
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 387.6,
"completions/max_terminated_length": 192.1,
"completions/mean_length": 122.46875,
"completions/mean_terminated_length": 72.42708358764648,
"completions/min_length": 23.3,
"completions/min_terminated_length": 23.3,
"epoch": 0.06126650931552252,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.8747087610534734e-07,
"loss": 0.019,
"num_tokens": 7087600.0,
"reward": 1.338072907924652,
"reward_std": 0.013132144883275031,
"rewards/combined_reward/mean": 1.338072907924652,
"rewards/combined_reward/std": 0.30777021273970606,
"step": 610
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 112.7,
"completions/max_terminated_length": 112.7,
"completions/mean_length": 58.44375,
"completions/mean_terminated_length": 58.44375,
"completions/min_length": 23.9,
"completions/min_terminated_length": 23.9,
"epoch": 0.06227087832069502,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.8693133779320382e-07,
"loss": -0.0031,
"num_tokens": 7191467.0,
"reward": 1.3348880290985108,
"reward_std": 0.007124999910593033,
"rewards/combined_reward/mean": 1.3348880290985108,
"rewards/combined_reward/std": 0.2751554258167744,
"step": 620
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.8,
"completions/max_terminated_length": 194.8,
"completions/mean_length": 84.76875,
"completions/mean_terminated_length": 84.76875,
"completions/min_length": 21.4,
"completions/min_terminated_length": 21.4,
"epoch": 0.06327524732586752,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.8638123660969793e-07,
"loss": -0.0084,
"num_tokens": 7304146.0,
"reward": 1.3757467865943909,
"reward_std": 0.0030034731142222883,
"rewards/combined_reward/mean": 1.3757467865943909,
"rewards/combined_reward/std": 0.28882216811180117,
"step": 630
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.9,
"completions/max_terminated_length": 101.9,
"completions/mean_length": 56.925,
"completions/mean_terminated_length": 56.925,
"completions/min_length": 21.0,
"completions/min_terminated_length": 21.0,
"epoch": 0.06427961633104003,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.858206393966405e-07,
"loss": 0.0,
"num_tokens": 7415006.0,
"reward": 1.3215104341506958,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3215104341506958,
"rewards/combined_reward/std": 0.33309968262910844,
"step": 640
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 106.5,
"completions/max_terminated_length": 106.5,
"completions/mean_length": 58.26875,
"completions/mean_terminated_length": 58.26875,
"completions/min_length": 14.2,
"completions/min_terminated_length": 14.2,
"epoch": 0.06528398533621252,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.8524961427119615e-07,
"loss": -0.009,
"num_tokens": 7546381.0,
"reward": 1.3129427313804627,
"reward_std": 0.002951054647564888,
"rewards/combined_reward/mean": 1.3129427313804627,
"rewards/combined_reward/std": 0.3575292468070984,
"step": 650
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 84.0,
"completions/max_terminated_length": 84.0,
"completions/mean_length": 46.75625,
"completions/mean_terminated_length": 46.75625,
"completions/min_length": 15.4,
"completions/min_terminated_length": 15.4,
"epoch": 0.06628835434138503,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.846682306176065e-07,
"loss": 0.0,
"num_tokens": 7668158.0,
"reward": 1.3184374928474427,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3184374928474427,
"rewards/combined_reward/std": 0.35122168958187105,
"step": 660
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 101.7,
"completions/max_terminated_length": 101.7,
"completions/mean_length": 56.3375,
"completions/mean_terminated_length": 56.3375,
"completions/min_length": 17.9,
"completions/min_terminated_length": 17.9,
"epoch": 0.06729272334655753,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.8407655907875938e-07,
"loss": 0.0006,
"num_tokens": 7794644.0,
"reward": 1.331454861164093,
"reward_std": 0.007124999910593033,
"rewards/combined_reward/mean": 1.331454861164093,
"rewards/combined_reward/std": 0.3434182394295931,
"step": 670
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 135.7,
"completions/max_terminated_length": 135.7,
"completions/mean_length": 68.90625,
"completions/mean_terminated_length": 68.90625,
"completions/min_length": 16.1,
"completions/min_terminated_length": 16.1,
"epoch": 0.06829709235173002,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 1.8347467154760515e-07,
"loss": 0.0079,
"num_tokens": 7913933.0,
"reward": 1.3356944441795349,
"reward_std": 0.0053335148841142654,
"rewards/combined_reward/mean": 1.3356944441795349,
"rewards/combined_reward/std": 0.3590264985337853,
"step": 680
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 265.9,
"completions/max_terminated_length": 265.9,
"completions/mean_length": 91.5,
"completions/mean_terminated_length": 91.5,
"completions/min_length": 24.8,
"completions/min_terminated_length": 24.8,
"epoch": 0.06930146135690253,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.8286264115842114e-07,
"loss": 0.0017,
"num_tokens": 8033153.0,
"reward": 1.3431249916553498,
"reward_std": 0.0044791650027036665,
"rewards/combined_reward/mean": 1.3431249916553498,
"rewards/combined_reward/std": 0.3242304854094982,
"step": 690
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 73.2,
"completions/max_terminated_length": 73.2,
"completions/mean_length": 39.55625,
"completions/mean_terminated_length": 39.55625,
"completions/min_length": 17.7,
"completions/min_terminated_length": 17.7,
"epoch": 0.07030583036207502,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.8224054227792522e-07,
"loss": -0.003,
"num_tokens": 8147198.0,
"reward": 1.3440885424613953,
"reward_std": 0.0002604176523163915,
"rewards/combined_reward/mean": 1.3440885424613953,
"rewards/combined_reward/std": 0.3006736177019775,
"step": 700
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 123.6,
"completions/max_terminated_length": 123.6,
"completions/mean_length": 67.76875,
"completions/mean_terminated_length": 67.76875,
"completions/min_length": 25.8,
"completions/min_terminated_length": 25.8,
"epoch": 0.07131019936724753,
"frac_reward_zero_std": 0.975,
"grad_norm": 1.436936616897583,
"learning_rate": 1.816084504962396e-07,
"loss": 0.0009,
"num_tokens": 8248985.0,
"reward": 1.459496557712555,
"reward_std": 0.002500000596046448,
"rewards/combined_reward/mean": 1.459496557712555,
"rewards/combined_reward/std": 0.15663873171433806,
"step": 710
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 172.1,
"completions/max_terminated_length": 172.1,
"completions/mean_length": 76.96875,
"completions/mean_terminated_length": 76.96875,
"completions/min_length": 24.5,
"completions/min_terminated_length": 24.5,
"epoch": 0.07231456837242002,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.8096644261770608e-07,
"loss": 0.0179,
"num_tokens": 8373128.0,
"reward": 1.3943750143051148,
"reward_std": 0.005624998733401299,
"rewards/combined_reward/mean": 1.3943750143051148,
"rewards/combined_reward/std": 0.24296645894646646,
"step": 720
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 120.0,
"completions/max_terminated_length": 120.0,
"completions/mean_length": 60.15625,
"completions/mean_terminated_length": 60.15625,
"completions/min_length": 18.8,
"completions/min_terminated_length": 18.8,
"epoch": 0.07331893737759253,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.8031459665155363e-07,
"loss": -0.001,
"num_tokens": 8487649.0,
"reward": 1.4223046898841858,
"reward_std": 0.0001302093267440796,
"rewards/combined_reward/mean": 1.4223046898841858,
"rewards/combined_reward/std": 0.2848698660731316,
"step": 730
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 77.8,
"completions/max_terminated_length": 77.8,
"completions/mean_length": 45.84375,
"completions/mean_terminated_length": 45.84375,
"completions/min_length": 18.4,
"completions/min_terminated_length": 18.4,
"epoch": 0.07432330638276503,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.796529918024196e-07,
"loss": 0.0,
"num_tokens": 8603284.0,
"reward": 1.37947918176651,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.37947918176651,
"rewards/combined_reward/std": 0.27231944501399996,
"step": 740
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 204.9,
"completions/max_terminated_length": 204.9,
"completions/mean_length": 76.9375,
"completions/mean_terminated_length": 76.9375,
"completions/min_length": 18.3,
"completions/min_terminated_length": 18.3,
"epoch": 0.07532767538793753,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.7898170846072592e-07,
"loss": 0.0009,
"num_tokens": 8718758.0,
"reward": 1.32010418176651,
"reward_std": 0.002500000596046448,
"rewards/combined_reward/mean": 1.32010418176651,
"rewards/combined_reward/std": 0.34439257588237526,
"step": 750
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 64.11875,
"completions/mean_terminated_length": 64.11875,
"completions/min_length": 16.4,
"completions/min_terminated_length": 16.4,
"epoch": 0.07633204439311003,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 1.783008281929106e-07,
"loss": -0.0051,
"num_tokens": 8833993.0,
"reward": 1.3178860425949097,
"reward_std": 0.016688717156648637,
"rewards/combined_reward/mean": 1.3178860425949097,
"rewards/combined_reward/std": 0.3388564258813858,
"step": 760
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 122.4,
"completions/max_terminated_length": 122.4,
"completions/mean_length": 62.99375,
"completions/mean_terminated_length": 62.99375,
"completions/min_length": 21.2,
"completions/min_terminated_length": 21.2,
"epoch": 0.07733641339828252,
"frac_reward_zero_std": 0.975,
"grad_norm": 1.1234172582626343,
"learning_rate": 1.7761043373151713e-07,
"loss": -0.0046,
"num_tokens": 8950896.0,
"reward": 1.3376388788223266,
"reward_std": 0.00034722290001809597,
"rewards/combined_reward/mean": 1.3376388788223266,
"rewards/combined_reward/std": 0.34661323949694633,
"step": 770
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 111.0,
"completions/max_terminated_length": 111.0,
"completions/mean_length": 56.3,
"completions/mean_terminated_length": 56.3,
"completions/min_length": 20.3,
"completions/min_terminated_length": 20.3,
"epoch": 0.07834078240345503,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.7691060896514168e-07,
"loss": -0.0003,
"num_tokens": 9071600.0,
"reward": 1.3996267199516297,
"reward_std": 0.002080751396715641,
"rewards/combined_reward/mean": 1.3996267199516297,
"rewards/combined_reward/std": 0.26108508543111386,
"step": 780
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.4,
"completions/max_terminated_length": 79.4,
"completions/mean_length": 45.76875,
"completions/mean_terminated_length": 45.76875,
"completions/min_length": 20.0,
"completions/min_terminated_length": 20.0,
"epoch": 0.07934515140862752,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.7620143892823975e-07,
"loss": -0.0062,
"num_tokens": 9174599.0,
"reward": 1.378697919845581,
"reward_std": 0.0003125001909211278,
"rewards/combined_reward/mean": 1.378697919845581,
"rewards/combined_reward/std": 0.2739857309497893,
"step": 790
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 98.3,
"completions/max_terminated_length": 98.3,
"completions/mean_length": 50.98125,
"completions/mean_terminated_length": 50.98125,
"completions/min_length": 19.2,
"completions/min_terminated_length": 19.2,
"epoch": 0.08034952041380003,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.7548300979079413e-07,
"loss": -0.0008,
"num_tokens": 9284796.0,
"reward": 1.368190097808838,
"reward_std": 0.004609373956918716,
"rewards/combined_reward/mean": 1.368190097808838,
"rewards/combined_reward/std": 0.25843119765631856,
"step": 800
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 80.0,
"completions/max_terminated_length": 80.0,
"completions/mean_length": 41.175,
"completions/mean_terminated_length": 41.175,
"completions/min_length": 12.8,
"completions/min_terminated_length": 12.8,
"epoch": 0.08135388941897254,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.7475540884784422e-07,
"loss": 0.0,
"num_tokens": 9398356.0,
"reward": 1.2378819465637207,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2378819465637207,
"rewards/combined_reward/std": 0.3914600659161806,
"step": 810
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 96.3,
"completions/max_terminated_length": 96.3,
"completions/mean_length": 54.50625,
"completions/mean_terminated_length": 54.50625,
"completions/min_length": 19.6,
"completions/min_terminated_length": 19.6,
"epoch": 0.08235825842414503,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.7401872450887915e-07,
"loss": -0.0007,
"num_tokens": 9497821.0,
"reward": 1.3947187542915345,
"reward_std": 0.0015624999767169356,
"rewards/combined_reward/mean": 1.3947187542915345,
"rewards/combined_reward/std": 0.2990885377395898,
"step": 820
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 96.7,
"completions/max_terminated_length": 96.7,
"completions/mean_length": 49.1875,
"completions/mean_terminated_length": 49.1875,
"completions/min_length": 17.9,
"completions/min_terminated_length": 17.9,
"epoch": 0.08336262742931753,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.7327304628709528e-07,
"loss": 0.0,
"num_tokens": 9641355.0,
"reward": 1.3011458396911622,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3011458396911622,
"rewards/combined_reward/std": 0.2698082665912807,
"step": 830
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 99.3,
"completions/max_terminated_length": 99.3,
"completions/mean_length": 54.9125,
"completions/mean_terminated_length": 54.9125,
"completions/min_length": 21.7,
"completions/min_terminated_length": 21.7,
"epoch": 0.08436699643449003,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.7251846478851951e-07,
"loss": 0.0083,
"num_tokens": 9759969.0,
"reward": 1.2925694584846497,
"reward_std": 0.0019245008006691933,
"rewards/combined_reward/mean": 1.2925694584846497,
"rewards/combined_reward/std": 0.26882885694503783,
"step": 840
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 168.1,
"completions/max_terminated_length": 168.1,
"completions/mean_length": 66.68125,
"completions/mean_terminated_length": 66.68125,
"completions/min_length": 19.3,
"completions/min_terminated_length": 19.3,
"epoch": 0.08537136543966253,
"frac_reward_zero_std": 0.925,
"grad_norm": 6.147635459899902,
"learning_rate": 1.7175507170100008e-07,
"loss": -0.0077,
"num_tokens": 9881310.0,
"reward": 1.2720364809036255,
"reward_std": 0.011238560592755676,
"rewards/combined_reward/mean": 1.2720364809036255,
"rewards/combined_reward/std": 0.31835093796253205,
"step": 850
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 91.0,
"completions/max_terminated_length": 91.0,
"completions/mean_length": 47.25,
"completions/mean_terminated_length": 47.25,
"completions/min_length": 23.2,
"completions/min_terminated_length": 23.2,
"epoch": 0.08637573444483503,
"frac_reward_zero_std": 0.95,
"grad_norm": 1.287226676940918,
"learning_rate": 1.7098295978306552e-07,
"loss": -0.012,
"num_tokens": 9981046.0,
"reward": 1.322606337070465,
"reward_std": 0.0022470591589808463,
"rewards/combined_reward/mean": 1.322606337070465,
"rewards/combined_reward/std": 0.3106359137222171,
"step": 860
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 91.9,
"completions/max_terminated_length": 91.9,
"completions/mean_length": 46.50625,
"completions/mean_terminated_length": 46.50625,
"completions/min_length": 13.9,
"completions/min_terminated_length": 13.9,
"epoch": 0.08738010345000753,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.7020222285265395e-07,
"loss": 0.0,
"num_tokens": 10089371.0,
"reward": 1.2643750071525575,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2643750071525575,
"rewards/combined_reward/std": 0.4044176399707794,
"step": 870
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 217.1,
"completions/max_terminated_length": 217.1,
"completions/mean_length": 70.81875,
"completions/mean_terminated_length": 70.81875,
"completions/min_length": 18.9,
"completions/min_terminated_length": 18.9,
"epoch": 0.08838447245518004,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.6941295577571328e-07,
"loss": 0.0079,
"num_tokens": 10197254.0,
"reward": 1.309374988079071,
"reward_std": 0.002500000596046448,
"rewards/combined_reward/mean": 1.309374988079071,
"rewards/combined_reward/std": 0.325995758920908,
"step": 880
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 94.7,
"completions/max_terminated_length": 94.7,
"completions/mean_length": 53.04375,
"completions/mean_terminated_length": 53.04375,
"completions/min_length": 22.5,
"completions/min_terminated_length": 22.5,
"epoch": 0.08938884146035253,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.686152544546743e-07,
"loss": 0.0008,
"num_tokens": 10316525.0,
"reward": 1.3464062690734864,
"reward_std": 0.00416666641831398,
"rewards/combined_reward/mean": 1.3464062690734864,
"rewards/combined_reward/std": 0.2880703628063202,
"step": 890
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 97.0,
"completions/max_terminated_length": 97.0,
"completions/mean_length": 50.95625,
"completions/mean_terminated_length": 50.95625,
"completions/min_length": 16.8,
"completions/min_terminated_length": 16.8,
"epoch": 0.09039321046552504,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.6780921581679763e-07,
"loss": 0.0021,
"num_tokens": 10435242.0,
"reward": 1.2726041793823242,
"reward_std": 0.009523502597585321,
"rewards/combined_reward/mean": 1.2726041793823242,
"rewards/combined_reward/std": 0.33535852897912266,
"step": 900
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 104.1,
"completions/max_terminated_length": 104.1,
"completions/mean_length": 57.20625,
"completions/mean_terminated_length": 57.20625,
"completions/min_length": 19.4,
"completions/min_terminated_length": 19.4,
"epoch": 0.09139757947069753,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.6699493780239649e-07,
"loss": 0.0,
"num_tokens": 10548043.0,
"reward": 1.3535937666893005,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3535937666893005,
"rewards/combined_reward/std": 0.33704030215740205,
"step": 910
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 107.5,
"completions/max_terminated_length": 107.5,
"completions/mean_length": 52.25,
"completions/mean_terminated_length": 52.25,
"completions/min_length": 16.2,
"completions/min_terminated_length": 16.2,
"epoch": 0.09240194847587004,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.6617251935293588e-07,
"loss": -0.0028,
"num_tokens": 10675027.0,
"reward": 1.3419270992279053,
"reward_std": 0.0015625,
"rewards/combined_reward/mean": 1.3419270992279053,
"rewards/combined_reward/std": 0.32070667631924155,
"step": 920
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 104.1,
"completions/max_terminated_length": 104.1,
"completions/mean_length": 58.05625,
"completions/mean_terminated_length": 58.05625,
"completions/min_length": 25.7,
"completions/min_terminated_length": 25.7,
"epoch": 0.09340631748104254,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.6534206039901054e-07,
"loss": 0.0,
"num_tokens": 10805048.0,
"reward": 1.4538020730018615,
"reward_std": 0.0005208343267440796,
"rewards/combined_reward/mean": 1.4538020730018615,
"rewards/combined_reward/std": 0.17151957787573338,
"step": 930
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 79.8,
"completions/max_terminated_length": 79.8,
"completions/mean_length": 39.75,
"completions/mean_terminated_length": 39.75,
"completions/min_length": 12.6,
"completions/min_terminated_length": 12.6,
"epoch": 0.09441068648621503,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.6450366184820256e-07,
"loss": 0.0,
"num_tokens": 10906272.0,
"reward": 1.258458322286606,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.258458322286606,
"rewards/combined_reward/std": 0.3260463088750839,
"step": 940
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 118.2,
"completions/max_terminated_length": 118.2,
"completions/mean_length": 61.65625,
"completions/mean_terminated_length": 61.65625,
"completions/min_length": 22.5,
"completions/min_terminated_length": 22.5,
"epoch": 0.09541505549138754,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.6365742557282017e-07,
"loss": 0.0091,
"num_tokens": 11023301.0,
"reward": 1.3930208325386046,
"reward_std": 0.0050495008006691934,
"rewards/combined_reward/mean": 1.3930208325386046,
"rewards/combined_reward/std": 0.30010328590869906,
"step": 950
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 105.4,
"completions/max_terminated_length": 105.4,
"completions/mean_length": 55.79375,
"completions/mean_terminated_length": 55.79375,
"completions/min_length": 23.8,
"completions/min_terminated_length": 23.8,
"epoch": 0.09641942449656003,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.6280345439751956e-07,
"loss": 0.0044,
"num_tokens": 11148588.0,
"reward": 1.3295885443687439,
"reward_std": 0.024523502215743065,
"rewards/combined_reward/mean": 1.3295885443687439,
"rewards/combined_reward/std": 0.2928910902235657,
"step": 960
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 121.7,
"completions/max_terminated_length": 121.7,
"completions/mean_length": 57.56875,
"completions/mean_terminated_length": 57.56875,
"completions/min_length": 14.2,
"completions/min_terminated_length": 14.2,
"epoch": 0.09742379350173254,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.6194185208681082e-07,
"loss": -0.0043,
"num_tokens": 11268271.0,
"reward": 1.2413020730018616,
"reward_std": 0.005312500335276127,
"rewards/combined_reward/mean": 1.2413020730018616,
"rewards/combined_reward/std": 0.3525692358613014,
"step": 970
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 117.1,
"completions/max_terminated_length": 117.1,
"completions/mean_length": 57.45625,
"completions/mean_terminated_length": 57.45625,
"completions/min_length": 19.1,
"completions/min_terminated_length": 19.1,
"epoch": 0.09842816250690503,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.610727233324495e-07,
"loss": 0.0,
"num_tokens": 11388376.0,
"reward": 1.2743749976158143,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.2743749976158143,
"rewards/combined_reward/std": 0.2959941983222961,
"step": 980
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 99.0,
"completions/max_terminated_length": 99.0,
"completions/mean_length": 51.1875,
"completions/mean_terminated_length": 51.1875,
"completions/min_length": 15.9,
"completions/min_terminated_length": 15.9,
"epoch": 0.09943253151207754,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.6019617374071597e-07,
"loss": 0.0001,
"num_tokens": 11503346.0,
"reward": 1.3223437547683716,
"reward_std": 0.0028867511078715324,
"rewards/combined_reward/mean": 1.3223437547683716,
"rewards/combined_reward/std": 0.37292833551764487,
"step": 990
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 146.8,
"completions/max_terminated_length": 146.8,
"completions/mean_length": 64.61875,
"completions/mean_terminated_length": 64.61875,
"completions/min_length": 23.9,
"completions/min_terminated_length": 23.9,
"epoch": 0.10043690051725004,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 1.5931230981958326e-07,
"loss": 0.0,
"num_tokens": 11600585.0,
"reward": 1.3246874928474426,
"reward_std": 0.0,
"rewards/combined_reward/mean": 1.3246874928474426,
"rewards/combined_reward/std": 0.23927139891311527,
"step": 1000
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 118.5,
"completions/max_terminated_length": 118.5,
"completions/mean_length": 65.5,
"completions/mean_terminated_length": 65.5,
"completions/min_length": 19.9,
"completions/min_terminated_length": 19.9,
"epoch": 0.10144126952242254,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.5842123896577543e-07,
"loss": -0.0036,
"num_tokens": 11737513.0,
"reward": 1.4228541851043701,
"reward_std": 0.001154701132327318,
"rewards/combined_reward/mean": 1.4228541851043701,
"rewards/combined_reward/std": 0.25313766626641154,
"step": 1010
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 311.5,
"completions/max_terminated_length": 224.3,
"completions/mean_length": 90.28125,
"completions/mean_terminated_length": 54.49903869628906,
"completions/min_length": 12.0,
"completions/min_terminated_length": 12.0,
"epoch": 0.10244563852759504,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.5752306945171818e-07,
"loss": -0.0115,
"num_tokens": 11875626.0,
"reward": 1.2103593707084657,
"reward_std": 0.004468750953674316,
"rewards/combined_reward/mean": 1.2103593707084657,
"rewards/combined_reward/std": 0.40379793345928194,
"step": 1020
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 128.9,
"completions/max_terminated_length": 128.9,
"completions/mean_length": 59.56875,
"completions/mean_terminated_length": 59.56875,
"completions/min_length": 15.4,
"completions/min_terminated_length": 15.4,
"epoch": 0.10345000753276754,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.5661791041238254e-07,
"loss": 0.0054,
"num_tokens": 11995581.0,
"reward": 1.3099791407585144,
"reward_std": 0.00020833313465118408,
"rewards/combined_reward/mean": 1.3099791407585144,
"rewards/combined_reward/std": 0.33452749061398207,
"step": 1030
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 343.2,
"completions/max_terminated_length": 228.1,
"completions/mean_length": 114.825,
"completions/mean_terminated_length": 78.1860580444336,
"completions/min_length": 25.7,
"completions/min_terminated_length": 25.7,
"epoch": 0.10445437653794004,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.5570587183202433e-07,
"loss": -0.0099,
"num_tokens": 12114797.0,
"reward": 1.2818815290927887,
"reward_std": 0.0018619796261191367,
"rewards/combined_reward/mean": 1.2818815290927887,
"rewards/combined_reward/std": 0.31765228807926177,
"step": 1040
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 113.8,
"completions/max_terminated_length": 113.8,
"completions/mean_length": 55.68125,
"completions/mean_terminated_length": 55.68125,
"completions/min_length": 16.7,
"completions/min_terminated_length": 16.7,
"epoch": 0.10545874554311253,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.5478706453082016e-07,
"loss": -0.0016,
"num_tokens": 12246978.0,
"reward": 1.3307923913002013,
"reward_std": 0.0002604176523163915,
"rewards/combined_reward/mean": 1.3307923913002013,
"rewards/combined_reward/std": 0.3518651008605957,
"step": 1050
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 144.1,
"completions/max_terminated_length": 144.1,
"completions/mean_length": 69.0,
"completions/mean_terminated_length": 69.0,
"completions/min_length": 17.6,
"completions/min_terminated_length": 17.6,
"epoch": 0.10646311454828504,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.5386160015140167e-07,
"loss": 0.0061,
"num_tokens": 12363690.0,
"reward": 1.3816666841506957,
"reward_std": 0.00692450013011694,
"rewards/combined_reward/mean": 1.3816666841506957,
"rewards/combined_reward/std": 0.2784981057047844,
"step": 1060
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 94.3,
"completions/max_terminated_length": 94.3,
"completions/mean_length": 49.63125,
"completions/mean_terminated_length": 49.63125,
"completions/min_length": 13.9,
"completions/min_terminated_length": 13.9,
"epoch": 0.10746748355345755,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.5292959114529024e-07,
"loss": 0.0011,
"num_tokens": 12481815.0,
"reward": 1.3338541746139527,
"reward_std": 0.002886752039194107,
"rewards/combined_reward/mean": 1.3338541746139527,
"rewards/combined_reward/std": 0.3240374196320772,
"step": 1070
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 97.0,
"completions/max_terminated_length": 97.0,
"completions/mean_length": 49.3375,
"completions/mean_terminated_length": 49.3375,
"completions/min_length": 19.6,
"completions/min_terminated_length": 19.6,
"epoch": 0.10847185255863004,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.5199115075923323e-07,
"loss": -0.0008,
"num_tokens": 12604637.0,
"reward": 1.2796875,
"reward_std": 0.0003608435858041048,
"rewards/combined_reward/mean": 1.2796875,
"rewards/combined_reward/std": 0.3038814663887024,
"step": 1080
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 313.5,
"completions/max_terminated_length": 115.4,
"completions/mean_length": 112.05,
"completions/mean_terminated_length": 61.88333358764648,
"completions/min_length": 20.3,
"completions/min_terminated_length": 20.3,
"epoch": 0.10947622156380254,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.5104639302144326e-07,
"loss": 0.0052,
"num_tokens": 12735697.0,
"reward": 1.342291682958603,
"reward_std": 0.0007216888945549727,
"rewards/combined_reward/mean": 1.342291682958603,
"rewards/combined_reward/std": 0.31657470017671585,
"step": 1090
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 127.9,
"completions/max_terminated_length": 127.9,
"completions/mean_length": 61.70625,
"completions/mean_terminated_length": 61.70625,
"completions/min_length": 17.8,
"completions/min_terminated_length": 17.8,
"epoch": 0.11048059056897504,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.5009543272774323e-07,
"loss": 0.0029,
"num_tokens": 12842590.0,
"reward": 1.3991406440734864,
"reward_std": 0.000572918844409287,
"rewards/combined_reward/mean": 1.3991406440734864,
"rewards/combined_reward/std": 0.27981497598811983,
"step": 1100
}
],
"logging_steps": 10,
"max_steps": 3000,
"num_input_tokens_seen": 12842590,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}