Qwen3-1.7B-FC / trainer_state.json

Upload Qwen3-1.7B-FC model (RLVR fine-tuned for function calling)

02cd7b9 verified 19 days ago

100 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.11048059056897504,
	"eval_steps": 500,
	"global_step": 1100,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 459.2,
	"completions/max_terminated_length": 272.7,
	"completions/mean_length": 76.24375,
	"completions/mean_terminated_length": 64.11458358764648,
	"completions/min_length": 16.8,
	"completions/min_terminated_length": 16.8,
	"epoch": 0.0010043690051725004,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 1.1999999999999998e-08,
	"loss": 0.0208,
	"num_tokens": 108131.0,
	"reward": 1.2312812566757203,
	"reward_std": 0.05931956073036417,
	"rewards/combined_reward/mean": 1.2312812566757203,
	"rewards/combined_reward/std": 0.4361365109682083,
	"step": 10
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 330.9,
	"completions/max_terminated_length": 147.6,
	"completions/mean_length": 75.425,
	"completions/mean_terminated_length": 61.425418090820315,
	"completions/min_length": 13.0,
	"completions/min_terminated_length": 13.0,
	"epoch": 0.002008738010345001,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 2.5333333333333335e-08,
	"loss": 0.0279,
	"num_tokens": 233579.0,
	"reward": 1.3428645849227905,
	"reward_std": 0.029872814007103444,
	"rewards/combined_reward/mean": 1.3428645849227905,
	"rewards/combined_reward/std": 0.3860916443169117,
	"step": 20
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 110.6,
	"completions/max_terminated_length": 110.6,
	"completions/mean_length": 51.04375,
	"completions/mean_terminated_length": 51.04375,
	"completions/min_length": 16.7,
	"completions/min_terminated_length": 16.7,
	"epoch": 0.003013107015517501,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 3.3646392822265625,
	"learning_rate": 3.866666666666666e-08,
	"loss": -0.0132,
	"num_tokens": 352258.0,
	"reward": 1.323312509059906,
	"reward_std": 0.05337500050663948,
	"rewards/combined_reward/mean": 1.323312509059906,
	"rewards/combined_reward/std": 0.39539981335401536,
	"step": 30
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 307.1,
	"completions/max_terminated_length": 211.8,
	"completions/mean_length": 95.83125,
	"completions/mean_terminated_length": 60.24375,
	"completions/min_length": 11.5,
	"completions/min_terminated_length": 11.5,
	"epoch": 0.004017476020690002,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 5.2e-08,
	"loss": 0.0143,
	"num_tokens": 485155.0,
	"reward": 1.2628658890724183,
	"reward_std": 0.03280075653456151,
	"rewards/combined_reward/mean": 1.2628658890724183,
	"rewards/combined_reward/std": 0.4110621690750122,
	"step": 40
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 140.5,
	"completions/max_terminated_length": 140.5,
	"completions/mean_length": 61.7875,
	"completions/mean_terminated_length": 61.7875,
	"completions/min_length": 23.7,
	"completions/min_terminated_length": 23.7,
	"epoch": 0.005021845025862502,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 1.8525996208190918,
	"learning_rate": 6.533333333333332e-08,
	"loss": 0.0147,
	"num_tokens": 607629.0,
	"reward": 1.3795833349227906,
	"reward_std": 0.00583496168255806,
	"rewards/combined_reward/mean": 1.3795833349227906,
	"rewards/combined_reward/std": 0.30837071537971494,
	"step": 50
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 338.5,
	"completions/max_terminated_length": 238.4,
	"completions/mean_length": 102.60625,
	"completions/mean_terminated_length": 91.32791748046876,
	"completions/min_length": 21.6,
	"completions/min_terminated_length": 21.6,
	"epoch": 0.006026214031035002,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 2.992983818054199,
	"learning_rate": 7.866666666666666e-08,
	"loss": 0.0045,
	"num_tokens": 728802.0,
	"reward": 1.3164896011352538,
	"reward_std": 0.02619450243655592,
	"rewards/combined_reward/mean": 1.3164896011352538,
	"rewards/combined_reward/std": 0.3474510669708252,
	"step": 60
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 128.0,
	"completions/max_terminated_length": 128.0,
	"completions/mean_length": 62.81875,
	"completions/mean_terminated_length": 61.769583511352536,
	"completions/min_length": 20.2,
	"completions/min_terminated_length": 20.2,
	"epoch": 0.007030583036207502,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 9.2e-08,
	"loss": 0.0098,
	"num_tokens": 836341.0,
	"reward": 1.355798614025116,
	"reward_std": 0.004375000763684511,
	"rewards/combined_reward/mean": 1.355798614025116,
	"rewards/combined_reward/std": 0.29267608374357224,
	"step": 70
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 110.2,
	"completions/max_terminated_length": 110.2,
	"completions/mean_length": 54.3375,
	"completions/mean_terminated_length": 54.3375,
	"completions/min_length": 24.0,
	"completions/min_terminated_length": 24.0,
	"epoch": 0.008034952041380003,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 13.820528984069824,
	"learning_rate": 1.0533333333333332e-07,
	"loss": 0.0119,
	"num_tokens": 945703.0,
	"reward": 1.4564843893051147,
	"reward_std": 0.003906251955777406,
	"rewards/combined_reward/mean": 1.4564843893051147,
	"rewards/combined_reward/std": 0.1776508768554777,
	"step": 80
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 201.9,
	"completions/max_terminated_length": 201.9,
	"completions/mean_length": 70.6125,
	"completions/mean_terminated_length": 70.6125,
	"completions/min_length": 21.5,
	"completions/min_terminated_length": 21.5,
	"epoch": 0.009039321046552503,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.1866666666666667e-07,
	"loss": 0.0195,
	"num_tokens": 1062961.0,
	"reward": 1.3238854348659514,
	"reward_std": 0.005562501423992216,
	"rewards/combined_reward/mean": 1.3238854348659514,
	"rewards/combined_reward/std": 0.22054901346564293,
	"step": 90
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 113.7,
	"completions/max_terminated_length": 113.7,
	"completions/mean_length": 60.275,
	"completions/mean_terminated_length": 60.275,
	"completions/min_length": 24.1,
	"completions/min_terminated_length": 24.1,
	"epoch": 0.010043690051725004,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 1.32e-07,
	"loss": 0.0058,
	"num_tokens": 1175365.0,
	"reward": 1.4070937514305115,
	"reward_std": 0.034517763555049895,
	"rewards/combined_reward/mean": 1.4070937514305115,
	"rewards/combined_reward/std": 0.26661672741174697,
	"step": 100
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 158.5,
	"completions/max_terminated_length": 158.5,
	"completions/mean_length": 65.46875,
	"completions/mean_terminated_length": 64.41750030517578,
	"completions/min_length": 19.1,
	"completions/min_terminated_length": 19.1,
	"epoch": 0.011048059056897505,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.4533333333333334e-07,
	"loss": 0.0019,
	"num_tokens": 1288772.0,
	"reward": 1.2793750286102294,
	"reward_std": 0.0024999996647238733,
	"rewards/combined_reward/mean": 1.2793750286102294,
	"rewards/combined_reward/std": 0.31086390763521193,
	"step": 110
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 322.2,
	"completions/max_terminated_length": 134.1,
	"completions/mean_length": 77.01875,
	"completions/mean_terminated_length": 64.77458343505859,
	"completions/min_length": 20.7,
	"completions/min_terminated_length": 20.7,
	"epoch": 0.012052428062070004,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 8.276171684265137,
	"learning_rate": 1.5866666666666666e-07,
	"loss": 0.0134,
	"num_tokens": 1403035.0,
	"reward": 1.3504362106323242,
	"reward_std": 0.030459362699184568,
	"rewards/combined_reward/mean": 1.3504362106323242,
	"rewards/combined_reward/std": 0.309928272664547,
	"step": 120
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 102.1,
	"completions/max_terminated_length": 102.1,
	"completions/mean_length": 61.0625,
	"completions/mean_terminated_length": 61.0625,
	"completions/min_length": 31.5,
	"completions/min_terminated_length": 31.5,
	"epoch": 0.013056797067242505,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.7199999999999998e-07,
	"loss": -0.0027,
	"num_tokens": 1524697.0,
	"reward": 1.361527794599533,
	"reward_std": 0.008749999664723873,
	"rewards/combined_reward/mean": 1.361527794599533,
	"rewards/combined_reward/std": 0.2736371263861656,
	"step": 130
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 123.4,
	"completions/max_terminated_length": 123.4,
	"completions/mean_length": 58.05,
	"completions/mean_terminated_length": 58.05,
	"completions/min_length": 16.8,
	"completions/min_terminated_length": 16.8,
	"epoch": 0.014061166072415004,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 3.9411776065826416,
	"learning_rate": 1.8533333333333333e-07,
	"loss": 0.0062,
	"num_tokens": 1622389.0,
	"reward": 1.3123229265213012,
	"reward_std": 0.03212499991059303,
	"rewards/combined_reward/mean": 1.3123229265213012,
	"rewards/combined_reward/std": 0.35334871551021935,
	"step": 140
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 333.1,
	"completions/max_terminated_length": 135.6,
	"completions/mean_length": 111.125,
	"completions/mean_terminated_length": 61.191666793823245,
	"completions/min_length": 21.8,
	"completions/min_terminated_length": 21.8,
	"epoch": 0.015065535077587506,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9866666666666665e-07,
	"loss": 0.0039,
	"num_tokens": 1734901.0,
	"reward": 1.2678720355033875,
	"reward_std": 0.0006250014062970877,
	"rewards/combined_reward/mean": 1.2678720355033875,
	"rewards/combined_reward/std": 0.2531693406403065,
	"step": 150
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 85.7,
	"completions/max_terminated_length": 85.7,
	"completions/mean_length": 48.81875,
	"completions/mean_terminated_length": 48.81875,
	"completions/min_length": 17.9,
	"completions/min_terminated_length": 17.9,
	"epoch": 0.016069904082760007,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9999507890797406e-07,
	"loss": 0.0046,
	"num_tokens": 1847536.0,
	"reward": 1.345395851135254,
	"reward_std": 0.0016666671261191368,
	"rewards/combined_reward/mean": 1.345395851135254,
	"rewards/combined_reward/std": 0.29257251909002663,
	"step": 160
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 467.7,
	"completions/max_terminated_length": 277.7,
	"completions/mean_length": 144.23125,
	"completions/mean_terminated_length": 95.81041717529297,
	"completions/min_length": 28.9,
	"completions/min_terminated_length": 28.9,
	"epoch": 0.017074273087932506,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.9997806834748455e-07,
	"loss": -0.0018,
	"num_tokens": 1970837.0,
	"reward": 1.3027083039283753,
	"reward_std": 0.004424501396715641,
	"rewards/combined_reward/mean": 1.3027083039283753,
	"rewards/combined_reward/std": 0.4294335596263409,
	"step": 170
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 99.5,
	"completions/max_terminated_length": 99.5,
	"completions/mean_length": 50.44375,
	"completions/mean_terminated_length": 50.44375,
	"completions/min_length": 14.6,
	"completions/min_terminated_length": 14.6,
	"epoch": 0.018078642093105005,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9994890963073946e-07,
	"loss": 0.0059,
	"num_tokens": 2088820.0,
	"reward": 1.2765364408493043,
	"reward_std": 0.00015624959487468005,
	"rewards/combined_reward/mean": 1.2765364408493043,
	"rewards/combined_reward/std": 0.3481216669082642,
	"step": 180
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 170.0,
	"completions/max_terminated_length": 170.0,
	"completions/mean_length": 67.0,
	"completions/mean_terminated_length": 67.0,
	"completions/min_length": 15.8,
	"completions/min_terminated_length": 15.8,
	"epoch": 0.019083011098277508,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.9990760630076236e-07,
	"loss": -0.0197,
	"num_tokens": 2217116.0,
	"reward": 1.3771250247955322,
	"reward_std": 0.001916667865589261,
	"rewards/combined_reward/mean": 1.3771250247955322,
	"rewards/combined_reward/std": 0.29997652024030685,
	"step": 190
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 99.2,
	"completions/max_terminated_length": 99.2,
	"completions/mean_length": 41.91875,
	"completions/mean_terminated_length": 41.91875,
	"completions/min_length": 12.8,
	"completions/min_terminated_length": 12.8,
	"epoch": 0.020087380103450007,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 3.989150047302246,
	"learning_rate": 1.99854163376247e-07,
	"loss": 0.0011,
	"num_tokens": 2329863.0,
	"reward": 1.1117187559604644,
	"reward_std": 0.02916821506805718,
	"rewards/combined_reward/mean": 1.1117187559604644,
	"rewards/combined_reward/std": 0.37413454949855807,
	"step": 200
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 408.0,
	"completions/max_terminated_length": 220.7,
	"completions/mean_length": 133.575,
	"completions/mean_terminated_length": 84.2875,
	"completions/min_length": 25.0,
	"completions/min_terminated_length": 25.0,
	"epoch": 0.021091749108622507,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 1.9978858735094754e-07,
	"loss": 0.0285,
	"num_tokens": 2457743.0,
	"reward": 1.3693958520889282,
	"reward_std": 0.004563984216656536,
	"rewards/combined_reward/mean": 1.3693958520889282,
	"rewards/combined_reward/std": 0.33579447590745987,
	"step": 210
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 115.4,
	"completions/max_terminated_length": 115.4,
	"completions/mean_length": 60.24375,
	"completions/mean_terminated_length": 60.24375,
	"completions/min_length": 20.8,
	"completions/min_terminated_length": 20.8,
	"epoch": 0.02209611811379501,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.9971088619288948e-07,
	"loss": 0.0,
	"num_tokens": 2581282.0,
	"reward": 1.284375011920929,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.284375011920929,
	"rewards/combined_reward/std": 0.3291483834385872,
	"step": 220
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 110.9,
	"completions/max_terminated_length": 110.9,
	"completions/mean_length": 52.08125,
	"completions/mean_terminated_length": 51.73625030517578,
	"completions/min_length": 15.5,
	"completions/min_terminated_length": 15.5,
	"epoch": 0.02310048711896751,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.996210693434016e-07,
	"loss": 0.0,
	"num_tokens": 2716695.0,
	"reward": 1.3078229188919068,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3078229188919068,
	"rewards/combined_reward/std": 0.3146174341440201,
	"step": 230
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 316.8,
	"completions/max_terminated_length": 296.7,
	"completions/mean_length": 106.325,
	"completions/mean_terminated_length": 71.55961608886719,
	"completions/min_length": 20.0,
	"completions/min_terminated_length": 20.0,
	"epoch": 0.024104856124140008,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.9951914771596858e-07,
	"loss": 0.0,
	"num_tokens": 2820347.0,
	"reward": 1.2994583308696748,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2994583308696748,
	"rewards/combined_reward/std": 0.35011555850505827,
	"step": 240
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0125,
	"completions/max_length": 105.0,
	"completions/max_terminated_length": 105.0,
	"completions/mean_length": 58.80625,
	"completions/mean_terminated_length": 57.67589340209961,
	"completions/min_length": 19.0,
	"completions/min_terminated_length": 19.0,
	"epoch": 0.02510922512931251,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.9940513369490513e-07,
	"loss": 0.0119,
	"num_tokens": 2937640.0,
	"reward": 1.2942708253860473,
	"reward_std": 0.0020473659737035633,
	"rewards/combined_reward/mean": 1.2942708253860473,
	"rewards/combined_reward/std": 0.34473495446145536,
	"step": 250
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 136.2,
	"completions/max_terminated_length": 136.2,
	"completions/mean_length": 68.56875,
	"completions/mean_terminated_length": 68.56875,
	"completions/min_length": 27.3,
	"completions/min_terminated_length": 27.3,
	"epoch": 0.02611359413448501,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.9927904113385096e-07,
	"loss": 0.0134,
	"num_tokens": 3051799.0,
	"reward": 1.3380468726158141,
	"reward_std": 0.00270459558814764,
	"rewards/combined_reward/mean": 1.3380468726158141,
	"rewards/combined_reward/std": 0.28382683396339414,
	"step": 260
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 172.2,
	"completions/max_terminated_length": 172.2,
	"completions/mean_length": 72.875,
	"completions/mean_terminated_length": 72.875,
	"completions/min_length": 26.4,
	"completions/min_terminated_length": 26.4,
	"epoch": 0.02711796313965751,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.9914088535408765e-07,
	"loss": -0.0019,
	"num_tokens": 3164803.0,
	"reward": 1.4464478969573975,
	"reward_std": 0.0021736113354563712,
	"rewards/combined_reward/mean": 1.4464478969573975,
	"rewards/combined_reward/std": 0.19929498732089995,
	"step": 270
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 140.5,
	"completions/max_terminated_length": 140.5,
	"completions/mean_length": 59.38125,
	"completions/mean_terminated_length": 59.38125,
	"completions/min_length": 15.9,
	"completions/min_terminated_length": 15.9,
	"epoch": 0.02812233214483001,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9899068314267685e-07,
	"loss": 0.001,
	"num_tokens": 3280220.0,
	"reward": 1.3454687356948853,
	"reward_std": 0.004999999329447747,
	"rewards/combined_reward/mean": 1.3454687356948853,
	"rewards/combined_reward/std": 0.31286893486976625,
	"step": 280
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 94.7,
	"completions/max_terminated_length": 94.7,
	"completions/mean_length": 55.0,
	"completions/mean_terminated_length": 55.0,
	"completions/min_length": 22.0,
	"completions/min_terminated_length": 22.0,
	"epoch": 0.029126701150002512,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.9882845275042067e-07,
	"loss": 0.0065,
	"num_tokens": 3385228.0,
	"reward": 1.4142057299613953,
	"reward_std": 0.00044270951766520736,
	"rewards/combined_reward/mean": 1.4142057299613953,
	"rewards/combined_reward/std": 0.20944447480142117,
	"step": 290
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 173.3,
	"completions/max_terminated_length": 173.3,
	"completions/mean_length": 76.13125,
	"completions/mean_terminated_length": 76.13125,
	"completions/min_length": 23.4,
	"completions/min_terminated_length": 23.4,
	"epoch": 0.03013107015517501,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9865421388964382e-07,
	"loss": -0.0017,
	"num_tokens": 3496189.0,
	"reward": 1.3910624980926514,
	"reward_std": 0.0021650632843375206,
	"rewards/combined_reward/mean": 1.3910624980926514,
	"rewards/combined_reward/std": 0.28597628474235537,
	"step": 300
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 315.4,
	"completions/max_terminated_length": 315.4,
	"completions/mean_length": 99.93125,
	"completions/mean_terminated_length": 99.93125,
	"completions/min_length": 18.4,
	"completions/min_terminated_length": 18.4,
	"epoch": 0.03113543916034751,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 3.8047702312469482,
	"learning_rate": 1.9846798773179865e-07,
	"loss": 0.0118,
	"num_tokens": 3602282.0,
	"reward": 1.2963680744171142,
	"reward_std": 0.01609460562467575,
	"rewards/combined_reward/mean": 1.2963680744171142,
	"rewards/combined_reward/std": 0.3926819786429405,
	"step": 310
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 103.8,
	"completions/max_terminated_length": 103.8,
	"completions/mean_length": 52.2875,
	"completions/mean_terminated_length": 52.2875,
	"completions/min_length": 20.6,
	"completions/min_terminated_length": 20.6,
	"epoch": 0.03213980816552001,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9826979690489249e-07,
	"loss": 0.0014,
	"num_tokens": 3717904.0,
	"reward": 1.403697907924652,
	"reward_std": 0.0003125001909211278,
	"rewards/combined_reward/mean": 1.403697907924652,
	"rewards/combined_reward/std": 0.24410614371299744,
	"step": 320
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 79.8,
	"completions/max_terminated_length": 79.8,
	"completions/mean_length": 44.49375,
	"completions/mean_terminated_length": 44.49375,
	"completions/min_length": 16.1,
	"completions/min_terminated_length": 16.1,
	"epoch": 0.03314417717069251,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 1.9805966549073822e-07,
	"loss": 0.0057,
	"num_tokens": 3825867.0,
	"reward": 1.3135937452316284,
	"reward_std": 0.007812501117587089,
	"rewards/combined_reward/mean": 1.3135937452316284,
	"rewards/combined_reward/std": 0.3756252348423004,
	"step": 330
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 117.9,
	"completions/max_terminated_length": 117.9,
	"completions/mean_length": 54.15,
	"completions/mean_terminated_length": 54.15,
	"completions/min_length": 15.8,
	"completions/min_terminated_length": 15.8,
	"epoch": 0.03414854617586501,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9783761902202812e-07,
	"loss": 0.0067,
	"num_tokens": 3942087.0,
	"reward": 1.290208351612091,
	"reward_std": 0.0010206203907728196,
	"rewards/combined_reward/mean": 1.290208351612091,
	"rewards/combined_reward/std": 0.27491325289011004,
	"step": 340
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 89.2,
	"completions/max_terminated_length": 89.2,
	"completions/mean_length": 45.46875,
	"completions/mean_terminated_length": 45.46875,
	"completions/min_length": 12.9,
	"completions/min_terminated_length": 12.9,
	"epoch": 0.03515291518103751,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.9760368447923143e-07,
	"loss": 0.0,
	"num_tokens": 4077218.0,
	"reward": 1.271875011920929,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.271875011920929,
	"rewards/combined_reward/std": 0.3903637401759624,
	"step": 350
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 86.1,
	"completions/max_terminated_length": 86.1,
	"completions/mean_length": 47.9125,
	"completions/mean_terminated_length": 47.9125,
	"completions/min_length": 19.1,
	"completions/min_terminated_length": 19.1,
	"epoch": 0.03615728418621001,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 5.847682952880859,
	"learning_rate": 1.9735789028731602e-07,
	"loss": -0.0023,
	"num_tokens": 4189144.0,
	"reward": 1.3238541960716248,
	"reward_std": 0.03020833432674408,
	"rewards/combined_reward/mean": 1.3238541960716248,
	"rewards/combined_reward/std": 0.32445888966321945,
	"step": 360
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 103.3,
	"completions/max_terminated_length": 103.3,
	"completions/mean_length": 55.5,
	"completions/mean_terminated_length": 55.5,
	"completions/min_length": 20.6,
	"completions/min_terminated_length": 20.6,
	"epoch": 0.03716165319138252,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.9710026631229448e-07,
	"loss": 0.0001,
	"num_tokens": 4294100.0,
	"reward": 1.3909027934074403,
	"reward_std": 0.00034722290001809597,
	"rewards/combined_reward/mean": 1.3909027934074403,
	"rewards/combined_reward/std": 0.2816110193729401,
	"step": 370
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 119.4,
	"completions/max_terminated_length": 119.4,
	"completions/mean_length": 57.65625,
	"completions/mean_terminated_length": 57.65625,
	"completions/min_length": 19.0,
	"completions/min_terminated_length": 19.0,
	"epoch": 0.038166022196555016,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9683084385759522e-07,
	"loss": -0.0002,
	"num_tokens": 4400477.0,
	"reward": 1.333958351612091,
	"reward_std": 0.0012500007636845113,
	"rewards/combined_reward/mean": 1.333958351612091,
	"rewards/combined_reward/std": 0.2801030218601227,
	"step": 380
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 112.6,
	"completions/max_terminated_length": 112.6,
	"completions/mean_length": 55.225,
	"completions/mean_terminated_length": 55.225,
	"completions/min_length": 18.2,
	"completions/min_terminated_length": 18.2,
	"epoch": 0.039170391201727515,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 4.480510234832764,
	"learning_rate": 1.9654965566025878e-07,
	"loss": 0.006,
	"num_tokens": 4516865.0,
	"reward": 1.370369803905487,
	"reward_std": 0.002187502384185791,
	"rewards/combined_reward/mean": 1.370369803905487,
	"rewards/combined_reward/std": 0.27093904092907906,
	"step": 390
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 139.2,
	"completions/max_terminated_length": 139.2,
	"completions/mean_length": 55.54375,
	"completions/mean_terminated_length": 55.54375,
	"completions/min_length": 12.2,
	"completions/min_terminated_length": 12.2,
	"epoch": 0.040174760206900015,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.9625673588696007e-07,
	"loss": 0.0,
	"num_tokens": 4634776.0,
	"reward": 1.2619999647140503,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2619999647140503,
	"rewards/combined_reward/std": 0.3673270642757416,
	"step": 400
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 106.5,
	"completions/max_terminated_length": 106.5,
	"completions/mean_length": 52.2875,
	"completions/mean_terminated_length": 52.2875,
	"completions/min_length": 13.1,
	"completions/min_terminated_length": 13.1,
	"epoch": 0.041179129212072514,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 5.624104022979736,
	"learning_rate": 1.959521201298568e-07,
	"loss": 0.0061,
	"num_tokens": 4766894.0,
	"reward": 1.3308506846427917,
	"reward_std": 0.003342500701546669,
	"rewards/combined_reward/mean": 1.3308506846427917,
	"rewards/combined_reward/std": 0.37019643262028695,
	"step": 410
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 144.9,
	"completions/max_terminated_length": 144.9,
	"completions/mean_length": 63.63125,
	"completions/mean_terminated_length": 63.63125,
	"completions/min_length": 18.3,
	"completions/min_terminated_length": 18.3,
	"epoch": 0.042183498217245013,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.956358454022648e-07,
	"loss": -0.0011,
	"num_tokens": 4887883.0,
	"reward": 1.3249478936195374,
	"reward_std": 0.016550703253597022,
	"rewards/combined_reward/mean": 1.3249478936195374,
	"rewards/combined_reward/std": 0.31248683035373687,
	"step": 420
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 70.8,
	"completions/max_terminated_length": 70.8,
	"completions/mean_length": 40.03125,
	"completions/mean_terminated_length": 40.03125,
	"completions/min_length": 21.5,
	"completions/min_terminated_length": 21.5,
	"epoch": 0.04318786722241751,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9530795013416043e-07,
	"loss": -0.0062,
	"num_tokens": 5017432.0,
	"reward": 1.2040624856948852,
	"reward_std": 0.003125,
	"rewards/combined_reward/mean": 1.2040624856948852,
	"rewards/combined_reward/std": 0.28724531903862954,
	"step": 430
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0125,
	"completions/max_length": 95.9,
	"completions/max_terminated_length": 95.9,
	"completions/mean_length": 47.64375,
	"completions/mean_terminated_length": 46.64416732788086,
	"completions/min_length": 14.4,
	"completions/min_terminated_length": 14.4,
	"epoch": 0.04419223622759002,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.9496847416751122e-07,
	"loss": -0.0055,
	"num_tokens": 5127539.0,
	"reward": 1.3247395992279052,
	"reward_std": 0.005520834401249885,
	"rewards/combined_reward/mean": 1.3247395992279052,
	"rewards/combined_reward/std": 0.353334778547287,
	"step": 440
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 101.6,
	"completions/max_terminated_length": 101.6,
	"completions/mean_length": 53.95625,
	"completions/mean_terminated_length": 53.95625,
	"completions/min_length": 21.6,
	"completions/min_terminated_length": 21.6,
	"epoch": 0.04519660523276252,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9461745875143477e-07,
	"loss": -0.0013,
	"num_tokens": 5239592.0,
	"reward": 1.2362499833106995,
	"reward_std": 0.0016666660085320473,
	"rewards/combined_reward/mean": 1.2362499833106995,
	"rewards/combined_reward/std": 0.33721971064805983,
	"step": 450
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 156.9,
	"completions/max_terminated_length": 156.9,
	"completions/mean_length": 73.56875,
	"completions/mean_terminated_length": 73.56875,
	"completions/min_length": 16.6,
	"completions/min_terminated_length": 16.6,
	"epoch": 0.04620097423793502,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.942549465371863e-07,
	"loss": -0.0051,
	"num_tokens": 5360759.0,
	"reward": 1.364300584793091,
	"reward_std": 0.0033333331346511843,
	"rewards/combined_reward/mean": 1.364300584793091,
	"rewards/combined_reward/std": 0.29198225438594816,
	"step": 460
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 89.4,
	"completions/max_terminated_length": 89.4,
	"completions/mean_length": 49.9,
	"completions/mean_terminated_length": 49.9,
	"completions/min_length": 14.5,
	"completions/min_terminated_length": 14.5,
	"epoch": 0.04720534324310752,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.938809815729766e-07,
	"loss": 0.0,
	"num_tokens": 5489735.0,
	"reward": 1.2914583563804627,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2914583563804627,
	"rewards/combined_reward/std": 0.32128691375255586,
	"step": 470
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 116.8,
	"completions/max_terminated_length": 116.8,
	"completions/mean_length": 54.26875,
	"completions/mean_terminated_length": 54.26875,
	"completions/min_length": 16.8,
	"completions/min_terminated_length": 16.8,
	"epoch": 0.048209712248280016,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.9349560929861957e-07,
	"loss": 0.0036,
	"num_tokens": 5618126.0,
	"reward": 1.2964062452316285,
	"reward_std": 0.0034375011920928953,
	"rewards/combined_reward/mean": 1.2964062452316285,
	"rewards/combined_reward/std": 0.3410232897847891,
	"step": 480
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 138.5,
	"completions/max_terminated_length": 138.5,
	"completions/mean_length": 63.425,
	"completions/mean_terminated_length": 63.425,
	"completions/min_length": 17.2,
	"completions/min_terminated_length": 17.2,
	"epoch": 0.049214081253452516,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 5.859716892242432,
	"learning_rate": 1.9309887654001093e-07,
	"loss": -0.0122,
	"num_tokens": 5732858.0,
	"reward": 1.3710416555404663,
	"reward_std": 0.005000000074505806,
	"rewards/combined_reward/mean": 1.3710416555404663,
	"rewards/combined_reward/std": 0.2569635409861803,
	"step": 490
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 71.1,
	"completions/max_terminated_length": 71.1,
	"completions/mean_length": 37.5125,
	"completions/mean_terminated_length": 37.5125,
	"completions/min_length": 15.6,
	"completions/min_terminated_length": 15.6,
	"epoch": 0.05021845025862502,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.9269083150343857e-07,
	"loss": 0.0,
	"num_tokens": 5827508.0,
	"reward": 1.2737499952316285,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2737499952316285,
	"rewards/combined_reward/std": 0.36351585388183594,
	"step": 500
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 92.3,
	"completions/max_terminated_length": 92.3,
	"completions/mean_length": 49.31875,
	"completions/mean_terminated_length": 49.31875,
	"completions/min_length": 16.5,
	"completions/min_terminated_length": 16.5,
	"epoch": 0.05122281926379752,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.9227152376972505e-07,
	"loss": 0.0,
	"num_tokens": 5940043.0,
	"reward": 1.3223958492279053,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3223958492279053,
	"rewards/combined_reward/std": 0.32680114805698396,
	"step": 510
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 112.1,
	"completions/max_terminated_length": 112.1,
	"completions/mean_length": 60.84375,
	"completions/mean_terminated_length": 60.84375,
	"completions/min_length": 22.7,
	"completions/min_terminated_length": 22.7,
	"epoch": 0.05222718826897002,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.91841004288203e-07,
	"loss": 0.0,
	"num_tokens": 6061038.0,
	"reward": 1.3749479293823241,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3749479293823241,
	"rewards/combined_reward/std": 0.2760587348602712,
	"step": 520
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 383.1,
	"completions/max_terminated_length": 211.9,
	"completions/mean_length": 101.45,
	"completions/mean_terminated_length": 89.37000045776367,
	"completions/min_length": 29.4,
	"completions/min_terminated_length": 29.4,
	"epoch": 0.05323155727414252,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 1.913993253705246e-07,
	"loss": 0.0182,
	"num_tokens": 6172502.0,
	"reward": 1.3482013940811157,
	"reward_std": 0.004686582600697875,
	"rewards/combined_reward/mean": 1.3482013940811157,
	"rewards/combined_reward/std": 0.26615125834941866,
	"step": 530
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 116.0,
	"completions/max_terminated_length": 116.0,
	"completions/mean_length": 61.33125,
	"completions/mean_terminated_length": 61.33125,
	"completions/min_length": 26.7,
	"completions/min_terminated_length": 26.7,
	"epoch": 0.05423592627931502,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 6.519238471984863,
	"learning_rate": 1.9094654068430515e-07,
	"loss": -0.014,
	"num_tokens": 6279539.0,
	"reward": 1.456402564048767,
	"reward_std": 0.0006212619598954916,
	"rewards/combined_reward/mean": 1.456402564048767,
	"rewards/combined_reward/std": 0.17502975650131702,
	"step": 540
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 319.8,
	"completions/max_terminated_length": 302.8,
	"completions/mean_length": 102.7,
	"completions/mean_terminated_length": 92.22833557128907,
	"completions/min_length": 14.5,
	"completions/min_terminated_length": 14.5,
	"epoch": 0.05524029528448752,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 6.630038738250732,
	"learning_rate": 1.9048270524660196e-07,
	"loss": 0.0001,
	"num_tokens": 6401355.0,
	"reward": 1.2464791774749755,
	"reward_std": 0.016750000603497028,
	"rewards/combined_reward/mean": 1.2464791774749755,
	"rewards/combined_reward/std": 0.43877428472042085,
	"step": 550
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 108.4,
	"completions/max_terminated_length": 108.4,
	"completions/mean_length": 57.21875,
	"completions/mean_terminated_length": 57.21875,
	"completions/min_length": 22.1,
	"completions/min_terminated_length": 22.1,
	"epoch": 0.05624466428966002,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 4.464468955993652,
	"learning_rate": 1.9000787541722936e-07,
	"loss": -0.0008,
	"num_tokens": 6512806.0,
	"reward": 1.3637500047683715,
	"reward_std": 0.0056250004563480616,
	"rewards/combined_reward/mean": 1.3637500047683715,
	"rewards/combined_reward/std": 0.25516389338299633,
	"step": 560
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 296.2,
	"completions/max_terminated_length": 201.7,
	"completions/mean_length": 86.75625,
	"completions/mean_terminated_length": 75.22125091552735,
	"completions/min_length": 24.9,
	"completions/min_terminated_length": 24.9,
	"epoch": 0.057249033294832524,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.8952210889191065e-07,
	"loss": -0.0016,
	"num_tokens": 6619515.0,
	"reward": 1.3538541674613953,
	"reward_std": 0.009270833618938924,
	"rewards/combined_reward/mean": 1.3538541674613953,
	"rewards/combined_reward/std": 0.35525577939115466,
	"step": 570
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 104.1,
	"completions/max_terminated_length": 104.1,
	"completions/mean_length": 48.9,
	"completions/mean_terminated_length": 48.9,
	"completions/min_length": 16.2,
	"completions/min_terminated_length": 16.2,
	"epoch": 0.058253402300005024,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.890254646952674e-07,
	"loss": 0.0,
	"num_tokens": 6728163.0,
	"reward": 1.2268749833106996,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2268749833106996,
	"rewards/combined_reward/std": 0.33372554890811446,
	"step": 580
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 115.7,
	"completions/max_terminated_length": 115.7,
	"completions/mean_length": 61.34375,
	"completions/mean_terminated_length": 61.34375,
	"completions/min_length": 20.3,
	"completions/min_terminated_length": 20.3,
	"epoch": 0.05925777130517752,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 3.076678991317749,
	"learning_rate": 1.885180031736477e-07,
	"loss": -0.0013,
	"num_tokens": 6845358.0,
	"reward": 1.3715885639190675,
	"reward_std": 0.0037068985402584076,
	"rewards/combined_reward/mean": 1.3715885639190675,
	"rewards/combined_reward/std": 0.3188589945435524,
	"step": 590
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 122.6,
	"completions/max_terminated_length": 122.6,
	"completions/mean_length": 55.81875,
	"completions/mean_terminated_length": 55.81875,
	"completions/min_length": 14.8,
	"completions/min_terminated_length": 14.8,
	"epoch": 0.06026214031035002,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.879997859877932e-07,
	"loss": 0.0032,
	"num_tokens": 6971649.0,
	"reward": 1.280833327770233,
	"reward_std": 0.0006132050417363644,
	"rewards/combined_reward/mean": 1.280833327770233,
	"rewards/combined_reward/std": 0.338599956035614,
	"step": 600
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 387.6,
	"completions/max_terminated_length": 192.1,
	"completions/mean_length": 122.46875,
	"completions/mean_terminated_length": 72.42708358764648,
	"completions/min_length": 23.3,
	"completions/min_terminated_length": 23.3,
	"epoch": 0.06126650931552252,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.8747087610534734e-07,
	"loss": 0.019,
	"num_tokens": 7087600.0,
	"reward": 1.338072907924652,
	"reward_std": 0.013132144883275031,
	"rewards/combined_reward/mean": 1.338072907924652,
	"rewards/combined_reward/std": 0.30777021273970606,
	"step": 610
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 112.7,
	"completions/max_terminated_length": 112.7,
	"completions/mean_length": 58.44375,
	"completions/mean_terminated_length": 58.44375,
	"completions/min_length": 23.9,
	"completions/min_terminated_length": 23.9,
	"epoch": 0.06227087832069502,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.8693133779320382e-07,
	"loss": -0.0031,
	"num_tokens": 7191467.0,
	"reward": 1.3348880290985108,
	"reward_std": 0.007124999910593033,
	"rewards/combined_reward/mean": 1.3348880290985108,
	"rewards/combined_reward/std": 0.2751554258167744,
	"step": 620
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 194.8,
	"completions/max_terminated_length": 194.8,
	"completions/mean_length": 84.76875,
	"completions/mean_terminated_length": 84.76875,
	"completions/min_length": 21.4,
	"completions/min_terminated_length": 21.4,
	"epoch": 0.06327524732586752,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.8638123660969793e-07,
	"loss": -0.0084,
	"num_tokens": 7304146.0,
	"reward": 1.3757467865943909,
	"reward_std": 0.0030034731142222883,
	"rewards/combined_reward/mean": 1.3757467865943909,
	"rewards/combined_reward/std": 0.28882216811180117,
	"step": 630
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 101.9,
	"completions/max_terminated_length": 101.9,
	"completions/mean_length": 56.925,
	"completions/mean_terminated_length": 56.925,
	"completions/min_length": 21.0,
	"completions/min_terminated_length": 21.0,
	"epoch": 0.06427961633104003,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.858206393966405e-07,
	"loss": 0.0,
	"num_tokens": 7415006.0,
	"reward": 1.3215104341506958,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3215104341506958,
	"rewards/combined_reward/std": 0.33309968262910844,
	"step": 640
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 106.5,
	"completions/max_terminated_length": 106.5,
	"completions/mean_length": 58.26875,
	"completions/mean_terminated_length": 58.26875,
	"completions/min_length": 14.2,
	"completions/min_terminated_length": 14.2,
	"epoch": 0.06528398533621252,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.8524961427119615e-07,
	"loss": -0.009,
	"num_tokens": 7546381.0,
	"reward": 1.3129427313804627,
	"reward_std": 0.002951054647564888,
	"rewards/combined_reward/mean": 1.3129427313804627,
	"rewards/combined_reward/std": 0.3575292468070984,
	"step": 650
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 84.0,
	"completions/max_terminated_length": 84.0,
	"completions/mean_length": 46.75625,
	"completions/mean_terminated_length": 46.75625,
	"completions/min_length": 15.4,
	"completions/min_terminated_length": 15.4,
	"epoch": 0.06628835434138503,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.846682306176065e-07,
	"loss": 0.0,
	"num_tokens": 7668158.0,
	"reward": 1.3184374928474427,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3184374928474427,
	"rewards/combined_reward/std": 0.35122168958187105,
	"step": 660
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 101.7,
	"completions/max_terminated_length": 101.7,
	"completions/mean_length": 56.3375,
	"completions/mean_terminated_length": 56.3375,
	"completions/min_length": 17.9,
	"completions/min_terminated_length": 17.9,
	"epoch": 0.06729272334655753,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.8407655907875938e-07,
	"loss": 0.0006,
	"num_tokens": 7794644.0,
	"reward": 1.331454861164093,
	"reward_std": 0.007124999910593033,
	"rewards/combined_reward/mean": 1.331454861164093,
	"rewards/combined_reward/std": 0.3434182394295931,
	"step": 670
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 135.7,
	"completions/max_terminated_length": 135.7,
	"completions/mean_length": 68.90625,
	"completions/mean_terminated_length": 68.90625,
	"completions/min_length": 16.1,
	"completions/min_terminated_length": 16.1,
	"epoch": 0.06829709235173002,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 1.8347467154760515e-07,
	"loss": 0.0079,
	"num_tokens": 7913933.0,
	"reward": 1.3356944441795349,
	"reward_std": 0.0053335148841142654,
	"rewards/combined_reward/mean": 1.3356944441795349,
	"rewards/combined_reward/std": 0.3590264985337853,
	"step": 680
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 265.9,
	"completions/max_terminated_length": 265.9,
	"completions/mean_length": 91.5,
	"completions/mean_terminated_length": 91.5,
	"completions/min_length": 24.8,
	"completions/min_terminated_length": 24.8,
	"epoch": 0.06930146135690253,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.8286264115842114e-07,
	"loss": 0.0017,
	"num_tokens": 8033153.0,
	"reward": 1.3431249916553498,
	"reward_std": 0.0044791650027036665,
	"rewards/combined_reward/mean": 1.3431249916553498,
	"rewards/combined_reward/std": 0.3242304854094982,
	"step": 690
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 73.2,
	"completions/max_terminated_length": 73.2,
	"completions/mean_length": 39.55625,
	"completions/mean_terminated_length": 39.55625,
	"completions/min_length": 17.7,
	"completions/min_terminated_length": 17.7,
	"epoch": 0.07030583036207502,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.8224054227792522e-07,
	"loss": -0.003,
	"num_tokens": 8147198.0,
	"reward": 1.3440885424613953,
	"reward_std": 0.0002604176523163915,
	"rewards/combined_reward/mean": 1.3440885424613953,
	"rewards/combined_reward/std": 0.3006736177019775,
	"step": 700
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 123.6,
	"completions/max_terminated_length": 123.6,
	"completions/mean_length": 67.76875,
	"completions/mean_terminated_length": 67.76875,
	"completions/min_length": 25.8,
	"completions/min_terminated_length": 25.8,
	"epoch": 0.07131019936724753,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 1.436936616897583,
	"learning_rate": 1.816084504962396e-07,
	"loss": 0.0009,
	"num_tokens": 8248985.0,
	"reward": 1.459496557712555,
	"reward_std": 0.002500000596046448,
	"rewards/combined_reward/mean": 1.459496557712555,
	"rewards/combined_reward/std": 0.15663873171433806,
	"step": 710
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 172.1,
	"completions/max_terminated_length": 172.1,
	"completions/mean_length": 76.96875,
	"completions/mean_terminated_length": 76.96875,
	"completions/min_length": 24.5,
	"completions/min_terminated_length": 24.5,
	"epoch": 0.07231456837242002,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.8096644261770608e-07,
	"loss": 0.0179,
	"num_tokens": 8373128.0,
	"reward": 1.3943750143051148,
	"reward_std": 0.005624998733401299,
	"rewards/combined_reward/mean": 1.3943750143051148,
	"rewards/combined_reward/std": 0.24296645894646646,
	"step": 720
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 120.0,
	"completions/max_terminated_length": 120.0,
	"completions/mean_length": 60.15625,
	"completions/mean_terminated_length": 60.15625,
	"completions/min_length": 18.8,
	"completions/min_terminated_length": 18.8,
	"epoch": 0.07331893737759253,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.8031459665155363e-07,
	"loss": -0.001,
	"num_tokens": 8487649.0,
	"reward": 1.4223046898841858,
	"reward_std": 0.0001302093267440796,
	"rewards/combined_reward/mean": 1.4223046898841858,
	"rewards/combined_reward/std": 0.2848698660731316,
	"step": 730
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 77.8,
	"completions/max_terminated_length": 77.8,
	"completions/mean_length": 45.84375,
	"completions/mean_terminated_length": 45.84375,
	"completions/min_length": 18.4,
	"completions/min_terminated_length": 18.4,
	"epoch": 0.07432330638276503,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.796529918024196e-07,
	"loss": 0.0,
	"num_tokens": 8603284.0,
	"reward": 1.37947918176651,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.37947918176651,
	"rewards/combined_reward/std": 0.27231944501399996,
	"step": 740
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 204.9,
	"completions/max_terminated_length": 204.9,
	"completions/mean_length": 76.9375,
	"completions/mean_terminated_length": 76.9375,
	"completions/min_length": 18.3,
	"completions/min_terminated_length": 18.3,
	"epoch": 0.07532767538793753,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.7898170846072592e-07,
	"loss": 0.0009,
	"num_tokens": 8718758.0,
	"reward": 1.32010418176651,
	"reward_std": 0.002500000596046448,
	"rewards/combined_reward/mean": 1.32010418176651,
	"rewards/combined_reward/std": 0.34439257588237526,
	"step": 750
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 148.0,
	"completions/max_terminated_length": 148.0,
	"completions/mean_length": 64.11875,
	"completions/mean_terminated_length": 64.11875,
	"completions/min_length": 16.4,
	"completions/min_terminated_length": 16.4,
	"epoch": 0.07633204439311003,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 1.783008281929106e-07,
	"loss": -0.0051,
	"num_tokens": 8833993.0,
	"reward": 1.3178860425949097,
	"reward_std": 0.016688717156648637,
	"rewards/combined_reward/mean": 1.3178860425949097,
	"rewards/combined_reward/std": 0.3388564258813858,
	"step": 760
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 122.4,
	"completions/max_terminated_length": 122.4,
	"completions/mean_length": 62.99375,
	"completions/mean_terminated_length": 62.99375,
	"completions/min_length": 21.2,
	"completions/min_terminated_length": 21.2,
	"epoch": 0.07733641339828252,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 1.1234172582626343,
	"learning_rate": 1.7761043373151713e-07,
	"loss": -0.0046,
	"num_tokens": 8950896.0,
	"reward": 1.3376388788223266,
	"reward_std": 0.00034722290001809597,
	"rewards/combined_reward/mean": 1.3376388788223266,
	"rewards/combined_reward/std": 0.34661323949694633,
	"step": 770
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 111.0,
	"completions/max_terminated_length": 111.0,
	"completions/mean_length": 56.3,
	"completions/mean_terminated_length": 56.3,
	"completions/min_length": 20.3,
	"completions/min_terminated_length": 20.3,
	"epoch": 0.07834078240345503,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.7691060896514168e-07,
	"loss": -0.0003,
	"num_tokens": 9071600.0,
	"reward": 1.3996267199516297,
	"reward_std": 0.002080751396715641,
	"rewards/combined_reward/mean": 1.3996267199516297,
	"rewards/combined_reward/std": 0.26108508543111386,
	"step": 780
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 79.4,
	"completions/max_terminated_length": 79.4,
	"completions/mean_length": 45.76875,
	"completions/mean_terminated_length": 45.76875,
	"completions/min_length": 20.0,
	"completions/min_terminated_length": 20.0,
	"epoch": 0.07934515140862752,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.7620143892823975e-07,
	"loss": -0.0062,
	"num_tokens": 9174599.0,
	"reward": 1.378697919845581,
	"reward_std": 0.0003125001909211278,
	"rewards/combined_reward/mean": 1.378697919845581,
	"rewards/combined_reward/std": 0.2739857309497893,
	"step": 790
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 98.3,
	"completions/max_terminated_length": 98.3,
	"completions/mean_length": 50.98125,
	"completions/mean_terminated_length": 50.98125,
	"completions/min_length": 19.2,
	"completions/min_terminated_length": 19.2,
	"epoch": 0.08034952041380003,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.7548300979079413e-07,
	"loss": -0.0008,
	"num_tokens": 9284796.0,
	"reward": 1.368190097808838,
	"reward_std": 0.004609373956918716,
	"rewards/combined_reward/mean": 1.368190097808838,
	"rewards/combined_reward/std": 0.25843119765631856,
	"step": 800
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 80.0,
	"completions/max_terminated_length": 80.0,
	"completions/mean_length": 41.175,
	"completions/mean_terminated_length": 41.175,
	"completions/min_length": 12.8,
	"completions/min_terminated_length": 12.8,
	"epoch": 0.08135388941897254,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.7475540884784422e-07,
	"loss": 0.0,
	"num_tokens": 9398356.0,
	"reward": 1.2378819465637207,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2378819465637207,
	"rewards/combined_reward/std": 0.3914600659161806,
	"step": 810
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 96.3,
	"completions/max_terminated_length": 96.3,
	"completions/mean_length": 54.50625,
	"completions/mean_terminated_length": 54.50625,
	"completions/min_length": 19.6,
	"completions/min_terminated_length": 19.6,
	"epoch": 0.08235825842414503,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.7401872450887915e-07,
	"loss": -0.0007,
	"num_tokens": 9497821.0,
	"reward": 1.3947187542915345,
	"reward_std": 0.0015624999767169356,
	"rewards/combined_reward/mean": 1.3947187542915345,
	"rewards/combined_reward/std": 0.2990885377395898,
	"step": 820
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 96.7,
	"completions/max_terminated_length": 96.7,
	"completions/mean_length": 49.1875,
	"completions/mean_terminated_length": 49.1875,
	"completions/min_length": 17.9,
	"completions/min_terminated_length": 17.9,
	"epoch": 0.08336262742931753,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.7327304628709528e-07,
	"loss": 0.0,
	"num_tokens": 9641355.0,
	"reward": 1.3011458396911622,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3011458396911622,
	"rewards/combined_reward/std": 0.2698082665912807,
	"step": 830
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 99.3,
	"completions/max_terminated_length": 99.3,
	"completions/mean_length": 54.9125,
	"completions/mean_terminated_length": 54.9125,
	"completions/min_length": 21.7,
	"completions/min_terminated_length": 21.7,
	"epoch": 0.08436699643449003,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.7251846478851951e-07,
	"loss": 0.0083,
	"num_tokens": 9759969.0,
	"reward": 1.2925694584846497,
	"reward_std": 0.0019245008006691933,
	"rewards/combined_reward/mean": 1.2925694584846497,
	"rewards/combined_reward/std": 0.26882885694503783,
	"step": 840
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 168.1,
	"completions/max_terminated_length": 168.1,
	"completions/mean_length": 66.68125,
	"completions/mean_terminated_length": 66.68125,
	"completions/min_length": 19.3,
	"completions/min_terminated_length": 19.3,
	"epoch": 0.08537136543966253,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 6.147635459899902,
	"learning_rate": 1.7175507170100008e-07,
	"loss": -0.0077,
	"num_tokens": 9881310.0,
	"reward": 1.2720364809036255,
	"reward_std": 0.011238560592755676,
	"rewards/combined_reward/mean": 1.2720364809036255,
	"rewards/combined_reward/std": 0.31835093796253205,
	"step": 850
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 91.0,
	"completions/max_terminated_length": 91.0,
	"completions/mean_length": 47.25,
	"completions/mean_terminated_length": 47.25,
	"completions/min_length": 23.2,
	"completions/min_terminated_length": 23.2,
	"epoch": 0.08637573444483503,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 1.287226676940918,
	"learning_rate": 1.7098295978306552e-07,
	"loss": -0.012,
	"num_tokens": 9981046.0,
	"reward": 1.322606337070465,
	"reward_std": 0.0022470591589808463,
	"rewards/combined_reward/mean": 1.322606337070465,
	"rewards/combined_reward/std": 0.3106359137222171,
	"step": 860
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 91.9,
	"completions/max_terminated_length": 91.9,
	"completions/mean_length": 46.50625,
	"completions/mean_terminated_length": 46.50625,
	"completions/min_length": 13.9,
	"completions/min_terminated_length": 13.9,
	"epoch": 0.08738010345000753,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.7020222285265395e-07,
	"loss": 0.0,
	"num_tokens": 10089371.0,
	"reward": 1.2643750071525575,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2643750071525575,
	"rewards/combined_reward/std": 0.4044176399707794,
	"step": 870
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 217.1,
	"completions/max_terminated_length": 217.1,
	"completions/mean_length": 70.81875,
	"completions/mean_terminated_length": 70.81875,
	"completions/min_length": 18.9,
	"completions/min_terminated_length": 18.9,
	"epoch": 0.08838447245518004,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.6941295577571328e-07,
	"loss": 0.0079,
	"num_tokens": 10197254.0,
	"reward": 1.309374988079071,
	"reward_std": 0.002500000596046448,
	"rewards/combined_reward/mean": 1.309374988079071,
	"rewards/combined_reward/std": 0.325995758920908,
	"step": 880
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 94.7,
	"completions/max_terminated_length": 94.7,
	"completions/mean_length": 53.04375,
	"completions/mean_terminated_length": 53.04375,
	"completions/min_length": 22.5,
	"completions/min_terminated_length": 22.5,
	"epoch": 0.08938884146035253,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.686152544546743e-07,
	"loss": 0.0008,
	"num_tokens": 10316525.0,
	"reward": 1.3464062690734864,
	"reward_std": 0.00416666641831398,
	"rewards/combined_reward/mean": 1.3464062690734864,
	"rewards/combined_reward/std": 0.2880703628063202,
	"step": 890
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 97.0,
	"completions/max_terminated_length": 97.0,
	"completions/mean_length": 50.95625,
	"completions/mean_terminated_length": 50.95625,
	"completions/min_length": 16.8,
	"completions/min_terminated_length": 16.8,
	"epoch": 0.09039321046552504,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.6780921581679763e-07,
	"loss": 0.0021,
	"num_tokens": 10435242.0,
	"reward": 1.2726041793823242,
	"reward_std": 0.009523502597585321,
	"rewards/combined_reward/mean": 1.2726041793823242,
	"rewards/combined_reward/std": 0.33535852897912266,
	"step": 900
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 104.1,
	"completions/max_terminated_length": 104.1,
	"completions/mean_length": 57.20625,
	"completions/mean_terminated_length": 57.20625,
	"completions/min_length": 19.4,
	"completions/min_terminated_length": 19.4,
	"epoch": 0.09139757947069753,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.6699493780239649e-07,
	"loss": 0.0,
	"num_tokens": 10548043.0,
	"reward": 1.3535937666893005,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3535937666893005,
	"rewards/combined_reward/std": 0.33704030215740205,
	"step": 910
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 107.5,
	"completions/max_terminated_length": 107.5,
	"completions/mean_length": 52.25,
	"completions/mean_terminated_length": 52.25,
	"completions/min_length": 16.2,
	"completions/min_terminated_length": 16.2,
	"epoch": 0.09240194847587004,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.6617251935293588e-07,
	"loss": -0.0028,
	"num_tokens": 10675027.0,
	"reward": 1.3419270992279053,
	"reward_std": 0.0015625,
	"rewards/combined_reward/mean": 1.3419270992279053,
	"rewards/combined_reward/std": 0.32070667631924155,
	"step": 920
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 104.1,
	"completions/max_terminated_length": 104.1,
	"completions/mean_length": 58.05625,
	"completions/mean_terminated_length": 58.05625,
	"completions/min_length": 25.7,
	"completions/min_terminated_length": 25.7,
	"epoch": 0.09340631748104254,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.6534206039901054e-07,
	"loss": 0.0,
	"num_tokens": 10805048.0,
	"reward": 1.4538020730018615,
	"reward_std": 0.0005208343267440796,
	"rewards/combined_reward/mean": 1.4538020730018615,
	"rewards/combined_reward/std": 0.17151957787573338,
	"step": 930
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 79.8,
	"completions/max_terminated_length": 79.8,
	"completions/mean_length": 39.75,
	"completions/mean_terminated_length": 39.75,
	"completions/min_length": 12.6,
	"completions/min_terminated_length": 12.6,
	"epoch": 0.09441068648621503,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.6450366184820256e-07,
	"loss": 0.0,
	"num_tokens": 10906272.0,
	"reward": 1.258458322286606,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.258458322286606,
	"rewards/combined_reward/std": 0.3260463088750839,
	"step": 940
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 118.2,
	"completions/max_terminated_length": 118.2,
	"completions/mean_length": 61.65625,
	"completions/mean_terminated_length": 61.65625,
	"completions/min_length": 22.5,
	"completions/min_terminated_length": 22.5,
	"epoch": 0.09541505549138754,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.6365742557282017e-07,
	"loss": 0.0091,
	"num_tokens": 11023301.0,
	"reward": 1.3930208325386046,
	"reward_std": 0.0050495008006691934,
	"rewards/combined_reward/mean": 1.3930208325386046,
	"rewards/combined_reward/std": 0.30010328590869906,
	"step": 950
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 105.4,
	"completions/max_terminated_length": 105.4,
	"completions/mean_length": 55.79375,
	"completions/mean_terminated_length": 55.79375,
	"completions/min_length": 23.8,
	"completions/min_terminated_length": 23.8,
	"epoch": 0.09641942449656003,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.6280345439751956e-07,
	"loss": 0.0044,
	"num_tokens": 11148588.0,
	"reward": 1.3295885443687439,
	"reward_std": 0.024523502215743065,
	"rewards/combined_reward/mean": 1.3295885443687439,
	"rewards/combined_reward/std": 0.2928910902235657,
	"step": 960
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 121.7,
	"completions/max_terminated_length": 121.7,
	"completions/mean_length": 57.56875,
	"completions/mean_terminated_length": 57.56875,
	"completions/min_length": 14.2,
	"completions/min_terminated_length": 14.2,
	"epoch": 0.09742379350173254,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.6194185208681082e-07,
	"loss": -0.0043,
	"num_tokens": 11268271.0,
	"reward": 1.2413020730018616,
	"reward_std": 0.005312500335276127,
	"rewards/combined_reward/mean": 1.2413020730018616,
	"rewards/combined_reward/std": 0.3525692358613014,
	"step": 970
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 117.1,
	"completions/max_terminated_length": 117.1,
	"completions/mean_length": 57.45625,
	"completions/mean_terminated_length": 57.45625,
	"completions/min_length": 19.1,
	"completions/min_terminated_length": 19.1,
	"epoch": 0.09842816250690503,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.610727233324495e-07,
	"loss": 0.0,
	"num_tokens": 11388376.0,
	"reward": 1.2743749976158143,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.2743749976158143,
	"rewards/combined_reward/std": 0.2959941983222961,
	"step": 980
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 99.0,
	"completions/max_terminated_length": 99.0,
	"completions/mean_length": 51.1875,
	"completions/mean_terminated_length": 51.1875,
	"completions/min_length": 15.9,
	"completions/min_terminated_length": 15.9,
	"epoch": 0.09943253151207754,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.6019617374071597e-07,
	"loss": 0.0001,
	"num_tokens": 11503346.0,
	"reward": 1.3223437547683716,
	"reward_std": 0.0028867511078715324,
	"rewards/combined_reward/mean": 1.3223437547683716,
	"rewards/combined_reward/std": 0.37292833551764487,
	"step": 990
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 146.8,
	"completions/max_terminated_length": 146.8,
	"completions/mean_length": 64.61875,
	"completions/mean_terminated_length": 64.61875,
	"completions/min_length": 23.9,
	"completions/min_terminated_length": 23.9,
	"epoch": 0.10043690051725004,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 1.5931230981958326e-07,
	"loss": 0.0,
	"num_tokens": 11600585.0,
	"reward": 1.3246874928474426,
	"reward_std": 0.0,
	"rewards/combined_reward/mean": 1.3246874928474426,
	"rewards/combined_reward/std": 0.23927139891311527,
	"step": 1000
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 118.5,
	"completions/max_terminated_length": 118.5,
	"completions/mean_length": 65.5,
	"completions/mean_terminated_length": 65.5,
	"completions/min_length": 19.9,
	"completions/min_terminated_length": 19.9,
	"epoch": 0.10144126952242254,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.5842123896577543e-07,
	"loss": -0.0036,
	"num_tokens": 11737513.0,
	"reward": 1.4228541851043701,
	"reward_std": 0.001154701132327318,
	"rewards/combined_reward/mean": 1.4228541851043701,
	"rewards/combined_reward/std": 0.25313766626641154,
	"step": 1010
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 311.5,
	"completions/max_terminated_length": 224.3,
	"completions/mean_length": 90.28125,
	"completions/mean_terminated_length": 54.49903869628906,
	"completions/min_length": 12.0,
	"completions/min_terminated_length": 12.0,
	"epoch": 0.10244563852759504,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.5752306945171818e-07,
	"loss": -0.0115,
	"num_tokens": 11875626.0,
	"reward": 1.2103593707084657,
	"reward_std": 0.004468750953674316,
	"rewards/combined_reward/mean": 1.2103593707084657,
	"rewards/combined_reward/std": 0.40379793345928194,
	"step": 1020
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 128.9,
	"completions/max_terminated_length": 128.9,
	"completions/mean_length": 59.56875,
	"completions/mean_terminated_length": 59.56875,
	"completions/min_length": 15.4,
	"completions/min_terminated_length": 15.4,
	"epoch": 0.10345000753276754,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.5661791041238254e-07,
	"loss": 0.0054,
	"num_tokens": 11995581.0,
	"reward": 1.3099791407585144,
	"reward_std": 0.00020833313465118408,
	"rewards/combined_reward/mean": 1.3099791407585144,
	"rewards/combined_reward/std": 0.33452749061398207,
	"step": 1030
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 343.2,
	"completions/max_terminated_length": 228.1,
	"completions/mean_length": 114.825,
	"completions/mean_terminated_length": 78.1860580444336,
	"completions/min_length": 25.7,
	"completions/min_terminated_length": 25.7,
	"epoch": 0.10445437653794004,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.5570587183202433e-07,
	"loss": -0.0099,
	"num_tokens": 12114797.0,
	"reward": 1.2818815290927887,
	"reward_std": 0.0018619796261191367,
	"rewards/combined_reward/mean": 1.2818815290927887,
	"rewards/combined_reward/std": 0.31765228807926177,
	"step": 1040
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 113.8,
	"completions/max_terminated_length": 113.8,
	"completions/mean_length": 55.68125,
	"completions/mean_terminated_length": 55.68125,
	"completions/min_length": 16.7,
	"completions/min_terminated_length": 16.7,
	"epoch": 0.10545874554311253,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.5478706453082016e-07,
	"loss": -0.0016,
	"num_tokens": 12246978.0,
	"reward": 1.3307923913002013,
	"reward_std": 0.0002604176523163915,
	"rewards/combined_reward/mean": 1.3307923913002013,
	"rewards/combined_reward/std": 0.3518651008605957,
	"step": 1050
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 144.1,
	"completions/max_terminated_length": 144.1,
	"completions/mean_length": 69.0,
	"completions/mean_terminated_length": 69.0,
	"completions/min_length": 17.6,
	"completions/min_terminated_length": 17.6,
	"epoch": 0.10646311454828504,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.5386160015140167e-07,
	"loss": 0.0061,
	"num_tokens": 12363690.0,
	"reward": 1.3816666841506957,
	"reward_std": 0.00692450013011694,
	"rewards/combined_reward/mean": 1.3816666841506957,
	"rewards/combined_reward/std": 0.2784981057047844,
	"step": 1060
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 94.3,
	"completions/max_terminated_length": 94.3,
	"completions/mean_length": 49.63125,
	"completions/mean_terminated_length": 49.63125,
	"completions/min_length": 13.9,
	"completions/min_terminated_length": 13.9,
	"epoch": 0.10746748355345755,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.5292959114529024e-07,
	"loss": 0.0011,
	"num_tokens": 12481815.0,
	"reward": 1.3338541746139527,
	"reward_std": 0.002886752039194107,
	"rewards/combined_reward/mean": 1.3338541746139527,
	"rewards/combined_reward/std": 0.3240374196320772,
	"step": 1070
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 97.0,
	"completions/max_terminated_length": 97.0,
	"completions/mean_length": 49.3375,
	"completions/mean_terminated_length": 49.3375,
	"completions/min_length": 19.6,
	"completions/min_terminated_length": 19.6,
	"epoch": 0.10847185255863004,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.5199115075923323e-07,
	"loss": -0.0008,
	"num_tokens": 12604637.0,
	"reward": 1.2796875,
	"reward_std": 0.0003608435858041048,
	"rewards/combined_reward/mean": 1.2796875,
	"rewards/combined_reward/std": 0.3038814663887024,
	"step": 1080
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 313.5,
	"completions/max_terminated_length": 115.4,
	"completions/mean_length": 112.05,
	"completions/mean_terminated_length": 61.88333358764648,
	"completions/min_length": 20.3,
	"completions/min_terminated_length": 20.3,
	"epoch": 0.10947622156380254,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.5104639302144326e-07,
	"loss": 0.0052,
	"num_tokens": 12735697.0,
	"reward": 1.342291682958603,
	"reward_std": 0.0007216888945549727,
	"rewards/combined_reward/mean": 1.342291682958603,
	"rewards/combined_reward/std": 0.31657470017671585,
	"step": 1090
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 127.9,
	"completions/max_terminated_length": 127.9,
	"completions/mean_length": 61.70625,
	"completions/mean_terminated_length": 61.70625,
	"completions/min_length": 17.8,
	"completions/min_terminated_length": 17.8,
	"epoch": 0.11048059056897504,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.5009543272774323e-07,
	"loss": 0.0029,
	"num_tokens": 12842590.0,
	"reward": 1.3991406440734864,
	"reward_std": 0.000572918844409287,
	"rewards/combined_reward/mean": 1.3991406440734864,
	"rewards/combined_reward/std": 0.27981497598811983,
	"step": 1100
	}
	],
	"logging_steps": 10,
	"max_steps": 3000,
	"num_input_tokens_seen": 12842590,
	"num_train_epochs": 1,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}