Upload folder using huggingface_hub

140d572 verified about 2 months ago

54.3 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 0.2,
	"eval_steps": 500,
	"global_step": 50,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 237.0,
	"completions/max_terminated_length": 237.0,
	"completions/mean_length": 114.4375,
	"completions/mean_terminated_length": 114.4375,
	"completions/min_length": 68.0,
	"completions/min_terminated_length": 68.0,
	"entropy": 0.23012111708521843,
	"epoch": 0.004,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.32706609795270336,
	"learning_rate": 1e-06,
	"loss": 0.0,
	"num_tokens": 21278.0,
	"reward": 1.8497917652130127,
	"reward_std": 0.09231126308441162,
	"rewards/accuracy_reward_func/mean": 0.871666669845581,
	"rewards/accuracy_reward_func/std": 0.21418261528015137,
	"rewards/format_reward_func/mean": 0.9781249761581421,
	"rewards/format_reward_func/std": 0.1237436830997467,
	"step": 1,
	"step_time": 23.142175153829157
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 175.0,
	"completions/max_terminated_length": 175.0,
	"completions/mean_length": 95.9375,
	"completions/mean_terminated_length": 95.9375,
	"completions/min_length": 68.0,
	"completions/min_terminated_length": 68.0,
	"entropy": 0.2443241998553276,
	"epoch": 0.008,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.33467634062811474,
	"learning_rate": 9.8e-07,
	"loss": 0.0,
	"num_tokens": 43776.0,
	"reward": 1.8129092454910278,
	"reward_std": 0.02420501410961151,
	"rewards/accuracy_reward_func/mean": 0.8129092454910278,
	"rewards/accuracy_reward_func/std": 0.21289166808128357,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 2,
	"step_time": 9.088656539097428
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 145.0,
	"completions/max_terminated_length": 145.0,
	"completions/mean_length": 97.03125,
	"completions/mean_terminated_length": 97.03125,
	"completions/min_length": 67.0,
	"completions/min_terminated_length": 67.0,
	"entropy": 0.19135062769055367,
	"epoch": 0.012,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.4170485616641671,
	"learning_rate": 9.6e-07,
	"loss": -0.0,
	"num_tokens": 70409.0,
	"reward": 1.906822919845581,
	"reward_std": 0.0494791716337204,
	"rewards/accuracy_reward_func/mean": 0.906822919845581,
	"rewards/accuracy_reward_func/std": 0.13648581504821777,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 3,
	"step_time": 8.136402582749724
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 137.0,
	"completions/max_terminated_length": 137.0,
	"completions/mean_length": 99.125,
	"completions/mean_terminated_length": 99.125,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"entropy": 0.25429725274443626,
	"epoch": 0.016,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.7524404289855315,
	"learning_rate": 9.399999999999999e-07,
	"loss": 0.0,
	"num_tokens": 94161.0,
	"reward": 1.7960565090179443,
	"reward_std": 0.13303174078464508,
	"rewards/accuracy_reward_func/mean": 0.8085565567016602,
	"rewards/accuracy_reward_func/std": 0.24971628189086914,
	"rewards/format_reward_func/mean": 0.987500011920929,
	"rewards/format_reward_func/std": 0.0707106739282608,
	"step": 4,
	"step_time": 7.919396638870239
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 258.0,
	"completions/max_terminated_length": 258.0,
	"completions/mean_length": 127.75,
	"completions/mean_terminated_length": 127.75,
	"completions/min_length": 78.0,
	"completions/min_terminated_length": 78.0,
	"entropy": 0.24686714261770248,
	"epoch": 0.02,
	"frac_reward_zero_std": 0.375,
	"grad_norm": 0.5703983537961373,
	"learning_rate": 9.2e-07,
	"loss": -0.0,
	"num_tokens": 115581.0,
	"reward": 1.9024033546447754,
	"reward_std": 0.06382934749126434,
	"rewards/accuracy_reward_func/mean": 0.9024032950401306,
	"rewards/accuracy_reward_func/std": 0.14118432998657227,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 5,
	"step_time": 11.992262025363743
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 145.0,
	"completions/max_terminated_length": 145.0,
	"completions/mean_length": 92.34375,
	"completions/mean_terminated_length": 92.34375,
	"completions/min_length": 63.0,
	"completions/min_terminated_length": 63.0,
	"entropy": 0.22791285440325737,
	"epoch": 0.024,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.2647928147370579,
	"learning_rate": 9e-07,
	"loss": 0.0,
	"num_tokens": 140948.0,
	"reward": 1.7881250381469727,
	"reward_std": 0.016249999403953552,
	"rewards/accuracy_reward_func/mean": 0.7881250381469727,
	"rewards/accuracy_reward_func/std": 0.23106878995895386,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 6,
	"step_time": 8.167283555492759
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 158.0,
	"completions/max_terminated_length": 158.0,
	"completions/mean_length": 101.59375,
	"completions/mean_terminated_length": 101.59375,
	"completions/min_length": 61.0,
	"completions/min_terminated_length": 61.0,
	"entropy": 0.2525057829916477,
	"epoch": 0.028,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.28702743314019674,
	"learning_rate": 8.799999999999999e-07,
	"loss": 0.0,
	"num_tokens": 167411.0,
	"reward": 1.9117188453674316,
	"reward_std": 0.04828793182969093,
	"rewards/accuracy_reward_func/mean": 0.9117187261581421,
	"rewards/accuracy_reward_func/std": 0.12177487462759018,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 7,
	"step_time": 8.586366776376963
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 198.0,
	"completions/max_terminated_length": 198.0,
	"completions/mean_length": 101.90625,
	"completions/mean_terminated_length": 101.90625,
	"completions/min_length": 72.0,
	"completions/min_terminated_length": 72.0,
	"entropy": 0.251990407705307,
	"epoch": 0.032,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.6162980049635086,
	"learning_rate": 8.599999999999999e-07,
	"loss": -0.0,
	"num_tokens": 192304.0,
	"reward": 1.9317708015441895,
	"reward_std": 0.0726683959364891,
	"rewards/accuracy_reward_func/mean": 0.9317708611488342,
	"rewards/accuracy_reward_func/std": 0.12373802810907364,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 8,
	"step_time": 10.18786786403507
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 171.0,
	"completions/max_terminated_length": 171.0,
	"completions/mean_length": 89.53125,
	"completions/mean_terminated_length": 89.53125,
	"completions/min_length": 53.0,
	"completions/min_terminated_length": 53.0,
	"entropy": 0.2736722156405449,
	"epoch": 0.036,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.7002448094598603,
	"learning_rate": 8.399999999999999e-07,
	"loss": -0.0,
	"num_tokens": 217177.0,
	"reward": 1.8406250476837158,
	"reward_std": 0.09107423573732376,
	"rewards/accuracy_reward_func/mean": 0.840624988079071,
	"rewards/accuracy_reward_func/std": 0.3231492042541504,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 9,
	"step_time": 9.146976439282298
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 134.0,
	"completions/max_terminated_length": 134.0,
	"completions/mean_length": 88.40625,
	"completions/mean_terminated_length": 88.40625,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"entropy": 0.2195826843380928,
	"epoch": 0.04,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.4544179080346874,
	"learning_rate": 8.199999999999999e-07,
	"loss": -0.0,
	"num_tokens": 241142.0,
	"reward": 1.9390909671783447,
	"reward_std": 0.014433760195970535,
	"rewards/accuracy_reward_func/mean": 0.9390908479690552,
	"rewards/accuracy_reward_func/std": 0.0804174616932869,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 10,
	"step_time": 7.882300075143576
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 111.0,
	"completions/max_terminated_length": 111.0,
	"completions/mean_length": 87.5625,
	"completions/mean_terminated_length": 87.5625,
	"completions/min_length": 59.0,
	"completions/min_terminated_length": 59.0,
	"entropy": 0.19999410584568977,
	"epoch": 0.044,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.31705242310143567,
	"learning_rate": 8e-07,
	"loss": 0.0,
	"num_tokens": 268372.0,
	"reward": 1.909999966621399,
	"reward_std": 0.0329379141330719,
	"rewards/accuracy_reward_func/mean": 0.9099999666213989,
	"rewards/accuracy_reward_func/std": 0.1774914711713791,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 11,
	"step_time": 7.217764110304415
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 192.0,
	"completions/max_terminated_length": 192.0,
	"completions/mean_length": 103.25,
	"completions/mean_terminated_length": 103.25,
	"completions/min_length": 60.0,
	"completions/min_terminated_length": 60.0,
	"entropy": 0.2381710633635521,
	"epoch": 0.048,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.29876243636744126,
	"learning_rate": 7.799999999999999e-07,
	"loss": -0.0,
	"num_tokens": 286592.0,
	"reward": 1.8604166507720947,
	"reward_std": 0.11249998956918716,
	"rewards/accuracy_reward_func/mean": 0.8604166507720947,
	"rewards/accuracy_reward_func/std": 0.31079012155532837,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 12,
	"step_time": 9.55866174865514
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 168.0,
	"completions/max_terminated_length": 168.0,
	"completions/mean_length": 87.6875,
	"completions/mean_terminated_length": 87.6875,
	"completions/min_length": 53.0,
	"completions/min_terminated_length": 53.0,
	"entropy": 0.21462798118591309,
	"epoch": 0.052,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.7167465527886859,
	"learning_rate": 7.599999999999999e-07,
	"loss": -0.0,
	"num_tokens": 306702.0,
	"reward": 1.9366666078567505,
	"reward_std": 0.02041665092110634,
	"rewards/accuracy_reward_func/mean": 0.9366666674613953,
	"rewards/accuracy_reward_func/std": 0.11999402940273285,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 13,
	"step_time": 8.825894831679761
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 168.0,
	"completions/max_terminated_length": 168.0,
	"completions/mean_length": 119.375,
	"completions/mean_terminated_length": 119.375,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"entropy": 0.2256241999566555,
	"epoch": 0.056,
	"frac_reward_zero_std": 0.375,
	"grad_norm": 0.5531863595246166,
	"learning_rate": 7.4e-07,
	"loss": 0.0,
	"num_tokens": 333758.0,
	"reward": 1.8471875190734863,
	"reward_std": 0.14163094758987427,
	"rewards/accuracy_reward_func/mean": 0.8471875190734863,
	"rewards/accuracy_reward_func/std": 0.26768702268600464,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 14,
	"step_time": 8.871076120994985
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 151.0,
	"completions/max_terminated_length": 151.0,
	"completions/mean_length": 99.125,
	"completions/mean_terminated_length": 99.125,
	"completions/min_length": 64.0,
	"completions/min_terminated_length": 64.0,
	"entropy": 0.21050135791301727,
	"epoch": 0.06,
	"frac_reward_zero_std": 0.375,
	"grad_norm": 0.5955394753926603,
	"learning_rate": 7.2e-07,
	"loss": -0.0,
	"num_tokens": 363766.0,
	"reward": 1.8220758438110352,
	"reward_std": 0.18934205174446106,
	"rewards/accuracy_reward_func/mean": 0.8533259034156799,
	"rewards/accuracy_reward_func/std": 0.2272060364484787,
	"rewards/format_reward_func/mean": 0.96875,
	"rewards/format_reward_func/std": 0.1767766922712326,
	"step": 15,
	"step_time": 8.423650750890374
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 158.0,
	"completions/max_terminated_length": 158.0,
	"completions/mean_length": 94.875,
	"completions/mean_terminated_length": 94.875,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"entropy": 0.23235702514648438,
	"epoch": 0.064,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.31149648184979223,
	"learning_rate": 7e-07,
	"loss": -0.0,
	"num_tokens": 383022.0,
	"reward": 1.8937499523162842,
	"reward_std": 0.11737333238124847,
	"rewards/accuracy_reward_func/mean": 0.9156249761581421,
	"rewards/accuracy_reward_func/std": 0.19610625505447388,
	"rewards/format_reward_func/mean": 0.9781249761581421,
	"rewards/format_reward_func/std": 0.1237436830997467,
	"step": 16,
	"step_time": 8.814181880094111
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 216.0,
	"completions/max_terminated_length": 216.0,
	"completions/mean_length": 116.03125,
	"completions/mean_terminated_length": 116.03125,
	"completions/min_length": 77.0,
	"completions/min_terminated_length": 77.0,
	"entropy": 0.21332164481282234,
	"epoch": 0.068,
	"frac_reward_zero_std": 0.375,
	"grad_norm": 0.6314309586182766,
	"learning_rate": 6.800000000000001e-07,
	"loss": -0.0,
	"num_tokens": 409507.0,
	"reward": 1.7729910612106323,
	"reward_std": 0.08083245158195496,
	"rewards/accuracy_reward_func/mean": 0.7729910612106323,
	"rewards/accuracy_reward_func/std": 0.2013828307390213,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 17,
	"step_time": 10.377578075043857
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 194.0,
	"completions/max_terminated_length": 194.0,
	"completions/mean_length": 124.0,
	"completions/mean_terminated_length": 124.0,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"entropy": 0.3000790849328041,
	"epoch": 0.072,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.43916067550283777,
	"learning_rate": 6.6e-07,
	"loss": -0.0,
	"num_tokens": 429839.0,
	"reward": 1.7817708253860474,
	"reward_std": 0.0970831960439682,
	"rewards/accuracy_reward_func/mean": 0.7817708253860474,
	"rewards/accuracy_reward_func/std": 0.20934097468852997,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 18,
	"step_time": 9.632790027186275
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 180.0,
	"completions/max_terminated_length": 180.0,
	"completions/mean_length": 113.8125,
	"completions/mean_terminated_length": 113.8125,
	"completions/min_length": 69.0,
	"completions/min_terminated_length": 69.0,
	"entropy": 0.24520108476281166,
	"epoch": 0.076,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.8224032747868548,
	"learning_rate": 6.4e-07,
	"loss": -0.0,
	"num_tokens": 455401.0,
	"reward": 1.8831250667572021,
	"reward_std": 0.0729166716337204,
	"rewards/accuracy_reward_func/mean": 0.8831250071525574,
	"rewards/accuracy_reward_func/std": 0.1986341029405594,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 19,
	"step_time": 9.248267728835344
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 161.0,
	"completions/max_terminated_length": 161.0,
	"completions/mean_length": 94.28125,
	"completions/mean_terminated_length": 94.28125,
	"completions/min_length": 68.0,
	"completions/min_terminated_length": 68.0,
	"entropy": 0.2287127859890461,
	"epoch": 0.08,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.7435614457550869,
	"learning_rate": 6.2e-07,
	"loss": 0.0,
	"num_tokens": 482190.0,
	"reward": 1.8910417556762695,
	"reward_std": 0.08611349761486053,
	"rewards/accuracy_reward_func/mean": 0.8910416960716248,
	"rewards/accuracy_reward_func/std": 0.18090461194515228,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 20,
	"step_time": 8.669513036496937
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 137.0,
	"completions/max_terminated_length": 137.0,
	"completions/mean_length": 98.0,
	"completions/mean_terminated_length": 98.0,
	"completions/min_length": 71.0,
	"completions/min_terminated_length": 71.0,
	"entropy": 0.23622526600956917,
	"epoch": 0.084,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.3037737034258383,
	"learning_rate": 6e-07,
	"loss": 0.0,
	"num_tokens": 504026.0,
	"reward": 1.7918750047683716,
	"reward_std": 0.10315428674221039,
	"rewards/accuracy_reward_func/mean": 0.7918750047683716,
	"rewards/accuracy_reward_func/std": 0.3606663942337036,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 21,
	"step_time": 8.023445818573236
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 133.0,
	"completions/max_terminated_length": 133.0,
	"completions/mean_length": 94.4375,
	"completions/mean_terminated_length": 94.4375,
	"completions/min_length": 67.0,
	"completions/min_terminated_length": 67.0,
	"entropy": 0.21428008005023003,
	"epoch": 0.088,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.29203554195255926,
	"learning_rate": 5.8e-07,
	"loss": 0.0,
	"num_tokens": 530080.0,
	"reward": 1.9406249523162842,
	"reward_std": 0.045683760195970535,
	"rewards/accuracy_reward_func/mean": 0.940625011920929,
	"rewards/accuracy_reward_func/std": 0.10506334155797958,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 22,
	"step_time": 7.778694893233478
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 232.0,
	"completions/max_terminated_length": 232.0,
	"completions/mean_length": 110.40625,
	"completions/mean_terminated_length": 110.40625,
	"completions/min_length": 68.0,
	"completions/min_terminated_length": 68.0,
	"entropy": 0.23138786852359772,
	"epoch": 0.092,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.19686742995266426,
	"learning_rate": 5.6e-07,
	"loss": -0.0,
	"num_tokens": 558297.0,
	"reward": 1.8413751125335693,
	"reward_std": 0.0037499964237213135,
	"rewards/accuracy_reward_func/mean": 0.8413749933242798,
	"rewards/accuracy_reward_func/std": 0.21956580877304077,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 23,
	"step_time": 10.919259454123676
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 148.0,
	"completions/max_terminated_length": 148.0,
	"completions/mean_length": 102.59375,
	"completions/mean_terminated_length": 102.59375,
	"completions/min_length": 66.0,
	"completions/min_terminated_length": 66.0,
	"entropy": 0.18613235652446747,
	"epoch": 0.096,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.20640152621893892,
	"learning_rate": 5.4e-07,
	"loss": -0.0,
	"num_tokens": 584700.0,
	"reward": 1.8937499523162842,
	"reward_std": 0.020833328366279602,
	"rewards/accuracy_reward_func/mean": 0.893750011920929,
	"rewards/accuracy_reward_func/std": 0.1515599936246872,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 24,
	"step_time": 8.58028247859329
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 125.0,
	"completions/max_terminated_length": 125.0,
	"completions/mean_length": 97.625,
	"completions/mean_terminated_length": 97.625,
	"completions/min_length": 65.0,
	"completions/min_terminated_length": 65.0,
	"entropy": 0.1680564060807228,
	"epoch": 0.1,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.22607092328648917,
	"learning_rate": 5.2e-07,
	"loss": -0.0,
	"num_tokens": 605736.0,
	"reward": 1.9614583253860474,
	"reward_std": 0.04327813535928726,
	"rewards/accuracy_reward_func/mean": 0.9614583253860474,
	"rewards/accuracy_reward_func/std": 0.09804884344339371,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 25,
	"step_time": 7.494132779538631
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 218.0,
	"completions/max_terminated_length": 218.0,
	"completions/mean_length": 110.84375,
	"completions/mean_terminated_length": 110.84375,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"entropy": 0.2523631304502487,
	"epoch": 0.104,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.6963240544036335,
	"learning_rate": 5e-07,
	"loss": -0.0,
	"num_tokens": 629803.0,
	"reward": 1.722395896911621,
	"reward_std": 0.03437499701976776,
	"rewards/accuracy_reward_func/mean": 0.7223958373069763,
	"rewards/accuracy_reward_func/std": 0.2913203537464142,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 26,
	"step_time": 10.387551098130643
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 206.0,
	"completions/max_terminated_length": 206.0,
	"completions/mean_length": 101.65625,
	"completions/mean_terminated_length": 101.65625,
	"completions/min_length": 66.0,
	"completions/min_terminated_length": 66.0,
	"entropy": 0.28546470403671265,
	"epoch": 0.108,
	"frac_reward_zero_std": 0.25,
	"grad_norm": 0.8320895685645606,
	"learning_rate": 4.8e-07,
	"loss": -0.0,
	"num_tokens": 655600.0,
	"reward": 1.6748958826065063,
	"reward_std": 0.21159148216247559,
	"rewards/accuracy_reward_func/mean": 0.7405208349227905,
	"rewards/accuracy_reward_func/std": 0.3287121653556824,
	"rewards/format_reward_func/mean": 0.934374988079071,
	"rewards/format_reward_func/std": 0.20730119943618774,
	"step": 27,
	"step_time": 9.990737781859934
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 321.0,
	"completions/max_terminated_length": 321.0,
	"completions/mean_length": 121.78125,
	"completions/mean_terminated_length": 121.78125,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"entropy": 0.20365699753165245,
	"epoch": 0.112,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.3040693070538053,
	"learning_rate": 4.6e-07,
	"loss": -0.0,
	"num_tokens": 681285.0,
	"reward": 1.8820312023162842,
	"reward_std": 0.04736516997218132,
	"rewards/accuracy_reward_func/mean": 0.882031261920929,
	"rewards/accuracy_reward_func/std": 0.16079869866371155,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 28,
	"step_time": 13.582923103123903
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 175.0,
	"completions/max_terminated_length": 175.0,
	"completions/mean_length": 114.3125,
	"completions/mean_terminated_length": 114.3125,
	"completions/min_length": 65.0,
	"completions/min_terminated_length": 65.0,
	"entropy": 0.22691119089722633,
	"epoch": 0.116,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.29754584523046024,
	"learning_rate": 4.3999999999999997e-07,
	"loss": 0.0,
	"num_tokens": 700215.0,
	"reward": 1.941562533378601,
	"reward_std": 0.03998880088329315,
	"rewards/accuracy_reward_func/mean": 0.9415625333786011,
	"rewards/accuracy_reward_func/std": 0.11359171569347382,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 29,
	"step_time": 9.000959642231464
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 144.0,
	"completions/max_terminated_length": 144.0,
	"completions/mean_length": 98.8125,
	"completions/mean_terminated_length": 98.8125,
	"completions/min_length": 76.0,
	"completions/min_terminated_length": 76.0,
	"entropy": 0.19718682020902634,
	"epoch": 0.12,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.28356832567704315,
	"learning_rate": 4.1999999999999995e-07,
	"loss": 0.0,
	"num_tokens": 719033.0,
	"reward": 1.8567261695861816,
	"reward_std": 0.03630475699901581,
	"rewards/accuracy_reward_func/mean": 0.8567261695861816,
	"rewards/accuracy_reward_func/std": 0.244332417845726,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 30,
	"step_time": 8.117130983620882
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 300.0,
	"completions/max_terminated_length": 300.0,
	"completions/mean_length": 107.0625,
	"completions/mean_terminated_length": 107.0625,
	"completions/min_length": 67.0,
	"completions/min_terminated_length": 67.0,
	"entropy": 0.23164699599146843,
	"epoch": 0.124,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.3147460209715464,
	"learning_rate": 4e-07,
	"loss": 0.0,
	"num_tokens": 742811.0,
	"reward": 1.8263542652130127,
	"reward_std": 0.07890324294567108,
	"rewards/accuracy_reward_func/mean": 0.8263541460037231,
	"rewards/accuracy_reward_func/std": 0.2404511272907257,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 31,
	"step_time": 13.022676510736346
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 145.0,
	"completions/max_terminated_length": 145.0,
	"completions/mean_length": 104.28125,
	"completions/mean_terminated_length": 104.28125,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"entropy": 0.2124277576804161,
	"epoch": 0.128,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.14656459241245287,
	"learning_rate": 3.7999999999999996e-07,
	"loss": 0.0,
	"num_tokens": 767276.0,
	"reward": 1.933750033378601,
	"reward_std": 0.004330122843384743,
	"rewards/accuracy_reward_func/mean": 0.9337500333786011,
	"rewards/accuracy_reward_func/std": 0.16721147298812866,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 32,
	"step_time": 8.787895078770816
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 163.0,
	"completions/max_terminated_length": 163.0,
	"completions/mean_length": 99.6875,
	"completions/mean_terminated_length": 99.6875,
	"completions/min_length": 66.0,
	"completions/min_terminated_length": 66.0,
	"entropy": 0.18238281086087227,
	"epoch": 0.132,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.17864514961842184,
	"learning_rate": 3.6e-07,
	"loss": 0.0,
	"num_tokens": 795462.0,
	"reward": 1.957291603088379,
	"reward_std": 0.0024056239053606987,
	"rewards/accuracy_reward_func/mean": 0.9572916626930237,
	"rewards/accuracy_reward_func/std": 0.0789080411195755,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 33,
	"step_time": 8.68917733244598
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 162.0,
	"completions/max_terminated_length": 162.0,
	"completions/mean_length": 105.21875,
	"completions/mean_terminated_length": 105.21875,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"entropy": 0.20627178624272346,
	"epoch": 0.136,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.3530815395243793,
	"learning_rate": 3.4000000000000003e-07,
	"loss": 0.0,
	"num_tokens": 820193.0,
	"reward": 1.7807291746139526,
	"reward_std": 0.06852563470602036,
	"rewards/accuracy_reward_func/mean": 0.7807291746139526,
	"rewards/accuracy_reward_func/std": 0.34384265542030334,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 34,
	"step_time": 8.745650510303676
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 165.0,
	"completions/max_terminated_length": 165.0,
	"completions/mean_length": 112.15625,
	"completions/mean_terminated_length": 112.15625,
	"completions/min_length": 81.0,
	"completions/min_terminated_length": 81.0,
	"entropy": 0.260306891053915,
	"epoch": 0.14,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.5856316529561636,
	"learning_rate": 3.2e-07,
	"loss": -0.0,
	"num_tokens": 844158.0,
	"reward": 1.7372127771377563,
	"reward_std": 0.0817936509847641,
	"rewards/accuracy_reward_func/mean": 0.7372127771377563,
	"rewards/accuracy_reward_func/std": 0.2763761281967163,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 35,
	"step_time": 8.764568363316357
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 246.0,
	"completions/max_terminated_length": 246.0,
	"completions/mean_length": 120.65625,
	"completions/mean_terminated_length": 120.65625,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"entropy": 0.2644369825720787,
	"epoch": 0.144,
	"frac_reward_zero_std": 0.375,
	"grad_norm": 0.3867881480590169,
	"learning_rate": 3e-07,
	"loss": -0.0,
	"num_tokens": 870035.0,
	"reward": 1.723668098449707,
	"reward_std": 0.1543687880039215,
	"rewards/accuracy_reward_func/mean": 0.7455431222915649,
	"rewards/accuracy_reward_func/std": 0.3101375699043274,
	"rewards/format_reward_func/mean": 0.9781249761581421,
	"rewards/format_reward_func/std": 0.1237436830997467,
	"step": 36,
	"step_time": 11.264538847841322
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 162.0,
	"completions/max_terminated_length": 162.0,
	"completions/mean_length": 100.28125,
	"completions/mean_terminated_length": 100.28125,
	"completions/min_length": 72.0,
	"completions/min_terminated_length": 72.0,
	"entropy": 0.23801737278699875,
	"epoch": 0.148,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.4666251511091792,
	"learning_rate": 2.8e-07,
	"loss": -0.0,
	"num_tokens": 893872.0,
	"reward": 1.8420684337615967,
	"reward_std": 0.1202840656042099,
	"rewards/accuracy_reward_func/mean": 0.8639434576034546,
	"rewards/accuracy_reward_func/std": 0.228593647480011,
	"rewards/format_reward_func/mean": 0.9781249761581421,
	"rewards/format_reward_func/std": 0.1237436830997467,
	"step": 37,
	"step_time": 8.666291879490018
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 198.0,
	"completions/max_terminated_length": 198.0,
	"completions/mean_length": 110.9375,
	"completions/mean_terminated_length": 110.9375,
	"completions/min_length": 73.0,
	"completions/min_terminated_length": 73.0,
	"entropy": 0.23708894476294518,
	"epoch": 0.152,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.3003587942152678,
	"learning_rate": 2.6e-07,
	"loss": 0.0,
	"num_tokens": 919606.0,
	"reward": 1.9385044574737549,
	"reward_std": 0.02471514418721199,
	"rewards/accuracy_reward_func/mean": 0.9385044574737549,
	"rewards/accuracy_reward_func/std": 0.0629124566912651,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 38,
	"step_time": 9.764362094923854
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 146.0,
	"completions/max_terminated_length": 146.0,
	"completions/mean_length": 96.75,
	"completions/mean_terminated_length": 96.75,
	"completions/min_length": 68.0,
	"completions/min_terminated_length": 68.0,
	"entropy": 0.23468047007918358,
	"epoch": 0.156,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.37048827303687887,
	"learning_rate": 2.4e-07,
	"loss": -0.0,
	"num_tokens": 945630.0,
	"reward": 1.731874942779541,
	"reward_std": 0.1671428233385086,
	"rewards/accuracy_reward_func/mean": 0.7756249904632568,
	"rewards/accuracy_reward_func/std": 0.3532584309577942,
	"rewards/format_reward_func/mean": 0.956250011920929,
	"rewards/format_reward_func/std": 0.1721542775630951,
	"step": 39,
	"step_time": 9.183467078953981
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 173.0,
	"completions/max_terminated_length": 173.0,
	"completions/mean_length": 109.0,
	"completions/mean_terminated_length": 109.0,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"entropy": 0.23336144164204597,
	"epoch": 0.16,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.287196945879942,
	"learning_rate": 2.1999999999999998e-07,
	"loss": 0.0,
	"num_tokens": 967418.0,
	"reward": 1.9270832538604736,
	"reward_std": 0.012028136290609837,
	"rewards/accuracy_reward_func/mean": 0.9270833134651184,
	"rewards/accuracy_reward_func/std": 0.1690024584531784,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 40,
	"step_time": 9.006164254620671
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 143.0,
	"completions/max_terminated_length": 143.0,
	"completions/mean_length": 99.28125,
	"completions/mean_terminated_length": 99.28125,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"entropy": 0.22068660333752632,
	"epoch": 0.164,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.26699909188358745,
	"learning_rate": 2e-07,
	"loss": 0.0,
	"num_tokens": 994215.0,
	"reward": 1.9018750190734863,
	"reward_std": 0.06372595578432083,
	"rewards/accuracy_reward_func/mean": 0.9018750190734863,
	"rewards/accuracy_reward_func/std": 0.15860149264335632,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 41,
	"step_time": 8.13273252826184
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 149.0,
	"completions/max_terminated_length": 149.0,
	"completions/mean_length": 99.625,
	"completions/mean_terminated_length": 99.625,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"entropy": 0.23406217247247696,
	"epoch": 0.168,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.3533172869156224,
	"learning_rate": 1.8e-07,
	"loss": -0.0,
	"num_tokens": 1019951.0,
	"reward": 1.9318749904632568,
	"reward_std": 0.04643829166889191,
	"rewards/accuracy_reward_func/mean": 0.9318749904632568,
	"rewards/accuracy_reward_func/std": 0.10333744436502457,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 42,
	"step_time": 8.258449734188616
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 118.0,
	"completions/max_terminated_length": 118.0,
	"completions/mean_length": 92.75,
	"completions/mean_terminated_length": 92.75,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"entropy": 0.22229528427124023,
	"epoch": 0.172,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.25935443501058275,
	"learning_rate": 1.6e-07,
	"loss": 0.0,
	"num_tokens": 1047835.0,
	"reward": 1.9731919765472412,
	"reward_std": 0.022991076111793518,
	"rewards/accuracy_reward_func/mean": 0.9731919765472412,
	"rewards/accuracy_reward_func/std": 0.05601184815168381,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 43,
	"step_time": 7.3312763730064034
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 132.0,
	"completions/max_terminated_length": 132.0,
	"completions/mean_length": 98.15625,
	"completions/mean_terminated_length": 98.15625,
	"completions/min_length": 70.0,
	"completions/min_terminated_length": 70.0,
	"entropy": 0.2684129625558853,
	"epoch": 0.176,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.5054404086636916,
	"learning_rate": 1.4e-07,
	"loss": -0.0,
	"num_tokens": 1070652.0,
	"reward": 1.816145896911621,
	"reward_std": 0.046046242117881775,
	"rewards/accuracy_reward_func/mean": 0.8161457777023315,
	"rewards/accuracy_reward_func/std": 0.21198345720767975,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 44,
	"step_time": 8.090661917813122
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 189.0,
	"completions/max_terminated_length": 189.0,
	"completions/mean_length": 119.96875,
	"completions/mean_terminated_length": 119.96875,
	"completions/min_length": 63.0,
	"completions/min_terminated_length": 63.0,
	"entropy": 0.20931534841656685,
	"epoch": 0.18,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.4438102774930686,
	"learning_rate": 1.2e-07,
	"loss": 0.0,
	"num_tokens": 1094159.0,
	"reward": 1.8079866170883179,
	"reward_std": 0.07453451305627823,
	"rewards/accuracy_reward_func/mean": 0.8079866170883179,
	"rewards/accuracy_reward_func/std": 0.1761476993560791,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 45,
	"step_time": 9.527460671961308
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 215.0,
	"completions/max_terminated_length": 215.0,
	"completions/mean_length": 107.15625,
	"completions/mean_terminated_length": 107.15625,
	"completions/min_length": 75.0,
	"completions/min_terminated_length": 75.0,
	"entropy": 0.25717881694436073,
	"epoch": 0.184,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.21670389917367197,
	"learning_rate": 1e-07,
	"loss": 0.0,
	"num_tokens": 1122608.0,
	"reward": 1.808333396911621,
	"reward_std": 0.03125,
	"rewards/accuracy_reward_func/mean": 0.8083333373069763,
	"rewards/accuracy_reward_func/std": 0.21151866018772125,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 46,
	"step_time": 10.36410805862397
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 126.0,
	"completions/max_terminated_length": 126.0,
	"completions/mean_length": 90.375,
	"completions/mean_terminated_length": 90.375,
	"completions/min_length": 62.0,
	"completions/min_terminated_length": 62.0,
	"entropy": 0.21086286380887032,
	"epoch": 0.188,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.4243741948522665,
	"learning_rate": 8e-08,
	"loss": 0.0,
	"num_tokens": 1142524.0,
	"reward": 1.8813542127609253,
	"reward_std": 0.03895833343267441,
	"rewards/accuracy_reward_func/mean": 0.8813541531562805,
	"rewards/accuracy_reward_func/std": 0.13226144015789032,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 47,
	"step_time": 8.480774418450892
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 151.0,
	"completions/max_terminated_length": 151.0,
	"completions/mean_length": 98.96875,
	"completions/mean_terminated_length": 98.96875,
	"completions/min_length": 64.0,
	"completions/min_terminated_length": 64.0,
	"entropy": 0.25533241406083107,
	"epoch": 0.192,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.363734391468509,
	"learning_rate": 6e-08,
	"loss": 0.0,
	"num_tokens": 1167279.0,
	"reward": 1.8920758962631226,
	"reward_std": 0.043149448931217194,
	"rewards/accuracy_reward_func/mean": 0.8920758962631226,
	"rewards/accuracy_reward_func/std": 0.18521229922771454,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 48,
	"step_time": 8.397036101669073
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 158.0,
	"completions/max_terminated_length": 158.0,
	"completions/mean_length": 93.71875,
	"completions/mean_terminated_length": 93.71875,
	"completions/min_length": 55.0,
	"completions/min_terminated_length": 55.0,
	"entropy": 0.2753983736038208,
	"epoch": 0.196,
	"frac_reward_zero_std": 0.625,
	"grad_norm": 0.37001348543936974,
	"learning_rate": 4e-08,
	"loss": 0.0,
	"num_tokens": 1189970.0,
	"reward": 1.7348958253860474,
	"reward_std": 0.0466608926653862,
	"rewards/accuracy_reward_func/mean": 0.7348958253860474,
	"rewards/accuracy_reward_func/std": 0.23915956914424896,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 49,
	"step_time": 9.01807147078216
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 685.0,
	"completions/max_terminated_length": 685.0,
	"completions/mean_length": 130.3125,
	"completions/mean_terminated_length": 130.3125,
	"completions/min_length": 65.0,
	"completions/min_terminated_length": 65.0,
	"entropy": 0.271317683160305,
	"epoch": 0.2,
	"frac_reward_zero_std": 0.5,
	"grad_norm": 0.3729527358522691,
	"learning_rate": 2e-08,
	"loss": 0.0,
	"num_tokens": 1214600.0,
	"reward": 1.7553727626800537,
	"reward_std": 0.13293951749801636,
	"rewards/accuracy_reward_func/mean": 0.7553727626800537,
	"rewards/accuracy_reward_func/std": 0.3456610441207886,
	"rewards/format_reward_func/mean": 1.0,
	"rewards/format_reward_func/std": 0.0,
	"step": 50,
	"step_time": 25.12439160142094
	}
	],
	"logging_steps": 1.0,
	"max_steps": 50,
	"num_input_tokens_seen": 1214600,
	"num_train_epochs": 1,
	"save_steps": 500,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 2,
	"trial_name": null,
	"trial_params": null
	}