ToolPRM-GRPO-synthesis / trainer_state.json

Upload folder using huggingface_hub

a2cdc79 verified 4 months ago

91.2 kB

	{
	"best_global_step": null,
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 3.0,
	"eval_steps": 500,
	"global_step": 921,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.021875,
	"completions/max_length": 2397.7,
	"completions/max_terminated_length": 2153.3,
	"completions/mean_length": 724.20625,
	"completions/mean_terminated_length": 655.9658874511719,
	"completions/min_length": 158.7,
	"completions/min_terminated_length": 158.7,
	"entropy": 0.170973788946867,
	"epoch": 0.03257328990228013,
	"frac_reward_zero_std": 0.725,
	"grad_norm": 1.03125,
	"learning_rate": 9.90228013029316e-06,
	"loss": 0.0209,
	"num_tokens": 901722.0,
	"reward": 0.671875,
	"reward_std": 0.12951098531484603,
	"rewards/qwen_accuracy_reward/mean": 0.671875,
	"rewards/qwen_accuracy_reward/std": 0.32610869109630586,
	"step": 10,
	"step_time": 75.02407562928275
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2212.1,
	"completions/max_terminated_length": 2131.1,
	"completions/mean_length": 648.159375,
	"completions/mean_terminated_length": 619.5659057617188,
	"completions/min_length": 142.9,
	"completions/min_terminated_length": 142.9,
	"entropy": 0.16581312268972398,
	"epoch": 0.06514657980456026,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.0,
	"learning_rate": 9.79370249728556e-06,
	"loss": 0.0216,
	"num_tokens": 1827701.0,
	"reward": 0.834375,
	"reward_std": 0.09659009724855423,
	"rewards/qwen_accuracy_reward/mean": 0.834375,
	"rewards/qwen_accuracy_reward/std": 0.24483564049005507,
	"step": 20,
	"step_time": 66.24043246284127
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 2206.0,
	"completions/max_terminated_length": 2090.8,
	"completions/mean_length": 623.284375,
	"completions/mean_terminated_length": 558.5000427246093,
	"completions/min_length": 171.1,
	"completions/min_terminated_length": 171.1,
	"entropy": 0.1667719691991806,
	"epoch": 0.09771986970684039,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 0.7890625,
	"learning_rate": 9.68512486427796e-06,
	"loss": 0.0347,
	"num_tokens": 2654656.0,
	"reward": 0.83125,
	"reward_std": 0.09974638372659683,
	"rewards/qwen_accuracy_reward/mean": 0.83125,
	"rewards/qwen_accuracy_reward/std": 0.31891718655824663,
	"step": 30,
	"step_time": 67.77192380828782
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1337.0,
	"completions/max_terminated_length": 1337.0,
	"completions/mean_length": 469.159375,
	"completions/mean_terminated_length": 469.159375,
	"completions/min_length": 165.4,
	"completions/min_terminated_length": 165.4,
	"entropy": 0.15202879384160042,
	"epoch": 0.13029315960912052,
	"frac_reward_zero_std": 0.8,
	"grad_norm": 1.875,
	"learning_rate": 9.576547231270358e-06,
	"loss": 0.0142,
	"num_tokens": 3441667.0,
	"reward": 0.8125,
	"reward_std": 0.09931695759296418,
	"rewards/qwen_accuracy_reward/mean": 0.8125,
	"rewards/qwen_accuracy_reward/std": 0.3193816542625427,
	"step": 40,
	"step_time": 41.496644421108066
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1534.5,
	"completions/max_terminated_length": 1359.1,
	"completions/mean_length": 505.4125,
	"completions/mean_terminated_length": 475.48418579101565,
	"completions/min_length": 166.3,
	"completions/min_terminated_length": 166.3,
	"entropy": 0.1465001180768013,
	"epoch": 0.16286644951140064,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 1.1640625,
	"learning_rate": 9.467969598262759e-06,
	"loss": 0.0048,
	"num_tokens": 4235663.0,
	"reward": 0.875,
	"reward_std": 0.07596379667520523,
	"rewards/qwen_accuracy_reward/mean": 0.875,
	"rewards/qwen_accuracy_reward/std": 0.20005422383546828,
	"step": 50,
	"step_time": 47.53361711697653
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1282.1,
	"completions/max_terminated_length": 1282.1,
	"completions/mean_length": 475.225,
	"completions/mean_terminated_length": 475.225,
	"completions/min_length": 189.6,
	"completions/min_terminated_length": 189.6,
	"entropy": 0.15337296426296235,
	"epoch": 0.19543973941368079,
	"frac_reward_zero_std": 0.75,
	"grad_norm": 1.640625,
	"learning_rate": 9.359391965255158e-06,
	"loss": 0.0229,
	"num_tokens": 4959391.0,
	"reward": 0.85,
	"reward_std": 0.11794019415974617,
	"rewards/qwen_accuracy_reward/mean": 0.85,
	"rewards/qwen_accuracy_reward/std": 0.25456976890563965,
	"step": 60,
	"step_time": 36.53024397492409
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1927.4,
	"completions/max_terminated_length": 1895.7,
	"completions/mean_length": 612.075,
	"completions/mean_terminated_length": 590.4368774414063,
	"completions/min_length": 192.2,
	"completions/min_terminated_length": 192.2,
	"entropy": 0.16074557453393937,
	"epoch": 0.2280130293159609,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 1.4453125,
	"learning_rate": 9.250814332247557e-06,
	"loss": 0.0266,
	"num_tokens": 5749223.0,
	"reward": 0.803125,
	"reward_std": 0.08783914744853974,
	"rewards/qwen_accuracy_reward/mean": 0.803125,
	"rewards/qwen_accuracy_reward/std": 0.2966747134923935,
	"step": 70,
	"step_time": 57.99369401996955
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 2189.3,
	"completions/max_terminated_length": 2009.3,
	"completions/mean_length": 690.7125,
	"completions/mean_terminated_length": 586.1856994628906,
	"completions/min_length": 188.9,
	"completions/min_terminated_length": 188.9,
	"entropy": 0.15213419646024703,
	"epoch": 0.26058631921824105,
	"frac_reward_zero_std": 0.8,
	"grad_norm": 0.0,
	"learning_rate": 9.142236699239957e-06,
	"loss": 0.0282,
	"num_tokens": 6565907.0,
	"reward": 0.878125,
	"reward_std": 0.08617057129740716,
	"rewards/qwen_accuracy_reward/mean": 0.878125,
	"rewards/qwen_accuracy_reward/std": 0.27245663553476335,
	"step": 80,
	"step_time": 65.59074299260973
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1602.7,
	"completions/max_terminated_length": 1467.6,
	"completions/mean_length": 467.484375,
	"completions/mean_terminated_length": 456.31884765625,
	"completions/min_length": 179.3,
	"completions/min_terminated_length": 179.3,
	"entropy": 0.1368262179195881,
	"epoch": 0.2931596091205212,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 0.0,
	"learning_rate": 9.033659066232356e-06,
	"loss": 0.004,
	"num_tokens": 7334102.0,
	"reward": 0.88125,
	"reward_std": 0.06123279631137848,
	"rewards/qwen_accuracy_reward/mean": 0.88125,
	"rewards/qwen_accuracy_reward/std": 0.25089033097028735,
	"step": 90,
	"step_time": 48.28751948485151
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 2426.9,
	"completions/max_terminated_length": 2137.6,
	"completions/mean_length": 604.2625,
	"completions/mean_terminated_length": 548.6393463134766,
	"completions/min_length": 179.7,
	"completions/min_terminated_length": 179.7,
	"entropy": 0.15090147852897645,
	"epoch": 0.3257328990228013,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 1.625,
	"learning_rate": 8.925081433224755e-06,
	"loss": 0.0856,
	"num_tokens": 8127226.0,
	"reward": 0.84375,
	"reward_std": 0.07459585815668106,
	"rewards/qwen_accuracy_reward/mean": 0.84375,
	"rewards/qwen_accuracy_reward/std": 0.29288421422243116,
	"step": 100,
	"step_time": 70.59187124017626
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 2326.5,
	"completions/max_terminated_length": 2284.1,
	"completions/mean_length": 609.134375,
	"completions/mean_terminated_length": 541.8694732666015,
	"completions/min_length": 179.7,
	"completions/min_terminated_length": 179.7,
	"entropy": 0.17526374608278275,
	"epoch": 0.3583061889250814,
	"frac_reward_zero_std": 0.775,
	"grad_norm": 1.2109375,
	"learning_rate": 8.816503800217156e-06,
	"loss": -0.0395,
	"num_tokens": 8923405.0,
	"reward": 0.84375,
	"reward_std": 0.10457713454961777,
	"rewards/qwen_accuracy_reward/mean": 0.84375,
	"rewards/qwen_accuracy_reward/std": 0.2856591001152992,
	"step": 110,
	"step_time": 64.31981013910845
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.03125,
	"completions/max_length": 2472.2,
	"completions/max_terminated_length": 2434.0,
	"completions/mean_length": 614.309375,
	"completions/mean_terminated_length": 508.3423645019531,
	"completions/min_length": 206.9,
	"completions/min_terminated_length": 206.9,
	"entropy": 0.14617881700396537,
	"epoch": 0.39087947882736157,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 1.34375,
	"learning_rate": 8.707926167209557e-06,
	"loss": -0.0058,
	"num_tokens": 9650464.0,
	"reward": 0.934375,
	"reward_std": 0.06943454667925834,
	"rewards/qwen_accuracy_reward/mean": 0.934375,
	"rewards/qwen_accuracy_reward/std": 0.1803007885813713,
	"step": 120,
	"step_time": 71.93199644116685
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1687.5,
	"completions/max_terminated_length": 1578.9,
	"completions/mean_length": 506.765625,
	"completions/mean_terminated_length": 475.7398681640625,
	"completions/min_length": 197.5,
	"completions/min_terminated_length": 197.5,
	"entropy": 0.14087174832820892,
	"epoch": 0.4234527687296417,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 0.83984375,
	"learning_rate": 8.599348534201956e-06,
	"loss": 0.0126,
	"num_tokens": 10481205.0,
	"reward": 0.91875,
	"reward_std": 0.071863903850317,
	"rewards/qwen_accuracy_reward/mean": 0.91875,
	"rewards/qwen_accuracy_reward/std": 0.16812221705913544,
	"step": 130,
	"step_time": 51.10968422973529
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 2098.5,
	"completions/max_terminated_length": 1797.3,
	"completions/mean_length": 617.415625,
	"completions/mean_terminated_length": 556.195068359375,
	"completions/min_length": 178.4,
	"completions/min_terminated_length": 178.4,
	"entropy": 0.15323501601815223,
	"epoch": 0.4560260586319218,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 8.490770901194355e-06,
	"loss": 0.0577,
	"num_tokens": 11246506.0,
	"reward": 0.884375,
	"reward_std": 0.06102004498243332,
	"rewards/qwen_accuracy_reward/mean": 0.884375,
	"rewards/qwen_accuracy_reward/std": 0.18740518838167192,
	"step": 140,
	"step_time": 61.85610852092505
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2084.5,
	"completions/max_terminated_length": 1994.2,
	"completions/mean_length": 541.646875,
	"completions/mean_terminated_length": 509.9564208984375,
	"completions/min_length": 190.2,
	"completions/min_terminated_length": 190.2,
	"entropy": 0.13484818413853644,
	"epoch": 0.48859934853420195,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 8.382193268186755e-06,
	"loss": 0.0238,
	"num_tokens": 12068721.0,
	"reward": 0.93125,
	"reward_std": 0.03335031494498253,
	"rewards/qwen_accuracy_reward/mean": 0.93125,
	"rewards/qwen_accuracy_reward/std": 0.147479148209095,
	"step": 150,
	"step_time": 60.58088333830237
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 2113.6,
	"completions/max_terminated_length": 1865.1,
	"completions/mean_length": 617.86875,
	"completions/mean_terminated_length": 564.8990783691406,
	"completions/min_length": 176.8,
	"completions/min_terminated_length": 176.8,
	"entropy": 0.14884034767746926,
	"epoch": 0.5211726384364821,
	"frac_reward_zero_std": 0.825,
	"grad_norm": 1.15625,
	"learning_rate": 8.273615635179154e-06,
	"loss": 0.0349,
	"num_tokens": 12860375.0,
	"reward": 0.90625,
	"reward_std": 0.07280554771423339,
	"rewards/qwen_accuracy_reward/mean": 0.90625,
	"rewards/qwen_accuracy_reward/std": 0.19390300512313843,
	"step": 160,
	"step_time": 61.31780819287523
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 2070.4,
	"completions/max_terminated_length": 1955.4,
	"completions/mean_length": 525.28125,
	"completions/mean_terminated_length": 470.9878173828125,
	"completions/min_length": 199.4,
	"completions/min_terminated_length": 199.4,
	"entropy": 0.1467311643064022,
	"epoch": 0.5537459283387622,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 8.165038002171553e-06,
	"loss": 0.0292,
	"num_tokens": 13584777.0,
	"reward": 0.884375,
	"reward_std": 0.05376190170645714,
	"rewards/qwen_accuracy_reward/mean": 0.884375,
	"rewards/qwen_accuracy_reward/std": 0.2119799315929413,
	"step": 170,
	"step_time": 61.066531581245364
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1569.3,
	"completions/max_terminated_length": 1385.7,
	"completions/mean_length": 477.046875,
	"completions/mean_terminated_length": 466.2573547363281,
	"completions/min_length": 198.5,
	"completions/min_terminated_length": 198.5,
	"entropy": 0.14774601608514787,
	"epoch": 0.5863192182410424,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 1.0546875,
	"learning_rate": 8.056460369163954e-06,
	"loss": 0.06,
	"num_tokens": 14346864.0,
	"reward": 0.9375,
	"reward_std": 0.02925042062997818,
	"rewards/qwen_accuracy_reward/mean": 0.9375,
	"rewards/qwen_accuracy_reward/std": 0.13194561302661895,
	"step": 180,
	"step_time": 47.719357285648584
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 2356.5,
	"completions/max_terminated_length": 1938.6,
	"completions/mean_length": 530.79375,
	"completions/mean_terminated_length": 508.41269836425784,
	"completions/min_length": 179.6,
	"completions/min_terminated_length": 179.6,
	"entropy": 0.15401604473590852,
	"epoch": 0.6188925081433225,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 7.947882736156353e-06,
	"loss": 0.073,
	"num_tokens": 15077166.0,
	"reward": 0.928125,
	"reward_std": 0.03808925524353981,
	"rewards/qwen_accuracy_reward/mean": 0.928125,
	"rewards/qwen_accuracy_reward/std": 0.16100659370422363,
	"step": 190,
	"step_time": 68.03254930684344
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 2111.2,
	"completions/max_terminated_length": 2089.6,
	"completions/mean_length": 532.41875,
	"completions/mean_terminated_length": 511.9100036621094,
	"completions/min_length": 199.4,
	"completions/min_terminated_length": 199.4,
	"entropy": 0.14272007048130037,
	"epoch": 0.6514657980456026,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 7.839305103148752e-06,
	"loss": 0.038,
	"num_tokens": 15699220.0,
	"reward": 0.928125,
	"reward_std": 0.0245114803314209,
	"rewards/qwen_accuracy_reward/mean": 0.928125,
	"rewards/qwen_accuracy_reward/std": 0.11228372007608414,
	"step": 200,
	"step_time": 61.566719483956696
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1495.5,
	"completions/max_terminated_length": 1268.8,
	"completions/mean_length": 497.496875,
	"completions/mean_terminated_length": 442.867822265625,
	"completions/min_length": 186.1,
	"completions/min_terminated_length": 186.1,
	"entropy": 0.13395386636257173,
	"epoch": 0.6840390879478827,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 7.730727470141152e-06,
	"loss": 0.0028,
	"num_tokens": 16564171.0,
	"reward": 0.9625,
	"reward_std": 0.02177756354212761,
	"rewards/qwen_accuracy_reward/mean": 0.9625,
	"rewards/qwen_accuracy_reward/std": 0.07889154553413391,
	"step": 210,
	"step_time": 45.37072062129155
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1425.8,
	"completions/max_terminated_length": 1415.7,
	"completions/mean_length": 453.871875,
	"completions/mean_terminated_length": 444.69354248046875,
	"completions/min_length": 166.8,
	"completions/min_terminated_length": 166.8,
	"entropy": 0.13297367617487907,
	"epoch": 0.7166123778501629,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 7.622149837133551e-06,
	"loss": 0.0147,
	"num_tokens": 17330410.0,
	"reward": 0.940625,
	"reward_std": 0.022201896458864213,
	"rewards/qwen_accuracy_reward/mean": 0.940625,
	"rewards/qwen_accuracy_reward/std": 0.11959655284881592,
	"step": 220,
	"step_time": 44.74475174760446
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1833.4,
	"completions/max_terminated_length": 1752.4,
	"completions/mean_length": 501.321875,
	"completions/mean_terminated_length": 469.6424560546875,
	"completions/min_length": 171.5,
	"completions/min_terminated_length": 171.5,
	"entropy": 0.14848560467362404,
	"epoch": 0.749185667752443,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 7.51357220412595e-06,
	"loss": 0.022,
	"num_tokens": 18088737.0,
	"reward": 0.928125,
	"reward_std": 0.04397946000099182,
	"rewards/qwen_accuracy_reward/mean": 0.928125,
	"rewards/qwen_accuracy_reward/std": 0.13159393817186354,
	"step": 230,
	"step_time": 54.10395782412961
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1583.8,
	"completions/max_terminated_length": 1528.0,
	"completions/mean_length": 411.75625,
	"completions/mean_terminated_length": 400.55755615234375,
	"completions/min_length": 159.9,
	"completions/min_terminated_length": 159.9,
	"entropy": 0.13070192262530328,
	"epoch": 0.7817589576547231,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 7.40499457111835e-06,
	"loss": 0.0496,
	"num_tokens": 18793955.0,
	"reward": 0.95625,
	"reward_std": 0.03104073107242584,
	"rewards/qwen_accuracy_reward/mean": 0.95625,
	"rewards/qwen_accuracy_reward/std": 0.08069398403167724,
	"step": 240,
	"step_time": 47.35719826500863
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1359.2,
	"completions/max_terminated_length": 1338.3,
	"completions/mean_length": 481.21875,
	"completions/mean_terminated_length": 461.47271118164065,
	"completions/min_length": 200.0,
	"completions/min_terminated_length": 200.0,
	"entropy": 0.14220248386263848,
	"epoch": 0.8143322475570033,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 7.29641693811075e-06,
	"loss": 0.0184,
	"num_tokens": 19454497.0,
	"reward": 0.95625,
	"reward_std": 0.05418623313307762,
	"rewards/qwen_accuracy_reward/mean": 0.95625,
	"rewards/qwen_accuracy_reward/std": 0.11881711781024933,
	"step": 250,
	"step_time": 41.61551207816228
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1620.3,
	"completions/max_terminated_length": 1620.3,
	"completions/mean_length": 448.68125,
	"completions/mean_terminated_length": 448.68125,
	"completions/min_length": 182.0,
	"completions/min_terminated_length": 182.0,
	"entropy": 0.13489690721035003,
	"epoch": 0.8469055374592834,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 1.7421875,
	"learning_rate": 7.187839305103149e-06,
	"loss": 0.0,
	"num_tokens": 20191547.0,
	"reward": 0.925,
	"reward_std": 0.07259083464741707,
	"rewards/qwen_accuracy_reward/mean": 0.925,
	"rewards/qwen_accuracy_reward/std": 0.19271825700998307,
	"step": 260,
	"step_time": 45.474128680489954
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.025,
	"completions/max_length": 1471.7,
	"completions/max_terminated_length": 1424.8,
	"completions/mean_length": 519.99375,
	"completions/mean_terminated_length": 437.50208740234376,
	"completions/min_length": 181.7,
	"completions/min_terminated_length": 181.7,
	"entropy": 0.13938435539603233,
	"epoch": 0.8794788273615635,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.91015625,
	"learning_rate": 7.079261672095549e-06,
	"loss": -0.0167,
	"num_tokens": 21039881.0,
	"reward": 0.93125,
	"reward_std": 0.051027984172105786,
	"rewards/qwen_accuracy_reward/mean": 0.93125,
	"rewards/qwen_accuracy_reward/std": 0.15987386405467988,
	"step": 270,
	"step_time": 45.94509084094316
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1812.2,
	"completions/max_terminated_length": 1642.1,
	"completions/mean_length": 489.865625,
	"completions/mean_terminated_length": 469.1300048828125,
	"completions/min_length": 197.8,
	"completions/min_terminated_length": 197.8,
	"entropy": 0.14942506179213524,
	"epoch": 0.9120521172638436,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 6.9706840390879485e-06,
	"loss": 0.0433,
	"num_tokens": 21713262.0,
	"reward": 0.96875,
	"reward_std": 0.01767766922712326,
	"rewards/qwen_accuracy_reward/mean": 0.96875,
	"rewards/qwen_accuracy_reward/std": 0.06858760267496108,
	"step": 280,
	"step_time": 54.875936476886274
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1379.1,
	"completions/max_terminated_length": 1379.1,
	"completions/mean_length": 441.2125,
	"completions/mean_terminated_length": 441.2125,
	"completions/min_length": 184.5,
	"completions/min_terminated_length": 184.5,
	"entropy": 0.14119350165128708,
	"epoch": 0.9446254071661238,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 6.8621064060803475e-06,
	"loss": -0.0025,
	"num_tokens": 22443962.0,
	"reward": 0.95625,
	"reward_std": 0.02177756354212761,
	"rewards/qwen_accuracy_reward/mean": 0.95625,
	"rewards/qwen_accuracy_reward/std": 0.09856200665235519,
	"step": 290,
	"step_time": 41.87264884654432
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1268.2,
	"completions/max_terminated_length": 1268.2,
	"completions/mean_length": 434.421875,
	"completions/mean_terminated_length": 434.421875,
	"completions/min_length": 206.3,
	"completions/min_terminated_length": 206.3,
	"entropy": 0.1490817114710808,
	"epoch": 0.9771986970684039,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 1.0546875,
	"learning_rate": 6.753528773072747e-06,
	"loss": 0.0056,
	"num_tokens": 23106601.0,
	"reward": 0.953125,
	"reward_std": 0.03061639815568924,
	"rewards/qwen_accuracy_reward/mean": 0.953125,
	"rewards/qwen_accuracy_reward/std": 0.11623967587947845,
	"step": 300,
	"step_time": 39.61534147607163
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1601.1,
	"completions/max_terminated_length": 1574.2,
	"completions/mean_length": 469.58125,
	"completions/mean_terminated_length": 447.9079223632813,
	"completions/min_length": 181.1,
	"completions/min_terminated_length": 181.1,
	"entropy": 0.13837436586618423,
	"epoch": 1.009771986970684,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 6.644951140065147e-06,
	"loss": 0.029,
	"num_tokens": 23888067.0,
	"reward": 0.94375,
	"reward_std": 0.02925042062997818,
	"rewards/qwen_accuracy_reward/mean": 0.94375,
	"rewards/qwen_accuracy_reward/std": 0.12826661467552186,
	"step": 310,
	"step_time": 47.85272020176053
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1411.9,
	"completions/max_terminated_length": 1406.8,
	"completions/mean_length": 404.721875,
	"completions/mean_terminated_length": 393.82207641601565,
	"completions/min_length": 183.3,
	"completions/min_terminated_length": 183.3,
	"entropy": 0.1374943107366562,
	"epoch": 1.0423452768729642,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 6.536373507057546e-06,
	"loss": 0.0265,
	"num_tokens": 24727370.0,
	"reward": 0.996875,
	"reward_std": 0.00883883461356163,
	"rewards/qwen_accuracy_reward/mean": 0.996875,
	"rewards/qwen_accuracy_reward/std": 0.01767766922712326,
	"step": 320,
	"step_time": 44.45478741144761
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0125,
	"completions/max_length": 1119.1,
	"completions/max_terminated_length": 1095.2,
	"completions/mean_length": 461.95,
	"completions/mean_terminated_length": 424.2075927734375,
	"completions/min_length": 204.7,
	"completions/min_terminated_length": 204.7,
	"entropy": 0.15105342343449593,
	"epoch": 1.0749185667752443,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 6.427795874049946e-06,
	"loss": 0.0436,
	"num_tokens": 25442066.0,
	"reward": 0.978125,
	"reward_std": 0.04218915030360222,
	"rewards/qwen_accuracy_reward/mean": 0.978125,
	"rewards/qwen_accuracy_reward/std": 0.07587221264839172,
	"step": 330,
	"step_time": 35.703045930247754
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1083.0,
	"completions/max_terminated_length": 1083.0,
	"completions/mean_length": 374.3875,
	"completions/mean_terminated_length": 374.3875,
	"completions/min_length": 168.6,
	"completions/min_terminated_length": 168.6,
	"entropy": 0.14668092131614685,
	"epoch": 1.1074918566775245,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 6.319218241042345e-06,
	"loss": 0.012,
	"num_tokens": 26188086.0,
	"reward": 0.946875,
	"reward_std": 0.02651650384068489,
	"rewards/qwen_accuracy_reward/mean": 0.946875,
	"rewards/qwen_accuracy_reward/std": 0.1056659385561943,
	"step": 340,
	"step_time": 34.11438843393698
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0125,
	"completions/max_length": 1756.2,
	"completions/max_terminated_length": 1669.5,
	"completions/mean_length": 522.075,
	"completions/mean_terminated_length": 480.16993408203126,
	"completions/min_length": 191.0,
	"completions/min_terminated_length": 191.0,
	"entropy": 0.16799205988645555,
	"epoch": 1.1400651465798046,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 6.2106406080347455e-06,
	"loss": 0.0095,
	"num_tokens": 26977806.0,
	"reward": 0.896875,
	"reward_std": 0.02651650384068489,
	"rewards/qwen_accuracy_reward/mean": 0.896875,
	"rewards/qwen_accuracy_reward/std": 0.13685612380504608,
	"step": 350,
	"step_time": 53.24436394525692
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1369.8,
	"completions/max_terminated_length": 1369.8,
	"completions/mean_length": 441.03125,
	"completions/mean_terminated_length": 441.03125,
	"completions/min_length": 186.2,
	"completions/min_terminated_length": 186.2,
	"entropy": 0.16459481716156005,
	"epoch": 1.1726384364820848,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 1.546875,
	"learning_rate": 6.102062975027145e-06,
	"loss": -0.0057,
	"num_tokens": 27647848.0,
	"reward": 0.959375,
	"reward_std": 0.03061639815568924,
	"rewards/qwen_accuracy_reward/mean": 0.959375,
	"rewards/qwen_accuracy_reward/std": 0.09297246783971787,
	"step": 360,
	"step_time": 38.45815520407632
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1612.2,
	"completions/max_terminated_length": 1579.0,
	"completions/mean_length": 457.859375,
	"completions/mean_terminated_length": 447.72197265625,
	"completions/min_length": 166.2,
	"completions/min_terminated_length": 166.2,
	"entropy": 0.16912921741604806,
	"epoch": 1.205211726384365,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 5.993485342019545e-06,
	"loss": -0.0288,
	"num_tokens": 28297795.0,
	"reward": 0.925,
	"reward_std": 0.0408231720328331,
	"rewards/qwen_accuracy_reward/mean": 0.925,
	"rewards/qwen_accuracy_reward/std": 0.16792239248752594,
	"step": 370,
	"step_time": 47.60929348124191
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1762.3,
	"completions/max_terminated_length": 1711.7,
	"completions/mean_length": 454.509375,
	"completions/mean_terminated_length": 443.2507049560547,
	"completions/min_length": 187.1,
	"completions/min_terminated_length": 187.1,
	"entropy": 0.16176492720842361,
	"epoch": 1.237785016286645,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 5.884907709011944e-06,
	"loss": 0.0015,
	"num_tokens": 29009246.0,
	"reward": 0.9625,
	"reward_std": 0.013363061845302582,
	"rewards/qwen_accuracy_reward/mean": 0.9625,
	"rewards/qwen_accuracy_reward/std": 0.07759521007537842,
	"step": 380,
	"step_time": 53.365480937343094
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1627.8,
	"completions/max_terminated_length": 1543.4,
	"completions/mean_length": 474.075,
	"completions/mean_terminated_length": 463.8240905761719,
	"completions/min_length": 193.1,
	"completions/min_terminated_length": 193.1,
	"entropy": 0.17872475683689118,
	"epoch": 1.2703583061889252,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 0.0,
	"learning_rate": 5.776330076004344e-06,
	"loss": 0.0233,
	"num_tokens": 29602982.0,
	"reward": 0.86875,
	"reward_std": 0.06123279631137848,
	"rewards/qwen_accuracy_reward/mean": 0.86875,
	"rewards/qwen_accuracy_reward/std": 0.23578283339738845,
	"step": 390,
	"step_time": 49.23055710773915
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1638.8,
	"completions/max_terminated_length": 1631.1,
	"completions/mean_length": 486.021875,
	"completions/mean_terminated_length": 475.7992919921875,
	"completions/min_length": 195.0,
	"completions/min_terminated_length": 195.0,
	"entropy": 0.15955362915992738,
	"epoch": 1.3029315960912053,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 5.667752442996744e-06,
	"loss": 0.0137,
	"num_tokens": 30373933.0,
	"reward": 0.946875,
	"reward_std": 0.00883883461356163,
	"rewards/qwen_accuracy_reward/mean": 0.946875,
	"rewards/qwen_accuracy_reward/std": 0.1056659385561943,
	"step": 400,
	"step_time": 50.148436666186896
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1311.0,
	"completions/max_terminated_length": 1311.0,
	"completions/mean_length": 428.321875,
	"completions/mean_terminated_length": 428.321875,
	"completions/min_length": 169.8,
	"completions/min_terminated_length": 169.8,
	"entropy": 0.1585499659180641,
	"epoch": 1.3355048859934853,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 5.559174809989143e-06,
	"loss": -0.0122,
	"num_tokens": 31204012.0,
	"reward": 0.9625,
	"reward_std": 0.042613483220338824,
	"rewards/qwen_accuracy_reward/mean": 0.9625,
	"rewards/qwen_accuracy_reward/std": 0.09328008741140366,
	"step": 410,
	"step_time": 40.44286519419402
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1511.4,
	"completions/max_terminated_length": 1400.9,
	"completions/mean_length": 477.434375,
	"completions/mean_terminated_length": 455.53375244140625,
	"completions/min_length": 214.7,
	"completions/min_terminated_length": 214.7,
	"entropy": 0.17368159890174867,
	"epoch": 1.3680781758957654,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 5.4505971769815425e-06,
	"loss": 0.0329,
	"num_tokens": 31967207.0,
	"reward": 0.99375,
	"reward_std": 0.011572751402854919,
	"rewards/qwen_accuracy_reward/mean": 0.99375,
	"rewards/qwen_accuracy_reward/std": 0.024593468010425567,
	"step": 420,
	"step_time": 47.28061485029757
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1253.4,
	"completions/max_terminated_length": 1253.4,
	"completions/mean_length": 427.68125,
	"completions/mean_terminated_length": 427.68125,
	"completions/min_length": 184.1,
	"completions/min_terminated_length": 184.1,
	"entropy": 0.16302806735038758,
	"epoch": 1.4006514657980456,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 5.342019543973942e-06,
	"loss": 0.0154,
	"num_tokens": 32547929.0,
	"reward": 0.9875,
	"reward_std": 0.02177756354212761,
	"rewards/qwen_accuracy_reward/mean": 0.9875,
	"rewards/qwen_accuracy_reward/std": 0.04729212671518326,
	"step": 430,
	"step_time": 39.09927195487544
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1847.1,
	"completions/max_terminated_length": 1769.7,
	"completions/mean_length": 478.76875,
	"completions/mean_terminated_length": 468.09132690429686,
	"completions/min_length": 192.7,
	"completions/min_terminated_length": 192.7,
	"entropy": 0.16723438948392869,
	"epoch": 1.4332247557003257,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 5.233441910966341e-06,
	"loss": 0.0242,
	"num_tokens": 33257383.0,
	"reward": 0.94375,
	"reward_std": 0.02925042062997818,
	"rewards/qwen_accuracy_reward/mean": 0.94375,
	"rewards/qwen_accuracy_reward/std": 0.12826661467552186,
	"step": 440,
	"step_time": 54.6911054097116
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1209.8,
	"completions/max_terminated_length": 1209.8,
	"completions/mean_length": 372.3625,
	"completions/mean_terminated_length": 372.3625,
	"completions/min_length": 177.4,
	"completions/min_terminated_length": 177.4,
	"entropy": 0.15510803908109666,
	"epoch": 1.4657980456026058,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 5.124864277958741e-06,
	"loss": 0.0094,
	"num_tokens": 33972827.0,
	"reward": 0.978125,
	"reward_std": 0.036084231734275815,
	"rewards/qwen_accuracy_reward/mean": 0.978125,
	"rewards/qwen_accuracy_reward/std": 0.07880139350891113,
	"step": 450,
	"step_time": 38.53117633331567
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1607.4,
	"completions/max_terminated_length": 1607.4,
	"completions/mean_length": 427.8875,
	"completions/mean_terminated_length": 427.8875,
	"completions/min_length": 173.4,
	"completions/min_terminated_length": 173.4,
	"entropy": 0.15332257747650146,
	"epoch": 1.498371335504886,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 5.016286644951141e-06,
	"loss": 0.0191,
	"num_tokens": 34752895.0,
	"reward": 0.95,
	"reward_std": 0.03535533845424652,
	"rewards/qwen_accuracy_reward/mean": 0.95,
	"rewards/qwen_accuracy_reward/std": 0.10367314666509628,
	"step": 460,
	"step_time": 46.25087994951755
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1709.7,
	"completions/max_terminated_length": 1703.5,
	"completions/mean_length": 474.934375,
	"completions/mean_terminated_length": 464.940625,
	"completions/min_length": 197.1,
	"completions/min_terminated_length": 197.1,
	"entropy": 0.15956022590398788,
	"epoch": 1.5309446254071661,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 4.90770901194354e-06,
	"loss": 0.0197,
	"num_tokens": 35566530.0,
	"reward": 0.95625,
	"reward_std": 0.03335031494498253,
	"rewards/qwen_accuracy_reward/mean": 0.95625,
	"rewards/qwen_accuracy_reward/std": 0.11587972939014435,
	"step": 470,
	"step_time": 52.431317151151596
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.021875,
	"completions/max_length": 1856.4,
	"completions/max_terminated_length": 1411.0,
	"completions/mean_length": 556.703125,
	"completions/mean_terminated_length": 482.36143188476564,
	"completions/min_length": 201.5,
	"completions/min_terminated_length": 201.5,
	"entropy": 0.1640054076910019,
	"epoch": 1.5635179153094463,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 4.79913137893594e-06,
	"loss": 0.0122,
	"num_tokens": 36332819.0,
	"reward": 0.90625,
	"reward_std": 0.011572751402854919,
	"rewards/qwen_accuracy_reward/mean": 0.90625,
	"rewards/qwen_accuracy_reward/std": 0.17163818180561066,
	"step": 480,
	"step_time": 55.61120590567589
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1141.9,
	"completions/max_terminated_length": 1141.9,
	"completions/mean_length": 371.696875,
	"completions/mean_terminated_length": 371.696875,
	"completions/min_length": 172.6,
	"completions/min_terminated_length": 172.6,
	"entropy": 0.13739149868488312,
	"epoch": 1.5960912052117264,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 4.690553745928339e-06,
	"loss": 0.0168,
	"num_tokens": 37149242.0,
	"reward": 0.984375,
	"reward_std": 0.022201896458864213,
	"rewards/qwen_accuracy_reward/mean": 0.984375,
	"rewards/qwen_accuracy_reward/std": 0.05127874463796615,
	"step": 490,
	"step_time": 37.01674561398104
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 1862.1,
	"completions/max_terminated_length": 1621.2,
	"completions/mean_length": 530.93125,
	"completions/mean_terminated_length": 478.1709289550781,
	"completions/min_length": 202.8,
	"completions/min_terminated_length": 202.8,
	"entropy": 0.16189506649971008,
	"epoch": 1.6286644951140063,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 0.8828125,
	"learning_rate": 4.5819761129207385e-06,
	"loss": 0.0709,
	"num_tokens": 37935508.0,
	"reward": 0.946875,
	"reward_std": 0.06396671310067177,
	"rewards/qwen_accuracy_reward/mean": 0.946875,
	"rewards/qwen_accuracy_reward/std": 0.1611790642142296,
	"step": 500,
	"step_time": 57.11876249546185
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1254.1,
	"completions/max_terminated_length": 1167.1,
	"completions/mean_length": 411.725,
	"completions/mean_terminated_length": 400.9833679199219,
	"completions/min_length": 165.3,
	"completions/min_terminated_length": 165.3,
	"entropy": 0.14960483461618423,
	"epoch": 1.6612377850162865,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 4.473398479913138e-06,
	"loss": 0.0285,
	"num_tokens": 38658348.0,
	"reward": 0.99375,
	"reward_std": 0.01767766922712326,
	"rewards/qwen_accuracy_reward/mean": 0.99375,
	"rewards/qwen_accuracy_reward/std": 0.03535533845424652,
	"step": 510,
	"step_time": 37.5699773571454
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1664.6,
	"completions/max_terminated_length": 1541.8,
	"completions/mean_length": 507.371875,
	"completions/mean_terminated_length": 477.1740295410156,
	"completions/min_length": 199.8,
	"completions/min_terminated_length": 199.8,
	"entropy": 0.16616563498973846,
	"epoch": 1.6938110749185666,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 4.364820846905538e-06,
	"loss": 0.0223,
	"num_tokens": 39415795.0,
	"reward": 0.978125,
	"reward_std": 0.036084231734275815,
	"rewards/qwen_accuracy_reward/mean": 0.978125,
	"rewards/qwen_accuracy_reward/std": 0.061483670771121976,
	"step": 520,
	"step_time": 49.47790257129818
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1499.9,
	"completions/max_terminated_length": 1499.9,
	"completions/mean_length": 381.7875,
	"completions/mean_terminated_length": 381.7875,
	"completions/min_length": 178.3,
	"completions/min_terminated_length": 178.3,
	"entropy": 0.1455918937921524,
	"epoch": 1.7263843648208468,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 4.256243213897938e-06,
	"loss": 0.0,
	"num_tokens": 40196743.0,
	"reward": 0.975,
	"reward_std": 0.0,
	"rewards/qwen_accuracy_reward/mean": 0.975,
	"rewards/qwen_accuracy_reward/std": 0.04399413466453552,
	"step": 530,
	"step_time": 45.42760537136346
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1877.8,
	"completions/max_terminated_length": 1709.0,
	"completions/mean_length": 515.225,
	"completions/mean_terminated_length": 482.8147033691406,
	"completions/min_length": 195.3,
	"completions/min_terminated_length": 195.3,
	"entropy": 0.16437555029988288,
	"epoch": 1.758957654723127,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 4.147665580890337e-06,
	"loss": 0.0303,
	"num_tokens": 41000151.0,
	"reward": 0.975,
	"reward_std": 0.04261348247528076,
	"rewards/qwen_accuracy_reward/mean": 0.975,
	"rewards/qwen_accuracy_reward/std": 0.09354988187551498,
	"step": 540,
	"step_time": 57.456473257485776
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1165.0,
	"completions/max_terminated_length": 1165.0,
	"completions/mean_length": 423.43125,
	"completions/mean_terminated_length": 423.43125,
	"completions/min_length": 190.2,
	"completions/min_terminated_length": 190.2,
	"entropy": 0.16483787596225738,
	"epoch": 1.791530944625407,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 4.039087947882737e-06,
	"loss": -0.0021,
	"num_tokens": 41720497.0,
	"reward": 0.9125,
	"reward_std": 0.013363061845302582,
	"rewards/qwen_accuracy_reward/mean": 0.9125,
	"rewards/qwen_accuracy_reward/std": 0.12839525938034058,
	"step": 550,
	"step_time": 36.695044124592094
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1311.7,
	"completions/max_terminated_length": 1305.7,
	"completions/mean_length": 479.98125,
	"completions/mean_terminated_length": 470.1879028320312,
	"completions/min_length": 199.5,
	"completions/min_terminated_length": 199.5,
	"entropy": 0.1649734303355217,
	"epoch": 1.8241042345276872,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 3.9305103148751365e-06,
	"loss": 0.0047,
	"num_tokens": 42491363.0,
	"reward": 0.9875,
	"reward_std": 0.02925042062997818,
	"rewards/qwen_accuracy_reward/mean": 0.9875,
	"rewards/qwen_accuracy_reward/std": 0.059948806464672086,
	"step": 560,
	"step_time": 41.12004605270922
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.015625,
	"completions/max_length": 2218.3,
	"completions/max_terminated_length": 1861.4,
	"completions/mean_length": 548.38125,
	"completions/mean_terminated_length": 494.34682006835936,
	"completions/min_length": 183.9,
	"completions/min_terminated_length": 183.9,
	"entropy": 0.17041560113430024,
	"epoch": 1.8566775244299674,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 1.40625,
	"learning_rate": 3.8219326818675354e-06,
	"loss": 0.0604,
	"num_tokens": 43141245.0,
	"reward": 0.91875,
	"reward_std": 0.06260073557496071,
	"rewards/qwen_accuracy_reward/mean": 0.91875,
	"rewards/qwen_accuracy_reward/std": 0.19519128501415253,
	"step": 570,
	"step_time": 65.7936801508069
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1056.9,
	"completions/max_terminated_length": 1056.9,
	"completions/mean_length": 378.003125,
	"completions/mean_terminated_length": 378.003125,
	"completions/min_length": 188.2,
	"completions/min_terminated_length": 188.2,
	"entropy": 0.15374673902988434,
	"epoch": 1.8892508143322475,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 3.7133550488599353e-06,
	"loss": 0.0,
	"num_tokens": 43920014.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/qwen_accuracy_reward/mean": 1.0,
	"rewards/qwen_accuracy_reward/std": 0.0,
	"step": 580,
	"step_time": 34.265030450746416
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1124.7,
	"completions/max_terminated_length": 1011.9,
	"completions/mean_length": 395.740625,
	"completions/mean_terminated_length": 361.8580810546875,
	"completions/min_length": 183.3,
	"completions/min_terminated_length": 183.3,
	"entropy": 0.15494132190942764,
	"epoch": 1.9218241042345277,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 3.6047774158523346e-06,
	"loss": 0.0321,
	"num_tokens": 44712795.0,
	"reward": 0.9875,
	"reward_std": 0.02177756354212761,
	"rewards/qwen_accuracy_reward/mean": 0.9875,
	"rewards/qwen_accuracy_reward/std": 0.04729212671518326,
	"step": 590,
	"step_time": 37.20974960550666
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1213.0,
	"completions/max_terminated_length": 1213.0,
	"completions/mean_length": 404.8125,
	"completions/mean_terminated_length": 404.8125,
	"completions/min_length": 192.2,
	"completions/min_terminated_length": 192.2,
	"entropy": 0.13962563052773475,
	"epoch": 1.9543973941368078,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 3.496199782844734e-06,
	"loss": 0.0076,
	"num_tokens": 45496631.0,
	"reward": 0.9125,
	"reward_std": 0.013363061845302582,
	"rewards/qwen_accuracy_reward/mean": 0.9125,
	"rewards/qwen_accuracy_reward/std": 0.16558347940444945,
	"step": 600,
	"step_time": 38.92747511789203
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 1518.7,
	"completions/max_terminated_length": 916.1,
	"completions/mean_length": 421.90625,
	"completions/mean_terminated_length": 352.30673828125,
	"completions/min_length": 176.6,
	"completions/min_terminated_length": 176.6,
	"entropy": 0.15325831845402718,
	"epoch": 1.986970684039088,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 0.0,
	"learning_rate": 3.387622149837134e-06,
	"loss": 0.1029,
	"num_tokens": 46185953.0,
	"reward": 0.91875,
	"reward_std": 0.04355512708425522,
	"rewards/qwen_accuracy_reward/mean": 0.91875,
	"rewards/qwen_accuracy_reward/std": 0.1526600480079651,
	"step": 610,
	"step_time": 45.04236122053116
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1122.6,
	"completions/max_terminated_length": 1122.6,
	"completions/mean_length": 409.753125,
	"completions/mean_terminated_length": 409.753125,
	"completions/min_length": 187.6,
	"completions/min_terminated_length": 187.6,
	"entropy": 0.15086480602622032,
	"epoch": 2.019543973941368,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 3.2790445168295332e-06,
	"loss": -0.0012,
	"num_tokens": 46967130.0,
	"reward": 0.978125,
	"reward_std": 0.00883883461356163,
	"rewards/qwen_accuracy_reward/mean": 0.978125,
	"rewards/qwen_accuracy_reward/std": 0.0420013427734375,
	"step": 620,
	"step_time": 36.174442971032114
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1220.2,
	"completions/max_terminated_length": 1154.4,
	"completions/mean_length": 433.240625,
	"completions/mean_terminated_length": 415.08563232421875,
	"completions/min_length": 179.6,
	"completions/min_terminated_length": 179.6,
	"entropy": 0.1521160587668419,
	"epoch": 2.0521172638436482,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 3.1704668838219326e-06,
	"loss": 0.02,
	"num_tokens": 47658855.0,
	"reward": 0.9625,
	"reward_std": 0.05828612819314003,
	"rewards/qwen_accuracy_reward/mean": 0.9625,
	"rewards/qwen_accuracy_reward/std": 0.11276241540908813,
	"step": 630,
	"step_time": 39.28426020843908
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 904.7,
	"completions/max_terminated_length": 904.7,
	"completions/mean_length": 346.578125,
	"completions/mean_terminated_length": 346.578125,
	"completions/min_length": 186.1,
	"completions/min_terminated_length": 186.1,
	"entropy": 0.14097955524921418,
	"epoch": 2.0846905537459284,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 3.061889250814333e-06,
	"loss": 0.0,
	"num_tokens": 48413328.0,
	"reward": 0.975,
	"reward_std": 0.0,
	"rewards/qwen_accuracy_reward/mean": 0.975,
	"rewards/qwen_accuracy_reward/std": 0.04399413466453552,
	"step": 640,
	"step_time": 31.522073939908296
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1237.6,
	"completions/max_terminated_length": 1237.6,
	"completions/mean_length": 428.15625,
	"completions/mean_terminated_length": 428.15625,
	"completions/min_length": 184.4,
	"completions/min_terminated_length": 184.4,
	"entropy": 0.15489777624607087,
	"epoch": 2.1172638436482085,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 2.9533116178067322e-06,
	"loss": 0.0,
	"num_tokens": 49250122.0,
	"reward": 0.95,
	"reward_std": 0.0,
	"rewards/qwen_accuracy_reward/mean": 0.95,
	"rewards/qwen_accuracy_reward/std": 0.08798826932907104,
	"step": 650,
	"step_time": 38.97984252097085
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1062.7,
	"completions/max_terminated_length": 1062.7,
	"completions/mean_length": 399.328125,
	"completions/mean_terminated_length": 399.328125,
	"completions/min_length": 181.3,
	"completions/min_terminated_length": 181.3,
	"entropy": 0.14898339360952378,
	"epoch": 2.1498371335504887,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 2.8447339847991316e-06,
	"loss": 0.0039,
	"num_tokens": 50033963.0,
	"reward": 0.915625,
	"reward_std": 0.03377464786171913,
	"rewards/qwen_accuracy_reward/mean": 0.915625,
	"rewards/qwen_accuracy_reward/std": 0.14567448943853378,
	"step": 660,
	"step_time": 33.721394206117836
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1256.5,
	"completions/max_terminated_length": 1189.6,
	"completions/mean_length": 410.984375,
	"completions/mean_terminated_length": 399.96754150390626,
	"completions/min_length": 195.5,
	"completions/min_terminated_length": 195.5,
	"entropy": 0.15526492446660994,
	"epoch": 2.182410423452769,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 2.7361563517915314e-06,
	"loss": 0.0314,
	"num_tokens": 50775022.0,
	"reward": 0.946875,
	"reward_std": 0.00883883461356163,
	"rewards/qwen_accuracy_reward/mean": 0.946875,
	"rewards/qwen_accuracy_reward/std": 0.08967447578907013,
	"step": 670,
	"step_time": 39.73401907449588
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1545.4,
	"completions/max_terminated_length": 1413.5,
	"completions/mean_length": 461.90625,
	"completions/mean_terminated_length": 439.9891723632812,
	"completions/min_length": 191.5,
	"completions/min_terminated_length": 191.5,
	"entropy": 0.15921913534402848,
	"epoch": 2.214983713355049,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 2.627578718783931e-06,
	"loss": 0.0268,
	"num_tokens": 51536384.0,
	"reward": 0.96875,
	"reward_std": 0.03471825420856476,
	"rewards/qwen_accuracy_reward/mean": 0.96875,
	"rewards/qwen_accuracy_reward/std": 0.08884271383285522,
	"step": 680,
	"step_time": 47.763417969178406
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 1956.4,
	"completions/max_terminated_length": 1649.7,
	"completions/mean_length": 517.278125,
	"completions/mean_terminated_length": 450.4589599609375,
	"completions/min_length": 182.3,
	"completions/min_terminated_length": 182.3,
	"entropy": 0.16205079928040506,
	"epoch": 2.247557003257329,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 1.46875,
	"learning_rate": 2.5190010857763302e-06,
	"loss": 0.0888,
	"num_tokens": 52323265.0,
	"reward": 0.971875,
	"reward_std": 0.05145231708884239,
	"rewards/qwen_accuracy_reward/mean": 0.971875,
	"rewards/qwen_accuracy_reward/std": 0.1004656806588173,
	"step": 690,
	"step_time": 58.18257061317563
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1702.6,
	"completions/max_terminated_length": 1669.6,
	"completions/mean_length": 443.4125,
	"completions/mean_terminated_length": 432.5013061523438,
	"completions/min_length": 183.3,
	"completions/min_terminated_length": 183.3,
	"entropy": 0.15109438076615334,
	"epoch": 2.2801302931596092,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 2.4104234527687296e-06,
	"loss": 0.0435,
	"num_tokens": 53081109.0,
	"reward": 0.9625,
	"reward_std": 0.02925042062997818,
	"rewards/qwen_accuracy_reward/mean": 0.9625,
	"rewards/qwen_accuracy_reward/std": 0.10394294112920761,
	"step": 700,
	"step_time": 51.96710612634197
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1106.4,
	"completions/max_terminated_length": 1106.4,
	"completions/mean_length": 390.940625,
	"completions/mean_terminated_length": 390.940625,
	"completions/min_length": 186.6,
	"completions/min_terminated_length": 186.6,
	"entropy": 0.15517303124070167,
	"epoch": 2.3127035830618894,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 2.3018458197611294e-06,
	"loss": 0.0292,
	"num_tokens": 53864306.0,
	"reward": 0.934375,
	"reward_std": 0.04419417306780815,
	"rewards/qwen_accuracy_reward/mean": 0.934375,
	"rewards/qwen_accuracy_reward/std": 0.11064954251050949,
	"step": 710,
	"step_time": 32.47686854107305
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 923.7,
	"completions/max_terminated_length": 923.7,
	"completions/mean_length": 367.85625,
	"completions/mean_terminated_length": 367.85625,
	"completions/min_length": 179.6,
	"completions/min_terminated_length": 179.6,
	"entropy": 0.15150292664766313,
	"epoch": 2.3452768729641695,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 2.193268186753529e-06,
	"loss": 0.0006,
	"num_tokens": 54577764.0,
	"reward": 0.996875,
	"reward_std": 0.00883883461356163,
	"rewards/qwen_accuracy_reward/mean": 0.996875,
	"rewards/qwen_accuracy_reward/std": 0.01767766922712326,
	"step": 720,
	"step_time": 30.78780706692487
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1826.7,
	"completions/max_terminated_length": 1750.5,
	"completions/mean_length": 492.021875,
	"completions/mean_terminated_length": 460.0340515136719,
	"completions/min_length": 178.1,
	"completions/min_terminated_length": 178.1,
	"entropy": 0.15865328460931777,
	"epoch": 2.3778501628664497,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 2.0846905537459286e-06,
	"loss": 0.0263,
	"num_tokens": 55272515.0,
	"reward": 0.98125,
	"reward_std": 0.03104073032736778,
	"rewards/qwen_accuracy_reward/mean": 0.98125,
	"rewards/qwen_accuracy_reward/std": 0.05456787198781967,
	"step": 730,
	"step_time": 55.55191990900785
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1124.4,
	"completions/max_terminated_length": 1124.4,
	"completions/mean_length": 408.946875,
	"completions/mean_terminated_length": 408.946875,
	"completions/min_length": 170.6,
	"completions/min_terminated_length": 170.6,
	"entropy": 0.16333993151783943,
	"epoch": 2.41042345276873,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 1.976112920738328e-06,
	"loss": 0.0104,
	"num_tokens": 55921930.0,
	"reward": 0.965625,
	"reward_std": 0.01293872892856598,
	"rewards/qwen_accuracy_reward/mean": 0.965625,
	"rewards/qwen_accuracy_reward/std": 0.07360859215259552,
	"step": 740,
	"step_time": 34.814000389166175
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1399.8,
	"completions/max_terminated_length": 1388.3,
	"completions/mean_length": 431.521875,
	"completions/mean_terminated_length": 410.98146362304686,
	"completions/min_length": 164.8,
	"completions/min_terminated_length": 164.8,
	"entropy": 0.14200911596417426,
	"epoch": 2.44299674267101,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.8675352877307276e-06,
	"loss": 0.019,
	"num_tokens": 56772993.0,
	"reward": 0.98125,
	"reward_std": 0.02177756354212761,
	"rewards/qwen_accuracy_reward/mean": 0.98125,
	"rewards/qwen_accuracy_reward/std": 0.05456787198781967,
	"step": 750,
	"step_time": 43.50874480362982
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1684.0,
	"completions/max_terminated_length": 1554.1,
	"completions/mean_length": 515.384375,
	"completions/mean_terminated_length": 494.017919921875,
	"completions/min_length": 194.0,
	"completions/min_terminated_length": 194.0,
	"entropy": 0.17334669530391694,
	"epoch": 2.47557003257329,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 1.328125,
	"learning_rate": 1.7589576547231272e-06,
	"loss": 0.0074,
	"num_tokens": 57501516.0,
	"reward": 0.959375,
	"reward_std": 0.057342519611120225,
	"rewards/qwen_accuracy_reward/mean": 0.959375,
	"rewards/qwen_accuracy_reward/std": 0.09656921476125717,
	"step": 760,
	"step_time": 50.1735579572618
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.003125,
	"completions/max_length": 1758.1,
	"completions/max_terminated_length": 1663.3,
	"completions/mean_length": 470.63125,
	"completions/mean_terminated_length": 460.1435485839844,
	"completions/min_length": 185.0,
	"completions/min_terminated_length": 185.0,
	"entropy": 0.16403108537197114,
	"epoch": 2.5081433224755703,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 1.2421875,
	"learning_rate": 1.6503800217155266e-06,
	"loss": 0.0135,
	"num_tokens": 58186406.0,
	"reward": 0.99375,
	"reward_std": 0.01767766922712326,
	"rewards/qwen_accuracy_reward/mean": 0.99375,
	"rewards/qwen_accuracy_reward/std": 0.03535533845424652,
	"step": 770,
	"step_time": 51.758546930458394
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1432.5,
	"completions/max_terminated_length": 1432.5,
	"completions/mean_length": 448.15,
	"completions/mean_terminated_length": 448.15,
	"completions/min_length": 169.4,
	"completions/min_terminated_length": 169.4,
	"entropy": 0.1642938271164894,
	"epoch": 2.5407166123778504,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 1.5418023887079264e-06,
	"loss": 0.0038,
	"num_tokens": 58895790.0,
	"reward": 0.978125,
	"reward_std": 0.036084231734275815,
	"rewards/qwen_accuracy_reward/mean": 0.978125,
	"rewards/qwen_accuracy_reward/std": 0.06321553289890289,
	"step": 780,
	"step_time": 43.58511639842764
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2056.4,
	"completions/max_terminated_length": 2051.1,
	"completions/mean_length": 555.934375,
	"completions/mean_terminated_length": 527.9370727539062,
	"completions/min_length": 191.2,
	"completions/min_terminated_length": 191.2,
	"entropy": 0.16091172024607658,
	"epoch": 2.5732899022801305,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 0.0,
	"learning_rate": 1.433224755700326e-06,
	"loss": -0.0108,
	"num_tokens": 59586505.0,
	"reward": 0.89375,
	"reward_std": 0.055127878487110135,
	"rewards/qwen_accuracy_reward/mean": 0.89375,
	"rewards/qwen_accuracy_reward/std": 0.2057945430278778,
	"step": 790,
	"step_time": 58.492029800172894
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 697.5,
	"completions/max_terminated_length": 697.5,
	"completions/mean_length": 322.95,
	"completions/mean_terminated_length": 322.95,
	"completions/min_length": 165.9,
	"completions/min_terminated_length": 165.9,
	"entropy": 0.14762159138917924,
	"epoch": 2.6058631921824107,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 1.3246471226927254e-06,
	"loss": -0.0081,
	"num_tokens": 60234105.0,
	"reward": 0.925,
	"reward_std": 0.01767766922712326,
	"rewards/qwen_accuracy_reward/mean": 0.925,
	"rewards/qwen_accuracy_reward/std": 0.1476672813296318,
	"step": 800,
	"step_time": 23.70481554856524
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 2111.5,
	"completions/max_terminated_length": 1792.0,
	"completions/mean_length": 557.365625,
	"completions/mean_terminated_length": 523.718637084961,
	"completions/min_length": 207.9,
	"completions/min_terminated_length": 207.9,
	"entropy": 0.181430846452713,
	"epoch": 2.6384364820846904,
	"frac_reward_zero_std": 0.85,
	"grad_norm": 0.0,
	"learning_rate": 1.216069489685125e-06,
	"loss": 0.09,
	"num_tokens": 60907670.0,
	"reward": 0.9125,
	"reward_std": 0.06670062988996506,
	"rewards/qwen_accuracy_reward/mean": 0.9125,
	"rewards/qwen_accuracy_reward/std": 0.20204702019691467,
	"step": 810,
	"step_time": 57.087904060911384
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1913.1,
	"completions/max_terminated_length": 1700.8,
	"completions/mean_length": 471.440625,
	"completions/mean_terminated_length": 449.9090637207031,
	"completions/min_length": 188.8,
	"completions/min_terminated_length": 188.8,
	"entropy": 0.16233009248971939,
	"epoch": 2.6710097719869705,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 1.3046875,
	"learning_rate": 1.1074918566775244e-06,
	"loss": 0.0312,
	"num_tokens": 61661099.0,
	"reward": 0.984375,
	"reward_std": 0.03061639815568924,
	"rewards/qwen_accuracy_reward/mean": 0.984375,
	"rewards/qwen_accuracy_reward/std": 0.06496979594230652,
	"step": 820,
	"step_time": 56.08578431969509
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1505.7,
	"completions/max_terminated_length": 1505.7,
	"completions/mean_length": 430.6125,
	"completions/mean_terminated_length": 430.6125,
	"completions/min_length": 189.9,
	"completions/min_terminated_length": 189.9,
	"entropy": 0.16054447889328002,
	"epoch": 2.7035830618892507,
	"frac_reward_zero_std": 0.975,
	"grad_norm": 0.0,
	"learning_rate": 9.989142236699242e-07,
	"loss": -0.0073,
	"num_tokens": 62382591.0,
	"reward": 0.99375,
	"reward_std": 0.011572751402854919,
	"rewards/qwen_accuracy_reward/mean": 0.99375,
	"rewards/qwen_accuracy_reward/std": 0.024593468010425567,
	"step": 830,
	"step_time": 44.93511769743636
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1307.6,
	"completions/max_terminated_length": 1207.0,
	"completions/mean_length": 426.103125,
	"completions/mean_terminated_length": 403.6597930908203,
	"completions/min_length": 215.9,
	"completions/min_terminated_length": 215.9,
	"entropy": 0.16358136087656022,
	"epoch": 2.736156351791531,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 8.903365906623236e-07,
	"loss": 0.0463,
	"num_tokens": 63060464.0,
	"reward": 0.9875,
	"reward_std": 0.02177756354212761,
	"rewards/qwen_accuracy_reward/mean": 0.9875,
	"rewards/qwen_accuracy_reward/std": 0.04729212671518326,
	"step": 840,
	"step_time": 40.77616196591407
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 691.6,
	"completions/max_terminated_length": 691.6,
	"completions/mean_length": 307.434375,
	"completions/mean_terminated_length": 307.434375,
	"completions/min_length": 162.8,
	"completions/min_terminated_length": 162.8,
	"entropy": 0.13495604172348977,
	"epoch": 2.768729641693811,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 7.817589576547231e-07,
	"loss": 0.0,
	"num_tokens": 63820595.0,
	"reward": 0.975,
	"reward_std": 0.0,
	"rewards/qwen_accuracy_reward/mean": 0.975,
	"rewards/qwen_accuracy_reward/std": 0.04399413466453552,
	"step": 850,
	"step_time": 24.526741536986084
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1168.6,
	"completions/max_terminated_length": 1168.6,
	"completions/mean_length": 406.71875,
	"completions/mean_terminated_length": 406.71875,
	"completions/min_length": 198.2,
	"completions/min_terminated_length": 198.2,
	"entropy": 0.1559869095683098,
	"epoch": 2.801302931596091,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 1.71875,
	"learning_rate": 6.731813246471228e-07,
	"loss": 0.0029,
	"num_tokens": 64580849.0,
	"reward": 0.953125,
	"reward_std": 0.02041158601641655,
	"rewards/qwen_accuracy_reward/mean": 0.953125,
	"rewards/qwen_accuracy_reward/std": 0.10132758170366288,
	"step": 860,
	"step_time": 35.53059697123244
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.01875,
	"completions/max_length": 1577.3,
	"completions/max_terminated_length": 1516.9,
	"completions/mean_length": 498.10625,
	"completions/mean_terminated_length": 433.7035827636719,
	"completions/min_length": 175.3,
	"completions/min_terminated_length": 175.3,
	"entropy": 0.1583547368645668,
	"epoch": 2.8338762214983713,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.0,
	"learning_rate": 5.646036916395223e-07,
	"loss": 0.0275,
	"num_tokens": 65364547.0,
	"reward": 0.965625,
	"reward_std": 0.03787454217672348,
	"rewards/qwen_accuracy_reward/mean": 0.965625,
	"rewards/qwen_accuracy_reward/std": 0.09508474618196487,
	"step": 870,
	"step_time": 47.49830629490316
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.00625,
	"completions/max_length": 1447.2,
	"completions/max_terminated_length": 1369.3,
	"completions/mean_length": 457.55625,
	"completions/mean_terminated_length": 435.249169921875,
	"completions/min_length": 181.7,
	"completions/min_terminated_length": 181.7,
	"entropy": 0.16112774163484572,
	"epoch": 2.8664495114006514,
	"frac_reward_zero_std": 0.875,
	"grad_norm": 1.265625,
	"learning_rate": 4.5602605863192187e-07,
	"loss": 0.0422,
	"num_tokens": 66148253.0,
	"reward": 0.978125,
	"reward_std": 0.04966200664639473,
	"rewards/qwen_accuracy_reward/mean": 0.978125,
	"rewards/qwen_accuracy_reward/std": 0.10221994370222091,
	"step": 880,
	"step_time": 43.340578782279046
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1843.3,
	"completions/max_terminated_length": 1843.3,
	"completions/mean_length": 497.509375,
	"completions/mean_terminated_length": 497.509375,
	"completions/min_length": 191.2,
	"completions/min_terminated_length": 191.2,
	"entropy": 0.17072843462228776,
	"epoch": 2.8990228013029316,
	"frac_reward_zero_std": 0.95,
	"grad_norm": 0.0,
	"learning_rate": 3.474484256243214e-07,
	"loss": 0.0145,
	"num_tokens": 66879112.0,
	"reward": 0.965625,
	"reward_std": 0.02041158601641655,
	"rewards/qwen_accuracy_reward/mean": 0.965625,
	"rewards/qwen_accuracy_reward/std": 0.08626527190208436,
	"step": 890,
	"step_time": 53.65781031670049
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.009375,
	"completions/max_length": 1572.3,
	"completions/max_terminated_length": 1323.3,
	"completions/mean_length": 452.584375,
	"completions/mean_terminated_length": 420.08506469726564,
	"completions/min_length": 194.6,
	"completions/min_terminated_length": 194.6,
	"entropy": 0.1676468499004841,
	"epoch": 2.9315960912052117,
	"frac_reward_zero_std": 0.925,
	"grad_norm": 0.828125,
	"learning_rate": 2.3887079261672097e-07,
	"loss": 0.0376,
	"num_tokens": 67548691.0,
	"reward": 0.915625,
	"reward_std": 0.036084231734275815,
	"rewards/qwen_accuracy_reward/mean": 0.915625,
	"rewards/qwen_accuracy_reward/std": 0.15649925023317338,
	"step": 900,
	"step_time": 47.34689696319401
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 1042.7,
	"completions/max_terminated_length": 1042.7,
	"completions/mean_length": 393.828125,
	"completions/mean_terminated_length": 393.828125,
	"completions/min_length": 186.9,
	"completions/min_terminated_length": 186.9,
	"entropy": 0.15125710666179656,
	"epoch": 2.964169381107492,
	"frac_reward_zero_std": 0.9,
	"grad_norm": 1.7109375,
	"learning_rate": 1.3029315960912054e-07,
	"loss": 0.0054,
	"num_tokens": 68345092.0,
	"reward": 0.959375,
	"reward_std": 0.04218914955854416,
	"rewards/qwen_accuracy_reward/mean": 0.959375,
	"rewards/qwen_accuracy_reward/std": 0.11388693749904633,
	"step": 910,
	"step_time": 32.79988148277626
	},
	{
	"clip_ratio/high_max": 0.0,
	"clip_ratio/high_mean": 0.0,
	"clip_ratio/low_mean": 0.0,
	"clip_ratio/low_min": 0.0,
	"clip_ratio/region_mean": 0.0,
	"completions/clipped_ratio": 0.0,
	"completions/max_length": 708.5,
	"completions/max_terminated_length": 708.5,
	"completions/mean_length": 309.83125,
	"completions/mean_terminated_length": 309.83125,
	"completions/min_length": 173.2,
	"completions/min_terminated_length": 173.2,
	"entropy": 0.13616923689842225,
	"epoch": 2.996742671009772,
	"frac_reward_zero_std": 1.0,
	"grad_norm": 0.0,
	"learning_rate": 2.171552660152009e-08,
	"loss": 0.0,
	"num_tokens": 69106806.0,
	"reward": 1.0,
	"reward_std": 0.0,
	"rewards/qwen_accuracy_reward/mean": 1.0,
	"rewards/qwen_accuracy_reward/std": 0.0,
	"step": 920,
	"step_time": 25.47872376209125
	}
	],
	"logging_steps": 10,
	"max_steps": 921,
	"num_input_tokens_seen": 69178036,
	"num_train_epochs": 3,
	"save_steps": 50,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": true
	},
	"attributes": {}
	}
	},
	"total_flos": 0.0,
	"train_batch_size": 4,
	"trial_name": null,
	"trial_params": null
	}