ToolPRM-GRPO-synthesis / trainer_state.json
wjldw's picture
Upload folder using huggingface_hub
a2cdc79 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 921,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021875,
"completions/max_length": 2397.7,
"completions/max_terminated_length": 2153.3,
"completions/mean_length": 724.20625,
"completions/mean_terminated_length": 655.9658874511719,
"completions/min_length": 158.7,
"completions/min_terminated_length": 158.7,
"entropy": 0.170973788946867,
"epoch": 0.03257328990228013,
"frac_reward_zero_std": 0.725,
"grad_norm": 1.03125,
"learning_rate": 9.90228013029316e-06,
"loss": 0.0209,
"num_tokens": 901722.0,
"reward": 0.671875,
"reward_std": 0.12951098531484603,
"rewards/qwen_accuracy_reward/mean": 0.671875,
"rewards/qwen_accuracy_reward/std": 0.32610869109630586,
"step": 10,
"step_time": 75.02407562928275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2212.1,
"completions/max_terminated_length": 2131.1,
"completions/mean_length": 648.159375,
"completions/mean_terminated_length": 619.5659057617188,
"completions/min_length": 142.9,
"completions/min_terminated_length": 142.9,
"entropy": 0.16581312268972398,
"epoch": 0.06514657980456026,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.0,
"learning_rate": 9.79370249728556e-06,
"loss": 0.0216,
"num_tokens": 1827701.0,
"reward": 0.834375,
"reward_std": 0.09659009724855423,
"rewards/qwen_accuracy_reward/mean": 0.834375,
"rewards/qwen_accuracy_reward/std": 0.24483564049005507,
"step": 20,
"step_time": 66.24043246284127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 2206.0,
"completions/max_terminated_length": 2090.8,
"completions/mean_length": 623.284375,
"completions/mean_terminated_length": 558.5000427246093,
"completions/min_length": 171.1,
"completions/min_terminated_length": 171.1,
"entropy": 0.1667719691991806,
"epoch": 0.09771986970684039,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7890625,
"learning_rate": 9.68512486427796e-06,
"loss": 0.0347,
"num_tokens": 2654656.0,
"reward": 0.83125,
"reward_std": 0.09974638372659683,
"rewards/qwen_accuracy_reward/mean": 0.83125,
"rewards/qwen_accuracy_reward/std": 0.31891718655824663,
"step": 30,
"step_time": 67.77192380828782
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1337.0,
"completions/max_terminated_length": 1337.0,
"completions/mean_length": 469.159375,
"completions/mean_terminated_length": 469.159375,
"completions/min_length": 165.4,
"completions/min_terminated_length": 165.4,
"entropy": 0.15202879384160042,
"epoch": 0.13029315960912052,
"frac_reward_zero_std": 0.8,
"grad_norm": 1.875,
"learning_rate": 9.576547231270358e-06,
"loss": 0.0142,
"num_tokens": 3441667.0,
"reward": 0.8125,
"reward_std": 0.09931695759296418,
"rewards/qwen_accuracy_reward/mean": 0.8125,
"rewards/qwen_accuracy_reward/std": 0.3193816542625427,
"step": 40,
"step_time": 41.496644421108066
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1534.5,
"completions/max_terminated_length": 1359.1,
"completions/mean_length": 505.4125,
"completions/mean_terminated_length": 475.48418579101565,
"completions/min_length": 166.3,
"completions/min_terminated_length": 166.3,
"entropy": 0.1465001180768013,
"epoch": 0.16286644951140064,
"frac_reward_zero_std": 0.825,
"grad_norm": 1.1640625,
"learning_rate": 9.467969598262759e-06,
"loss": 0.0048,
"num_tokens": 4235663.0,
"reward": 0.875,
"reward_std": 0.07596379667520523,
"rewards/qwen_accuracy_reward/mean": 0.875,
"rewards/qwen_accuracy_reward/std": 0.20005422383546828,
"step": 50,
"step_time": 47.53361711697653
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1282.1,
"completions/max_terminated_length": 1282.1,
"completions/mean_length": 475.225,
"completions/mean_terminated_length": 475.225,
"completions/min_length": 189.6,
"completions/min_terminated_length": 189.6,
"entropy": 0.15337296426296235,
"epoch": 0.19543973941368079,
"frac_reward_zero_std": 0.75,
"grad_norm": 1.640625,
"learning_rate": 9.359391965255158e-06,
"loss": 0.0229,
"num_tokens": 4959391.0,
"reward": 0.85,
"reward_std": 0.11794019415974617,
"rewards/qwen_accuracy_reward/mean": 0.85,
"rewards/qwen_accuracy_reward/std": 0.25456976890563965,
"step": 60,
"step_time": 36.53024397492409
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1927.4,
"completions/max_terminated_length": 1895.7,
"completions/mean_length": 612.075,
"completions/mean_terminated_length": 590.4368774414063,
"completions/min_length": 192.2,
"completions/min_terminated_length": 192.2,
"entropy": 0.16074557453393937,
"epoch": 0.2280130293159609,
"frac_reward_zero_std": 0.825,
"grad_norm": 1.4453125,
"learning_rate": 9.250814332247557e-06,
"loss": 0.0266,
"num_tokens": 5749223.0,
"reward": 0.803125,
"reward_std": 0.08783914744853974,
"rewards/qwen_accuracy_reward/mean": 0.803125,
"rewards/qwen_accuracy_reward/std": 0.2966747134923935,
"step": 70,
"step_time": 57.99369401996955
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2189.3,
"completions/max_terminated_length": 2009.3,
"completions/mean_length": 690.7125,
"completions/mean_terminated_length": 586.1856994628906,
"completions/min_length": 188.9,
"completions/min_terminated_length": 188.9,
"entropy": 0.15213419646024703,
"epoch": 0.26058631921824105,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0,
"learning_rate": 9.142236699239957e-06,
"loss": 0.0282,
"num_tokens": 6565907.0,
"reward": 0.878125,
"reward_std": 0.08617057129740716,
"rewards/qwen_accuracy_reward/mean": 0.878125,
"rewards/qwen_accuracy_reward/std": 0.27245663553476335,
"step": 80,
"step_time": 65.59074299260973
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1602.7,
"completions/max_terminated_length": 1467.6,
"completions/mean_length": 467.484375,
"completions/mean_terminated_length": 456.31884765625,
"completions/min_length": 179.3,
"completions/min_terminated_length": 179.3,
"entropy": 0.1368262179195881,
"epoch": 0.2931596091205212,
"frac_reward_zero_std": 0.85,
"grad_norm": 0.0,
"learning_rate": 9.033659066232356e-06,
"loss": 0.004,
"num_tokens": 7334102.0,
"reward": 0.88125,
"reward_std": 0.06123279631137848,
"rewards/qwen_accuracy_reward/mean": 0.88125,
"rewards/qwen_accuracy_reward/std": 0.25089033097028735,
"step": 90,
"step_time": 48.28751948485151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2426.9,
"completions/max_terminated_length": 2137.6,
"completions/mean_length": 604.2625,
"completions/mean_terminated_length": 548.6393463134766,
"completions/min_length": 179.7,
"completions/min_terminated_length": 179.7,
"entropy": 0.15090147852897645,
"epoch": 0.3257328990228013,
"frac_reward_zero_std": 0.825,
"grad_norm": 1.625,
"learning_rate": 8.925081433224755e-06,
"loss": 0.0856,
"num_tokens": 8127226.0,
"reward": 0.84375,
"reward_std": 0.07459585815668106,
"rewards/qwen_accuracy_reward/mean": 0.84375,
"rewards/qwen_accuracy_reward/std": 0.29288421422243116,
"step": 100,
"step_time": 70.59187124017626
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 2326.5,
"completions/max_terminated_length": 2284.1,
"completions/mean_length": 609.134375,
"completions/mean_terminated_length": 541.8694732666015,
"completions/min_length": 179.7,
"completions/min_terminated_length": 179.7,
"entropy": 0.17526374608278275,
"epoch": 0.3583061889250814,
"frac_reward_zero_std": 0.775,
"grad_norm": 1.2109375,
"learning_rate": 8.816503800217156e-06,
"loss": -0.0395,
"num_tokens": 8923405.0,
"reward": 0.84375,
"reward_std": 0.10457713454961777,
"rewards/qwen_accuracy_reward/mean": 0.84375,
"rewards/qwen_accuracy_reward/std": 0.2856591001152992,
"step": 110,
"step_time": 64.31981013910845
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2472.2,
"completions/max_terminated_length": 2434.0,
"completions/mean_length": 614.309375,
"completions/mean_terminated_length": 508.3423645019531,
"completions/min_length": 206.9,
"completions/min_terminated_length": 206.9,
"entropy": 0.14617881700396537,
"epoch": 0.39087947882736157,
"frac_reward_zero_std": 0.85,
"grad_norm": 1.34375,
"learning_rate": 8.707926167209557e-06,
"loss": -0.0058,
"num_tokens": 9650464.0,
"reward": 0.934375,
"reward_std": 0.06943454667925834,
"rewards/qwen_accuracy_reward/mean": 0.934375,
"rewards/qwen_accuracy_reward/std": 0.1803007885813713,
"step": 120,
"step_time": 71.93199644116685
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1687.5,
"completions/max_terminated_length": 1578.9,
"completions/mean_length": 506.765625,
"completions/mean_terminated_length": 475.7398681640625,
"completions/min_length": 197.5,
"completions/min_terminated_length": 197.5,
"entropy": 0.14087174832820892,
"epoch": 0.4234527687296417,
"frac_reward_zero_std": 0.825,
"grad_norm": 0.83984375,
"learning_rate": 8.599348534201956e-06,
"loss": 0.0126,
"num_tokens": 10481205.0,
"reward": 0.91875,
"reward_std": 0.071863903850317,
"rewards/qwen_accuracy_reward/mean": 0.91875,
"rewards/qwen_accuracy_reward/std": 0.16812221705913544,
"step": 130,
"step_time": 51.10968422973529
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 2098.5,
"completions/max_terminated_length": 1797.3,
"completions/mean_length": 617.415625,
"completions/mean_terminated_length": 556.195068359375,
"completions/min_length": 178.4,
"completions/min_terminated_length": 178.4,
"entropy": 0.15323501601815223,
"epoch": 0.4560260586319218,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 8.490770901194355e-06,
"loss": 0.0577,
"num_tokens": 11246506.0,
"reward": 0.884375,
"reward_std": 0.06102004498243332,
"rewards/qwen_accuracy_reward/mean": 0.884375,
"rewards/qwen_accuracy_reward/std": 0.18740518838167192,
"step": 140,
"step_time": 61.85610852092505
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2084.5,
"completions/max_terminated_length": 1994.2,
"completions/mean_length": 541.646875,
"completions/mean_terminated_length": 509.9564208984375,
"completions/min_length": 190.2,
"completions/min_terminated_length": 190.2,
"entropy": 0.13484818413853644,
"epoch": 0.48859934853420195,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 8.382193268186755e-06,
"loss": 0.0238,
"num_tokens": 12068721.0,
"reward": 0.93125,
"reward_std": 0.03335031494498253,
"rewards/qwen_accuracy_reward/mean": 0.93125,
"rewards/qwen_accuracy_reward/std": 0.147479148209095,
"step": 150,
"step_time": 60.58088333830237
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2113.6,
"completions/max_terminated_length": 1865.1,
"completions/mean_length": 617.86875,
"completions/mean_terminated_length": 564.8990783691406,
"completions/min_length": 176.8,
"completions/min_terminated_length": 176.8,
"entropy": 0.14884034767746926,
"epoch": 0.5211726384364821,
"frac_reward_zero_std": 0.825,
"grad_norm": 1.15625,
"learning_rate": 8.273615635179154e-06,
"loss": 0.0349,
"num_tokens": 12860375.0,
"reward": 0.90625,
"reward_std": 0.07280554771423339,
"rewards/qwen_accuracy_reward/mean": 0.90625,
"rewards/qwen_accuracy_reward/std": 0.19390300512313843,
"step": 160,
"step_time": 61.31780819287523
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2070.4,
"completions/max_terminated_length": 1955.4,
"completions/mean_length": 525.28125,
"completions/mean_terminated_length": 470.9878173828125,
"completions/min_length": 199.4,
"completions/min_terminated_length": 199.4,
"entropy": 0.1467311643064022,
"epoch": 0.5537459283387622,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 8.165038002171553e-06,
"loss": 0.0292,
"num_tokens": 13584777.0,
"reward": 0.884375,
"reward_std": 0.05376190170645714,
"rewards/qwen_accuracy_reward/mean": 0.884375,
"rewards/qwen_accuracy_reward/std": 0.2119799315929413,
"step": 170,
"step_time": 61.066531581245364
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1569.3,
"completions/max_terminated_length": 1385.7,
"completions/mean_length": 477.046875,
"completions/mean_terminated_length": 466.2573547363281,
"completions/min_length": 198.5,
"completions/min_terminated_length": 198.5,
"entropy": 0.14774601608514787,
"epoch": 0.5863192182410424,
"frac_reward_zero_std": 0.925,
"grad_norm": 1.0546875,
"learning_rate": 8.056460369163954e-06,
"loss": 0.06,
"num_tokens": 14346864.0,
"reward": 0.9375,
"reward_std": 0.02925042062997818,
"rewards/qwen_accuracy_reward/mean": 0.9375,
"rewards/qwen_accuracy_reward/std": 0.13194561302661895,
"step": 180,
"step_time": 47.719357285648584
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 2356.5,
"completions/max_terminated_length": 1938.6,
"completions/mean_length": 530.79375,
"completions/mean_terminated_length": 508.41269836425784,
"completions/min_length": 179.6,
"completions/min_terminated_length": 179.6,
"entropy": 0.15401604473590852,
"epoch": 0.6188925081433225,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 7.947882736156353e-06,
"loss": 0.073,
"num_tokens": 15077166.0,
"reward": 0.928125,
"reward_std": 0.03808925524353981,
"rewards/qwen_accuracy_reward/mean": 0.928125,
"rewards/qwen_accuracy_reward/std": 0.16100659370422363,
"step": 190,
"step_time": 68.03254930684344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 2111.2,
"completions/max_terminated_length": 2089.6,
"completions/mean_length": 532.41875,
"completions/mean_terminated_length": 511.9100036621094,
"completions/min_length": 199.4,
"completions/min_terminated_length": 199.4,
"entropy": 0.14272007048130037,
"epoch": 0.6514657980456026,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 7.839305103148752e-06,
"loss": 0.038,
"num_tokens": 15699220.0,
"reward": 0.928125,
"reward_std": 0.0245114803314209,
"rewards/qwen_accuracy_reward/mean": 0.928125,
"rewards/qwen_accuracy_reward/std": 0.11228372007608414,
"step": 200,
"step_time": 61.566719483956696
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1495.5,
"completions/max_terminated_length": 1268.8,
"completions/mean_length": 497.496875,
"completions/mean_terminated_length": 442.867822265625,
"completions/min_length": 186.1,
"completions/min_terminated_length": 186.1,
"entropy": 0.13395386636257173,
"epoch": 0.6840390879478827,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 7.730727470141152e-06,
"loss": 0.0028,
"num_tokens": 16564171.0,
"reward": 0.9625,
"reward_std": 0.02177756354212761,
"rewards/qwen_accuracy_reward/mean": 0.9625,
"rewards/qwen_accuracy_reward/std": 0.07889154553413391,
"step": 210,
"step_time": 45.37072062129155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1425.8,
"completions/max_terminated_length": 1415.7,
"completions/mean_length": 453.871875,
"completions/mean_terminated_length": 444.69354248046875,
"completions/min_length": 166.8,
"completions/min_terminated_length": 166.8,
"entropy": 0.13297367617487907,
"epoch": 0.7166123778501629,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 7.622149837133551e-06,
"loss": 0.0147,
"num_tokens": 17330410.0,
"reward": 0.940625,
"reward_std": 0.022201896458864213,
"rewards/qwen_accuracy_reward/mean": 0.940625,
"rewards/qwen_accuracy_reward/std": 0.11959655284881592,
"step": 220,
"step_time": 44.74475174760446
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1833.4,
"completions/max_terminated_length": 1752.4,
"completions/mean_length": 501.321875,
"completions/mean_terminated_length": 469.6424560546875,
"completions/min_length": 171.5,
"completions/min_terminated_length": 171.5,
"entropy": 0.14848560467362404,
"epoch": 0.749185667752443,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 7.51357220412595e-06,
"loss": 0.022,
"num_tokens": 18088737.0,
"reward": 0.928125,
"reward_std": 0.04397946000099182,
"rewards/qwen_accuracy_reward/mean": 0.928125,
"rewards/qwen_accuracy_reward/std": 0.13159393817186354,
"step": 230,
"step_time": 54.10395782412961
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1583.8,
"completions/max_terminated_length": 1528.0,
"completions/mean_length": 411.75625,
"completions/mean_terminated_length": 400.55755615234375,
"completions/min_length": 159.9,
"completions/min_terminated_length": 159.9,
"entropy": 0.13070192262530328,
"epoch": 0.7817589576547231,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 7.40499457111835e-06,
"loss": 0.0496,
"num_tokens": 18793955.0,
"reward": 0.95625,
"reward_std": 0.03104073107242584,
"rewards/qwen_accuracy_reward/mean": 0.95625,
"rewards/qwen_accuracy_reward/std": 0.08069398403167724,
"step": 240,
"step_time": 47.35719826500863
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1359.2,
"completions/max_terminated_length": 1338.3,
"completions/mean_length": 481.21875,
"completions/mean_terminated_length": 461.47271118164065,
"completions/min_length": 200.0,
"completions/min_terminated_length": 200.0,
"entropy": 0.14220248386263848,
"epoch": 0.8143322475570033,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 7.29641693811075e-06,
"loss": 0.0184,
"num_tokens": 19454497.0,
"reward": 0.95625,
"reward_std": 0.05418623313307762,
"rewards/qwen_accuracy_reward/mean": 0.95625,
"rewards/qwen_accuracy_reward/std": 0.11881711781024933,
"step": 250,
"step_time": 41.61551207816228
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1620.3,
"completions/max_terminated_length": 1620.3,
"completions/mean_length": 448.68125,
"completions/mean_terminated_length": 448.68125,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"entropy": 0.13489690721035003,
"epoch": 0.8469055374592834,
"frac_reward_zero_std": 0.85,
"grad_norm": 1.7421875,
"learning_rate": 7.187839305103149e-06,
"loss": 0.0,
"num_tokens": 20191547.0,
"reward": 0.925,
"reward_std": 0.07259083464741707,
"rewards/qwen_accuracy_reward/mean": 0.925,
"rewards/qwen_accuracy_reward/std": 0.19271825700998307,
"step": 260,
"step_time": 45.474128680489954
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.025,
"completions/max_length": 1471.7,
"completions/max_terminated_length": 1424.8,
"completions/mean_length": 519.99375,
"completions/mean_terminated_length": 437.50208740234376,
"completions/min_length": 181.7,
"completions/min_terminated_length": 181.7,
"entropy": 0.13938435539603233,
"epoch": 0.8794788273615635,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.91015625,
"learning_rate": 7.079261672095549e-06,
"loss": -0.0167,
"num_tokens": 21039881.0,
"reward": 0.93125,
"reward_std": 0.051027984172105786,
"rewards/qwen_accuracy_reward/mean": 0.93125,
"rewards/qwen_accuracy_reward/std": 0.15987386405467988,
"step": 270,
"step_time": 45.94509084094316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1812.2,
"completions/max_terminated_length": 1642.1,
"completions/mean_length": 489.865625,
"completions/mean_terminated_length": 469.1300048828125,
"completions/min_length": 197.8,
"completions/min_terminated_length": 197.8,
"entropy": 0.14942506179213524,
"epoch": 0.9120521172638436,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 6.9706840390879485e-06,
"loss": 0.0433,
"num_tokens": 21713262.0,
"reward": 0.96875,
"reward_std": 0.01767766922712326,
"rewards/qwen_accuracy_reward/mean": 0.96875,
"rewards/qwen_accuracy_reward/std": 0.06858760267496108,
"step": 280,
"step_time": 54.875936476886274
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1379.1,
"completions/max_terminated_length": 1379.1,
"completions/mean_length": 441.2125,
"completions/mean_terminated_length": 441.2125,
"completions/min_length": 184.5,
"completions/min_terminated_length": 184.5,
"entropy": 0.14119350165128708,
"epoch": 0.9446254071661238,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 6.8621064060803475e-06,
"loss": -0.0025,
"num_tokens": 22443962.0,
"reward": 0.95625,
"reward_std": 0.02177756354212761,
"rewards/qwen_accuracy_reward/mean": 0.95625,
"rewards/qwen_accuracy_reward/std": 0.09856200665235519,
"step": 290,
"step_time": 41.87264884654432
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1268.2,
"completions/max_terminated_length": 1268.2,
"completions/mean_length": 434.421875,
"completions/mean_terminated_length": 434.421875,
"completions/min_length": 206.3,
"completions/min_terminated_length": 206.3,
"entropy": 0.1490817114710808,
"epoch": 0.9771986970684039,
"frac_reward_zero_std": 0.925,
"grad_norm": 1.0546875,
"learning_rate": 6.753528773072747e-06,
"loss": 0.0056,
"num_tokens": 23106601.0,
"reward": 0.953125,
"reward_std": 0.03061639815568924,
"rewards/qwen_accuracy_reward/mean": 0.953125,
"rewards/qwen_accuracy_reward/std": 0.11623967587947845,
"step": 300,
"step_time": 39.61534147607163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1601.1,
"completions/max_terminated_length": 1574.2,
"completions/mean_length": 469.58125,
"completions/mean_terminated_length": 447.9079223632813,
"completions/min_length": 181.1,
"completions/min_terminated_length": 181.1,
"entropy": 0.13837436586618423,
"epoch": 1.009771986970684,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 6.644951140065147e-06,
"loss": 0.029,
"num_tokens": 23888067.0,
"reward": 0.94375,
"reward_std": 0.02925042062997818,
"rewards/qwen_accuracy_reward/mean": 0.94375,
"rewards/qwen_accuracy_reward/std": 0.12826661467552186,
"step": 310,
"step_time": 47.85272020176053
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1411.9,
"completions/max_terminated_length": 1406.8,
"completions/mean_length": 404.721875,
"completions/mean_terminated_length": 393.82207641601565,
"completions/min_length": 183.3,
"completions/min_terminated_length": 183.3,
"entropy": 0.1374943107366562,
"epoch": 1.0423452768729642,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 6.536373507057546e-06,
"loss": 0.0265,
"num_tokens": 24727370.0,
"reward": 0.996875,
"reward_std": 0.00883883461356163,
"rewards/qwen_accuracy_reward/mean": 0.996875,
"rewards/qwen_accuracy_reward/std": 0.01767766922712326,
"step": 320,
"step_time": 44.45478741144761
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 1119.1,
"completions/max_terminated_length": 1095.2,
"completions/mean_length": 461.95,
"completions/mean_terminated_length": 424.2075927734375,
"completions/min_length": 204.7,
"completions/min_terminated_length": 204.7,
"entropy": 0.15105342343449593,
"epoch": 1.0749185667752443,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 6.427795874049946e-06,
"loss": 0.0436,
"num_tokens": 25442066.0,
"reward": 0.978125,
"reward_std": 0.04218915030360222,
"rewards/qwen_accuracy_reward/mean": 0.978125,
"rewards/qwen_accuracy_reward/std": 0.07587221264839172,
"step": 330,
"step_time": 35.703045930247754
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1083.0,
"completions/max_terminated_length": 1083.0,
"completions/mean_length": 374.3875,
"completions/mean_terminated_length": 374.3875,
"completions/min_length": 168.6,
"completions/min_terminated_length": 168.6,
"entropy": 0.14668092131614685,
"epoch": 1.1074918566775245,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 6.319218241042345e-06,
"loss": 0.012,
"num_tokens": 26188086.0,
"reward": 0.946875,
"reward_std": 0.02651650384068489,
"rewards/qwen_accuracy_reward/mean": 0.946875,
"rewards/qwen_accuracy_reward/std": 0.1056659385561943,
"step": 340,
"step_time": 34.11438843393698
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0125,
"completions/max_length": 1756.2,
"completions/max_terminated_length": 1669.5,
"completions/mean_length": 522.075,
"completions/mean_terminated_length": 480.16993408203126,
"completions/min_length": 191.0,
"completions/min_terminated_length": 191.0,
"entropy": 0.16799205988645555,
"epoch": 1.1400651465798046,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 6.2106406080347455e-06,
"loss": 0.0095,
"num_tokens": 26977806.0,
"reward": 0.896875,
"reward_std": 0.02651650384068489,
"rewards/qwen_accuracy_reward/mean": 0.896875,
"rewards/qwen_accuracy_reward/std": 0.13685612380504608,
"step": 350,
"step_time": 53.24436394525692
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1369.8,
"completions/max_terminated_length": 1369.8,
"completions/mean_length": 441.03125,
"completions/mean_terminated_length": 441.03125,
"completions/min_length": 186.2,
"completions/min_terminated_length": 186.2,
"entropy": 0.16459481716156005,
"epoch": 1.1726384364820848,
"frac_reward_zero_std": 0.925,
"grad_norm": 1.546875,
"learning_rate": 6.102062975027145e-06,
"loss": -0.0057,
"num_tokens": 27647848.0,
"reward": 0.959375,
"reward_std": 0.03061639815568924,
"rewards/qwen_accuracy_reward/mean": 0.959375,
"rewards/qwen_accuracy_reward/std": 0.09297246783971787,
"step": 360,
"step_time": 38.45815520407632
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1612.2,
"completions/max_terminated_length": 1579.0,
"completions/mean_length": 457.859375,
"completions/mean_terminated_length": 447.72197265625,
"completions/min_length": 166.2,
"completions/min_terminated_length": 166.2,
"entropy": 0.16912921741604806,
"epoch": 1.205211726384365,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 5.993485342019545e-06,
"loss": -0.0288,
"num_tokens": 28297795.0,
"reward": 0.925,
"reward_std": 0.0408231720328331,
"rewards/qwen_accuracy_reward/mean": 0.925,
"rewards/qwen_accuracy_reward/std": 0.16792239248752594,
"step": 370,
"step_time": 47.60929348124191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1762.3,
"completions/max_terminated_length": 1711.7,
"completions/mean_length": 454.509375,
"completions/mean_terminated_length": 443.2507049560547,
"completions/min_length": 187.1,
"completions/min_terminated_length": 187.1,
"entropy": 0.16176492720842361,
"epoch": 1.237785016286645,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 5.884907709011944e-06,
"loss": 0.0015,
"num_tokens": 29009246.0,
"reward": 0.9625,
"reward_std": 0.013363061845302582,
"rewards/qwen_accuracy_reward/mean": 0.9625,
"rewards/qwen_accuracy_reward/std": 0.07759521007537842,
"step": 380,
"step_time": 53.365480937343094
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1627.8,
"completions/max_terminated_length": 1543.4,
"completions/mean_length": 474.075,
"completions/mean_terminated_length": 463.8240905761719,
"completions/min_length": 193.1,
"completions/min_terminated_length": 193.1,
"entropy": 0.17872475683689118,
"epoch": 1.2703583061889252,
"frac_reward_zero_std": 0.85,
"grad_norm": 0.0,
"learning_rate": 5.776330076004344e-06,
"loss": 0.0233,
"num_tokens": 29602982.0,
"reward": 0.86875,
"reward_std": 0.06123279631137848,
"rewards/qwen_accuracy_reward/mean": 0.86875,
"rewards/qwen_accuracy_reward/std": 0.23578283339738845,
"step": 390,
"step_time": 49.23055710773915
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1638.8,
"completions/max_terminated_length": 1631.1,
"completions/mean_length": 486.021875,
"completions/mean_terminated_length": 475.7992919921875,
"completions/min_length": 195.0,
"completions/min_terminated_length": 195.0,
"entropy": 0.15955362915992738,
"epoch": 1.3029315960912053,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 5.667752442996744e-06,
"loss": 0.0137,
"num_tokens": 30373933.0,
"reward": 0.946875,
"reward_std": 0.00883883461356163,
"rewards/qwen_accuracy_reward/mean": 0.946875,
"rewards/qwen_accuracy_reward/std": 0.1056659385561943,
"step": 400,
"step_time": 50.148436666186896
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1311.0,
"completions/max_terminated_length": 1311.0,
"completions/mean_length": 428.321875,
"completions/mean_terminated_length": 428.321875,
"completions/min_length": 169.8,
"completions/min_terminated_length": 169.8,
"entropy": 0.1585499659180641,
"epoch": 1.3355048859934853,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 5.559174809989143e-06,
"loss": -0.0122,
"num_tokens": 31204012.0,
"reward": 0.9625,
"reward_std": 0.042613483220338824,
"rewards/qwen_accuracy_reward/mean": 0.9625,
"rewards/qwen_accuracy_reward/std": 0.09328008741140366,
"step": 410,
"step_time": 40.44286519419402
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1511.4,
"completions/max_terminated_length": 1400.9,
"completions/mean_length": 477.434375,
"completions/mean_terminated_length": 455.53375244140625,
"completions/min_length": 214.7,
"completions/min_terminated_length": 214.7,
"entropy": 0.17368159890174867,
"epoch": 1.3680781758957654,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 5.4505971769815425e-06,
"loss": 0.0329,
"num_tokens": 31967207.0,
"reward": 0.99375,
"reward_std": 0.011572751402854919,
"rewards/qwen_accuracy_reward/mean": 0.99375,
"rewards/qwen_accuracy_reward/std": 0.024593468010425567,
"step": 420,
"step_time": 47.28061485029757
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1253.4,
"completions/max_terminated_length": 1253.4,
"completions/mean_length": 427.68125,
"completions/mean_terminated_length": 427.68125,
"completions/min_length": 184.1,
"completions/min_terminated_length": 184.1,
"entropy": 0.16302806735038758,
"epoch": 1.4006514657980456,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 5.342019543973942e-06,
"loss": 0.0154,
"num_tokens": 32547929.0,
"reward": 0.9875,
"reward_std": 0.02177756354212761,
"rewards/qwen_accuracy_reward/mean": 0.9875,
"rewards/qwen_accuracy_reward/std": 0.04729212671518326,
"step": 430,
"step_time": 39.09927195487544
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1847.1,
"completions/max_terminated_length": 1769.7,
"completions/mean_length": 478.76875,
"completions/mean_terminated_length": 468.09132690429686,
"completions/min_length": 192.7,
"completions/min_terminated_length": 192.7,
"entropy": 0.16723438948392869,
"epoch": 1.4332247557003257,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 5.233441910966341e-06,
"loss": 0.0242,
"num_tokens": 33257383.0,
"reward": 0.94375,
"reward_std": 0.02925042062997818,
"rewards/qwen_accuracy_reward/mean": 0.94375,
"rewards/qwen_accuracy_reward/std": 0.12826661467552186,
"step": 440,
"step_time": 54.6911054097116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1209.8,
"completions/max_terminated_length": 1209.8,
"completions/mean_length": 372.3625,
"completions/mean_terminated_length": 372.3625,
"completions/min_length": 177.4,
"completions/min_terminated_length": 177.4,
"entropy": 0.15510803908109666,
"epoch": 1.4657980456026058,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 5.124864277958741e-06,
"loss": 0.0094,
"num_tokens": 33972827.0,
"reward": 0.978125,
"reward_std": 0.036084231734275815,
"rewards/qwen_accuracy_reward/mean": 0.978125,
"rewards/qwen_accuracy_reward/std": 0.07880139350891113,
"step": 450,
"step_time": 38.53117633331567
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1607.4,
"completions/max_terminated_length": 1607.4,
"completions/mean_length": 427.8875,
"completions/mean_terminated_length": 427.8875,
"completions/min_length": 173.4,
"completions/min_terminated_length": 173.4,
"entropy": 0.15332257747650146,
"epoch": 1.498371335504886,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 5.016286644951141e-06,
"loss": 0.0191,
"num_tokens": 34752895.0,
"reward": 0.95,
"reward_std": 0.03535533845424652,
"rewards/qwen_accuracy_reward/mean": 0.95,
"rewards/qwen_accuracy_reward/std": 0.10367314666509628,
"step": 460,
"step_time": 46.25087994951755
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1709.7,
"completions/max_terminated_length": 1703.5,
"completions/mean_length": 474.934375,
"completions/mean_terminated_length": 464.940625,
"completions/min_length": 197.1,
"completions/min_terminated_length": 197.1,
"entropy": 0.15956022590398788,
"epoch": 1.5309446254071661,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 4.90770901194354e-06,
"loss": 0.0197,
"num_tokens": 35566530.0,
"reward": 0.95625,
"reward_std": 0.03335031494498253,
"rewards/qwen_accuracy_reward/mean": 0.95625,
"rewards/qwen_accuracy_reward/std": 0.11587972939014435,
"step": 470,
"step_time": 52.431317151151596
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.021875,
"completions/max_length": 1856.4,
"completions/max_terminated_length": 1411.0,
"completions/mean_length": 556.703125,
"completions/mean_terminated_length": 482.36143188476564,
"completions/min_length": 201.5,
"completions/min_terminated_length": 201.5,
"entropy": 0.1640054076910019,
"epoch": 1.5635179153094463,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 4.79913137893594e-06,
"loss": 0.0122,
"num_tokens": 36332819.0,
"reward": 0.90625,
"reward_std": 0.011572751402854919,
"rewards/qwen_accuracy_reward/mean": 0.90625,
"rewards/qwen_accuracy_reward/std": 0.17163818180561066,
"step": 480,
"step_time": 55.61120590567589
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1141.9,
"completions/max_terminated_length": 1141.9,
"completions/mean_length": 371.696875,
"completions/mean_terminated_length": 371.696875,
"completions/min_length": 172.6,
"completions/min_terminated_length": 172.6,
"entropy": 0.13739149868488312,
"epoch": 1.5960912052117264,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 4.690553745928339e-06,
"loss": 0.0168,
"num_tokens": 37149242.0,
"reward": 0.984375,
"reward_std": 0.022201896458864213,
"rewards/qwen_accuracy_reward/mean": 0.984375,
"rewards/qwen_accuracy_reward/std": 0.05127874463796615,
"step": 490,
"step_time": 37.01674561398104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 1862.1,
"completions/max_terminated_length": 1621.2,
"completions/mean_length": 530.93125,
"completions/mean_terminated_length": 478.1709289550781,
"completions/min_length": 202.8,
"completions/min_terminated_length": 202.8,
"entropy": 0.16189506649971008,
"epoch": 1.6286644951140063,
"frac_reward_zero_std": 0.85,
"grad_norm": 0.8828125,
"learning_rate": 4.5819761129207385e-06,
"loss": 0.0709,
"num_tokens": 37935508.0,
"reward": 0.946875,
"reward_std": 0.06396671310067177,
"rewards/qwen_accuracy_reward/mean": 0.946875,
"rewards/qwen_accuracy_reward/std": 0.1611790642142296,
"step": 500,
"step_time": 57.11876249546185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1254.1,
"completions/max_terminated_length": 1167.1,
"completions/mean_length": 411.725,
"completions/mean_terminated_length": 400.9833679199219,
"completions/min_length": 165.3,
"completions/min_terminated_length": 165.3,
"entropy": 0.14960483461618423,
"epoch": 1.6612377850162865,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 4.473398479913138e-06,
"loss": 0.0285,
"num_tokens": 38658348.0,
"reward": 0.99375,
"reward_std": 0.01767766922712326,
"rewards/qwen_accuracy_reward/mean": 0.99375,
"rewards/qwen_accuracy_reward/std": 0.03535533845424652,
"step": 510,
"step_time": 37.5699773571454
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1664.6,
"completions/max_terminated_length": 1541.8,
"completions/mean_length": 507.371875,
"completions/mean_terminated_length": 477.1740295410156,
"completions/min_length": 199.8,
"completions/min_terminated_length": 199.8,
"entropy": 0.16616563498973846,
"epoch": 1.6938110749185666,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 4.364820846905538e-06,
"loss": 0.0223,
"num_tokens": 39415795.0,
"reward": 0.978125,
"reward_std": 0.036084231734275815,
"rewards/qwen_accuracy_reward/mean": 0.978125,
"rewards/qwen_accuracy_reward/std": 0.061483670771121976,
"step": 520,
"step_time": 49.47790257129818
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1499.9,
"completions/max_terminated_length": 1499.9,
"completions/mean_length": 381.7875,
"completions/mean_terminated_length": 381.7875,
"completions/min_length": 178.3,
"completions/min_terminated_length": 178.3,
"entropy": 0.1455918937921524,
"epoch": 1.7263843648208468,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 4.256243213897938e-06,
"loss": 0.0,
"num_tokens": 40196743.0,
"reward": 0.975,
"reward_std": 0.0,
"rewards/qwen_accuracy_reward/mean": 0.975,
"rewards/qwen_accuracy_reward/std": 0.04399413466453552,
"step": 530,
"step_time": 45.42760537136346
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1877.8,
"completions/max_terminated_length": 1709.0,
"completions/mean_length": 515.225,
"completions/mean_terminated_length": 482.8147033691406,
"completions/min_length": 195.3,
"completions/min_terminated_length": 195.3,
"entropy": 0.16437555029988288,
"epoch": 1.758957654723127,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 4.147665580890337e-06,
"loss": 0.0303,
"num_tokens": 41000151.0,
"reward": 0.975,
"reward_std": 0.04261348247528076,
"rewards/qwen_accuracy_reward/mean": 0.975,
"rewards/qwen_accuracy_reward/std": 0.09354988187551498,
"step": 540,
"step_time": 57.456473257485776
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1165.0,
"completions/max_terminated_length": 1165.0,
"completions/mean_length": 423.43125,
"completions/mean_terminated_length": 423.43125,
"completions/min_length": 190.2,
"completions/min_terminated_length": 190.2,
"entropy": 0.16483787596225738,
"epoch": 1.791530944625407,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 4.039087947882737e-06,
"loss": -0.0021,
"num_tokens": 41720497.0,
"reward": 0.9125,
"reward_std": 0.013363061845302582,
"rewards/qwen_accuracy_reward/mean": 0.9125,
"rewards/qwen_accuracy_reward/std": 0.12839525938034058,
"step": 550,
"step_time": 36.695044124592094
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1311.7,
"completions/max_terminated_length": 1305.7,
"completions/mean_length": 479.98125,
"completions/mean_terminated_length": 470.1879028320312,
"completions/min_length": 199.5,
"completions/min_terminated_length": 199.5,
"entropy": 0.1649734303355217,
"epoch": 1.8241042345276872,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 3.9305103148751365e-06,
"loss": 0.0047,
"num_tokens": 42491363.0,
"reward": 0.9875,
"reward_std": 0.02925042062997818,
"rewards/qwen_accuracy_reward/mean": 0.9875,
"rewards/qwen_accuracy_reward/std": 0.059948806464672086,
"step": 560,
"step_time": 41.12004605270922
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2218.3,
"completions/max_terminated_length": 1861.4,
"completions/mean_length": 548.38125,
"completions/mean_terminated_length": 494.34682006835936,
"completions/min_length": 183.9,
"completions/min_terminated_length": 183.9,
"entropy": 0.17041560113430024,
"epoch": 1.8566775244299674,
"frac_reward_zero_std": 0.85,
"grad_norm": 1.40625,
"learning_rate": 3.8219326818675354e-06,
"loss": 0.0604,
"num_tokens": 43141245.0,
"reward": 0.91875,
"reward_std": 0.06260073557496071,
"rewards/qwen_accuracy_reward/mean": 0.91875,
"rewards/qwen_accuracy_reward/std": 0.19519128501415253,
"step": 570,
"step_time": 65.7936801508069
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1056.9,
"completions/max_terminated_length": 1056.9,
"completions/mean_length": 378.003125,
"completions/mean_terminated_length": 378.003125,
"completions/min_length": 188.2,
"completions/min_terminated_length": 188.2,
"entropy": 0.15374673902988434,
"epoch": 1.8892508143322475,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 3.7133550488599353e-06,
"loss": 0.0,
"num_tokens": 43920014.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/qwen_accuracy_reward/mean": 1.0,
"rewards/qwen_accuracy_reward/std": 0.0,
"step": 580,
"step_time": 34.265030450746416
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1124.7,
"completions/max_terminated_length": 1011.9,
"completions/mean_length": 395.740625,
"completions/mean_terminated_length": 361.8580810546875,
"completions/min_length": 183.3,
"completions/min_terminated_length": 183.3,
"entropy": 0.15494132190942764,
"epoch": 1.9218241042345277,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 3.6047774158523346e-06,
"loss": 0.0321,
"num_tokens": 44712795.0,
"reward": 0.9875,
"reward_std": 0.02177756354212761,
"rewards/qwen_accuracy_reward/mean": 0.9875,
"rewards/qwen_accuracy_reward/std": 0.04729212671518326,
"step": 590,
"step_time": 37.20974960550666
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1213.0,
"completions/max_terminated_length": 1213.0,
"completions/mean_length": 404.8125,
"completions/mean_terminated_length": 404.8125,
"completions/min_length": 192.2,
"completions/min_terminated_length": 192.2,
"entropy": 0.13962563052773475,
"epoch": 1.9543973941368078,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 3.496199782844734e-06,
"loss": 0.0076,
"num_tokens": 45496631.0,
"reward": 0.9125,
"reward_std": 0.013363061845302582,
"rewards/qwen_accuracy_reward/mean": 0.9125,
"rewards/qwen_accuracy_reward/std": 0.16558347940444945,
"step": 600,
"step_time": 38.92747511789203
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 1518.7,
"completions/max_terminated_length": 916.1,
"completions/mean_length": 421.90625,
"completions/mean_terminated_length": 352.30673828125,
"completions/min_length": 176.6,
"completions/min_terminated_length": 176.6,
"entropy": 0.15325831845402718,
"epoch": 1.986970684039088,
"frac_reward_zero_std": 0.9,
"grad_norm": 0.0,
"learning_rate": 3.387622149837134e-06,
"loss": 0.1029,
"num_tokens": 46185953.0,
"reward": 0.91875,
"reward_std": 0.04355512708425522,
"rewards/qwen_accuracy_reward/mean": 0.91875,
"rewards/qwen_accuracy_reward/std": 0.1526600480079651,
"step": 610,
"step_time": 45.04236122053116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1122.6,
"completions/max_terminated_length": 1122.6,
"completions/mean_length": 409.753125,
"completions/mean_terminated_length": 409.753125,
"completions/min_length": 187.6,
"completions/min_terminated_length": 187.6,
"entropy": 0.15086480602622032,
"epoch": 2.019543973941368,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 3.2790445168295332e-06,
"loss": -0.0012,
"num_tokens": 46967130.0,
"reward": 0.978125,
"reward_std": 0.00883883461356163,
"rewards/qwen_accuracy_reward/mean": 0.978125,
"rewards/qwen_accuracy_reward/std": 0.0420013427734375,
"step": 620,
"step_time": 36.174442971032114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1220.2,
"completions/max_terminated_length": 1154.4,
"completions/mean_length": 433.240625,
"completions/mean_terminated_length": 415.08563232421875,
"completions/min_length": 179.6,
"completions/min_terminated_length": 179.6,
"entropy": 0.1521160587668419,
"epoch": 2.0521172638436482,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 3.1704668838219326e-06,
"loss": 0.02,
"num_tokens": 47658855.0,
"reward": 0.9625,
"reward_std": 0.05828612819314003,
"rewards/qwen_accuracy_reward/mean": 0.9625,
"rewards/qwen_accuracy_reward/std": 0.11276241540908813,
"step": 630,
"step_time": 39.28426020843908
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 904.7,
"completions/max_terminated_length": 904.7,
"completions/mean_length": 346.578125,
"completions/mean_terminated_length": 346.578125,
"completions/min_length": 186.1,
"completions/min_terminated_length": 186.1,
"entropy": 0.14097955524921418,
"epoch": 2.0846905537459284,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 3.061889250814333e-06,
"loss": 0.0,
"num_tokens": 48413328.0,
"reward": 0.975,
"reward_std": 0.0,
"rewards/qwen_accuracy_reward/mean": 0.975,
"rewards/qwen_accuracy_reward/std": 0.04399413466453552,
"step": 640,
"step_time": 31.522073939908296
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1237.6,
"completions/max_terminated_length": 1237.6,
"completions/mean_length": 428.15625,
"completions/mean_terminated_length": 428.15625,
"completions/min_length": 184.4,
"completions/min_terminated_length": 184.4,
"entropy": 0.15489777624607087,
"epoch": 2.1172638436482085,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 2.9533116178067322e-06,
"loss": 0.0,
"num_tokens": 49250122.0,
"reward": 0.95,
"reward_std": 0.0,
"rewards/qwen_accuracy_reward/mean": 0.95,
"rewards/qwen_accuracy_reward/std": 0.08798826932907104,
"step": 650,
"step_time": 38.97984252097085
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1062.7,
"completions/max_terminated_length": 1062.7,
"completions/mean_length": 399.328125,
"completions/mean_terminated_length": 399.328125,
"completions/min_length": 181.3,
"completions/min_terminated_length": 181.3,
"entropy": 0.14898339360952378,
"epoch": 2.1498371335504887,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 2.8447339847991316e-06,
"loss": 0.0039,
"num_tokens": 50033963.0,
"reward": 0.915625,
"reward_std": 0.03377464786171913,
"rewards/qwen_accuracy_reward/mean": 0.915625,
"rewards/qwen_accuracy_reward/std": 0.14567448943853378,
"step": 660,
"step_time": 33.721394206117836
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1256.5,
"completions/max_terminated_length": 1189.6,
"completions/mean_length": 410.984375,
"completions/mean_terminated_length": 399.96754150390626,
"completions/min_length": 195.5,
"completions/min_terminated_length": 195.5,
"entropy": 0.15526492446660994,
"epoch": 2.182410423452769,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 2.7361563517915314e-06,
"loss": 0.0314,
"num_tokens": 50775022.0,
"reward": 0.946875,
"reward_std": 0.00883883461356163,
"rewards/qwen_accuracy_reward/mean": 0.946875,
"rewards/qwen_accuracy_reward/std": 0.08967447578907013,
"step": 670,
"step_time": 39.73401907449588
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1545.4,
"completions/max_terminated_length": 1413.5,
"completions/mean_length": 461.90625,
"completions/mean_terminated_length": 439.9891723632812,
"completions/min_length": 191.5,
"completions/min_terminated_length": 191.5,
"entropy": 0.15921913534402848,
"epoch": 2.214983713355049,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 2.627578718783931e-06,
"loss": 0.0268,
"num_tokens": 51536384.0,
"reward": 0.96875,
"reward_std": 0.03471825420856476,
"rewards/qwen_accuracy_reward/mean": 0.96875,
"rewards/qwen_accuracy_reward/std": 0.08884271383285522,
"step": 680,
"step_time": 47.763417969178406
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 1956.4,
"completions/max_terminated_length": 1649.7,
"completions/mean_length": 517.278125,
"completions/mean_terminated_length": 450.4589599609375,
"completions/min_length": 182.3,
"completions/min_terminated_length": 182.3,
"entropy": 0.16205079928040506,
"epoch": 2.247557003257329,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.46875,
"learning_rate": 2.5190010857763302e-06,
"loss": 0.0888,
"num_tokens": 52323265.0,
"reward": 0.971875,
"reward_std": 0.05145231708884239,
"rewards/qwen_accuracy_reward/mean": 0.971875,
"rewards/qwen_accuracy_reward/std": 0.1004656806588173,
"step": 690,
"step_time": 58.18257061317563
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1702.6,
"completions/max_terminated_length": 1669.6,
"completions/mean_length": 443.4125,
"completions/mean_terminated_length": 432.5013061523438,
"completions/min_length": 183.3,
"completions/min_terminated_length": 183.3,
"entropy": 0.15109438076615334,
"epoch": 2.2801302931596092,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 2.4104234527687296e-06,
"loss": 0.0435,
"num_tokens": 53081109.0,
"reward": 0.9625,
"reward_std": 0.02925042062997818,
"rewards/qwen_accuracy_reward/mean": 0.9625,
"rewards/qwen_accuracy_reward/std": 0.10394294112920761,
"step": 700,
"step_time": 51.96710612634197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1106.4,
"completions/max_terminated_length": 1106.4,
"completions/mean_length": 390.940625,
"completions/mean_terminated_length": 390.940625,
"completions/min_length": 186.6,
"completions/min_terminated_length": 186.6,
"entropy": 0.15517303124070167,
"epoch": 2.3127035830618894,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 2.3018458197611294e-06,
"loss": 0.0292,
"num_tokens": 53864306.0,
"reward": 0.934375,
"reward_std": 0.04419417306780815,
"rewards/qwen_accuracy_reward/mean": 0.934375,
"rewards/qwen_accuracy_reward/std": 0.11064954251050949,
"step": 710,
"step_time": 32.47686854107305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 923.7,
"completions/max_terminated_length": 923.7,
"completions/mean_length": 367.85625,
"completions/mean_terminated_length": 367.85625,
"completions/min_length": 179.6,
"completions/min_terminated_length": 179.6,
"entropy": 0.15150292664766313,
"epoch": 2.3452768729641695,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 2.193268186753529e-06,
"loss": 0.0006,
"num_tokens": 54577764.0,
"reward": 0.996875,
"reward_std": 0.00883883461356163,
"rewards/qwen_accuracy_reward/mean": 0.996875,
"rewards/qwen_accuracy_reward/std": 0.01767766922712326,
"step": 720,
"step_time": 30.78780706692487
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1826.7,
"completions/max_terminated_length": 1750.5,
"completions/mean_length": 492.021875,
"completions/mean_terminated_length": 460.0340515136719,
"completions/min_length": 178.1,
"completions/min_terminated_length": 178.1,
"entropy": 0.15865328460931777,
"epoch": 2.3778501628664497,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 2.0846905537459286e-06,
"loss": 0.0263,
"num_tokens": 55272515.0,
"reward": 0.98125,
"reward_std": 0.03104073032736778,
"rewards/qwen_accuracy_reward/mean": 0.98125,
"rewards/qwen_accuracy_reward/std": 0.05456787198781967,
"step": 730,
"step_time": 55.55191990900785
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1124.4,
"completions/max_terminated_length": 1124.4,
"completions/mean_length": 408.946875,
"completions/mean_terminated_length": 408.946875,
"completions/min_length": 170.6,
"completions/min_terminated_length": 170.6,
"entropy": 0.16333993151783943,
"epoch": 2.41042345276873,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 1.976112920738328e-06,
"loss": 0.0104,
"num_tokens": 55921930.0,
"reward": 0.965625,
"reward_std": 0.01293872892856598,
"rewards/qwen_accuracy_reward/mean": 0.965625,
"rewards/qwen_accuracy_reward/std": 0.07360859215259552,
"step": 740,
"step_time": 34.814000389166175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1399.8,
"completions/max_terminated_length": 1388.3,
"completions/mean_length": 431.521875,
"completions/mean_terminated_length": 410.98146362304686,
"completions/min_length": 164.8,
"completions/min_terminated_length": 164.8,
"entropy": 0.14200911596417426,
"epoch": 2.44299674267101,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.8675352877307276e-06,
"loss": 0.019,
"num_tokens": 56772993.0,
"reward": 0.98125,
"reward_std": 0.02177756354212761,
"rewards/qwen_accuracy_reward/mean": 0.98125,
"rewards/qwen_accuracy_reward/std": 0.05456787198781967,
"step": 750,
"step_time": 43.50874480362982
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1684.0,
"completions/max_terminated_length": 1554.1,
"completions/mean_length": 515.384375,
"completions/mean_terminated_length": 494.017919921875,
"completions/min_length": 194.0,
"completions/min_terminated_length": 194.0,
"entropy": 0.17334669530391694,
"epoch": 2.47557003257329,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.328125,
"learning_rate": 1.7589576547231272e-06,
"loss": 0.0074,
"num_tokens": 57501516.0,
"reward": 0.959375,
"reward_std": 0.057342519611120225,
"rewards/qwen_accuracy_reward/mean": 0.959375,
"rewards/qwen_accuracy_reward/std": 0.09656921476125717,
"step": 760,
"step_time": 50.1735579572618
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.003125,
"completions/max_length": 1758.1,
"completions/max_terminated_length": 1663.3,
"completions/mean_length": 470.63125,
"completions/mean_terminated_length": 460.1435485839844,
"completions/min_length": 185.0,
"completions/min_terminated_length": 185.0,
"entropy": 0.16403108537197114,
"epoch": 2.5081433224755703,
"frac_reward_zero_std": 0.95,
"grad_norm": 1.2421875,
"learning_rate": 1.6503800217155266e-06,
"loss": 0.0135,
"num_tokens": 58186406.0,
"reward": 0.99375,
"reward_std": 0.01767766922712326,
"rewards/qwen_accuracy_reward/mean": 0.99375,
"rewards/qwen_accuracy_reward/std": 0.03535533845424652,
"step": 770,
"step_time": 51.758546930458394
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1432.5,
"completions/max_terminated_length": 1432.5,
"completions/mean_length": 448.15,
"completions/mean_terminated_length": 448.15,
"completions/min_length": 169.4,
"completions/min_terminated_length": 169.4,
"entropy": 0.1642938271164894,
"epoch": 2.5407166123778504,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 1.5418023887079264e-06,
"loss": 0.0038,
"num_tokens": 58895790.0,
"reward": 0.978125,
"reward_std": 0.036084231734275815,
"rewards/qwen_accuracy_reward/mean": 0.978125,
"rewards/qwen_accuracy_reward/std": 0.06321553289890289,
"step": 780,
"step_time": 43.58511639842764
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2056.4,
"completions/max_terminated_length": 2051.1,
"completions/mean_length": 555.934375,
"completions/mean_terminated_length": 527.9370727539062,
"completions/min_length": 191.2,
"completions/min_terminated_length": 191.2,
"entropy": 0.16091172024607658,
"epoch": 2.5732899022801305,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.0,
"learning_rate": 1.433224755700326e-06,
"loss": -0.0108,
"num_tokens": 59586505.0,
"reward": 0.89375,
"reward_std": 0.055127878487110135,
"rewards/qwen_accuracy_reward/mean": 0.89375,
"rewards/qwen_accuracy_reward/std": 0.2057945430278778,
"step": 790,
"step_time": 58.492029800172894
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 697.5,
"completions/max_terminated_length": 697.5,
"completions/mean_length": 322.95,
"completions/mean_terminated_length": 322.95,
"completions/min_length": 165.9,
"completions/min_terminated_length": 165.9,
"entropy": 0.14762159138917924,
"epoch": 2.6058631921824107,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 1.3246471226927254e-06,
"loss": -0.0081,
"num_tokens": 60234105.0,
"reward": 0.925,
"reward_std": 0.01767766922712326,
"rewards/qwen_accuracy_reward/mean": 0.925,
"rewards/qwen_accuracy_reward/std": 0.1476672813296318,
"step": 800,
"step_time": 23.70481554856524
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 2111.5,
"completions/max_terminated_length": 1792.0,
"completions/mean_length": 557.365625,
"completions/mean_terminated_length": 523.718637084961,
"completions/min_length": 207.9,
"completions/min_terminated_length": 207.9,
"entropy": 0.181430846452713,
"epoch": 2.6384364820846904,
"frac_reward_zero_std": 0.85,
"grad_norm": 0.0,
"learning_rate": 1.216069489685125e-06,
"loss": 0.09,
"num_tokens": 60907670.0,
"reward": 0.9125,
"reward_std": 0.06670062988996506,
"rewards/qwen_accuracy_reward/mean": 0.9125,
"rewards/qwen_accuracy_reward/std": 0.20204702019691467,
"step": 810,
"step_time": 57.087904060911384
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1913.1,
"completions/max_terminated_length": 1700.8,
"completions/mean_length": 471.440625,
"completions/mean_terminated_length": 449.9090637207031,
"completions/min_length": 188.8,
"completions/min_terminated_length": 188.8,
"entropy": 0.16233009248971939,
"epoch": 2.6710097719869705,
"frac_reward_zero_std": 0.925,
"grad_norm": 1.3046875,
"learning_rate": 1.1074918566775244e-06,
"loss": 0.0312,
"num_tokens": 61661099.0,
"reward": 0.984375,
"reward_std": 0.03061639815568924,
"rewards/qwen_accuracy_reward/mean": 0.984375,
"rewards/qwen_accuracy_reward/std": 0.06496979594230652,
"step": 820,
"step_time": 56.08578431969509
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1505.7,
"completions/max_terminated_length": 1505.7,
"completions/mean_length": 430.6125,
"completions/mean_terminated_length": 430.6125,
"completions/min_length": 189.9,
"completions/min_terminated_length": 189.9,
"entropy": 0.16054447889328002,
"epoch": 2.7035830618892507,
"frac_reward_zero_std": 0.975,
"grad_norm": 0.0,
"learning_rate": 9.989142236699242e-07,
"loss": -0.0073,
"num_tokens": 62382591.0,
"reward": 0.99375,
"reward_std": 0.011572751402854919,
"rewards/qwen_accuracy_reward/mean": 0.99375,
"rewards/qwen_accuracy_reward/std": 0.024593468010425567,
"step": 830,
"step_time": 44.93511769743636
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1307.6,
"completions/max_terminated_length": 1207.0,
"completions/mean_length": 426.103125,
"completions/mean_terminated_length": 403.6597930908203,
"completions/min_length": 215.9,
"completions/min_terminated_length": 215.9,
"entropy": 0.16358136087656022,
"epoch": 2.736156351791531,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 8.903365906623236e-07,
"loss": 0.0463,
"num_tokens": 63060464.0,
"reward": 0.9875,
"reward_std": 0.02177756354212761,
"rewards/qwen_accuracy_reward/mean": 0.9875,
"rewards/qwen_accuracy_reward/std": 0.04729212671518326,
"step": 840,
"step_time": 40.77616196591407
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 691.6,
"completions/max_terminated_length": 691.6,
"completions/mean_length": 307.434375,
"completions/mean_terminated_length": 307.434375,
"completions/min_length": 162.8,
"completions/min_terminated_length": 162.8,
"entropy": 0.13495604172348977,
"epoch": 2.768729641693811,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 7.817589576547231e-07,
"loss": 0.0,
"num_tokens": 63820595.0,
"reward": 0.975,
"reward_std": 0.0,
"rewards/qwen_accuracy_reward/mean": 0.975,
"rewards/qwen_accuracy_reward/std": 0.04399413466453552,
"step": 850,
"step_time": 24.526741536986084
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1168.6,
"completions/max_terminated_length": 1168.6,
"completions/mean_length": 406.71875,
"completions/mean_terminated_length": 406.71875,
"completions/min_length": 198.2,
"completions/min_terminated_length": 198.2,
"entropy": 0.1559869095683098,
"epoch": 2.801302931596091,
"frac_reward_zero_std": 0.95,
"grad_norm": 1.71875,
"learning_rate": 6.731813246471228e-07,
"loss": 0.0029,
"num_tokens": 64580849.0,
"reward": 0.953125,
"reward_std": 0.02041158601641655,
"rewards/qwen_accuracy_reward/mean": 0.953125,
"rewards/qwen_accuracy_reward/std": 0.10132758170366288,
"step": 860,
"step_time": 35.53059697123244
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 1577.3,
"completions/max_terminated_length": 1516.9,
"completions/mean_length": 498.10625,
"completions/mean_terminated_length": 433.7035827636719,
"completions/min_length": 175.3,
"completions/min_terminated_length": 175.3,
"entropy": 0.1583547368645668,
"epoch": 2.8338762214983713,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.0,
"learning_rate": 5.646036916395223e-07,
"loss": 0.0275,
"num_tokens": 65364547.0,
"reward": 0.965625,
"reward_std": 0.03787454217672348,
"rewards/qwen_accuracy_reward/mean": 0.965625,
"rewards/qwen_accuracy_reward/std": 0.09508474618196487,
"step": 870,
"step_time": 47.49830629490316
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.00625,
"completions/max_length": 1447.2,
"completions/max_terminated_length": 1369.3,
"completions/mean_length": 457.55625,
"completions/mean_terminated_length": 435.249169921875,
"completions/min_length": 181.7,
"completions/min_terminated_length": 181.7,
"entropy": 0.16112774163484572,
"epoch": 2.8664495114006514,
"frac_reward_zero_std": 0.875,
"grad_norm": 1.265625,
"learning_rate": 4.5602605863192187e-07,
"loss": 0.0422,
"num_tokens": 66148253.0,
"reward": 0.978125,
"reward_std": 0.04966200664639473,
"rewards/qwen_accuracy_reward/mean": 0.978125,
"rewards/qwen_accuracy_reward/std": 0.10221994370222091,
"step": 880,
"step_time": 43.340578782279046
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1843.3,
"completions/max_terminated_length": 1843.3,
"completions/mean_length": 497.509375,
"completions/mean_terminated_length": 497.509375,
"completions/min_length": 191.2,
"completions/min_terminated_length": 191.2,
"entropy": 0.17072843462228776,
"epoch": 2.8990228013029316,
"frac_reward_zero_std": 0.95,
"grad_norm": 0.0,
"learning_rate": 3.474484256243214e-07,
"loss": 0.0145,
"num_tokens": 66879112.0,
"reward": 0.965625,
"reward_std": 0.02041158601641655,
"rewards/qwen_accuracy_reward/mean": 0.965625,
"rewards/qwen_accuracy_reward/std": 0.08626527190208436,
"step": 890,
"step_time": 53.65781031670049
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.009375,
"completions/max_length": 1572.3,
"completions/max_terminated_length": 1323.3,
"completions/mean_length": 452.584375,
"completions/mean_terminated_length": 420.08506469726564,
"completions/min_length": 194.6,
"completions/min_terminated_length": 194.6,
"entropy": 0.1676468499004841,
"epoch": 2.9315960912052117,
"frac_reward_zero_std": 0.925,
"grad_norm": 0.828125,
"learning_rate": 2.3887079261672097e-07,
"loss": 0.0376,
"num_tokens": 67548691.0,
"reward": 0.915625,
"reward_std": 0.036084231734275815,
"rewards/qwen_accuracy_reward/mean": 0.915625,
"rewards/qwen_accuracy_reward/std": 0.15649925023317338,
"step": 900,
"step_time": 47.34689696319401
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1042.7,
"completions/max_terminated_length": 1042.7,
"completions/mean_length": 393.828125,
"completions/mean_terminated_length": 393.828125,
"completions/min_length": 186.9,
"completions/min_terminated_length": 186.9,
"entropy": 0.15125710666179656,
"epoch": 2.964169381107492,
"frac_reward_zero_std": 0.9,
"grad_norm": 1.7109375,
"learning_rate": 1.3029315960912054e-07,
"loss": 0.0054,
"num_tokens": 68345092.0,
"reward": 0.959375,
"reward_std": 0.04218914955854416,
"rewards/qwen_accuracy_reward/mean": 0.959375,
"rewards/qwen_accuracy_reward/std": 0.11388693749904633,
"step": 910,
"step_time": 32.79988148277626
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 708.5,
"completions/max_terminated_length": 708.5,
"completions/mean_length": 309.83125,
"completions/mean_terminated_length": 309.83125,
"completions/min_length": 173.2,
"completions/min_terminated_length": 173.2,
"entropy": 0.13616923689842225,
"epoch": 2.996742671009772,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0,
"learning_rate": 2.171552660152009e-08,
"loss": 0.0,
"num_tokens": 69106806.0,
"reward": 1.0,
"reward_std": 0.0,
"rewards/qwen_accuracy_reward/mean": 1.0,
"rewards/qwen_accuracy_reward/std": 0.0,
"step": 920,
"step_time": 25.47872376209125
}
],
"logging_steps": 10,
"max_steps": 921,
"num_input_tokens_seen": 69178036,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}