| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.0, |
| "eval_steps": 500, |
| "global_step": 921, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.021875, |
| "completions/max_length": 2397.7, |
| "completions/max_terminated_length": 2153.3, |
| "completions/mean_length": 724.20625, |
| "completions/mean_terminated_length": 655.9658874511719, |
| "completions/min_length": 158.7, |
| "completions/min_terminated_length": 158.7, |
| "entropy": 0.170973788946867, |
| "epoch": 0.03257328990228013, |
| "frac_reward_zero_std": 0.725, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.90228013029316e-06, |
| "loss": 0.0209, |
| "num_tokens": 901722.0, |
| "reward": 0.671875, |
| "reward_std": 0.12951098531484603, |
| "rewards/qwen_accuracy_reward/mean": 0.671875, |
| "rewards/qwen_accuracy_reward/std": 0.32610869109630586, |
| "step": 10, |
| "step_time": 75.02407562928275 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 2212.1, |
| "completions/max_terminated_length": 2131.1, |
| "completions/mean_length": 648.159375, |
| "completions/mean_terminated_length": 619.5659057617188, |
| "completions/min_length": 142.9, |
| "completions/min_terminated_length": 142.9, |
| "entropy": 0.16581312268972398, |
| "epoch": 0.06514657980456026, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.0, |
| "learning_rate": 9.79370249728556e-06, |
| "loss": 0.0216, |
| "num_tokens": 1827701.0, |
| "reward": 0.834375, |
| "reward_std": 0.09659009724855423, |
| "rewards/qwen_accuracy_reward/mean": 0.834375, |
| "rewards/qwen_accuracy_reward/std": 0.24483564049005507, |
| "step": 20, |
| "step_time": 66.24043246284127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 2206.0, |
| "completions/max_terminated_length": 2090.8, |
| "completions/mean_length": 623.284375, |
| "completions/mean_terminated_length": 558.5000427246093, |
| "completions/min_length": 171.1, |
| "completions/min_terminated_length": 171.1, |
| "entropy": 0.1667719691991806, |
| "epoch": 0.09771986970684039, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 0.7890625, |
| "learning_rate": 9.68512486427796e-06, |
| "loss": 0.0347, |
| "num_tokens": 2654656.0, |
| "reward": 0.83125, |
| "reward_std": 0.09974638372659683, |
| "rewards/qwen_accuracy_reward/mean": 0.83125, |
| "rewards/qwen_accuracy_reward/std": 0.31891718655824663, |
| "step": 30, |
| "step_time": 67.77192380828782 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1337.0, |
| "completions/max_terminated_length": 1337.0, |
| "completions/mean_length": 469.159375, |
| "completions/mean_terminated_length": 469.159375, |
| "completions/min_length": 165.4, |
| "completions/min_terminated_length": 165.4, |
| "entropy": 0.15202879384160042, |
| "epoch": 0.13029315960912052, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 1.875, |
| "learning_rate": 9.576547231270358e-06, |
| "loss": 0.0142, |
| "num_tokens": 3441667.0, |
| "reward": 0.8125, |
| "reward_std": 0.09931695759296418, |
| "rewards/qwen_accuracy_reward/mean": 0.8125, |
| "rewards/qwen_accuracy_reward/std": 0.3193816542625427, |
| "step": 40, |
| "step_time": 41.496644421108066 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1534.5, |
| "completions/max_terminated_length": 1359.1, |
| "completions/mean_length": 505.4125, |
| "completions/mean_terminated_length": 475.48418579101565, |
| "completions/min_length": 166.3, |
| "completions/min_terminated_length": 166.3, |
| "entropy": 0.1465001180768013, |
| "epoch": 0.16286644951140064, |
| "frac_reward_zero_std": 0.825, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.467969598262759e-06, |
| "loss": 0.0048, |
| "num_tokens": 4235663.0, |
| "reward": 0.875, |
| "reward_std": 0.07596379667520523, |
| "rewards/qwen_accuracy_reward/mean": 0.875, |
| "rewards/qwen_accuracy_reward/std": 0.20005422383546828, |
| "step": 50, |
| "step_time": 47.53361711697653 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1282.1, |
| "completions/max_terminated_length": 1282.1, |
| "completions/mean_length": 475.225, |
| "completions/mean_terminated_length": 475.225, |
| "completions/min_length": 189.6, |
| "completions/min_terminated_length": 189.6, |
| "entropy": 0.15337296426296235, |
| "epoch": 0.19543973941368079, |
| "frac_reward_zero_std": 0.75, |
| "grad_norm": 1.640625, |
| "learning_rate": 9.359391965255158e-06, |
| "loss": 0.0229, |
| "num_tokens": 4959391.0, |
| "reward": 0.85, |
| "reward_std": 0.11794019415974617, |
| "rewards/qwen_accuracy_reward/mean": 0.85, |
| "rewards/qwen_accuracy_reward/std": 0.25456976890563965, |
| "step": 60, |
| "step_time": 36.53024397492409 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1927.4, |
| "completions/max_terminated_length": 1895.7, |
| "completions/mean_length": 612.075, |
| "completions/mean_terminated_length": 590.4368774414063, |
| "completions/min_length": 192.2, |
| "completions/min_terminated_length": 192.2, |
| "entropy": 0.16074557453393937, |
| "epoch": 0.2280130293159609, |
| "frac_reward_zero_std": 0.825, |
| "grad_norm": 1.4453125, |
| "learning_rate": 9.250814332247557e-06, |
| "loss": 0.0266, |
| "num_tokens": 5749223.0, |
| "reward": 0.803125, |
| "reward_std": 0.08783914744853974, |
| "rewards/qwen_accuracy_reward/mean": 0.803125, |
| "rewards/qwen_accuracy_reward/std": 0.2966747134923935, |
| "step": 70, |
| "step_time": 57.99369401996955 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2189.3, |
| "completions/max_terminated_length": 2009.3, |
| "completions/mean_length": 690.7125, |
| "completions/mean_terminated_length": 586.1856994628906, |
| "completions/min_length": 188.9, |
| "completions/min_terminated_length": 188.9, |
| "entropy": 0.15213419646024703, |
| "epoch": 0.26058631921824105, |
| "frac_reward_zero_std": 0.8, |
| "grad_norm": 0.0, |
| "learning_rate": 9.142236699239957e-06, |
| "loss": 0.0282, |
| "num_tokens": 6565907.0, |
| "reward": 0.878125, |
| "reward_std": 0.08617057129740716, |
| "rewards/qwen_accuracy_reward/mean": 0.878125, |
| "rewards/qwen_accuracy_reward/std": 0.27245663553476335, |
| "step": 80, |
| "step_time": 65.59074299260973 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1602.7, |
| "completions/max_terminated_length": 1467.6, |
| "completions/mean_length": 467.484375, |
| "completions/mean_terminated_length": 456.31884765625, |
| "completions/min_length": 179.3, |
| "completions/min_terminated_length": 179.3, |
| "entropy": 0.1368262179195881, |
| "epoch": 0.2931596091205212, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 0.0, |
| "learning_rate": 9.033659066232356e-06, |
| "loss": 0.004, |
| "num_tokens": 7334102.0, |
| "reward": 0.88125, |
| "reward_std": 0.06123279631137848, |
| "rewards/qwen_accuracy_reward/mean": 0.88125, |
| "rewards/qwen_accuracy_reward/std": 0.25089033097028735, |
| "step": 90, |
| "step_time": 48.28751948485151 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2426.9, |
| "completions/max_terminated_length": 2137.6, |
| "completions/mean_length": 604.2625, |
| "completions/mean_terminated_length": 548.6393463134766, |
| "completions/min_length": 179.7, |
| "completions/min_terminated_length": 179.7, |
| "entropy": 0.15090147852897645, |
| "epoch": 0.3257328990228013, |
| "frac_reward_zero_std": 0.825, |
| "grad_norm": 1.625, |
| "learning_rate": 8.925081433224755e-06, |
| "loss": 0.0856, |
| "num_tokens": 8127226.0, |
| "reward": 0.84375, |
| "reward_std": 0.07459585815668106, |
| "rewards/qwen_accuracy_reward/mean": 0.84375, |
| "rewards/qwen_accuracy_reward/std": 0.29288421422243116, |
| "step": 100, |
| "step_time": 70.59187124017626 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 2326.5, |
| "completions/max_terminated_length": 2284.1, |
| "completions/mean_length": 609.134375, |
| "completions/mean_terminated_length": 541.8694732666015, |
| "completions/min_length": 179.7, |
| "completions/min_terminated_length": 179.7, |
| "entropy": 0.17526374608278275, |
| "epoch": 0.3583061889250814, |
| "frac_reward_zero_std": 0.775, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.816503800217156e-06, |
| "loss": -0.0395, |
| "num_tokens": 8923405.0, |
| "reward": 0.84375, |
| "reward_std": 0.10457713454961777, |
| "rewards/qwen_accuracy_reward/mean": 0.84375, |
| "rewards/qwen_accuracy_reward/std": 0.2856591001152992, |
| "step": 110, |
| "step_time": 64.31981013910845 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2472.2, |
| "completions/max_terminated_length": 2434.0, |
| "completions/mean_length": 614.309375, |
| "completions/mean_terminated_length": 508.3423645019531, |
| "completions/min_length": 206.9, |
| "completions/min_terminated_length": 206.9, |
| "entropy": 0.14617881700396537, |
| "epoch": 0.39087947882736157, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.707926167209557e-06, |
| "loss": -0.0058, |
| "num_tokens": 9650464.0, |
| "reward": 0.934375, |
| "reward_std": 0.06943454667925834, |
| "rewards/qwen_accuracy_reward/mean": 0.934375, |
| "rewards/qwen_accuracy_reward/std": 0.1803007885813713, |
| "step": 120, |
| "step_time": 71.93199644116685 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1687.5, |
| "completions/max_terminated_length": 1578.9, |
| "completions/mean_length": 506.765625, |
| "completions/mean_terminated_length": 475.7398681640625, |
| "completions/min_length": 197.5, |
| "completions/min_terminated_length": 197.5, |
| "entropy": 0.14087174832820892, |
| "epoch": 0.4234527687296417, |
| "frac_reward_zero_std": 0.825, |
| "grad_norm": 0.83984375, |
| "learning_rate": 8.599348534201956e-06, |
| "loss": 0.0126, |
| "num_tokens": 10481205.0, |
| "reward": 0.91875, |
| "reward_std": 0.071863903850317, |
| "rewards/qwen_accuracy_reward/mean": 0.91875, |
| "rewards/qwen_accuracy_reward/std": 0.16812221705913544, |
| "step": 130, |
| "step_time": 51.10968422973529 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 2098.5, |
| "completions/max_terminated_length": 1797.3, |
| "completions/mean_length": 617.415625, |
| "completions/mean_terminated_length": 556.195068359375, |
| "completions/min_length": 178.4, |
| "completions/min_terminated_length": 178.4, |
| "entropy": 0.15323501601815223, |
| "epoch": 0.4560260586319218, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.0, |
| "learning_rate": 8.490770901194355e-06, |
| "loss": 0.0577, |
| "num_tokens": 11246506.0, |
| "reward": 0.884375, |
| "reward_std": 0.06102004498243332, |
| "rewards/qwen_accuracy_reward/mean": 0.884375, |
| "rewards/qwen_accuracy_reward/std": 0.18740518838167192, |
| "step": 140, |
| "step_time": 61.85610852092505 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 2084.5, |
| "completions/max_terminated_length": 1994.2, |
| "completions/mean_length": 541.646875, |
| "completions/mean_terminated_length": 509.9564208984375, |
| "completions/min_length": 190.2, |
| "completions/min_terminated_length": 190.2, |
| "entropy": 0.13484818413853644, |
| "epoch": 0.48859934853420195, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 8.382193268186755e-06, |
| "loss": 0.0238, |
| "num_tokens": 12068721.0, |
| "reward": 0.93125, |
| "reward_std": 0.03335031494498253, |
| "rewards/qwen_accuracy_reward/mean": 0.93125, |
| "rewards/qwen_accuracy_reward/std": 0.147479148209095, |
| "step": 150, |
| "step_time": 60.58088333830237 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2113.6, |
| "completions/max_terminated_length": 1865.1, |
| "completions/mean_length": 617.86875, |
| "completions/mean_terminated_length": 564.8990783691406, |
| "completions/min_length": 176.8, |
| "completions/min_terminated_length": 176.8, |
| "entropy": 0.14884034767746926, |
| "epoch": 0.5211726384364821, |
| "frac_reward_zero_std": 0.825, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.273615635179154e-06, |
| "loss": 0.0349, |
| "num_tokens": 12860375.0, |
| "reward": 0.90625, |
| "reward_std": 0.07280554771423339, |
| "rewards/qwen_accuracy_reward/mean": 0.90625, |
| "rewards/qwen_accuracy_reward/std": 0.19390300512313843, |
| "step": 160, |
| "step_time": 61.31780819287523 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2070.4, |
| "completions/max_terminated_length": 1955.4, |
| "completions/mean_length": 525.28125, |
| "completions/mean_terminated_length": 470.9878173828125, |
| "completions/min_length": 199.4, |
| "completions/min_terminated_length": 199.4, |
| "entropy": 0.1467311643064022, |
| "epoch": 0.5537459283387622, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.0, |
| "learning_rate": 8.165038002171553e-06, |
| "loss": 0.0292, |
| "num_tokens": 13584777.0, |
| "reward": 0.884375, |
| "reward_std": 0.05376190170645714, |
| "rewards/qwen_accuracy_reward/mean": 0.884375, |
| "rewards/qwen_accuracy_reward/std": 0.2119799315929413, |
| "step": 170, |
| "step_time": 61.066531581245364 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1569.3, |
| "completions/max_terminated_length": 1385.7, |
| "completions/mean_length": 477.046875, |
| "completions/mean_terminated_length": 466.2573547363281, |
| "completions/min_length": 198.5, |
| "completions/min_terminated_length": 198.5, |
| "entropy": 0.14774601608514787, |
| "epoch": 0.5863192182410424, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.056460369163954e-06, |
| "loss": 0.06, |
| "num_tokens": 14346864.0, |
| "reward": 0.9375, |
| "reward_std": 0.02925042062997818, |
| "rewards/qwen_accuracy_reward/mean": 0.9375, |
| "rewards/qwen_accuracy_reward/std": 0.13194561302661895, |
| "step": 180, |
| "step_time": 47.719357285648584 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 2356.5, |
| "completions/max_terminated_length": 1938.6, |
| "completions/mean_length": 530.79375, |
| "completions/mean_terminated_length": 508.41269836425784, |
| "completions/min_length": 179.6, |
| "completions/min_terminated_length": 179.6, |
| "entropy": 0.15401604473590852, |
| "epoch": 0.6188925081433225, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 7.947882736156353e-06, |
| "loss": 0.073, |
| "num_tokens": 15077166.0, |
| "reward": 0.928125, |
| "reward_std": 0.03808925524353981, |
| "rewards/qwen_accuracy_reward/mean": 0.928125, |
| "rewards/qwen_accuracy_reward/std": 0.16100659370422363, |
| "step": 190, |
| "step_time": 68.03254930684344 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 2111.2, |
| "completions/max_terminated_length": 2089.6, |
| "completions/mean_length": 532.41875, |
| "completions/mean_terminated_length": 511.9100036621094, |
| "completions/min_length": 199.4, |
| "completions/min_terminated_length": 199.4, |
| "entropy": 0.14272007048130037, |
| "epoch": 0.6514657980456026, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 7.839305103148752e-06, |
| "loss": 0.038, |
| "num_tokens": 15699220.0, |
| "reward": 0.928125, |
| "reward_std": 0.0245114803314209, |
| "rewards/qwen_accuracy_reward/mean": 0.928125, |
| "rewards/qwen_accuracy_reward/std": 0.11228372007608414, |
| "step": 200, |
| "step_time": 61.566719483956696 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 1495.5, |
| "completions/max_terminated_length": 1268.8, |
| "completions/mean_length": 497.496875, |
| "completions/mean_terminated_length": 442.867822265625, |
| "completions/min_length": 186.1, |
| "completions/min_terminated_length": 186.1, |
| "entropy": 0.13395386636257173, |
| "epoch": 0.6840390879478827, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 7.730727470141152e-06, |
| "loss": 0.0028, |
| "num_tokens": 16564171.0, |
| "reward": 0.9625, |
| "reward_std": 0.02177756354212761, |
| "rewards/qwen_accuracy_reward/mean": 0.9625, |
| "rewards/qwen_accuracy_reward/std": 0.07889154553413391, |
| "step": 210, |
| "step_time": 45.37072062129155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1425.8, |
| "completions/max_terminated_length": 1415.7, |
| "completions/mean_length": 453.871875, |
| "completions/mean_terminated_length": 444.69354248046875, |
| "completions/min_length": 166.8, |
| "completions/min_terminated_length": 166.8, |
| "entropy": 0.13297367617487907, |
| "epoch": 0.7166123778501629, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 7.622149837133551e-06, |
| "loss": 0.0147, |
| "num_tokens": 17330410.0, |
| "reward": 0.940625, |
| "reward_std": 0.022201896458864213, |
| "rewards/qwen_accuracy_reward/mean": 0.940625, |
| "rewards/qwen_accuracy_reward/std": 0.11959655284881592, |
| "step": 220, |
| "step_time": 44.74475174760446 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1833.4, |
| "completions/max_terminated_length": 1752.4, |
| "completions/mean_length": 501.321875, |
| "completions/mean_terminated_length": 469.6424560546875, |
| "completions/min_length": 171.5, |
| "completions/min_terminated_length": 171.5, |
| "entropy": 0.14848560467362404, |
| "epoch": 0.749185667752443, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 7.51357220412595e-06, |
| "loss": 0.022, |
| "num_tokens": 18088737.0, |
| "reward": 0.928125, |
| "reward_std": 0.04397946000099182, |
| "rewards/qwen_accuracy_reward/mean": 0.928125, |
| "rewards/qwen_accuracy_reward/std": 0.13159393817186354, |
| "step": 230, |
| "step_time": 54.10395782412961 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1583.8, |
| "completions/max_terminated_length": 1528.0, |
| "completions/mean_length": 411.75625, |
| "completions/mean_terminated_length": 400.55755615234375, |
| "completions/min_length": 159.9, |
| "completions/min_terminated_length": 159.9, |
| "entropy": 0.13070192262530328, |
| "epoch": 0.7817589576547231, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 7.40499457111835e-06, |
| "loss": 0.0496, |
| "num_tokens": 18793955.0, |
| "reward": 0.95625, |
| "reward_std": 0.03104073107242584, |
| "rewards/qwen_accuracy_reward/mean": 0.95625, |
| "rewards/qwen_accuracy_reward/std": 0.08069398403167724, |
| "step": 240, |
| "step_time": 47.35719826500863 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1359.2, |
| "completions/max_terminated_length": 1338.3, |
| "completions/mean_length": 481.21875, |
| "completions/mean_terminated_length": 461.47271118164065, |
| "completions/min_length": 200.0, |
| "completions/min_terminated_length": 200.0, |
| "entropy": 0.14220248386263848, |
| "epoch": 0.8143322475570033, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.0, |
| "learning_rate": 7.29641693811075e-06, |
| "loss": 0.0184, |
| "num_tokens": 19454497.0, |
| "reward": 0.95625, |
| "reward_std": 0.05418623313307762, |
| "rewards/qwen_accuracy_reward/mean": 0.95625, |
| "rewards/qwen_accuracy_reward/std": 0.11881711781024933, |
| "step": 250, |
| "step_time": 41.61551207816228 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1620.3, |
| "completions/max_terminated_length": 1620.3, |
| "completions/mean_length": 448.68125, |
| "completions/mean_terminated_length": 448.68125, |
| "completions/min_length": 182.0, |
| "completions/min_terminated_length": 182.0, |
| "entropy": 0.13489690721035003, |
| "epoch": 0.8469055374592834, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 1.7421875, |
| "learning_rate": 7.187839305103149e-06, |
| "loss": 0.0, |
| "num_tokens": 20191547.0, |
| "reward": 0.925, |
| "reward_std": 0.07259083464741707, |
| "rewards/qwen_accuracy_reward/mean": 0.925, |
| "rewards/qwen_accuracy_reward/std": 0.19271825700998307, |
| "step": 260, |
| "step_time": 45.474128680489954 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.025, |
| "completions/max_length": 1471.7, |
| "completions/max_terminated_length": 1424.8, |
| "completions/mean_length": 519.99375, |
| "completions/mean_terminated_length": 437.50208740234376, |
| "completions/min_length": 181.7, |
| "completions/min_terminated_length": 181.7, |
| "entropy": 0.13938435539603233, |
| "epoch": 0.8794788273615635, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.91015625, |
| "learning_rate": 7.079261672095549e-06, |
| "loss": -0.0167, |
| "num_tokens": 21039881.0, |
| "reward": 0.93125, |
| "reward_std": 0.051027984172105786, |
| "rewards/qwen_accuracy_reward/mean": 0.93125, |
| "rewards/qwen_accuracy_reward/std": 0.15987386405467988, |
| "step": 270, |
| "step_time": 45.94509084094316 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1812.2, |
| "completions/max_terminated_length": 1642.1, |
| "completions/mean_length": 489.865625, |
| "completions/mean_terminated_length": 469.1300048828125, |
| "completions/min_length": 197.8, |
| "completions/min_terminated_length": 197.8, |
| "entropy": 0.14942506179213524, |
| "epoch": 0.9120521172638436, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 6.9706840390879485e-06, |
| "loss": 0.0433, |
| "num_tokens": 21713262.0, |
| "reward": 0.96875, |
| "reward_std": 0.01767766922712326, |
| "rewards/qwen_accuracy_reward/mean": 0.96875, |
| "rewards/qwen_accuracy_reward/std": 0.06858760267496108, |
| "step": 280, |
| "step_time": 54.875936476886274 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1379.1, |
| "completions/max_terminated_length": 1379.1, |
| "completions/mean_length": 441.2125, |
| "completions/mean_terminated_length": 441.2125, |
| "completions/min_length": 184.5, |
| "completions/min_terminated_length": 184.5, |
| "entropy": 0.14119350165128708, |
| "epoch": 0.9446254071661238, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 6.8621064060803475e-06, |
| "loss": -0.0025, |
| "num_tokens": 22443962.0, |
| "reward": 0.95625, |
| "reward_std": 0.02177756354212761, |
| "rewards/qwen_accuracy_reward/mean": 0.95625, |
| "rewards/qwen_accuracy_reward/std": 0.09856200665235519, |
| "step": 290, |
| "step_time": 41.87264884654432 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1268.2, |
| "completions/max_terminated_length": 1268.2, |
| "completions/mean_length": 434.421875, |
| "completions/mean_terminated_length": 434.421875, |
| "completions/min_length": 206.3, |
| "completions/min_terminated_length": 206.3, |
| "entropy": 0.1490817114710808, |
| "epoch": 0.9771986970684039, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 1.0546875, |
| "learning_rate": 6.753528773072747e-06, |
| "loss": 0.0056, |
| "num_tokens": 23106601.0, |
| "reward": 0.953125, |
| "reward_std": 0.03061639815568924, |
| "rewards/qwen_accuracy_reward/mean": 0.953125, |
| "rewards/qwen_accuracy_reward/std": 0.11623967587947845, |
| "step": 300, |
| "step_time": 39.61534147607163 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1601.1, |
| "completions/max_terminated_length": 1574.2, |
| "completions/mean_length": 469.58125, |
| "completions/mean_terminated_length": 447.9079223632813, |
| "completions/min_length": 181.1, |
| "completions/min_terminated_length": 181.1, |
| "entropy": 0.13837436586618423, |
| "epoch": 1.009771986970684, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 6.644951140065147e-06, |
| "loss": 0.029, |
| "num_tokens": 23888067.0, |
| "reward": 0.94375, |
| "reward_std": 0.02925042062997818, |
| "rewards/qwen_accuracy_reward/mean": 0.94375, |
| "rewards/qwen_accuracy_reward/std": 0.12826661467552186, |
| "step": 310, |
| "step_time": 47.85272020176053 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1411.9, |
| "completions/max_terminated_length": 1406.8, |
| "completions/mean_length": 404.721875, |
| "completions/mean_terminated_length": 393.82207641601565, |
| "completions/min_length": 183.3, |
| "completions/min_terminated_length": 183.3, |
| "entropy": 0.1374943107366562, |
| "epoch": 1.0423452768729642, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 6.536373507057546e-06, |
| "loss": 0.0265, |
| "num_tokens": 24727370.0, |
| "reward": 0.996875, |
| "reward_std": 0.00883883461356163, |
| "rewards/qwen_accuracy_reward/mean": 0.996875, |
| "rewards/qwen_accuracy_reward/std": 0.01767766922712326, |
| "step": 320, |
| "step_time": 44.45478741144761 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0125, |
| "completions/max_length": 1119.1, |
| "completions/max_terminated_length": 1095.2, |
| "completions/mean_length": 461.95, |
| "completions/mean_terminated_length": 424.2075927734375, |
| "completions/min_length": 204.7, |
| "completions/min_terminated_length": 204.7, |
| "entropy": 0.15105342343449593, |
| "epoch": 1.0749185667752443, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 6.427795874049946e-06, |
| "loss": 0.0436, |
| "num_tokens": 25442066.0, |
| "reward": 0.978125, |
| "reward_std": 0.04218915030360222, |
| "rewards/qwen_accuracy_reward/mean": 0.978125, |
| "rewards/qwen_accuracy_reward/std": 0.07587221264839172, |
| "step": 330, |
| "step_time": 35.703045930247754 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1083.0, |
| "completions/max_terminated_length": 1083.0, |
| "completions/mean_length": 374.3875, |
| "completions/mean_terminated_length": 374.3875, |
| "completions/min_length": 168.6, |
| "completions/min_terminated_length": 168.6, |
| "entropy": 0.14668092131614685, |
| "epoch": 1.1074918566775245, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 6.319218241042345e-06, |
| "loss": 0.012, |
| "num_tokens": 26188086.0, |
| "reward": 0.946875, |
| "reward_std": 0.02651650384068489, |
| "rewards/qwen_accuracy_reward/mean": 0.946875, |
| "rewards/qwen_accuracy_reward/std": 0.1056659385561943, |
| "step": 340, |
| "step_time": 34.11438843393698 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0125, |
| "completions/max_length": 1756.2, |
| "completions/max_terminated_length": 1669.5, |
| "completions/mean_length": 522.075, |
| "completions/mean_terminated_length": 480.16993408203126, |
| "completions/min_length": 191.0, |
| "completions/min_terminated_length": 191.0, |
| "entropy": 0.16799205988645555, |
| "epoch": 1.1400651465798046, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 6.2106406080347455e-06, |
| "loss": 0.0095, |
| "num_tokens": 26977806.0, |
| "reward": 0.896875, |
| "reward_std": 0.02651650384068489, |
| "rewards/qwen_accuracy_reward/mean": 0.896875, |
| "rewards/qwen_accuracy_reward/std": 0.13685612380504608, |
| "step": 350, |
| "step_time": 53.24436394525692 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1369.8, |
| "completions/max_terminated_length": 1369.8, |
| "completions/mean_length": 441.03125, |
| "completions/mean_terminated_length": 441.03125, |
| "completions/min_length": 186.2, |
| "completions/min_terminated_length": 186.2, |
| "entropy": 0.16459481716156005, |
| "epoch": 1.1726384364820848, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 1.546875, |
| "learning_rate": 6.102062975027145e-06, |
| "loss": -0.0057, |
| "num_tokens": 27647848.0, |
| "reward": 0.959375, |
| "reward_std": 0.03061639815568924, |
| "rewards/qwen_accuracy_reward/mean": 0.959375, |
| "rewards/qwen_accuracy_reward/std": 0.09297246783971787, |
| "step": 360, |
| "step_time": 38.45815520407632 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1612.2, |
| "completions/max_terminated_length": 1579.0, |
| "completions/mean_length": 457.859375, |
| "completions/mean_terminated_length": 447.72197265625, |
| "completions/min_length": 166.2, |
| "completions/min_terminated_length": 166.2, |
| "entropy": 0.16912921741604806, |
| "epoch": 1.205211726384365, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 5.993485342019545e-06, |
| "loss": -0.0288, |
| "num_tokens": 28297795.0, |
| "reward": 0.925, |
| "reward_std": 0.0408231720328331, |
| "rewards/qwen_accuracy_reward/mean": 0.925, |
| "rewards/qwen_accuracy_reward/std": 0.16792239248752594, |
| "step": 370, |
| "step_time": 47.60929348124191 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1762.3, |
| "completions/max_terminated_length": 1711.7, |
| "completions/mean_length": 454.509375, |
| "completions/mean_terminated_length": 443.2507049560547, |
| "completions/min_length": 187.1, |
| "completions/min_terminated_length": 187.1, |
| "entropy": 0.16176492720842361, |
| "epoch": 1.237785016286645, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 5.884907709011944e-06, |
| "loss": 0.0015, |
| "num_tokens": 29009246.0, |
| "reward": 0.9625, |
| "reward_std": 0.013363061845302582, |
| "rewards/qwen_accuracy_reward/mean": 0.9625, |
| "rewards/qwen_accuracy_reward/std": 0.07759521007537842, |
| "step": 380, |
| "step_time": 53.365480937343094 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1627.8, |
| "completions/max_terminated_length": 1543.4, |
| "completions/mean_length": 474.075, |
| "completions/mean_terminated_length": 463.8240905761719, |
| "completions/min_length": 193.1, |
| "completions/min_terminated_length": 193.1, |
| "entropy": 0.17872475683689118, |
| "epoch": 1.2703583061889252, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 0.0, |
| "learning_rate": 5.776330076004344e-06, |
| "loss": 0.0233, |
| "num_tokens": 29602982.0, |
| "reward": 0.86875, |
| "reward_std": 0.06123279631137848, |
| "rewards/qwen_accuracy_reward/mean": 0.86875, |
| "rewards/qwen_accuracy_reward/std": 0.23578283339738845, |
| "step": 390, |
| "step_time": 49.23055710773915 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1638.8, |
| "completions/max_terminated_length": 1631.1, |
| "completions/mean_length": 486.021875, |
| "completions/mean_terminated_length": 475.7992919921875, |
| "completions/min_length": 195.0, |
| "completions/min_terminated_length": 195.0, |
| "entropy": 0.15955362915992738, |
| "epoch": 1.3029315960912053, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 5.667752442996744e-06, |
| "loss": 0.0137, |
| "num_tokens": 30373933.0, |
| "reward": 0.946875, |
| "reward_std": 0.00883883461356163, |
| "rewards/qwen_accuracy_reward/mean": 0.946875, |
| "rewards/qwen_accuracy_reward/std": 0.1056659385561943, |
| "step": 400, |
| "step_time": 50.148436666186896 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1311.0, |
| "completions/max_terminated_length": 1311.0, |
| "completions/mean_length": 428.321875, |
| "completions/mean_terminated_length": 428.321875, |
| "completions/min_length": 169.8, |
| "completions/min_terminated_length": 169.8, |
| "entropy": 0.1585499659180641, |
| "epoch": 1.3355048859934853, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 5.559174809989143e-06, |
| "loss": -0.0122, |
| "num_tokens": 31204012.0, |
| "reward": 0.9625, |
| "reward_std": 0.042613483220338824, |
| "rewards/qwen_accuracy_reward/mean": 0.9625, |
| "rewards/qwen_accuracy_reward/std": 0.09328008741140366, |
| "step": 410, |
| "step_time": 40.44286519419402 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1511.4, |
| "completions/max_terminated_length": 1400.9, |
| "completions/mean_length": 477.434375, |
| "completions/mean_terminated_length": 455.53375244140625, |
| "completions/min_length": 214.7, |
| "completions/min_terminated_length": 214.7, |
| "entropy": 0.17368159890174867, |
| "epoch": 1.3680781758957654, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 5.4505971769815425e-06, |
| "loss": 0.0329, |
| "num_tokens": 31967207.0, |
| "reward": 0.99375, |
| "reward_std": 0.011572751402854919, |
| "rewards/qwen_accuracy_reward/mean": 0.99375, |
| "rewards/qwen_accuracy_reward/std": 0.024593468010425567, |
| "step": 420, |
| "step_time": 47.28061485029757 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1253.4, |
| "completions/max_terminated_length": 1253.4, |
| "completions/mean_length": 427.68125, |
| "completions/mean_terminated_length": 427.68125, |
| "completions/min_length": 184.1, |
| "completions/min_terminated_length": 184.1, |
| "entropy": 0.16302806735038758, |
| "epoch": 1.4006514657980456, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 5.342019543973942e-06, |
| "loss": 0.0154, |
| "num_tokens": 32547929.0, |
| "reward": 0.9875, |
| "reward_std": 0.02177756354212761, |
| "rewards/qwen_accuracy_reward/mean": 0.9875, |
| "rewards/qwen_accuracy_reward/std": 0.04729212671518326, |
| "step": 430, |
| "step_time": 39.09927195487544 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1847.1, |
| "completions/max_terminated_length": 1769.7, |
| "completions/mean_length": 478.76875, |
| "completions/mean_terminated_length": 468.09132690429686, |
| "completions/min_length": 192.7, |
| "completions/min_terminated_length": 192.7, |
| "entropy": 0.16723438948392869, |
| "epoch": 1.4332247557003257, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 5.233441910966341e-06, |
| "loss": 0.0242, |
| "num_tokens": 33257383.0, |
| "reward": 0.94375, |
| "reward_std": 0.02925042062997818, |
| "rewards/qwen_accuracy_reward/mean": 0.94375, |
| "rewards/qwen_accuracy_reward/std": 0.12826661467552186, |
| "step": 440, |
| "step_time": 54.6911054097116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1209.8, |
| "completions/max_terminated_length": 1209.8, |
| "completions/mean_length": 372.3625, |
| "completions/mean_terminated_length": 372.3625, |
| "completions/min_length": 177.4, |
| "completions/min_terminated_length": 177.4, |
| "entropy": 0.15510803908109666, |
| "epoch": 1.4657980456026058, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 5.124864277958741e-06, |
| "loss": 0.0094, |
| "num_tokens": 33972827.0, |
| "reward": 0.978125, |
| "reward_std": 0.036084231734275815, |
| "rewards/qwen_accuracy_reward/mean": 0.978125, |
| "rewards/qwen_accuracy_reward/std": 0.07880139350891113, |
| "step": 450, |
| "step_time": 38.53117633331567 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1607.4, |
| "completions/max_terminated_length": 1607.4, |
| "completions/mean_length": 427.8875, |
| "completions/mean_terminated_length": 427.8875, |
| "completions/min_length": 173.4, |
| "completions/min_terminated_length": 173.4, |
| "entropy": 0.15332257747650146, |
| "epoch": 1.498371335504886, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 5.016286644951141e-06, |
| "loss": 0.0191, |
| "num_tokens": 34752895.0, |
| "reward": 0.95, |
| "reward_std": 0.03535533845424652, |
| "rewards/qwen_accuracy_reward/mean": 0.95, |
| "rewards/qwen_accuracy_reward/std": 0.10367314666509628, |
| "step": 460, |
| "step_time": 46.25087994951755 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1709.7, |
| "completions/max_terminated_length": 1703.5, |
| "completions/mean_length": 474.934375, |
| "completions/mean_terminated_length": 464.940625, |
| "completions/min_length": 197.1, |
| "completions/min_terminated_length": 197.1, |
| "entropy": 0.15956022590398788, |
| "epoch": 1.5309446254071661, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 4.90770901194354e-06, |
| "loss": 0.0197, |
| "num_tokens": 35566530.0, |
| "reward": 0.95625, |
| "reward_std": 0.03335031494498253, |
| "rewards/qwen_accuracy_reward/mean": 0.95625, |
| "rewards/qwen_accuracy_reward/std": 0.11587972939014435, |
| "step": 470, |
| "step_time": 52.431317151151596 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.021875, |
| "completions/max_length": 1856.4, |
| "completions/max_terminated_length": 1411.0, |
| "completions/mean_length": 556.703125, |
| "completions/mean_terminated_length": 482.36143188476564, |
| "completions/min_length": 201.5, |
| "completions/min_terminated_length": 201.5, |
| "entropy": 0.1640054076910019, |
| "epoch": 1.5635179153094463, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 4.79913137893594e-06, |
| "loss": 0.0122, |
| "num_tokens": 36332819.0, |
| "reward": 0.90625, |
| "reward_std": 0.011572751402854919, |
| "rewards/qwen_accuracy_reward/mean": 0.90625, |
| "rewards/qwen_accuracy_reward/std": 0.17163818180561066, |
| "step": 480, |
| "step_time": 55.61120590567589 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1141.9, |
| "completions/max_terminated_length": 1141.9, |
| "completions/mean_length": 371.696875, |
| "completions/mean_terminated_length": 371.696875, |
| "completions/min_length": 172.6, |
| "completions/min_terminated_length": 172.6, |
| "entropy": 0.13739149868488312, |
| "epoch": 1.5960912052117264, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 4.690553745928339e-06, |
| "loss": 0.0168, |
| "num_tokens": 37149242.0, |
| "reward": 0.984375, |
| "reward_std": 0.022201896458864213, |
| "rewards/qwen_accuracy_reward/mean": 0.984375, |
| "rewards/qwen_accuracy_reward/std": 0.05127874463796615, |
| "step": 490, |
| "step_time": 37.01674561398104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 1862.1, |
| "completions/max_terminated_length": 1621.2, |
| "completions/mean_length": 530.93125, |
| "completions/mean_terminated_length": 478.1709289550781, |
| "completions/min_length": 202.8, |
| "completions/min_terminated_length": 202.8, |
| "entropy": 0.16189506649971008, |
| "epoch": 1.6286644951140063, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 0.8828125, |
| "learning_rate": 4.5819761129207385e-06, |
| "loss": 0.0709, |
| "num_tokens": 37935508.0, |
| "reward": 0.946875, |
| "reward_std": 0.06396671310067177, |
| "rewards/qwen_accuracy_reward/mean": 0.946875, |
| "rewards/qwen_accuracy_reward/std": 0.1611790642142296, |
| "step": 500, |
| "step_time": 57.11876249546185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1254.1, |
| "completions/max_terminated_length": 1167.1, |
| "completions/mean_length": 411.725, |
| "completions/mean_terminated_length": 400.9833679199219, |
| "completions/min_length": 165.3, |
| "completions/min_terminated_length": 165.3, |
| "entropy": 0.14960483461618423, |
| "epoch": 1.6612377850162865, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 4.473398479913138e-06, |
| "loss": 0.0285, |
| "num_tokens": 38658348.0, |
| "reward": 0.99375, |
| "reward_std": 0.01767766922712326, |
| "rewards/qwen_accuracy_reward/mean": 0.99375, |
| "rewards/qwen_accuracy_reward/std": 0.03535533845424652, |
| "step": 510, |
| "step_time": 37.5699773571454 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1664.6, |
| "completions/max_terminated_length": 1541.8, |
| "completions/mean_length": 507.371875, |
| "completions/mean_terminated_length": 477.1740295410156, |
| "completions/min_length": 199.8, |
| "completions/min_terminated_length": 199.8, |
| "entropy": 0.16616563498973846, |
| "epoch": 1.6938110749185666, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 4.364820846905538e-06, |
| "loss": 0.0223, |
| "num_tokens": 39415795.0, |
| "reward": 0.978125, |
| "reward_std": 0.036084231734275815, |
| "rewards/qwen_accuracy_reward/mean": 0.978125, |
| "rewards/qwen_accuracy_reward/std": 0.061483670771121976, |
| "step": 520, |
| "step_time": 49.47790257129818 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1499.9, |
| "completions/max_terminated_length": 1499.9, |
| "completions/mean_length": 381.7875, |
| "completions/mean_terminated_length": 381.7875, |
| "completions/min_length": 178.3, |
| "completions/min_terminated_length": 178.3, |
| "entropy": 0.1455918937921524, |
| "epoch": 1.7263843648208468, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 4.256243213897938e-06, |
| "loss": 0.0, |
| "num_tokens": 40196743.0, |
| "reward": 0.975, |
| "reward_std": 0.0, |
| "rewards/qwen_accuracy_reward/mean": 0.975, |
| "rewards/qwen_accuracy_reward/std": 0.04399413466453552, |
| "step": 530, |
| "step_time": 45.42760537136346 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1877.8, |
| "completions/max_terminated_length": 1709.0, |
| "completions/mean_length": 515.225, |
| "completions/mean_terminated_length": 482.8147033691406, |
| "completions/min_length": 195.3, |
| "completions/min_terminated_length": 195.3, |
| "entropy": 0.16437555029988288, |
| "epoch": 1.758957654723127, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 4.147665580890337e-06, |
| "loss": 0.0303, |
| "num_tokens": 41000151.0, |
| "reward": 0.975, |
| "reward_std": 0.04261348247528076, |
| "rewards/qwen_accuracy_reward/mean": 0.975, |
| "rewards/qwen_accuracy_reward/std": 0.09354988187551498, |
| "step": 540, |
| "step_time": 57.456473257485776 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1165.0, |
| "completions/max_terminated_length": 1165.0, |
| "completions/mean_length": 423.43125, |
| "completions/mean_terminated_length": 423.43125, |
| "completions/min_length": 190.2, |
| "completions/min_terminated_length": 190.2, |
| "entropy": 0.16483787596225738, |
| "epoch": 1.791530944625407, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 4.039087947882737e-06, |
| "loss": -0.0021, |
| "num_tokens": 41720497.0, |
| "reward": 0.9125, |
| "reward_std": 0.013363061845302582, |
| "rewards/qwen_accuracy_reward/mean": 0.9125, |
| "rewards/qwen_accuracy_reward/std": 0.12839525938034058, |
| "step": 550, |
| "step_time": 36.695044124592094 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1311.7, |
| "completions/max_terminated_length": 1305.7, |
| "completions/mean_length": 479.98125, |
| "completions/mean_terminated_length": 470.1879028320312, |
| "completions/min_length": 199.5, |
| "completions/min_terminated_length": 199.5, |
| "entropy": 0.1649734303355217, |
| "epoch": 1.8241042345276872, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 3.9305103148751365e-06, |
| "loss": 0.0047, |
| "num_tokens": 42491363.0, |
| "reward": 0.9875, |
| "reward_std": 0.02925042062997818, |
| "rewards/qwen_accuracy_reward/mean": 0.9875, |
| "rewards/qwen_accuracy_reward/std": 0.059948806464672086, |
| "step": 560, |
| "step_time": 41.12004605270922 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2218.3, |
| "completions/max_terminated_length": 1861.4, |
| "completions/mean_length": 548.38125, |
| "completions/mean_terminated_length": 494.34682006835936, |
| "completions/min_length": 183.9, |
| "completions/min_terminated_length": 183.9, |
| "entropy": 0.17041560113430024, |
| "epoch": 1.8566775244299674, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 1.40625, |
| "learning_rate": 3.8219326818675354e-06, |
| "loss": 0.0604, |
| "num_tokens": 43141245.0, |
| "reward": 0.91875, |
| "reward_std": 0.06260073557496071, |
| "rewards/qwen_accuracy_reward/mean": 0.91875, |
| "rewards/qwen_accuracy_reward/std": 0.19519128501415253, |
| "step": 570, |
| "step_time": 65.7936801508069 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1056.9, |
| "completions/max_terminated_length": 1056.9, |
| "completions/mean_length": 378.003125, |
| "completions/mean_terminated_length": 378.003125, |
| "completions/min_length": 188.2, |
| "completions/min_terminated_length": 188.2, |
| "entropy": 0.15374673902988434, |
| "epoch": 1.8892508143322475, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.7133550488599353e-06, |
| "loss": 0.0, |
| "num_tokens": 43920014.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/qwen_accuracy_reward/mean": 1.0, |
| "rewards/qwen_accuracy_reward/std": 0.0, |
| "step": 580, |
| "step_time": 34.265030450746416 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1124.7, |
| "completions/max_terminated_length": 1011.9, |
| "completions/mean_length": 395.740625, |
| "completions/mean_terminated_length": 361.8580810546875, |
| "completions/min_length": 183.3, |
| "completions/min_terminated_length": 183.3, |
| "entropy": 0.15494132190942764, |
| "epoch": 1.9218241042345277, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 3.6047774158523346e-06, |
| "loss": 0.0321, |
| "num_tokens": 44712795.0, |
| "reward": 0.9875, |
| "reward_std": 0.02177756354212761, |
| "rewards/qwen_accuracy_reward/mean": 0.9875, |
| "rewards/qwen_accuracy_reward/std": 0.04729212671518326, |
| "step": 590, |
| "step_time": 37.20974960550666 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1213.0, |
| "completions/max_terminated_length": 1213.0, |
| "completions/mean_length": 404.8125, |
| "completions/mean_terminated_length": 404.8125, |
| "completions/min_length": 192.2, |
| "completions/min_terminated_length": 192.2, |
| "entropy": 0.13962563052773475, |
| "epoch": 1.9543973941368078, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 3.496199782844734e-06, |
| "loss": 0.0076, |
| "num_tokens": 45496631.0, |
| "reward": 0.9125, |
| "reward_std": 0.013363061845302582, |
| "rewards/qwen_accuracy_reward/mean": 0.9125, |
| "rewards/qwen_accuracy_reward/std": 0.16558347940444945, |
| "step": 600, |
| "step_time": 38.92747511789203 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 1518.7, |
| "completions/max_terminated_length": 916.1, |
| "completions/mean_length": 421.90625, |
| "completions/mean_terminated_length": 352.30673828125, |
| "completions/min_length": 176.6, |
| "completions/min_terminated_length": 176.6, |
| "entropy": 0.15325831845402718, |
| "epoch": 1.986970684039088, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 0.0, |
| "learning_rate": 3.387622149837134e-06, |
| "loss": 0.1029, |
| "num_tokens": 46185953.0, |
| "reward": 0.91875, |
| "reward_std": 0.04355512708425522, |
| "rewards/qwen_accuracy_reward/mean": 0.91875, |
| "rewards/qwen_accuracy_reward/std": 0.1526600480079651, |
| "step": 610, |
| "step_time": 45.04236122053116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1122.6, |
| "completions/max_terminated_length": 1122.6, |
| "completions/mean_length": 409.753125, |
| "completions/mean_terminated_length": 409.753125, |
| "completions/min_length": 187.6, |
| "completions/min_terminated_length": 187.6, |
| "entropy": 0.15086480602622032, |
| "epoch": 2.019543973941368, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 3.2790445168295332e-06, |
| "loss": -0.0012, |
| "num_tokens": 46967130.0, |
| "reward": 0.978125, |
| "reward_std": 0.00883883461356163, |
| "rewards/qwen_accuracy_reward/mean": 0.978125, |
| "rewards/qwen_accuracy_reward/std": 0.0420013427734375, |
| "step": 620, |
| "step_time": 36.174442971032114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1220.2, |
| "completions/max_terminated_length": 1154.4, |
| "completions/mean_length": 433.240625, |
| "completions/mean_terminated_length": 415.08563232421875, |
| "completions/min_length": 179.6, |
| "completions/min_terminated_length": 179.6, |
| "entropy": 0.1521160587668419, |
| "epoch": 2.0521172638436482, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.0, |
| "learning_rate": 3.1704668838219326e-06, |
| "loss": 0.02, |
| "num_tokens": 47658855.0, |
| "reward": 0.9625, |
| "reward_std": 0.05828612819314003, |
| "rewards/qwen_accuracy_reward/mean": 0.9625, |
| "rewards/qwen_accuracy_reward/std": 0.11276241540908813, |
| "step": 630, |
| "step_time": 39.28426020843908 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 904.7, |
| "completions/max_terminated_length": 904.7, |
| "completions/mean_length": 346.578125, |
| "completions/mean_terminated_length": 346.578125, |
| "completions/min_length": 186.1, |
| "completions/min_terminated_length": 186.1, |
| "entropy": 0.14097955524921418, |
| "epoch": 2.0846905537459284, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 3.061889250814333e-06, |
| "loss": 0.0, |
| "num_tokens": 48413328.0, |
| "reward": 0.975, |
| "reward_std": 0.0, |
| "rewards/qwen_accuracy_reward/mean": 0.975, |
| "rewards/qwen_accuracy_reward/std": 0.04399413466453552, |
| "step": 640, |
| "step_time": 31.522073939908296 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1237.6, |
| "completions/max_terminated_length": 1237.6, |
| "completions/mean_length": 428.15625, |
| "completions/mean_terminated_length": 428.15625, |
| "completions/min_length": 184.4, |
| "completions/min_terminated_length": 184.4, |
| "entropy": 0.15489777624607087, |
| "epoch": 2.1172638436482085, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.9533116178067322e-06, |
| "loss": 0.0, |
| "num_tokens": 49250122.0, |
| "reward": 0.95, |
| "reward_std": 0.0, |
| "rewards/qwen_accuracy_reward/mean": 0.95, |
| "rewards/qwen_accuracy_reward/std": 0.08798826932907104, |
| "step": 650, |
| "step_time": 38.97984252097085 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1062.7, |
| "completions/max_terminated_length": 1062.7, |
| "completions/mean_length": 399.328125, |
| "completions/mean_terminated_length": 399.328125, |
| "completions/min_length": 181.3, |
| "completions/min_terminated_length": 181.3, |
| "entropy": 0.14898339360952378, |
| "epoch": 2.1498371335504887, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 2.8447339847991316e-06, |
| "loss": 0.0039, |
| "num_tokens": 50033963.0, |
| "reward": 0.915625, |
| "reward_std": 0.03377464786171913, |
| "rewards/qwen_accuracy_reward/mean": 0.915625, |
| "rewards/qwen_accuracy_reward/std": 0.14567448943853378, |
| "step": 660, |
| "step_time": 33.721394206117836 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1256.5, |
| "completions/max_terminated_length": 1189.6, |
| "completions/mean_length": 410.984375, |
| "completions/mean_terminated_length": 399.96754150390626, |
| "completions/min_length": 195.5, |
| "completions/min_terminated_length": 195.5, |
| "entropy": 0.15526492446660994, |
| "epoch": 2.182410423452769, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 2.7361563517915314e-06, |
| "loss": 0.0314, |
| "num_tokens": 50775022.0, |
| "reward": 0.946875, |
| "reward_std": 0.00883883461356163, |
| "rewards/qwen_accuracy_reward/mean": 0.946875, |
| "rewards/qwen_accuracy_reward/std": 0.08967447578907013, |
| "step": 670, |
| "step_time": 39.73401907449588 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1545.4, |
| "completions/max_terminated_length": 1413.5, |
| "completions/mean_length": 461.90625, |
| "completions/mean_terminated_length": 439.9891723632812, |
| "completions/min_length": 191.5, |
| "completions/min_terminated_length": 191.5, |
| "entropy": 0.15921913534402848, |
| "epoch": 2.214983713355049, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 2.627578718783931e-06, |
| "loss": 0.0268, |
| "num_tokens": 51536384.0, |
| "reward": 0.96875, |
| "reward_std": 0.03471825420856476, |
| "rewards/qwen_accuracy_reward/mean": 0.96875, |
| "rewards/qwen_accuracy_reward/std": 0.08884271383285522, |
| "step": 680, |
| "step_time": 47.763417969178406 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 1956.4, |
| "completions/max_terminated_length": 1649.7, |
| "completions/mean_length": 517.278125, |
| "completions/mean_terminated_length": 450.4589599609375, |
| "completions/min_length": 182.3, |
| "completions/min_terminated_length": 182.3, |
| "entropy": 0.16205079928040506, |
| "epoch": 2.247557003257329, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.46875, |
| "learning_rate": 2.5190010857763302e-06, |
| "loss": 0.0888, |
| "num_tokens": 52323265.0, |
| "reward": 0.971875, |
| "reward_std": 0.05145231708884239, |
| "rewards/qwen_accuracy_reward/mean": 0.971875, |
| "rewards/qwen_accuracy_reward/std": 0.1004656806588173, |
| "step": 690, |
| "step_time": 58.18257061317563 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1702.6, |
| "completions/max_terminated_length": 1669.6, |
| "completions/mean_length": 443.4125, |
| "completions/mean_terminated_length": 432.5013061523438, |
| "completions/min_length": 183.3, |
| "completions/min_terminated_length": 183.3, |
| "entropy": 0.15109438076615334, |
| "epoch": 2.2801302931596092, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 2.4104234527687296e-06, |
| "loss": 0.0435, |
| "num_tokens": 53081109.0, |
| "reward": 0.9625, |
| "reward_std": 0.02925042062997818, |
| "rewards/qwen_accuracy_reward/mean": 0.9625, |
| "rewards/qwen_accuracy_reward/std": 0.10394294112920761, |
| "step": 700, |
| "step_time": 51.96710612634197 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1106.4, |
| "completions/max_terminated_length": 1106.4, |
| "completions/mean_length": 390.940625, |
| "completions/mean_terminated_length": 390.940625, |
| "completions/min_length": 186.6, |
| "completions/min_terminated_length": 186.6, |
| "entropy": 0.15517303124070167, |
| "epoch": 2.3127035830618894, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.0, |
| "learning_rate": 2.3018458197611294e-06, |
| "loss": 0.0292, |
| "num_tokens": 53864306.0, |
| "reward": 0.934375, |
| "reward_std": 0.04419417306780815, |
| "rewards/qwen_accuracy_reward/mean": 0.934375, |
| "rewards/qwen_accuracy_reward/std": 0.11064954251050949, |
| "step": 710, |
| "step_time": 32.47686854107305 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 923.7, |
| "completions/max_terminated_length": 923.7, |
| "completions/mean_length": 367.85625, |
| "completions/mean_terminated_length": 367.85625, |
| "completions/min_length": 179.6, |
| "completions/min_terminated_length": 179.6, |
| "entropy": 0.15150292664766313, |
| "epoch": 2.3452768729641695, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 2.193268186753529e-06, |
| "loss": 0.0006, |
| "num_tokens": 54577764.0, |
| "reward": 0.996875, |
| "reward_std": 0.00883883461356163, |
| "rewards/qwen_accuracy_reward/mean": 0.996875, |
| "rewards/qwen_accuracy_reward/std": 0.01767766922712326, |
| "step": 720, |
| "step_time": 30.78780706692487 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1826.7, |
| "completions/max_terminated_length": 1750.5, |
| "completions/mean_length": 492.021875, |
| "completions/mean_terminated_length": 460.0340515136719, |
| "completions/min_length": 178.1, |
| "completions/min_terminated_length": 178.1, |
| "entropy": 0.15865328460931777, |
| "epoch": 2.3778501628664497, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 2.0846905537459286e-06, |
| "loss": 0.0263, |
| "num_tokens": 55272515.0, |
| "reward": 0.98125, |
| "reward_std": 0.03104073032736778, |
| "rewards/qwen_accuracy_reward/mean": 0.98125, |
| "rewards/qwen_accuracy_reward/std": 0.05456787198781967, |
| "step": 730, |
| "step_time": 55.55191990900785 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1124.4, |
| "completions/max_terminated_length": 1124.4, |
| "completions/mean_length": 408.946875, |
| "completions/mean_terminated_length": 408.946875, |
| "completions/min_length": 170.6, |
| "completions/min_terminated_length": 170.6, |
| "entropy": 0.16333993151783943, |
| "epoch": 2.41042345276873, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 1.976112920738328e-06, |
| "loss": 0.0104, |
| "num_tokens": 55921930.0, |
| "reward": 0.965625, |
| "reward_std": 0.01293872892856598, |
| "rewards/qwen_accuracy_reward/mean": 0.965625, |
| "rewards/qwen_accuracy_reward/std": 0.07360859215259552, |
| "step": 740, |
| "step_time": 34.814000389166175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1399.8, |
| "completions/max_terminated_length": 1388.3, |
| "completions/mean_length": 431.521875, |
| "completions/mean_terminated_length": 410.98146362304686, |
| "completions/min_length": 164.8, |
| "completions/min_terminated_length": 164.8, |
| "entropy": 0.14200911596417426, |
| "epoch": 2.44299674267101, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 1.8675352877307276e-06, |
| "loss": 0.019, |
| "num_tokens": 56772993.0, |
| "reward": 0.98125, |
| "reward_std": 0.02177756354212761, |
| "rewards/qwen_accuracy_reward/mean": 0.98125, |
| "rewards/qwen_accuracy_reward/std": 0.05456787198781967, |
| "step": 750, |
| "step_time": 43.50874480362982 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1684.0, |
| "completions/max_terminated_length": 1554.1, |
| "completions/mean_length": 515.384375, |
| "completions/mean_terminated_length": 494.017919921875, |
| "completions/min_length": 194.0, |
| "completions/min_terminated_length": 194.0, |
| "entropy": 0.17334669530391694, |
| "epoch": 2.47557003257329, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.328125, |
| "learning_rate": 1.7589576547231272e-06, |
| "loss": 0.0074, |
| "num_tokens": 57501516.0, |
| "reward": 0.959375, |
| "reward_std": 0.057342519611120225, |
| "rewards/qwen_accuracy_reward/mean": 0.959375, |
| "rewards/qwen_accuracy_reward/std": 0.09656921476125717, |
| "step": 760, |
| "step_time": 50.1735579572618 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.003125, |
| "completions/max_length": 1758.1, |
| "completions/max_terminated_length": 1663.3, |
| "completions/mean_length": 470.63125, |
| "completions/mean_terminated_length": 460.1435485839844, |
| "completions/min_length": 185.0, |
| "completions/min_terminated_length": 185.0, |
| "entropy": 0.16403108537197114, |
| "epoch": 2.5081433224755703, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 1.2421875, |
| "learning_rate": 1.6503800217155266e-06, |
| "loss": 0.0135, |
| "num_tokens": 58186406.0, |
| "reward": 0.99375, |
| "reward_std": 0.01767766922712326, |
| "rewards/qwen_accuracy_reward/mean": 0.99375, |
| "rewards/qwen_accuracy_reward/std": 0.03535533845424652, |
| "step": 770, |
| "step_time": 51.758546930458394 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1432.5, |
| "completions/max_terminated_length": 1432.5, |
| "completions/mean_length": 448.15, |
| "completions/mean_terminated_length": 448.15, |
| "completions/min_length": 169.4, |
| "completions/min_terminated_length": 169.4, |
| "entropy": 0.1642938271164894, |
| "epoch": 2.5407166123778504, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 1.5418023887079264e-06, |
| "loss": 0.0038, |
| "num_tokens": 58895790.0, |
| "reward": 0.978125, |
| "reward_std": 0.036084231734275815, |
| "rewards/qwen_accuracy_reward/mean": 0.978125, |
| "rewards/qwen_accuracy_reward/std": 0.06321553289890289, |
| "step": 780, |
| "step_time": 43.58511639842764 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 2056.4, |
| "completions/max_terminated_length": 2051.1, |
| "completions/mean_length": 555.934375, |
| "completions/mean_terminated_length": 527.9370727539062, |
| "completions/min_length": 191.2, |
| "completions/min_terminated_length": 191.2, |
| "entropy": 0.16091172024607658, |
| "epoch": 2.5732899022801305, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 0.0, |
| "learning_rate": 1.433224755700326e-06, |
| "loss": -0.0108, |
| "num_tokens": 59586505.0, |
| "reward": 0.89375, |
| "reward_std": 0.055127878487110135, |
| "rewards/qwen_accuracy_reward/mean": 0.89375, |
| "rewards/qwen_accuracy_reward/std": 0.2057945430278778, |
| "step": 790, |
| "step_time": 58.492029800172894 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 697.5, |
| "completions/max_terminated_length": 697.5, |
| "completions/mean_length": 322.95, |
| "completions/mean_terminated_length": 322.95, |
| "completions/min_length": 165.9, |
| "completions/min_terminated_length": 165.9, |
| "entropy": 0.14762159138917924, |
| "epoch": 2.6058631921824107, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 1.3246471226927254e-06, |
| "loss": -0.0081, |
| "num_tokens": 60234105.0, |
| "reward": 0.925, |
| "reward_std": 0.01767766922712326, |
| "rewards/qwen_accuracy_reward/mean": 0.925, |
| "rewards/qwen_accuracy_reward/std": 0.1476672813296318, |
| "step": 800, |
| "step_time": 23.70481554856524 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 2111.5, |
| "completions/max_terminated_length": 1792.0, |
| "completions/mean_length": 557.365625, |
| "completions/mean_terminated_length": 523.718637084961, |
| "completions/min_length": 207.9, |
| "completions/min_terminated_length": 207.9, |
| "entropy": 0.181430846452713, |
| "epoch": 2.6384364820846904, |
| "frac_reward_zero_std": 0.85, |
| "grad_norm": 0.0, |
| "learning_rate": 1.216069489685125e-06, |
| "loss": 0.09, |
| "num_tokens": 60907670.0, |
| "reward": 0.9125, |
| "reward_std": 0.06670062988996506, |
| "rewards/qwen_accuracy_reward/mean": 0.9125, |
| "rewards/qwen_accuracy_reward/std": 0.20204702019691467, |
| "step": 810, |
| "step_time": 57.087904060911384 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1913.1, |
| "completions/max_terminated_length": 1700.8, |
| "completions/mean_length": 471.440625, |
| "completions/mean_terminated_length": 449.9090637207031, |
| "completions/min_length": 188.8, |
| "completions/min_terminated_length": 188.8, |
| "entropy": 0.16233009248971939, |
| "epoch": 2.6710097719869705, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 1.3046875, |
| "learning_rate": 1.1074918566775244e-06, |
| "loss": 0.0312, |
| "num_tokens": 61661099.0, |
| "reward": 0.984375, |
| "reward_std": 0.03061639815568924, |
| "rewards/qwen_accuracy_reward/mean": 0.984375, |
| "rewards/qwen_accuracy_reward/std": 0.06496979594230652, |
| "step": 820, |
| "step_time": 56.08578431969509 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1505.7, |
| "completions/max_terminated_length": 1505.7, |
| "completions/mean_length": 430.6125, |
| "completions/mean_terminated_length": 430.6125, |
| "completions/min_length": 189.9, |
| "completions/min_terminated_length": 189.9, |
| "entropy": 0.16054447889328002, |
| "epoch": 2.7035830618892507, |
| "frac_reward_zero_std": 0.975, |
| "grad_norm": 0.0, |
| "learning_rate": 9.989142236699242e-07, |
| "loss": -0.0073, |
| "num_tokens": 62382591.0, |
| "reward": 0.99375, |
| "reward_std": 0.011572751402854919, |
| "rewards/qwen_accuracy_reward/mean": 0.99375, |
| "rewards/qwen_accuracy_reward/std": 0.024593468010425567, |
| "step": 830, |
| "step_time": 44.93511769743636 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1307.6, |
| "completions/max_terminated_length": 1207.0, |
| "completions/mean_length": 426.103125, |
| "completions/mean_terminated_length": 403.6597930908203, |
| "completions/min_length": 215.9, |
| "completions/min_terminated_length": 215.9, |
| "entropy": 0.16358136087656022, |
| "epoch": 2.736156351791531, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 8.903365906623236e-07, |
| "loss": 0.0463, |
| "num_tokens": 63060464.0, |
| "reward": 0.9875, |
| "reward_std": 0.02177756354212761, |
| "rewards/qwen_accuracy_reward/mean": 0.9875, |
| "rewards/qwen_accuracy_reward/std": 0.04729212671518326, |
| "step": 840, |
| "step_time": 40.77616196591407 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 691.6, |
| "completions/max_terminated_length": 691.6, |
| "completions/mean_length": 307.434375, |
| "completions/mean_terminated_length": 307.434375, |
| "completions/min_length": 162.8, |
| "completions/min_terminated_length": 162.8, |
| "entropy": 0.13495604172348977, |
| "epoch": 2.768729641693811, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 7.817589576547231e-07, |
| "loss": 0.0, |
| "num_tokens": 63820595.0, |
| "reward": 0.975, |
| "reward_std": 0.0, |
| "rewards/qwen_accuracy_reward/mean": 0.975, |
| "rewards/qwen_accuracy_reward/std": 0.04399413466453552, |
| "step": 850, |
| "step_time": 24.526741536986084 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1168.6, |
| "completions/max_terminated_length": 1168.6, |
| "completions/mean_length": 406.71875, |
| "completions/mean_terminated_length": 406.71875, |
| "completions/min_length": 198.2, |
| "completions/min_terminated_length": 198.2, |
| "entropy": 0.1559869095683098, |
| "epoch": 2.801302931596091, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 1.71875, |
| "learning_rate": 6.731813246471228e-07, |
| "loss": 0.0029, |
| "num_tokens": 64580849.0, |
| "reward": 0.953125, |
| "reward_std": 0.02041158601641655, |
| "rewards/qwen_accuracy_reward/mean": 0.953125, |
| "rewards/qwen_accuracy_reward/std": 0.10132758170366288, |
| "step": 860, |
| "step_time": 35.53059697123244 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 1577.3, |
| "completions/max_terminated_length": 1516.9, |
| "completions/mean_length": 498.10625, |
| "completions/mean_terminated_length": 433.7035827636719, |
| "completions/min_length": 175.3, |
| "completions/min_terminated_length": 175.3, |
| "entropy": 0.1583547368645668, |
| "epoch": 2.8338762214983713, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.0, |
| "learning_rate": 5.646036916395223e-07, |
| "loss": 0.0275, |
| "num_tokens": 65364547.0, |
| "reward": 0.965625, |
| "reward_std": 0.03787454217672348, |
| "rewards/qwen_accuracy_reward/mean": 0.965625, |
| "rewards/qwen_accuracy_reward/std": 0.09508474618196487, |
| "step": 870, |
| "step_time": 47.49830629490316 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 1447.2, |
| "completions/max_terminated_length": 1369.3, |
| "completions/mean_length": 457.55625, |
| "completions/mean_terminated_length": 435.249169921875, |
| "completions/min_length": 181.7, |
| "completions/min_terminated_length": 181.7, |
| "entropy": 0.16112774163484572, |
| "epoch": 2.8664495114006514, |
| "frac_reward_zero_std": 0.875, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.5602605863192187e-07, |
| "loss": 0.0422, |
| "num_tokens": 66148253.0, |
| "reward": 0.978125, |
| "reward_std": 0.04966200664639473, |
| "rewards/qwen_accuracy_reward/mean": 0.978125, |
| "rewards/qwen_accuracy_reward/std": 0.10221994370222091, |
| "step": 880, |
| "step_time": 43.340578782279046 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1843.3, |
| "completions/max_terminated_length": 1843.3, |
| "completions/mean_length": 497.509375, |
| "completions/mean_terminated_length": 497.509375, |
| "completions/min_length": 191.2, |
| "completions/min_terminated_length": 191.2, |
| "entropy": 0.17072843462228776, |
| "epoch": 2.8990228013029316, |
| "frac_reward_zero_std": 0.95, |
| "grad_norm": 0.0, |
| "learning_rate": 3.474484256243214e-07, |
| "loss": 0.0145, |
| "num_tokens": 66879112.0, |
| "reward": 0.965625, |
| "reward_std": 0.02041158601641655, |
| "rewards/qwen_accuracy_reward/mean": 0.965625, |
| "rewards/qwen_accuracy_reward/std": 0.08626527190208436, |
| "step": 890, |
| "step_time": 53.65781031670049 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 1572.3, |
| "completions/max_terminated_length": 1323.3, |
| "completions/mean_length": 452.584375, |
| "completions/mean_terminated_length": 420.08506469726564, |
| "completions/min_length": 194.6, |
| "completions/min_terminated_length": 194.6, |
| "entropy": 0.1676468499004841, |
| "epoch": 2.9315960912052117, |
| "frac_reward_zero_std": 0.925, |
| "grad_norm": 0.828125, |
| "learning_rate": 2.3887079261672097e-07, |
| "loss": 0.0376, |
| "num_tokens": 67548691.0, |
| "reward": 0.915625, |
| "reward_std": 0.036084231734275815, |
| "rewards/qwen_accuracy_reward/mean": 0.915625, |
| "rewards/qwen_accuracy_reward/std": 0.15649925023317338, |
| "step": 900, |
| "step_time": 47.34689696319401 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1042.7, |
| "completions/max_terminated_length": 1042.7, |
| "completions/mean_length": 393.828125, |
| "completions/mean_terminated_length": 393.828125, |
| "completions/min_length": 186.9, |
| "completions/min_terminated_length": 186.9, |
| "entropy": 0.15125710666179656, |
| "epoch": 2.964169381107492, |
| "frac_reward_zero_std": 0.9, |
| "grad_norm": 1.7109375, |
| "learning_rate": 1.3029315960912054e-07, |
| "loss": 0.0054, |
| "num_tokens": 68345092.0, |
| "reward": 0.959375, |
| "reward_std": 0.04218914955854416, |
| "rewards/qwen_accuracy_reward/mean": 0.959375, |
| "rewards/qwen_accuracy_reward/std": 0.11388693749904633, |
| "step": 910, |
| "step_time": 32.79988148277626 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 708.5, |
| "completions/max_terminated_length": 708.5, |
| "completions/mean_length": 309.83125, |
| "completions/mean_terminated_length": 309.83125, |
| "completions/min_length": 173.2, |
| "completions/min_terminated_length": 173.2, |
| "entropy": 0.13616923689842225, |
| "epoch": 2.996742671009772, |
| "frac_reward_zero_std": 1.0, |
| "grad_norm": 0.0, |
| "learning_rate": 2.171552660152009e-08, |
| "loss": 0.0, |
| "num_tokens": 69106806.0, |
| "reward": 1.0, |
| "reward_std": 0.0, |
| "rewards/qwen_accuracy_reward/mean": 1.0, |
| "rewards/qwen_accuracy_reward/std": 0.0, |
| "step": 920, |
| "step_time": 25.47872376209125 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 921, |
| "num_input_tokens_seen": 69178036, |
| "num_train_epochs": 3, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|