| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 50.0, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 389.0, | |
| "completions/mean_length": 377.5, | |
| "completions/min_length": 366.0, | |
| "epoch": 0.002, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2745305299758911, | |
| "kl": 0.0, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 0.0, | |
| "reward": 0.5, | |
| "reward_std": 0.7071067690849304, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.7071067690849304, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.25, | |
| "completions/max_length": 712.75, | |
| "completions/mean_length": 689.375, | |
| "completions/min_length": 666.0, | |
| "epoch": 0.01, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0006804907461628318, | |
| "kl": 9.946800855686888e-05, | |
| "learning_rate": 1e-05, | |
| "loss": 4.000145054305904e-06, | |
| "reward": 0.25, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.25, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0024464832618832587, | |
| "clip_ratio/high_mean": 0.0024464832618832587, | |
| "clip_ratio/low_mean": 0.0001640689093619585, | |
| "clip_ratio/low_min": 0.0001640689093619585, | |
| "clip_ratio/region_mean": 0.002610552171245217, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 497.2, | |
| "completions/mean_length": 485.6, | |
| "completions/min_length": 474.0, | |
| "epoch": 0.02, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.22628173232078552, | |
| "kl": 0.0002987155457958579, | |
| "learning_rate": 2e-05, | |
| "loss": -0.0003628176636993885, | |
| "reward": 0.6, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 421.6, | |
| "completions/mean_length": 390.2, | |
| "completions/min_length": 358.8, | |
| "epoch": 0.03, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0014656687853857875, | |
| "kl": 0.00022319573326967657, | |
| "learning_rate": 3e-05, | |
| "loss": -1.4230319357011467e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 486.0, | |
| "completions/mean_length": 444.9, | |
| "completions/min_length": 403.8, | |
| "epoch": 0.04, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0009142484632320702, | |
| "kl": 0.0004241452901624143, | |
| "learning_rate": 4e-05, | |
| "loss": 1.7114212096203118e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0005540780373848974, | |
| "clip_ratio/high_mean": 0.0005540780373848974, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0005540780373848974, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 787.4, | |
| "completions/mean_length": 738.5, | |
| "completions/min_length": 689.6, | |
| "epoch": 0.05, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.0029386563692241907, | |
| "kl": 0.0017149186198366806, | |
| "learning_rate": 5e-05, | |
| "loss": 8.679315214976668e-05, | |
| "reward": 0.4, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006403414998203516, | |
| "clip_ratio/high_mean": 0.0006403414998203516, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006403414998203516, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 647.6, | |
| "completions/mean_length": 566.2, | |
| "completions/min_length": 484.8, | |
| "epoch": 0.06, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0006255562184378505, | |
| "kl": 0.0003066264180233702, | |
| "learning_rate": 4.9986331433523156e-05, | |
| "loss": 1.4322872448246927e-05, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 686.4, | |
| "completions/mean_length": 664.7, | |
| "completions/min_length": 643.0, | |
| "epoch": 0.07, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0014924455899745226, | |
| "kl": 0.00036709415726363657, | |
| "learning_rate": 4.994534068046937e-05, | |
| "loss": 1.4717187150381506e-05, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00040349699556827543, | |
| "clip_ratio/high_mean": 0.00040349699556827543, | |
| "clip_ratio/low_mean": 0.00040349699556827543, | |
| "clip_ratio/low_min": 0.00040349699556827543, | |
| "clip_ratio/region_mean": 0.0008069939911365509, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 681.6, | |
| "completions/mean_length": 587.4, | |
| "completions/min_length": 493.2, | |
| "epoch": 0.08, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0019065124215558171, | |
| "kl": 0.0004894518526270986, | |
| "learning_rate": 4.9877072563625285e-05, | |
| "loss": -0.00015065595507621766, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 677.2, | |
| "completions/mean_length": 661.8, | |
| "completions/min_length": 646.4, | |
| "epoch": 0.09, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.000817921943962574, | |
| "kl": 0.0003075484826695174, | |
| "learning_rate": 4.978160173317438e-05, | |
| "loss": 1.24339887406677e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 656.8, | |
| "completions/mean_length": 621.3, | |
| "completions/min_length": 585.8, | |
| "epoch": 0.1, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.15634514391422272, | |
| "kl": 0.000459137320285663, | |
| "learning_rate": 4.965903258506806e-05, | |
| "loss": 2.121384022757411e-06, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4, | |
| "completions/max_length": 677.0, | |
| "completions/mean_length": 671.2, | |
| "completions/min_length": 665.4, | |
| "epoch": 0.11, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0011203172616660595, | |
| "kl": 0.0012636209139600396, | |
| "learning_rate": 4.9509499146870236e-05, | |
| "loss": 4.886850947514176e-05, | |
| "reward": 0.2, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.2, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00022246940061450006, | |
| "clip_ratio/high_mean": 0.00022246940061450006, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00022246940061450006, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 418.2, | |
| "completions/mean_length": 400.8, | |
| "completions/min_length": 383.4, | |
| "epoch": 0.12, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0010890079429373145, | |
| "kl": 0.0009534806886222214, | |
| "learning_rate": 4.933316493120015e-05, | |
| "loss": 2.114146773237735e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 681.6, | |
| "completions/mean_length": 648.9, | |
| "completions/min_length": 616.2, | |
| "epoch": 0.13, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0018795138457790017, | |
| "kl": 0.0005977108958177269, | |
| "learning_rate": 4.913022275693372e-05, | |
| "loss": 2.4121845490299167e-05, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0009542598738335073, | |
| "clip_ratio/high_mean": 0.0009542598738335073, | |
| "clip_ratio/low_mean": 0.00020920501556247472, | |
| "clip_ratio/low_min": 0.00020920501556247472, | |
| "clip_ratio/region_mean": 0.0011634649126790464, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 435.8, | |
| "completions/mean_length": 412.4, | |
| "completions/min_length": 389.0, | |
| "epoch": 0.14, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.25056585669517517, | |
| "kl": 0.0006330947682727129, | |
| "learning_rate": 4.8900894538358944e-05, | |
| "loss": -0.00019633164629340172, | |
| "reward": 0.4, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 611.0, | |
| "completions/mean_length": 575.0, | |
| "completions/min_length": 539.0, | |
| "epoch": 0.15, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002435472793877125, | |
| "kl": 0.0007646598271094263, | |
| "learning_rate": 4.864543104251587e-05, | |
| "loss": 3.094758721999824e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00010515246540307999, | |
| "clip_ratio/high_mean": 0.00010515246540307999, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00010515246540307999, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 668.6, | |
| "completions/mean_length": 606.7, | |
| "completions/min_length": 544.8, | |
| "epoch": 0.16, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.11095567792654037, | |
| "kl": 0.0007162548077758402, | |
| "learning_rate": 4.8364111614986527e-05, | |
| "loss": 7.679397240281104e-05, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 568.8, | |
| "completions/mean_length": 527.3, | |
| "completions/min_length": 485.8, | |
| "epoch": 0.17, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0033143432810902596, | |
| "kl": 0.0006754565751180053, | |
| "learning_rate": 4.805724387443462e-05, | |
| "loss": 4.5427383156493306e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0003456221194937825, | |
| "clip_ratio/high_mean": 0.0003456221194937825, | |
| "clip_ratio/low_mean": 0.00011520737316459418, | |
| "clip_ratio/low_min": 0.00011520737316459418, | |
| "clip_ratio/region_mean": 0.0004608294926583767, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 831.6, | |
| "completions/mean_length": 796.0, | |
| "completions/min_length": 760.4, | |
| "epoch": 0.18, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0037328507751226425, | |
| "kl": 0.0010223451943602413, | |
| "learning_rate": 4.7725163376229064e-05, | |
| "loss": 4.158227238804102e-05, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 497.4, | |
| "completions/mean_length": 441.2, | |
| "completions/min_length": 385.0, | |
| "epoch": 0.19, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.002750825835391879, | |
| "kl": 0.0019137584429699927, | |
| "learning_rate": 4.736823324551909e-05, | |
| "loss": 0.00014175053220242262, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 95 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0005050505045801401, | |
| "clip_ratio/high_mean": 0.0005050505045801401, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0005050505045801401, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 508.2, | |
| "completions/mean_length": 493.9, | |
| "completions/min_length": 479.6, | |
| "epoch": 0.2, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.4193066656589508, | |
| "kl": 0.0017093931266572327, | |
| "learning_rate": 4.698684378016222e-05, | |
| "loss": 0.00012627228861674666, | |
| "reward": 0.8, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 360.8, | |
| "completions/mean_length": 341.7, | |
| "completions/min_length": 322.6, | |
| "epoch": 0.21, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.007469469215720892, | |
| "kl": 0.003066345490515232, | |
| "learning_rate": 4.6581412023939354e-05, | |
| "loss": 0.00012305844575166702, | |
| "reward": 1.0, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 1.0, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0001224739709869027, | |
| "clip_ratio/high_mean": 0.0001224739709869027, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0001224739709869027, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 611.2, | |
| "completions/mean_length": 572.1, | |
| "completions/min_length": 533.0, | |
| "epoch": 0.22, | |
| "frac_reward_zero_std": 0.4, | |
| "grad_norm": 0.0033601748291403055, | |
| "kl": 0.003409948293119669, | |
| "learning_rate": 4.6152381310523387e-05, | |
| "loss": 0.00023221683222800492, | |
| "reward": 0.5, | |
| "reward_std": 0.42426406145095824, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.42426406145095824, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 734.0, | |
| "completions/mean_length": 692.2, | |
| "completions/min_length": 650.4, | |
| "epoch": 0.23, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0015698346542194486, | |
| "kl": 0.0008670258859638125, | |
| "learning_rate": 4.5700220778700504e-05, | |
| "loss": 0.0001534310751594603, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 723.8, | |
| "completions/mean_length": 682.2, | |
| "completions/min_length": 640.6, | |
| "epoch": 0.24, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.010230440646409988, | |
| "kl": 0.005100146430777386, | |
| "learning_rate": 4.522542485937369e-05, | |
| "loss": 0.00020869788713753223, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4, | |
| "completions/max_length": 922.4, | |
| "completions/mean_length": 872.0, | |
| "completions/min_length": 821.6, | |
| "epoch": 0.25, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002747906604781747, | |
| "kl": 0.0014180985395796596, | |
| "learning_rate": 4.4728512734909844e-05, | |
| "loss": 5.677485605701804e-05, | |
| "reward": 0.0, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.0, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 405.8, | |
| "completions/mean_length": 388.4, | |
| "completions/min_length": 371.0, | |
| "epoch": 0.26, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0014852522872388363, | |
| "kl": 0.001983049605041742, | |
| "learning_rate": 4.421002777142148e-05, | |
| "loss": 7.918149349279701e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0001583531266078353, | |
| "clip_ratio/low_min": 0.0001583531266078353, | |
| "clip_ratio/region_mean": 0.0001583531266078353, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 458.4, | |
| "completions/mean_length": 429.9, | |
| "completions/min_length": 401.4, | |
| "epoch": 0.27, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.01503363810479641, | |
| "kl": 0.005801378504838794, | |
| "learning_rate": 4.367053692460385e-05, | |
| "loss": 0.0004215865395963192, | |
| "reward": 0.6, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 645.2, | |
| "completions/mean_length": 537.3, | |
| "completions/min_length": 429.4, | |
| "epoch": 0.28, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002116028917953372, | |
| "kl": 0.005066322290804237, | |
| "learning_rate": 4.311063011977723e-05, | |
| "loss": 0.00020899884402751922, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 375.8, | |
| "completions/mean_length": 354.6, | |
| "completions/min_length": 333.4, | |
| "epoch": 0.29, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.006887642201036215, | |
| "kl": 0.009827147470787168, | |
| "learning_rate": 4.2530919606812216e-05, | |
| "loss": 0.0003938500303775072, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 416.4, | |
| "completions/mean_length": 366.7, | |
| "completions/min_length": 317.0, | |
| "epoch": 0.3, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.002672386122867465, | |
| "kl": 0.008298561931587756, | |
| "learning_rate": 4.193203929064353e-05, | |
| "loss": 0.00047482880763709544, | |
| "reward": 0.9, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.9, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 640.8, | |
| "completions/mean_length": 594.7, | |
| "completions/min_length": 548.6, | |
| "epoch": 0.31, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0023163340520113707, | |
| "kl": 0.0016974479891359805, | |
| "learning_rate": 4.131464403810422e-05, | |
| "loss": 6.728884764015675e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 381.2, | |
| "completions/mean_length": 353.1, | |
| "completions/min_length": 325.0, | |
| "epoch": 0.32, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0056184823624789715, | |
| "kl": 0.006532504153437912, | |
| "learning_rate": 4.067940896183843e-05, | |
| "loss": 0.0001474126009270549, | |
| "reward": 0.9, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.9, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 499.4, | |
| "completions/mean_length": 479.6, | |
| "completions/min_length": 459.8, | |
| "epoch": 0.33, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.007484205532819033, | |
| "kl": 0.0038784807082265617, | |
| "learning_rate": 4.002702868207563e-05, | |
| "loss": 0.0001543789985589683, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006509372964501381, | |
| "clip_ratio/high_mean": 0.0006509372964501381, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006509372964501381, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 658.2, | |
| "completions/mean_length": 576.8, | |
| "completions/min_length": 495.4, | |
| "epoch": 0.34, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.004718282260000706, | |
| "kl": 0.007649715105071664, | |
| "learning_rate": 3.935821656707359e-05, | |
| "loss": 0.0005094979424029589, | |
| "reward": 0.8, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 587.4, | |
| "completions/mean_length": 544.4, | |
| "completions/min_length": 501.4, | |
| "epoch": 0.35, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0019858903251588345, | |
| "kl": 0.0035309843719005586, | |
| "learning_rate": 3.867370395306068e-05, | |
| "loss": 6.014038226567209e-05, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 639.6, | |
| "completions/mean_length": 594.5, | |
| "completions/min_length": 549.4, | |
| "epoch": 0.36, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.015034261159598827, | |
| "kl": 0.003863858920522034, | |
| "learning_rate": 3.797423934453038e-05, | |
| "loss": 0.0001626830198802054, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 805.0, | |
| "completions/mean_length": 710.5, | |
| "completions/min_length": 616.0, | |
| "epoch": 0.37, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0022431982215493917, | |
| "kl": 0.0025493323453702034, | |
| "learning_rate": 3.726058759576271e-05, | |
| "loss": 0.0001080367248505354, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.00020140986889600755, | |
| "clip_ratio/low_min": 0.00020140986889600755, | |
| "clip_ratio/region_mean": 0.00020140986889600755, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 384.8, | |
| "completions/mean_length": 366.2, | |
| "completions/min_length": 347.6, | |
| "epoch": 0.38, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.019761426374316216, | |
| "kl": 0.011293478566221893, | |
| "learning_rate": 3.65335290744672e-05, | |
| "loss": 0.0005690994672477246, | |
| "reward": 0.9, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.9, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.00012698412174358965, | |
| "clip_ratio/low_min": 0.00012698412174358965, | |
| "clip_ratio/region_mean": 0.00012698412174358965, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 703.6, | |
| "completions/mean_length": 662.9, | |
| "completions/min_length": 622.2, | |
| "epoch": 0.39, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.002572572324424982, | |
| "kl": 0.0015339702018536626, | |
| "learning_rate": 3.579385880846232e-05, | |
| "loss": 2.8236012440174817e-06, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 195 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 643.2, | |
| "completions/mean_length": 612.7, | |
| "completions/min_length": 582.2, | |
| "epoch": 0.4, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0031194768380373716, | |
| "kl": 0.002493513422086835, | |
| "learning_rate": 3.504238561632424e-05, | |
| "loss": 9.978280868381262e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 674.2, | |
| "completions/mean_length": 638.7, | |
| "completions/min_length": 603.2, | |
| "epoch": 0.41, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0036480328999459743, | |
| "kl": 0.0019606892135925593, | |
| "learning_rate": 3.427993122295552e-05, | |
| "loss": 7.743847672827541e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 632.4, | |
| "completions/mean_length": 570.5, | |
| "completions/min_length": 508.6, | |
| "epoch": 0.42, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.00799116026610136, | |
| "kl": 0.003256951330695301, | |
| "learning_rate": 3.350732936104108e-05, | |
| "loss": 0.00010458981851115822, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 315.6, | |
| "completions/mean_length": 300.6, | |
| "completions/min_length": 285.6, | |
| "epoch": 0.43, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.005798510741442442, | |
| "kl": 0.011395945539698004, | |
| "learning_rate": 3.272542485937369e-05, | |
| "loss": 0.00045509766787290575, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4, | |
| "completions/max_length": 777.6, | |
| "completions/mean_length": 756.6, | |
| "completions/min_length": 735.6, | |
| "epoch": 0.44, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.00223241513594985, | |
| "kl": 0.0014477839809842407, | |
| "learning_rate": 3.1935072719046115e-05, | |
| "loss": 5.752563010901213e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00031645570416003463, | |
| "clip_ratio/high_mean": 0.00031645570416003463, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00031645570416003463, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 443.8, | |
| "completions/mean_length": 398.4, | |
| "completions/min_length": 353.0, | |
| "epoch": 0.45, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.0020589185878634453, | |
| "kl": 0.0025043860776349904, | |
| "learning_rate": 3.1137137178519985e-05, | |
| "loss": 0.0001688675722107291, | |
| "reward": 0.4, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio/high_max": 9.955201530829073e-05, | |
| "clip_ratio/high_mean": 9.955201530829073e-05, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 9.955201530829073e-05, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 747.4, | |
| "completions/mean_length": 715.4, | |
| "completions/min_length": 683.4, | |
| "epoch": 0.46, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.002850248944014311, | |
| "kl": 0.0052430763142183425, | |
| "learning_rate": 3.0332490768593675e-05, | |
| "loss": 0.00025521754287183285, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0004385964944958687, | |
| "clip_ratio/high_mean": 0.0004385964944958687, | |
| "clip_ratio/low_mean": 0.00021929824724793435, | |
| "clip_ratio/low_min": 0.00021929824724793435, | |
| "clip_ratio/region_mean": 0.000657894741743803, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 625.8, | |
| "completions/mean_length": 551.9, | |
| "completions/min_length": 478.0, | |
| "epoch": 0.47, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.005838941317051649, | |
| "kl": 0.002799734321888536, | |
| "learning_rate": 2.952201335830275e-05, | |
| "loss": 7.562300888821482e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.000272479560226202, | |
| "clip_ratio/high_mean": 0.000272479560226202, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.000272479560226202, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 530.6, | |
| "completions/mean_length": 480.4, | |
| "completions/min_length": 430.2, | |
| "epoch": 0.48, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.003879491239786148, | |
| "kl": 0.0028692058520391585, | |
| "learning_rate": 2.870659119279605e-05, | |
| "loss": 0.0001448941882699728, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 482.6, | |
| "completions/mean_length": 463.6, | |
| "completions/min_length": 444.6, | |
| "epoch": 0.49, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.008210963569581509, | |
| "kl": 0.0075855673989281055, | |
| "learning_rate": 2.788711592423966e-05, | |
| "loss": 0.0003023243509232998, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0002461538417264819, | |
| "clip_ratio/high_mean": 0.0002461538417264819, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0002461538417264819, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 775.4, | |
| "completions/mean_length": 716.8, | |
| "completions/min_length": 658.2, | |
| "epoch": 0.5, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.006750498432666063, | |
| "kl": 0.0025351812597364186, | |
| "learning_rate": 2.7064483636808313e-05, | |
| "loss": 0.00016423141350969673, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 619.8, | |
| "completions/mean_length": 591.4, | |
| "completions/min_length": 563.0, | |
| "epoch": 0.51, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.006483216769993305, | |
| "kl": 0.0026113510597497226, | |
| "learning_rate": 2.623959386683056e-05, | |
| "loss": 0.00010410962859168649, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00032043447718024256, | |
| "clip_ratio/high_mean": 0.00032043447718024256, | |
| "clip_ratio/low_mean": 0.00020130849443376065, | |
| "clip_ratio/low_min": 0.00020130849443376065, | |
| "clip_ratio/region_mean": 0.0005217429948970676, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 782.6, | |
| "completions/mean_length": 706.5, | |
| "completions/min_length": 630.4, | |
| "epoch": 0.52, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.003508440451696515, | |
| "kl": 0.001330986130051315, | |
| "learning_rate": 2.5413348619158967e-05, | |
| "loss": -4.660175181925297e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00013029315741732716, | |
| "clip_ratio/high_mean": 0.00013029315741732716, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00013029315741732716, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 604.4, | |
| "completions/mean_length": 536.1, | |
| "completions/min_length": 467.8, | |
| "epoch": 0.53, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.1873970776796341, | |
| "kl": 0.009700851677916945, | |
| "learning_rate": 2.458665138084104e-05, | |
| "loss": 0.00033162124454975126, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 544.8, | |
| "completions/mean_length": 517.5, | |
| "completions/min_length": 490.2, | |
| "epoch": 0.54, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0017767059616744518, | |
| "kl": 0.002807480387855321, | |
| "learning_rate": 2.3760406133169443e-05, | |
| "loss": 0.00011274998541921377, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.00019474197179079056, | |
| "clip_ratio/low_min": 0.00019474197179079056, | |
| "clip_ratio/region_mean": 0.00019474197179079056, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 641.0, | |
| "completions/mean_length": 620.8, | |
| "completions/min_length": 600.6, | |
| "epoch": 0.55, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.08176784217357635, | |
| "kl": 0.0031962784822098913, | |
| "learning_rate": 2.2935516363191693e-05, | |
| "loss": 0.0002264779293909669, | |
| "reward": 0.6, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006584362126886845, | |
| "clip_ratio/high_mean": 0.0006584362126886845, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006584362126886845, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 638.0, | |
| "completions/mean_length": 562.6, | |
| "completions/min_length": 487.2, | |
| "epoch": 0.56, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.007495217490941286, | |
| "kl": 0.004752782918512821, | |
| "learning_rate": 2.2112884075760347e-05, | |
| "loss": 8.390162838622928e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00011280316393822432, | |
| "clip_ratio/high_mean": 0.00011280316393822432, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00011280316393822432, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 717.4, | |
| "completions/mean_length": 650.2, | |
| "completions/min_length": 583.0, | |
| "epoch": 0.57, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.0028083010111004114, | |
| "kl": 0.005990609969012439, | |
| "learning_rate": 2.1293408807203947e-05, | |
| "loss": 0.000368604133836925, | |
| "reward": 0.4, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0001277955248951912, | |
| "clip_ratio/high_mean": 0.0001277955248951912, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0001277955248951912, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 503.0, | |
| "completions/mean_length": 472.3, | |
| "completions/min_length": 441.6, | |
| "epoch": 0.58, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.17317219078540802, | |
| "kl": 0.003116553882136941, | |
| "learning_rate": 2.047798664169726e-05, | |
| "loss": 2.7030031196773054e-05, | |
| "reward": 0.9, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.9, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00011068069143220783, | |
| "clip_ratio/high_mean": 0.00011068069143220783, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00011068069143220783, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 701.6, | |
| "completions/mean_length": 638.9, | |
| "completions/min_length": 576.2, | |
| "epoch": 0.59, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0019411866087466478, | |
| "kl": 0.0012191154062747955, | |
| "learning_rate": 1.9667509231406334e-05, | |
| "loss": -5.089085607323795e-06, | |
| "reward": 0.3, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.3, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 588.6, | |
| "completions/mean_length": 567.2, | |
| "completions/min_length": 545.8, | |
| "epoch": 0.6, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0071156201884150505, | |
| "kl": 0.003254280146211386, | |
| "learning_rate": 1.8862862821480025e-05, | |
| "loss": 0.00012772842310369015, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00033927056938409805, | |
| "clip_ratio/high_mean": 0.00033927056938409805, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00033927056938409805, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 649.0, | |
| "completions/mean_length": 625.9, | |
| "completions/min_length": 602.8, | |
| "epoch": 0.61, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.003073514671996236, | |
| "kl": 0.002048709220252931, | |
| "learning_rate": 1.806492728095389e-05, | |
| "loss": 0.00016608801670372486, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 726.2, | |
| "completions/mean_length": 684.3, | |
| "completions/min_length": 642.4, | |
| "epoch": 0.62, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.002632194198668003, | |
| "kl": 0.004164765309542418, | |
| "learning_rate": 1.7274575140626318e-05, | |
| "loss": 0.0001662806374952197, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00016849199309945106, | |
| "clip_ratio/high_mean": 0.00016849199309945106, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00016849199309945106, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 581.6, | |
| "completions/mean_length": 551.1, | |
| "completions/min_length": 520.6, | |
| "epoch": 0.63, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.004793096799403429, | |
| "kl": 0.006928782898467034, | |
| "learning_rate": 1.6492670638958924e-05, | |
| "loss": 0.0002944141859188676, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006410256493836642, | |
| "clip_ratio/high_mean": 0.0006410256493836642, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006410256493836642, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 583.8, | |
| "completions/mean_length": 559.5, | |
| "completions/min_length": 535.2, | |
| "epoch": 0.64, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.009927814826369286, | |
| "kl": 0.005139771406538785, | |
| "learning_rate": 1.5720068777044476e-05, | |
| "loss": 2.336390898562968e-05, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 473.4, | |
| "completions/mean_length": 433.2, | |
| "completions/min_length": 393.0, | |
| "epoch": 0.65, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.009365738369524479, | |
| "kl": 0.010789648251375183, | |
| "learning_rate": 1.495761438367577e-05, | |
| "loss": 0.0004745126701891422, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4, | |
| "completions/max_length": 802.6, | |
| "completions/mean_length": 755.4, | |
| "completions/min_length": 708.2, | |
| "epoch": 0.66, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.004060312174260616, | |
| "kl": 0.0012661131797358394, | |
| "learning_rate": 1.4206141191537682e-05, | |
| "loss": 5.287175299599767e-05, | |
| "reward": 0.2, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.2, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 697.0, | |
| "completions/mean_length": 654.7, | |
| "completions/min_length": 612.4, | |
| "epoch": 0.67, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.003583623794838786, | |
| "kl": 0.0017941199708729982, | |
| "learning_rate": 1.346647092553281e-05, | |
| "loss": 0.00011949921026825905, | |
| "reward": 0.9, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.9, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00012666244292631746, | |
| "clip_ratio/high_mean": 0.00012666244292631746, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00012666244292631746, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 601.2, | |
| "completions/mean_length": 529.5, | |
| "completions/min_length": 457.8, | |
| "epoch": 0.68, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0036033187061548233, | |
| "kl": 0.001624487293884158, | |
| "learning_rate": 1.2739412404237306e-05, | |
| "loss": -2.4389610916841776e-05, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 498.8, | |
| "completions/mean_length": 485.2, | |
| "completions/min_length": 471.6, | |
| "epoch": 0.69, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.006147034000605345, | |
| "kl": 0.0014556913753040134, | |
| "learning_rate": 1.202576065546963e-05, | |
| "loss": 5.921515985392034e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0003992015961557627, | |
| "clip_ratio/high_mean": 0.0003992015961557627, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0003992015961557627, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 436.4, | |
| "completions/mean_length": 421.5, | |
| "completions/min_length": 406.6, | |
| "epoch": 0.7, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.006787001620978117, | |
| "kl": 0.0027022200636565687, | |
| "learning_rate": 1.1326296046939333e-05, | |
| "loss": 0.0002472905209288001, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 427.6, | |
| "completions/mean_length": 404.0, | |
| "completions/min_length": 380.4, | |
| "epoch": 0.71, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0022182271350175142, | |
| "kl": 0.002075655141379684, | |
| "learning_rate": 1.064178343292641e-05, | |
| "loss": 8.377792546525598e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0003294892841950059, | |
| "clip_ratio/high_mean": 0.0003294892841950059, | |
| "clip_ratio/low_mean": 0.00022050717379897832, | |
| "clip_ratio/low_min": 0.00022050717379897832, | |
| "clip_ratio/region_mean": 0.0005499964579939842, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 527.0, | |
| "completions/mean_length": 431.4, | |
| "completions/min_length": 335.8, | |
| "epoch": 0.72, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.005938891787081957, | |
| "kl": 0.003542816312983632, | |
| "learning_rate": 9.972971317924374e-06, | |
| "loss": -4.2559945723041895e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 753.0, | |
| "completions/mean_length": 721.9, | |
| "completions/min_length": 690.8, | |
| "epoch": 0.73, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0016718930564820766, | |
| "kl": 0.0013213358353823424, | |
| "learning_rate": 9.320591038161574e-06, | |
| "loss": 5.248577799648047e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006450645858421921, | |
| "clip_ratio/high_mean": 0.0006450645858421921, | |
| "clip_ratio/low_mean": 0.0001163467182777822, | |
| "clip_ratio/low_min": 0.0001163467182777822, | |
| "clip_ratio/region_mean": 0.0007614112924784422, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 653.6, | |
| "completions/mean_length": 601.9, | |
| "completions/min_length": 550.2, | |
| "epoch": 0.74, | |
| "frac_reward_zero_std": 0.4, | |
| "grad_norm": 0.10866400599479675, | |
| "kl": 0.001310768094845116, | |
| "learning_rate": 8.685355961895784e-06, | |
| "loss": -0.0001609708764590323, | |
| "reward": 0.5, | |
| "reward_std": 0.42426406145095824, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.42426406145095824, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0003552397945895791, | |
| "clip_ratio/high_mean": 0.0003552397945895791, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0003552397945895791, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 585.6, | |
| "completions/mean_length": 557.0, | |
| "completions/min_length": 528.4, | |
| "epoch": 0.75, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.007570336107164621, | |
| "kl": 0.0019031562842428684, | |
| "learning_rate": 8.067960709356478e-06, | |
| "loss": 0.00010980634251609445, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 827.2, | |
| "completions/mean_length": 776.1, | |
| "completions/min_length": 725.0, | |
| "epoch": 0.76, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0032681154552847147, | |
| "kl": 0.0015106519451364875, | |
| "learning_rate": 7.469080393187786e-06, | |
| "loss": 6.0344923986122013e-05, | |
| "reward": 0.2, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.2, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 567.6, | |
| "completions/mean_length": 527.8, | |
| "completions/min_length": 488.0, | |
| "epoch": 0.77, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0029238348361104727, | |
| "kl": 0.001659035962074995, | |
| "learning_rate": 6.889369880222776e-06, | |
| "loss": 6.735894712619483e-05, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 665.2, | |
| "completions/mean_length": 576.4, | |
| "completions/min_length": 487.6, | |
| "epoch": 0.78, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0015165195800364017, | |
| "kl": 0.0015694351401180028, | |
| "learning_rate": 6.329463075396161e-06, | |
| "loss": 6.140409386716783e-05, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00020020019728690387, | |
| "clip_ratio/high_mean": 0.00020020019728690387, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00020020019728690387, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 624.0, | |
| "completions/mean_length": 603.8, | |
| "completions/min_length": 583.6, | |
| "epoch": 0.79, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.00223752879537642, | |
| "kl": 0.0014426506008021534, | |
| "learning_rate": 5.78997222857853e-06, | |
| "loss": 9.092516265809535e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0008895917097106576, | |
| "clip_ratio/high_mean": 0.0008895917097106576, | |
| "clip_ratio/low_mean": 0.0002871500328183174, | |
| "clip_ratio/low_min": 0.0002871500328183174, | |
| "clip_ratio/region_mean": 0.001176741742528975, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 674.0, | |
| "completions/mean_length": 605.1, | |
| "completions/min_length": 536.2, | |
| "epoch": 0.8, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.006541391368955374, | |
| "kl": 0.002322370233014226, | |
| "learning_rate": 5.271487265090163e-06, | |
| "loss": 0.00022997541818767787, | |
| "reward": 0.4, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 405.2, | |
| "completions/mean_length": 385.8, | |
| "completions/min_length": 366.4, | |
| "epoch": 0.81, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.006315944250673056, | |
| "kl": 0.002596192993223667, | |
| "learning_rate": 4.7745751406263165e-06, | |
| "loss": 0.00014588373014703392, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 631.6, | |
| "completions/mean_length": 595.8, | |
| "completions/min_length": 560.0, | |
| "epoch": 0.82, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0020425012335181236, | |
| "kl": 0.0014671742217615246, | |
| "learning_rate": 4.299779221299499e-06, | |
| "loss": 5.95603312831372e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00021141648758202792, | |
| "clip_ratio/high_mean": 0.00021141648758202792, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00021141648758202792, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 426.2, | |
| "completions/mean_length": 407.7, | |
| "completions/min_length": 389.2, | |
| "epoch": 0.83, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.003026613499969244, | |
| "kl": 0.0029249578481540086, | |
| "learning_rate": 3.847618689476612e-06, | |
| "loss": -3.8415665039792656e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 556.2, | |
| "completions/mean_length": 511.3, | |
| "completions/min_length": 466.4, | |
| "epoch": 0.84, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0027313604950904846, | |
| "kl": 0.00170407232362777, | |
| "learning_rate": 3.418587976060653e-06, | |
| "loss": 6.951598916202784e-05, | |
| "reward": 0.2, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.2, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00044247787445783615, | |
| "clip_ratio/high_mean": 0.00044247787445783615, | |
| "clip_ratio/low_mean": 0.00044247787445783615, | |
| "clip_ratio/low_min": 0.00044247787445783615, | |
| "clip_ratio/region_mean": 0.0008849557489156723, | |
| "completions/clipped_ratio": 0.3, | |
| "completions/max_length": 719.6, | |
| "completions/mean_length": 680.4, | |
| "completions/min_length": 641.2, | |
| "epoch": 0.85, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0022066642995923758, | |
| "kl": 0.004893560777418315, | |
| "learning_rate": 3.013156219837776e-06, | |
| "loss": 0.00014771391870453953, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 684.0, | |
| "completions/mean_length": 615.6, | |
| "completions/min_length": 547.2, | |
| "epoch": 0.86, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.004402415361255407, | |
| "kl": 0.0030957374721765516, | |
| "learning_rate": 2.6317667544809134e-06, | |
| "loss": 0.00012539406307041646, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 491.6, | |
| "completions/mean_length": 429.4, | |
| "completions/min_length": 367.2, | |
| "epoch": 0.87, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0023908629082143307, | |
| "kl": 0.005757934390567243, | |
| "learning_rate": 2.2748366237709374e-06, | |
| "loss": 0.00023677514400333166, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.4, | |
| "completions/max_length": 941.0, | |
| "completions/mean_length": 845.1, | |
| "completions/min_length": 749.2, | |
| "epoch": 0.88, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0024610680993646383, | |
| "kl": 0.0023990799207240345, | |
| "learning_rate": 1.9427561255653816e-06, | |
| "loss": 8.56145576108247e-05, | |
| "reward": 0.1, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.1, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0003554502269253135, | |
| "clip_ratio/high_mean": 0.0003554502269253135, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0003554502269253135, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 717.8, | |
| "completions/mean_length": 628.7, | |
| "completions/min_length": 539.6, | |
| "epoch": 0.89, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0033430650364607573, | |
| "kl": 0.00213278106530197, | |
| "learning_rate": 1.6358883850134816e-06, | |
| "loss": 7.869623950682581e-05, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00014947683084756135, | |
| "clip_ratio/high_mean": 0.00014947683084756135, | |
| "clip_ratio/low_mean": 0.00014947683084756135, | |
| "clip_ratio/low_min": 0.00014947683084756135, | |
| "clip_ratio/region_mean": 0.0002989536616951227, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 561.2, | |
| "completions/mean_length": 541.5, | |
| "completions/min_length": 521.8, | |
| "epoch": 0.9, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.00285865506157279, | |
| "kl": 0.0031095960177481173, | |
| "learning_rate": 1.3545689574841342e-06, | |
| "loss": 0.00018892092630267142, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00017857142956927418, | |
| "clip_ratio/high_mean": 0.00017857142956927418, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00017857142956927418, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 673.4, | |
| "completions/mean_length": 612.5, | |
| "completions/min_length": 551.6, | |
| "epoch": 0.91, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.004231106955558062, | |
| "kl": 0.001816297578625381, | |
| "learning_rate": 1.0991054616410589e-06, | |
| "loss": 2.4761457461863755e-05, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 774.4, | |
| "completions/mean_length": 719.6, | |
| "completions/min_length": 664.8, | |
| "epoch": 0.92, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.0025418533477932215, | |
| "kl": 0.001939354185014963, | |
| "learning_rate": 8.697772430662859e-07, | |
| "loss": 7.737103151157498e-05, | |
| "reward": 0.4, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 511.0, | |
| "completions/mean_length": 471.9, | |
| "completions/min_length": 432.8, | |
| "epoch": 0.93, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.009396975859999657, | |
| "kl": 0.0025742474826984107, | |
| "learning_rate": 6.668350687998565e-07, | |
| "loss": 0.00010292576625943184, | |
| "reward": 0.8, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.000551610765978694, | |
| "clip_ratio/high_mean": 0.000551610765978694, | |
| "clip_ratio/low_mean": 9.881423320621253e-05, | |
| "clip_ratio/low_min": 9.881423320621253e-05, | |
| "clip_ratio/region_mean": 0.0006504249759018421, | |
| "completions/clipped_ratio": 0.2, | |
| "completions/max_length": 705.0, | |
| "completions/mean_length": 675.6, | |
| "completions/min_length": 646.2, | |
| "epoch": 0.94, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.1849575787782669, | |
| "kl": 0.0012461108970455825, | |
| "learning_rate": 4.905008531297661e-07, | |
| "loss": 3.4519674954935907e-06, | |
| "reward": 0.6, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0001932367100380361, | |
| "clip_ratio/low_min": 0.0001932367100380361, | |
| "clip_ratio/region_mean": 0.0001932367100380361, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 704.2, | |
| "completions/mean_length": 656.1, | |
| "completions/min_length": 608.0, | |
| "epoch": 0.95, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.17005588114261627, | |
| "kl": 0.0020033617503941057, | |
| "learning_rate": 3.4096741493194197e-07, | |
| "loss": 0.00018819719552993775, | |
| "reward": 0.4, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.4, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 394.0, | |
| "completions/mean_length": 366.8, | |
| "completions/min_length": 339.6, | |
| "epoch": 0.96, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.0033463104628026485, | |
| "kl": 0.0017193612293340266, | |
| "learning_rate": 2.1839826682562015e-07, | |
| "loss": 0.00022480501793324946, | |
| "reward": 0.5, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.5, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0005300353281199932, | |
| "clip_ratio/high_mean": 0.0005300353281199932, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0005300353281199932, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 701.2, | |
| "completions/mean_length": 642.3, | |
| "completions/min_length": 583.4, | |
| "epoch": 0.97, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.17250776290893555, | |
| "kl": 0.0015719524584710599, | |
| "learning_rate": 1.229274363747146e-07, | |
| "loss": 1.5123013872653246e-05, | |
| "reward": 0.8, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00020439447835087776, | |
| "clip_ratio/high_mean": 0.00020439447835087776, | |
| "clip_ratio/low_mean": 0.00025062656495720146, | |
| "clip_ratio/low_min": 0.00025062656495720146, | |
| "clip_ratio/region_mean": 0.00045502104330807924, | |
| "completions/clipped_ratio": 0.1, | |
| "completions/max_length": 558.2, | |
| "completions/mean_length": 510.3, | |
| "completions/min_length": 462.4, | |
| "epoch": 0.98, | |
| "frac_reward_zero_std": 0.6, | |
| "grad_norm": 0.3301871418952942, | |
| "kl": 0.0023324352921918036, | |
| "learning_rate": 5.4659319530636633e-08, | |
| "loss": 0.00013293407391756773, | |
| "reward": 0.8, | |
| "reward_std": 0.2828427076339722, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.8, | |
| "rewards/MathAccuracy/std": 0.2828427076339722, | |
| "step": 490 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 636.6, | |
| "completions/mean_length": 614.3, | |
| "completions/min_length": 592.0, | |
| "epoch": 0.99, | |
| "frac_reward_zero_std": 1.0, | |
| "grad_norm": 0.005240611266344786, | |
| "kl": 0.002130005625076592, | |
| "learning_rate": 1.3668566476848777e-08, | |
| "loss": 8.560363785363734e-05, | |
| "reward": 0.6, | |
| "reward_std": 0.0, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.6, | |
| "rewards/MathAccuracy/std": 0.0, | |
| "step": 495 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0001826483989134431, | |
| "clip_ratio/high_mean": 0.0001826483989134431, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0001826483989134431, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 640.0, | |
| "completions/mean_length": 551.5, | |
| "completions/min_length": 463.0, | |
| "epoch": 1.0, | |
| "frac_reward_zero_std": 0.8, | |
| "grad_norm": 0.15998604893684387, | |
| "kl": 0.002341361262369901, | |
| "learning_rate": 0.0, | |
| "loss": 0.00012646716786548495, | |
| "reward": 0.7, | |
| "reward_std": 0.1414213538169861, | |
| "rewards/Format/mean": 0.0, | |
| "rewards/Format/std": 0.0, | |
| "rewards/MathAccuracy/mean": 0.7, | |
| "rewards/MathAccuracy/std": 0.1414213538169861, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 500, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |