Qwen2.5-7B-Instruct-GRPO-Math / trainer_state.json
FutureMa's picture
Upload GRPO fine-tuned Qwen2.5-7B-Instruct model
bc4cc58 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 50.0,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 389.0,
"completions/mean_length": 377.5,
"completions/min_length": 366.0,
"epoch": 0.002,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2745305299758911,
"kl": 0.0,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.0,
"reward": 0.5,
"reward_std": 0.7071067690849304,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.7071067690849304,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 712.75,
"completions/mean_length": 689.375,
"completions/min_length": 666.0,
"epoch": 0.01,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0006804907461628318,
"kl": 9.946800855686888e-05,
"learning_rate": 1e-05,
"loss": 4.000145054305904e-06,
"reward": 0.25,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.25,
"rewards/MathAccuracy/std": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0024464832618832587,
"clip_ratio/high_mean": 0.0024464832618832587,
"clip_ratio/low_mean": 0.0001640689093619585,
"clip_ratio/low_min": 0.0001640689093619585,
"clip_ratio/region_mean": 0.002610552171245217,
"completions/clipped_ratio": 0.2,
"completions/max_length": 497.2,
"completions/mean_length": 485.6,
"completions/min_length": 474.0,
"epoch": 0.02,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.22628173232078552,
"kl": 0.0002987155457958579,
"learning_rate": 2e-05,
"loss": -0.0003628176636993885,
"reward": 0.6,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 421.6,
"completions/mean_length": 390.2,
"completions/min_length": 358.8,
"epoch": 0.03,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0014656687853857875,
"kl": 0.00022319573326967657,
"learning_rate": 3e-05,
"loss": -1.4230319357011467e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 486.0,
"completions/mean_length": 444.9,
"completions/min_length": 403.8,
"epoch": 0.04,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0009142484632320702,
"kl": 0.0004241452901624143,
"learning_rate": 4e-05,
"loss": 1.7114212096203118e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0005540780373848974,
"clip_ratio/high_mean": 0.0005540780373848974,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005540780373848974,
"completions/clipped_ratio": 0.2,
"completions/max_length": 787.4,
"completions/mean_length": 738.5,
"completions/min_length": 689.6,
"epoch": 0.05,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.0029386563692241907,
"kl": 0.0017149186198366806,
"learning_rate": 5e-05,
"loss": 8.679315214976668e-05,
"reward": 0.4,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 25
},
{
"clip_ratio/high_max": 0.0006403414998203516,
"clip_ratio/high_mean": 0.0006403414998203516,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006403414998203516,
"completions/clipped_ratio": 0.1,
"completions/max_length": 647.6,
"completions/mean_length": 566.2,
"completions/min_length": 484.8,
"epoch": 0.06,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0006255562184378505,
"kl": 0.0003066264180233702,
"learning_rate": 4.9986331433523156e-05,
"loss": 1.4322872448246927e-05,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3,
"completions/max_length": 686.4,
"completions/mean_length": 664.7,
"completions/min_length": 643.0,
"epoch": 0.07,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0014924455899745226,
"kl": 0.00036709415726363657,
"learning_rate": 4.994534068046937e-05,
"loss": 1.4717187150381506e-05,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.00040349699556827543,
"clip_ratio/high_mean": 0.00040349699556827543,
"clip_ratio/low_mean": 0.00040349699556827543,
"clip_ratio/low_min": 0.00040349699556827543,
"clip_ratio/region_mean": 0.0008069939911365509,
"completions/clipped_ratio": 0.1,
"completions/max_length": 681.6,
"completions/mean_length": 587.4,
"completions/min_length": 493.2,
"epoch": 0.08,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0019065124215558171,
"kl": 0.0004894518526270986,
"learning_rate": 4.9877072563625285e-05,
"loss": -0.00015065595507621766,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 677.2,
"completions/mean_length": 661.8,
"completions/min_length": 646.4,
"epoch": 0.09,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.000817921943962574,
"kl": 0.0003075484826695174,
"learning_rate": 4.978160173317438e-05,
"loss": 1.24339887406677e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 656.8,
"completions/mean_length": 621.3,
"completions/min_length": 585.8,
"epoch": 0.1,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.15634514391422272,
"kl": 0.000459137320285663,
"learning_rate": 4.965903258506806e-05,
"loss": 2.121384022757411e-06,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4,
"completions/max_length": 677.0,
"completions/mean_length": 671.2,
"completions/min_length": 665.4,
"epoch": 0.11,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0011203172616660595,
"kl": 0.0012636209139600396,
"learning_rate": 4.9509499146870236e-05,
"loss": 4.886850947514176e-05,
"reward": 0.2,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.2,
"rewards/MathAccuracy/std": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.00022246940061450006,
"clip_ratio/high_mean": 0.00022246940061450006,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00022246940061450006,
"completions/clipped_ratio": 0.0,
"completions/max_length": 418.2,
"completions/mean_length": 400.8,
"completions/min_length": 383.4,
"epoch": 0.12,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0010890079429373145,
"kl": 0.0009534806886222214,
"learning_rate": 4.933316493120015e-05,
"loss": 2.114146773237735e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3,
"completions/max_length": 681.6,
"completions/mean_length": 648.9,
"completions/min_length": 616.2,
"epoch": 0.13,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0018795138457790017,
"kl": 0.0005977108958177269,
"learning_rate": 4.913022275693372e-05,
"loss": 2.4121845490299167e-05,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0009542598738335073,
"clip_ratio/high_mean": 0.0009542598738335073,
"clip_ratio/low_mean": 0.00020920501556247472,
"clip_ratio/low_min": 0.00020920501556247472,
"clip_ratio/region_mean": 0.0011634649126790464,
"completions/clipped_ratio": 0.0,
"completions/max_length": 435.8,
"completions/mean_length": 412.4,
"completions/min_length": 389.0,
"epoch": 0.14,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.25056585669517517,
"kl": 0.0006330947682727129,
"learning_rate": 4.8900894538358944e-05,
"loss": -0.00019633164629340172,
"reward": 0.4,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 611.0,
"completions/mean_length": 575.0,
"completions/min_length": 539.0,
"epoch": 0.15,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002435472793877125,
"kl": 0.0007646598271094263,
"learning_rate": 4.864543104251587e-05,
"loss": 3.094758721999824e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.00010515246540307999,
"clip_ratio/high_mean": 0.00010515246540307999,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00010515246540307999,
"completions/clipped_ratio": 0.1,
"completions/max_length": 668.6,
"completions/mean_length": 606.7,
"completions/min_length": 544.8,
"epoch": 0.16,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.11095567792654037,
"kl": 0.0007162548077758402,
"learning_rate": 4.8364111614986527e-05,
"loss": 7.679397240281104e-05,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 568.8,
"completions/mean_length": 527.3,
"completions/min_length": 485.8,
"epoch": 0.17,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0033143432810902596,
"kl": 0.0006754565751180053,
"learning_rate": 4.805724387443462e-05,
"loss": 4.5427383156493306e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 85
},
{
"clip_ratio/high_max": 0.0003456221194937825,
"clip_ratio/high_mean": 0.0003456221194937825,
"clip_ratio/low_mean": 0.00011520737316459418,
"clip_ratio/low_min": 0.00011520737316459418,
"clip_ratio/region_mean": 0.0004608294926583767,
"completions/clipped_ratio": 0.2,
"completions/max_length": 831.6,
"completions/mean_length": 796.0,
"completions/min_length": 760.4,
"epoch": 0.18,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0037328507751226425,
"kl": 0.0010223451943602413,
"learning_rate": 4.7725163376229064e-05,
"loss": 4.158227238804102e-05,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 497.4,
"completions/mean_length": 441.2,
"completions/min_length": 385.0,
"epoch": 0.19,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.002750825835391879,
"kl": 0.0019137584429699927,
"learning_rate": 4.736823324551909e-05,
"loss": 0.00014175053220242262,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 95
},
{
"clip_ratio/high_max": 0.0005050505045801401,
"clip_ratio/high_mean": 0.0005050505045801401,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005050505045801401,
"completions/clipped_ratio": 0.0,
"completions/max_length": 508.2,
"completions/mean_length": 493.9,
"completions/min_length": 479.6,
"epoch": 0.2,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.4193066656589508,
"kl": 0.0017093931266572327,
"learning_rate": 4.698684378016222e-05,
"loss": 0.00012627228861674666,
"reward": 0.8,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 360.8,
"completions/mean_length": 341.7,
"completions/min_length": 322.6,
"epoch": 0.21,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.007469469215720892,
"kl": 0.003066345490515232,
"learning_rate": 4.6581412023939354e-05,
"loss": 0.00012305844575166702,
"reward": 1.0,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 1.0,
"rewards/MathAccuracy/std": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0001224739709869027,
"clip_ratio/high_mean": 0.0001224739709869027,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0001224739709869027,
"completions/clipped_ratio": 0.0,
"completions/max_length": 611.2,
"completions/mean_length": 572.1,
"completions/min_length": 533.0,
"epoch": 0.22,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.0033601748291403055,
"kl": 0.003409948293119669,
"learning_rate": 4.6152381310523387e-05,
"loss": 0.00023221683222800492,
"reward": 0.5,
"reward_std": 0.42426406145095824,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.42426406145095824,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 734.0,
"completions/mean_length": 692.2,
"completions/min_length": 650.4,
"epoch": 0.23,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0015698346542194486,
"kl": 0.0008670258859638125,
"learning_rate": 4.5700220778700504e-05,
"loss": 0.0001534310751594603,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 723.8,
"completions/mean_length": 682.2,
"completions/min_length": 640.6,
"epoch": 0.24,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.010230440646409988,
"kl": 0.005100146430777386,
"learning_rate": 4.522542485937369e-05,
"loss": 0.00020869788713753223,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4,
"completions/max_length": 922.4,
"completions/mean_length": 872.0,
"completions/min_length": 821.6,
"epoch": 0.25,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002747906604781747,
"kl": 0.0014180985395796596,
"learning_rate": 4.4728512734909844e-05,
"loss": 5.677485605701804e-05,
"reward": 0.0,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.0,
"rewards/MathAccuracy/std": 0.0,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.8,
"completions/mean_length": 388.4,
"completions/min_length": 371.0,
"epoch": 0.26,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0014852522872388363,
"kl": 0.001983049605041742,
"learning_rate": 4.421002777142148e-05,
"loss": 7.918149349279701e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0001583531266078353,
"clip_ratio/low_min": 0.0001583531266078353,
"clip_ratio/region_mean": 0.0001583531266078353,
"completions/clipped_ratio": 0.0,
"completions/max_length": 458.4,
"completions/mean_length": 429.9,
"completions/min_length": 401.4,
"epoch": 0.27,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.01503363810479641,
"kl": 0.005801378504838794,
"learning_rate": 4.367053692460385e-05,
"loss": 0.0004215865395963192,
"reward": 0.6,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 645.2,
"completions/mean_length": 537.3,
"completions/min_length": 429.4,
"epoch": 0.28,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002116028917953372,
"kl": 0.005066322290804237,
"learning_rate": 4.311063011977723e-05,
"loss": 0.00020899884402751922,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 375.8,
"completions/mean_length": 354.6,
"completions/min_length": 333.4,
"epoch": 0.29,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006887642201036215,
"kl": 0.009827147470787168,
"learning_rate": 4.2530919606812216e-05,
"loss": 0.0003938500303775072,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 416.4,
"completions/mean_length": 366.7,
"completions/min_length": 317.0,
"epoch": 0.3,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.002672386122867465,
"kl": 0.008298561931587756,
"learning_rate": 4.193203929064353e-05,
"loss": 0.00047482880763709544,
"reward": 0.9,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.9,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 640.8,
"completions/mean_length": 594.7,
"completions/min_length": 548.6,
"epoch": 0.31,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0023163340520113707,
"kl": 0.0016974479891359805,
"learning_rate": 4.131464403810422e-05,
"loss": 6.728884764015675e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 381.2,
"completions/mean_length": 353.1,
"completions/min_length": 325.0,
"epoch": 0.32,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0056184823624789715,
"kl": 0.006532504153437912,
"learning_rate": 4.067940896183843e-05,
"loss": 0.0001474126009270549,
"reward": 0.9,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.9,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 499.4,
"completions/mean_length": 479.6,
"completions/min_length": 459.8,
"epoch": 0.33,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.007484205532819033,
"kl": 0.0038784807082265617,
"learning_rate": 4.002702868207563e-05,
"loss": 0.0001543789985589683,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0006509372964501381,
"clip_ratio/high_mean": 0.0006509372964501381,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006509372964501381,
"completions/clipped_ratio": 0.1,
"completions/max_length": 658.2,
"completions/mean_length": 576.8,
"completions/min_length": 495.4,
"epoch": 0.34,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.004718282260000706,
"kl": 0.007649715105071664,
"learning_rate": 3.935821656707359e-05,
"loss": 0.0005094979424029589,
"reward": 0.8,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 587.4,
"completions/mean_length": 544.4,
"completions/min_length": 501.4,
"epoch": 0.35,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0019858903251588345,
"kl": 0.0035309843719005586,
"learning_rate": 3.867370395306068e-05,
"loss": 6.014038226567209e-05,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 639.6,
"completions/mean_length": 594.5,
"completions/min_length": 549.4,
"epoch": 0.36,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.015034261159598827,
"kl": 0.003863858920522034,
"learning_rate": 3.797423934453038e-05,
"loss": 0.0001626830198802054,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3,
"completions/max_length": 805.0,
"completions/mean_length": 710.5,
"completions/min_length": 616.0,
"epoch": 0.37,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0022431982215493917,
"kl": 0.0025493323453702034,
"learning_rate": 3.726058759576271e-05,
"loss": 0.0001080367248505354,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00020140986889600755,
"clip_ratio/low_min": 0.00020140986889600755,
"clip_ratio/region_mean": 0.00020140986889600755,
"completions/clipped_ratio": 0.0,
"completions/max_length": 384.8,
"completions/mean_length": 366.2,
"completions/min_length": 347.6,
"epoch": 0.38,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.019761426374316216,
"kl": 0.011293478566221893,
"learning_rate": 3.65335290744672e-05,
"loss": 0.0005690994672477246,
"reward": 0.9,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.9,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00012698412174358965,
"clip_ratio/low_min": 0.00012698412174358965,
"clip_ratio/region_mean": 0.00012698412174358965,
"completions/clipped_ratio": 0.2,
"completions/max_length": 703.6,
"completions/mean_length": 662.9,
"completions/min_length": 622.2,
"epoch": 0.39,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.002572572324424982,
"kl": 0.0015339702018536626,
"learning_rate": 3.579385880846232e-05,
"loss": 2.8236012440174817e-06,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 643.2,
"completions/mean_length": 612.7,
"completions/min_length": 582.2,
"epoch": 0.4,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0031194768380373716,
"kl": 0.002493513422086835,
"learning_rate": 3.504238561632424e-05,
"loss": 9.978280868381262e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3,
"completions/max_length": 674.2,
"completions/mean_length": 638.7,
"completions/min_length": 603.2,
"epoch": 0.41,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0036480328999459743,
"kl": 0.0019606892135925593,
"learning_rate": 3.427993122295552e-05,
"loss": 7.743847672827541e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 632.4,
"completions/mean_length": 570.5,
"completions/min_length": 508.6,
"epoch": 0.42,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.00799116026610136,
"kl": 0.003256951330695301,
"learning_rate": 3.350732936104108e-05,
"loss": 0.00010458981851115822,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 315.6,
"completions/mean_length": 300.6,
"completions/min_length": 285.6,
"epoch": 0.43,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005798510741442442,
"kl": 0.011395945539698004,
"learning_rate": 3.272542485937369e-05,
"loss": 0.00045509766787290575,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4,
"completions/max_length": 777.6,
"completions/mean_length": 756.6,
"completions/min_length": 735.6,
"epoch": 0.44,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.00223241513594985,
"kl": 0.0014477839809842407,
"learning_rate": 3.1935072719046115e-05,
"loss": 5.752563010901213e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 220
},
{
"clip_ratio/high_max": 0.00031645570416003463,
"clip_ratio/high_mean": 0.00031645570416003463,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00031645570416003463,
"completions/clipped_ratio": 0.0,
"completions/max_length": 443.8,
"completions/mean_length": 398.4,
"completions/min_length": 353.0,
"epoch": 0.45,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.0020589185878634453,
"kl": 0.0025043860776349904,
"learning_rate": 3.1137137178519985e-05,
"loss": 0.0001688675722107291,
"reward": 0.4,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 225
},
{
"clip_ratio/high_max": 9.955201530829073e-05,
"clip_ratio/high_mean": 9.955201530829073e-05,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 9.955201530829073e-05,
"completions/clipped_ratio": 0.3,
"completions/max_length": 747.4,
"completions/mean_length": 715.4,
"completions/min_length": 683.4,
"epoch": 0.46,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.002850248944014311,
"kl": 0.0052430763142183425,
"learning_rate": 3.0332490768593675e-05,
"loss": 0.00025521754287183285,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 230
},
{
"clip_ratio/high_max": 0.0004385964944958687,
"clip_ratio/high_mean": 0.0004385964944958687,
"clip_ratio/low_mean": 0.00021929824724793435,
"clip_ratio/low_min": 0.00021929824724793435,
"clip_ratio/region_mean": 0.000657894741743803,
"completions/clipped_ratio": 0.0,
"completions/max_length": 625.8,
"completions/mean_length": 551.9,
"completions/min_length": 478.0,
"epoch": 0.47,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.005838941317051649,
"kl": 0.002799734321888536,
"learning_rate": 2.952201335830275e-05,
"loss": 7.562300888821482e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 235
},
{
"clip_ratio/high_max": 0.000272479560226202,
"clip_ratio/high_mean": 0.000272479560226202,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.000272479560226202,
"completions/clipped_ratio": 0.0,
"completions/max_length": 530.6,
"completions/mean_length": 480.4,
"completions/min_length": 430.2,
"epoch": 0.48,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.003879491239786148,
"kl": 0.0028692058520391585,
"learning_rate": 2.870659119279605e-05,
"loss": 0.0001448941882699728,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 482.6,
"completions/mean_length": 463.6,
"completions/min_length": 444.6,
"epoch": 0.49,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.008210963569581509,
"kl": 0.0075855673989281055,
"learning_rate": 2.788711592423966e-05,
"loss": 0.0003023243509232998,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 245
},
{
"clip_ratio/high_max": 0.0002461538417264819,
"clip_ratio/high_mean": 0.0002461538417264819,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0002461538417264819,
"completions/clipped_ratio": 0.1,
"completions/max_length": 775.4,
"completions/mean_length": 716.8,
"completions/min_length": 658.2,
"epoch": 0.5,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.006750498432666063,
"kl": 0.0025351812597364186,
"learning_rate": 2.7064483636808313e-05,
"loss": 0.00016423141350969673,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 619.8,
"completions/mean_length": 591.4,
"completions/min_length": 563.0,
"epoch": 0.51,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006483216769993305,
"kl": 0.0026113510597497226,
"learning_rate": 2.623959386683056e-05,
"loss": 0.00010410962859168649,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 255
},
{
"clip_ratio/high_max": 0.00032043447718024256,
"clip_ratio/high_mean": 0.00032043447718024256,
"clip_ratio/low_mean": 0.00020130849443376065,
"clip_ratio/low_min": 0.00020130849443376065,
"clip_ratio/region_mean": 0.0005217429948970676,
"completions/clipped_ratio": 0.3,
"completions/max_length": 782.6,
"completions/mean_length": 706.5,
"completions/min_length": 630.4,
"epoch": 0.52,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.003508440451696515,
"kl": 0.001330986130051315,
"learning_rate": 2.5413348619158967e-05,
"loss": -4.660175181925297e-05,
"reward": 0.6,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 260
},
{
"clip_ratio/high_max": 0.00013029315741732716,
"clip_ratio/high_mean": 0.00013029315741732716,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00013029315741732716,
"completions/clipped_ratio": 0.1,
"completions/max_length": 604.4,
"completions/mean_length": 536.1,
"completions/min_length": 467.8,
"epoch": 0.53,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.1873970776796341,
"kl": 0.009700851677916945,
"learning_rate": 2.458665138084104e-05,
"loss": 0.00033162124454975126,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 544.8,
"completions/mean_length": 517.5,
"completions/min_length": 490.2,
"epoch": 0.54,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0017767059616744518,
"kl": 0.002807480387855321,
"learning_rate": 2.3760406133169443e-05,
"loss": 0.00011274998541921377,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.00019474197179079056,
"clip_ratio/low_min": 0.00019474197179079056,
"clip_ratio/region_mean": 0.00019474197179079056,
"completions/clipped_ratio": 0.1,
"completions/max_length": 641.0,
"completions/mean_length": 620.8,
"completions/min_length": 600.6,
"epoch": 0.55,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.08176784217357635,
"kl": 0.0031962784822098913,
"learning_rate": 2.2935516363191693e-05,
"loss": 0.0002264779293909669,
"reward": 0.6,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 275
},
{
"clip_ratio/high_max": 0.0006584362126886845,
"clip_ratio/high_mean": 0.0006584362126886845,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006584362126886845,
"completions/clipped_ratio": 0.1,
"completions/max_length": 638.0,
"completions/mean_length": 562.6,
"completions/min_length": 487.2,
"epoch": 0.56,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.007495217490941286,
"kl": 0.004752782918512821,
"learning_rate": 2.2112884075760347e-05,
"loss": 8.390162838622928e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 280
},
{
"clip_ratio/high_max": 0.00011280316393822432,
"clip_ratio/high_mean": 0.00011280316393822432,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00011280316393822432,
"completions/clipped_ratio": 0.0,
"completions/max_length": 717.4,
"completions/mean_length": 650.2,
"completions/min_length": 583.0,
"epoch": 0.57,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.0028083010111004114,
"kl": 0.005990609969012439,
"learning_rate": 2.1293408807203947e-05,
"loss": 0.000368604133836925,
"reward": 0.4,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 285
},
{
"clip_ratio/high_max": 0.0001277955248951912,
"clip_ratio/high_mean": 0.0001277955248951912,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0001277955248951912,
"completions/clipped_ratio": 0.0,
"completions/max_length": 503.0,
"completions/mean_length": 472.3,
"completions/min_length": 441.6,
"epoch": 0.58,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.17317219078540802,
"kl": 0.003116553882136941,
"learning_rate": 2.047798664169726e-05,
"loss": 2.7030031196773054e-05,
"reward": 0.9,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.9,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 290
},
{
"clip_ratio/high_max": 0.00011068069143220783,
"clip_ratio/high_mean": 0.00011068069143220783,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00011068069143220783,
"completions/clipped_ratio": 0.0,
"completions/max_length": 701.6,
"completions/mean_length": 638.9,
"completions/min_length": 576.2,
"epoch": 0.59,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0019411866087466478,
"kl": 0.0012191154062747955,
"learning_rate": 1.9667509231406334e-05,
"loss": -5.089085607323795e-06,
"reward": 0.3,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.3,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 295
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 588.6,
"completions/mean_length": 567.2,
"completions/min_length": 545.8,
"epoch": 0.6,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0071156201884150505,
"kl": 0.003254280146211386,
"learning_rate": 1.8862862821480025e-05,
"loss": 0.00012772842310369015,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 300
},
{
"clip_ratio/high_max": 0.00033927056938409805,
"clip_ratio/high_mean": 0.00033927056938409805,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00033927056938409805,
"completions/clipped_ratio": 0.2,
"completions/max_length": 649.0,
"completions/mean_length": 625.9,
"completions/min_length": 602.8,
"epoch": 0.61,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.003073514671996236,
"kl": 0.002048709220252931,
"learning_rate": 1.806492728095389e-05,
"loss": 0.00016608801670372486,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 726.2,
"completions/mean_length": 684.3,
"completions/min_length": 642.4,
"epoch": 0.62,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.002632194198668003,
"kl": 0.004164765309542418,
"learning_rate": 1.7274575140626318e-05,
"loss": 0.0001662806374952197,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 310
},
{
"clip_ratio/high_max": 0.00016849199309945106,
"clip_ratio/high_mean": 0.00016849199309945106,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00016849199309945106,
"completions/clipped_ratio": 0.0,
"completions/max_length": 581.6,
"completions/mean_length": 551.1,
"completions/min_length": 520.6,
"epoch": 0.63,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.004793096799403429,
"kl": 0.006928782898467034,
"learning_rate": 1.6492670638958924e-05,
"loss": 0.0002944141859188676,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 315
},
{
"clip_ratio/high_max": 0.0006410256493836642,
"clip_ratio/high_mean": 0.0006410256493836642,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0006410256493836642,
"completions/clipped_ratio": 0.2,
"completions/max_length": 583.8,
"completions/mean_length": 559.5,
"completions/min_length": 535.2,
"epoch": 0.64,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.009927814826369286,
"kl": 0.005139771406538785,
"learning_rate": 1.5720068777044476e-05,
"loss": 2.336390898562968e-05,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 473.4,
"completions/mean_length": 433.2,
"completions/min_length": 393.0,
"epoch": 0.65,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.009365738369524479,
"kl": 0.010789648251375183,
"learning_rate": 1.495761438367577e-05,
"loss": 0.0004745126701891422,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 325
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4,
"completions/max_length": 802.6,
"completions/mean_length": 755.4,
"completions/min_length": 708.2,
"epoch": 0.66,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004060312174260616,
"kl": 0.0012661131797358394,
"learning_rate": 1.4206141191537682e-05,
"loss": 5.287175299599767e-05,
"reward": 0.2,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.2,
"rewards/MathAccuracy/std": 0.0,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 697.0,
"completions/mean_length": 654.7,
"completions/min_length": 612.4,
"epoch": 0.67,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.003583623794838786,
"kl": 0.0017941199708729982,
"learning_rate": 1.346647092553281e-05,
"loss": 0.00011949921026825905,
"reward": 0.9,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.9,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 335
},
{
"clip_ratio/high_max": 0.00012666244292631746,
"clip_ratio/high_mean": 0.00012666244292631746,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00012666244292631746,
"completions/clipped_ratio": 0.1,
"completions/max_length": 601.2,
"completions/mean_length": 529.5,
"completions/min_length": 457.8,
"epoch": 0.68,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0036033187061548233,
"kl": 0.001624487293884158,
"learning_rate": 1.2739412404237306e-05,
"loss": -2.4389610916841776e-05,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 498.8,
"completions/mean_length": 485.2,
"completions/min_length": 471.6,
"epoch": 0.69,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.006147034000605345,
"kl": 0.0014556913753040134,
"learning_rate": 1.202576065546963e-05,
"loss": 5.921515985392034e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 345
},
{
"clip_ratio/high_max": 0.0003992015961557627,
"clip_ratio/high_mean": 0.0003992015961557627,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003992015961557627,
"completions/clipped_ratio": 0.1,
"completions/max_length": 436.4,
"completions/mean_length": 421.5,
"completions/min_length": 406.6,
"epoch": 0.7,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.006787001620978117,
"kl": 0.0027022200636565687,
"learning_rate": 1.1326296046939333e-05,
"loss": 0.0002472905209288001,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 427.6,
"completions/mean_length": 404.0,
"completions/min_length": 380.4,
"epoch": 0.71,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0022182271350175142,
"kl": 0.002075655141379684,
"learning_rate": 1.064178343292641e-05,
"loss": 8.377792546525598e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 355
},
{
"clip_ratio/high_max": 0.0003294892841950059,
"clip_ratio/high_mean": 0.0003294892841950059,
"clip_ratio/low_mean": 0.00022050717379897832,
"clip_ratio/low_min": 0.00022050717379897832,
"clip_ratio/region_mean": 0.0005499964579939842,
"completions/clipped_ratio": 0.0,
"completions/max_length": 527.0,
"completions/mean_length": 431.4,
"completions/min_length": 335.8,
"epoch": 0.72,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.005938891787081957,
"kl": 0.003542816312983632,
"learning_rate": 9.972971317924374e-06,
"loss": -4.2559945723041895e-05,
"reward": 0.6,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 753.0,
"completions/mean_length": 721.9,
"completions/min_length": 690.8,
"epoch": 0.73,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0016718930564820766,
"kl": 0.0013213358353823424,
"learning_rate": 9.320591038161574e-06,
"loss": 5.248577799648047e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 365
},
{
"clip_ratio/high_max": 0.0006450645858421921,
"clip_ratio/high_mean": 0.0006450645858421921,
"clip_ratio/low_mean": 0.0001163467182777822,
"clip_ratio/low_min": 0.0001163467182777822,
"clip_ratio/region_mean": 0.0007614112924784422,
"completions/clipped_ratio": 0.0,
"completions/max_length": 653.6,
"completions/mean_length": 601.9,
"completions/min_length": 550.2,
"epoch": 0.74,
"frac_reward_zero_std": 0.4,
"grad_norm": 0.10866400599479675,
"kl": 0.001310768094845116,
"learning_rate": 8.685355961895784e-06,
"loss": -0.0001609708764590323,
"reward": 0.5,
"reward_std": 0.42426406145095824,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.42426406145095824,
"step": 370
},
{
"clip_ratio/high_max": 0.0003552397945895791,
"clip_ratio/high_mean": 0.0003552397945895791,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003552397945895791,
"completions/clipped_ratio": 0.2,
"completions/max_length": 585.6,
"completions/mean_length": 557.0,
"completions/min_length": 528.4,
"epoch": 0.75,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.007570336107164621,
"kl": 0.0019031562842428684,
"learning_rate": 8.067960709356478e-06,
"loss": 0.00010980634251609445,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 827.2,
"completions/mean_length": 776.1,
"completions/min_length": 725.0,
"epoch": 0.76,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0032681154552847147,
"kl": 0.0015106519451364875,
"learning_rate": 7.469080393187786e-06,
"loss": 6.0344923986122013e-05,
"reward": 0.2,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.2,
"rewards/MathAccuracy/std": 0.0,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 567.6,
"completions/mean_length": 527.8,
"completions/min_length": 488.0,
"epoch": 0.77,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0029238348361104727,
"kl": 0.001659035962074995,
"learning_rate": 6.889369880222776e-06,
"loss": 6.735894712619483e-05,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 665.2,
"completions/mean_length": 576.4,
"completions/min_length": 487.6,
"epoch": 0.78,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0015165195800364017,
"kl": 0.0015694351401180028,
"learning_rate": 6.329463075396161e-06,
"loss": 6.140409386716783e-05,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 390
},
{
"clip_ratio/high_max": 0.00020020019728690387,
"clip_ratio/high_mean": 0.00020020019728690387,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00020020019728690387,
"completions/clipped_ratio": 0.1,
"completions/max_length": 624.0,
"completions/mean_length": 603.8,
"completions/min_length": 583.6,
"epoch": 0.79,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.00223752879537642,
"kl": 0.0014426506008021534,
"learning_rate": 5.78997222857853e-06,
"loss": 9.092516265809535e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 395
},
{
"clip_ratio/high_max": 0.0008895917097106576,
"clip_ratio/high_mean": 0.0008895917097106576,
"clip_ratio/low_mean": 0.0002871500328183174,
"clip_ratio/low_min": 0.0002871500328183174,
"clip_ratio/region_mean": 0.001176741742528975,
"completions/clipped_ratio": 0.1,
"completions/max_length": 674.0,
"completions/mean_length": 605.1,
"completions/min_length": 536.2,
"epoch": 0.8,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.006541391368955374,
"kl": 0.002322370233014226,
"learning_rate": 5.271487265090163e-06,
"loss": 0.00022997541818767787,
"reward": 0.4,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 400
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 405.2,
"completions/mean_length": 385.8,
"completions/min_length": 366.4,
"epoch": 0.81,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.006315944250673056,
"kl": 0.002596192993223667,
"learning_rate": 4.7745751406263165e-06,
"loss": 0.00014588373014703392,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 405
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 631.6,
"completions/mean_length": 595.8,
"completions/min_length": 560.0,
"epoch": 0.82,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0020425012335181236,
"kl": 0.0014671742217615246,
"learning_rate": 4.299779221299499e-06,
"loss": 5.95603312831372e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 410
},
{
"clip_ratio/high_max": 0.00021141648758202792,
"clip_ratio/high_mean": 0.00021141648758202792,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00021141648758202792,
"completions/clipped_ratio": 0.0,
"completions/max_length": 426.2,
"completions/mean_length": 407.7,
"completions/min_length": 389.2,
"epoch": 0.83,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.003026613499969244,
"kl": 0.0029249578481540086,
"learning_rate": 3.847618689476612e-06,
"loss": -3.8415665039792656e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 556.2,
"completions/mean_length": 511.3,
"completions/min_length": 466.4,
"epoch": 0.84,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0027313604950904846,
"kl": 0.00170407232362777,
"learning_rate": 3.418587976060653e-06,
"loss": 6.951598916202784e-05,
"reward": 0.2,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.2,
"rewards/MathAccuracy/std": 0.0,
"step": 420
},
{
"clip_ratio/high_max": 0.00044247787445783615,
"clip_ratio/high_mean": 0.00044247787445783615,
"clip_ratio/low_mean": 0.00044247787445783615,
"clip_ratio/low_min": 0.00044247787445783615,
"clip_ratio/region_mean": 0.0008849557489156723,
"completions/clipped_ratio": 0.3,
"completions/max_length": 719.6,
"completions/mean_length": 680.4,
"completions/min_length": 641.2,
"epoch": 0.85,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0022066642995923758,
"kl": 0.004893560777418315,
"learning_rate": 3.013156219837776e-06,
"loss": 0.00014771391870453953,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 425
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 684.0,
"completions/mean_length": 615.6,
"completions/min_length": 547.2,
"epoch": 0.86,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.004402415361255407,
"kl": 0.0030957374721765516,
"learning_rate": 2.6317667544809134e-06,
"loss": 0.00012539406307041646,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 430
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 491.6,
"completions/mean_length": 429.4,
"completions/min_length": 367.2,
"epoch": 0.87,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0023908629082143307,
"kl": 0.005757934390567243,
"learning_rate": 2.2748366237709374e-06,
"loss": 0.00023677514400333166,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 435
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.4,
"completions/max_length": 941.0,
"completions/mean_length": 845.1,
"completions/min_length": 749.2,
"epoch": 0.88,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0024610680993646383,
"kl": 0.0023990799207240345,
"learning_rate": 1.9427561255653816e-06,
"loss": 8.56145576108247e-05,
"reward": 0.1,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.1,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 440
},
{
"clip_ratio/high_max": 0.0003554502269253135,
"clip_ratio/high_mean": 0.0003554502269253135,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0003554502269253135,
"completions/clipped_ratio": 0.1,
"completions/max_length": 717.8,
"completions/mean_length": 628.7,
"completions/min_length": 539.6,
"epoch": 0.89,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0033430650364607573,
"kl": 0.00213278106530197,
"learning_rate": 1.6358883850134816e-06,
"loss": 7.869623950682581e-05,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 445
},
{
"clip_ratio/high_max": 0.00014947683084756135,
"clip_ratio/high_mean": 0.00014947683084756135,
"clip_ratio/low_mean": 0.00014947683084756135,
"clip_ratio/low_min": 0.00014947683084756135,
"clip_ratio/region_mean": 0.0002989536616951227,
"completions/clipped_ratio": 0.1,
"completions/max_length": 561.2,
"completions/mean_length": 541.5,
"completions/min_length": 521.8,
"epoch": 0.9,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.00285865506157279,
"kl": 0.0031095960177481173,
"learning_rate": 1.3545689574841342e-06,
"loss": 0.00018892092630267142,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 450
},
{
"clip_ratio/high_max": 0.00017857142956927418,
"clip_ratio/high_mean": 0.00017857142956927418,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.00017857142956927418,
"completions/clipped_ratio": 0.2,
"completions/max_length": 673.4,
"completions/mean_length": 612.5,
"completions/min_length": 551.6,
"epoch": 0.91,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.004231106955558062,
"kl": 0.001816297578625381,
"learning_rate": 1.0991054616410589e-06,
"loss": 2.4761457461863755e-05,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 455
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1,
"completions/max_length": 774.4,
"completions/mean_length": 719.6,
"completions/min_length": 664.8,
"epoch": 0.92,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.0025418533477932215,
"kl": 0.001939354185014963,
"learning_rate": 8.697772430662859e-07,
"loss": 7.737103151157498e-05,
"reward": 0.4,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.0,
"step": 460
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 511.0,
"completions/mean_length": 471.9,
"completions/min_length": 432.8,
"epoch": 0.93,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.009396975859999657,
"kl": 0.0025742474826984107,
"learning_rate": 6.668350687998565e-07,
"loss": 0.00010292576625943184,
"reward": 0.8,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.0,
"step": 465
},
{
"clip_ratio/high_max": 0.000551610765978694,
"clip_ratio/high_mean": 0.000551610765978694,
"clip_ratio/low_mean": 9.881423320621253e-05,
"clip_ratio/low_min": 9.881423320621253e-05,
"clip_ratio/region_mean": 0.0006504249759018421,
"completions/clipped_ratio": 0.2,
"completions/max_length": 705.0,
"completions/mean_length": 675.6,
"completions/min_length": 646.2,
"epoch": 0.94,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.1849575787782669,
"kl": 0.0012461108970455825,
"learning_rate": 4.905008531297661e-07,
"loss": 3.4519674954935907e-06,
"reward": 0.6,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 470
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0001932367100380361,
"clip_ratio/low_min": 0.0001932367100380361,
"clip_ratio/region_mean": 0.0001932367100380361,
"completions/clipped_ratio": 0.1,
"completions/max_length": 704.2,
"completions/mean_length": 656.1,
"completions/min_length": 608.0,
"epoch": 0.95,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.17005588114261627,
"kl": 0.0020033617503941057,
"learning_rate": 3.4096741493194197e-07,
"loss": 0.00018819719552993775,
"reward": 0.4,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.4,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 475
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 394.0,
"completions/mean_length": 366.8,
"completions/min_length": 339.6,
"epoch": 0.96,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.0033463104628026485,
"kl": 0.0017193612293340266,
"learning_rate": 2.1839826682562015e-07,
"loss": 0.00022480501793324946,
"reward": 0.5,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.5,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 480
},
{
"clip_ratio/high_max": 0.0005300353281199932,
"clip_ratio/high_mean": 0.0005300353281199932,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0005300353281199932,
"completions/clipped_ratio": 0.1,
"completions/max_length": 701.2,
"completions/mean_length": 642.3,
"completions/min_length": 583.4,
"epoch": 0.97,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.17250776290893555,
"kl": 0.0015719524584710599,
"learning_rate": 1.229274363747146e-07,
"loss": 1.5123013872653246e-05,
"reward": 0.8,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 485
},
{
"clip_ratio/high_max": 0.00020439447835087776,
"clip_ratio/high_mean": 0.00020439447835087776,
"clip_ratio/low_mean": 0.00025062656495720146,
"clip_ratio/low_min": 0.00025062656495720146,
"clip_ratio/region_mean": 0.00045502104330807924,
"completions/clipped_ratio": 0.1,
"completions/max_length": 558.2,
"completions/mean_length": 510.3,
"completions/min_length": 462.4,
"epoch": 0.98,
"frac_reward_zero_std": 0.6,
"grad_norm": 0.3301871418952942,
"kl": 0.0023324352921918036,
"learning_rate": 5.4659319530636633e-08,
"loss": 0.00013293407391756773,
"reward": 0.8,
"reward_std": 0.2828427076339722,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.8,
"rewards/MathAccuracy/std": 0.2828427076339722,
"step": 490
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 636.6,
"completions/mean_length": 614.3,
"completions/min_length": 592.0,
"epoch": 0.99,
"frac_reward_zero_std": 1.0,
"grad_norm": 0.005240611266344786,
"kl": 0.002130005625076592,
"learning_rate": 1.3668566476848777e-08,
"loss": 8.560363785363734e-05,
"reward": 0.6,
"reward_std": 0.0,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.6,
"rewards/MathAccuracy/std": 0.0,
"step": 495
},
{
"clip_ratio/high_max": 0.0001826483989134431,
"clip_ratio/high_mean": 0.0001826483989134431,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0001826483989134431,
"completions/clipped_ratio": 0.0,
"completions/max_length": 640.0,
"completions/mean_length": 551.5,
"completions/min_length": 463.0,
"epoch": 1.0,
"frac_reward_zero_std": 0.8,
"grad_norm": 0.15998604893684387,
"kl": 0.002341361262369901,
"learning_rate": 0.0,
"loss": 0.00012646716786548495,
"reward": 0.7,
"reward_std": 0.1414213538169861,
"rewards/Format/mean": 0.0,
"rewards/Format/std": 0.0,
"rewards/MathAccuracy/mean": 0.7,
"rewards/MathAccuracy/std": 0.1414213538169861,
"step": 500
}
],
"logging_steps": 5,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}