Qwen-2.5-Math-7B-Max-v3-accuracy / trainer_state.json
chenggong
Model save
193f56e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.9893390191897655,
"eval_steps": 60,
"global_step": 232,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 664.3515930175781,
"epoch": 0.017057569296375266,
"grad_norm": 0.11565207690000534,
"kl": 0.0,
"learning_rate": 5e-07,
"loss": 0.1249,
"reward": 0.8191964775323868,
"reward_std": 0.1755836745724082,
"rewards/accuracy_reward": 0.8191964775323868,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 661.427487373352,
"epoch": 0.08528784648187633,
"grad_norm": 0.1410389542579651,
"kl": 0.00010322034358978271,
"learning_rate": 5e-07,
"loss": 0.0815,
"reward": 0.7940848618745804,
"reward_std": 0.16921476647257805,
"rewards/accuracy_reward": 0.7940848618745804,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 648.45962600708,
"epoch": 0.17057569296375266,
"grad_norm": 0.09059495478868484,
"kl": 0.00012706518173217775,
"learning_rate": 5e-07,
"loss": 0.092,
"reward": 0.8165178954601288,
"reward_std": 0.1695016896352172,
"rewards/accuracy_reward": 0.8165178954601288,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 635.6861877441406,
"epoch": 0.255863539445629,
"grad_norm": 0.23655401170253754,
"kl": 0.0001537799835205078,
"learning_rate": 5e-07,
"loss": 0.1002,
"reward": 0.8232143238186836,
"reward_std": 0.17031898349523544,
"rewards/accuracy_reward": 0.8232143238186836,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 636.7087341308594,
"epoch": 0.3411513859275053,
"grad_norm": 0.15169048309326172,
"kl": 0.0003520965576171875,
"learning_rate": 5e-07,
"loss": 0.0965,
"reward": 0.8183036118745803,
"reward_std": 0.16691437950357796,
"rewards/accuracy_reward": 0.8183036118745803,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 660.6172164916992,
"epoch": 0.42643923240938164,
"grad_norm": 0.10549971461296082,
"kl": 0.00020837783813476562,
"learning_rate": 5e-07,
"loss": 0.0838,
"reward": 0.813392898440361,
"reward_std": 0.17468413366004826,
"rewards/accuracy_reward": 0.813392898440361,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 659.3672187805175,
"epoch": 0.511727078891258,
"grad_norm": 0.13681049644947052,
"kl": 0.00038820505142211914,
"learning_rate": 5e-07,
"loss": 0.0786,
"reward": 0.80357146859169,
"reward_std": 0.17490468453615904,
"rewards/accuracy_reward": 0.80357146859169,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 660.5917663574219,
"epoch": 0.5970149253731343,
"grad_norm": 0.09065572917461395,
"kl": 0.0004504680633544922,
"learning_rate": 5e-07,
"loss": 0.0824,
"reward": 0.8071428924798966,
"reward_std": 0.1621523329988122,
"rewards/accuracy_reward": 0.8071428924798966,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 638.1212364196778,
"epoch": 0.6823027718550106,
"grad_norm": 0.09261901676654816,
"kl": 0.0005172014236450196,
"learning_rate": 5e-07,
"loss": 0.0585,
"reward": 0.8138393208384513,
"reward_std": 0.1534264313057065,
"rewards/accuracy_reward": 0.8138393208384513,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 664.7774856567382,
"epoch": 0.767590618336887,
"grad_norm": 0.09141222387552261,
"kl": 0.00053253173828125,
"learning_rate": 5e-07,
"loss": 0.0687,
"reward": 0.8071428909897804,
"reward_std": 0.16072208830155432,
"rewards/accuracy_reward": 0.8071428909897804,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 662.5498062133789,
"epoch": 0.8528784648187633,
"grad_norm": 0.24577292799949646,
"kl": 0.0011467933654785156,
"learning_rate": 5e-07,
"loss": 0.0649,
"reward": 0.8042411059141159,
"reward_std": 0.16364638023078443,
"rewards/accuracy_reward": 0.8042411059141159,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 643.9475730895996,
"epoch": 0.9381663113006397,
"grad_norm": 0.10820304602384567,
"kl": 0.000740814208984375,
"learning_rate": 5e-07,
"loss": 0.0622,
"reward": 0.8261161133646965,
"reward_std": 0.15972621561959385,
"rewards/accuracy_reward": 0.8261161133646965,
"step": 55
},
{
"epoch": 1.0341151385927505,
"grad_norm": 0.1097937524318695,
"learning_rate": 5e-07,
"loss": 0.0662,
"step": 60
},
{
"epoch": 1.0341151385927505,
"eval_clip_ratio": 0.0,
"eval_completion_length": 638.2177686691284,
"eval_kl": 0.0012085437774658203,
"eval_loss": 0.027663394808769226,
"eval_reward": 0.7151227928698063,
"eval_reward_std": 0.2182473847642541,
"eval_rewards/accuracy_reward": 0.7151227928698063,
"eval_runtime": 835.396,
"eval_samples_per_second": 0.599,
"eval_steps_per_second": 0.006,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 630.2452568054199,
"epoch": 1.1194029850746268,
"grad_norm": 0.08139240741729736,
"kl": 0.0015056610107421875,
"learning_rate": 5e-07,
"loss": 0.0579,
"reward": 0.8170759312808513,
"reward_std": 0.16009651254862547,
"rewards/accuracy_reward": 0.8170759312808513,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 620.5440017700196,
"epoch": 1.2046908315565032,
"grad_norm": 0.10313019156455994,
"kl": 0.0016027450561523437,
"learning_rate": 5e-07,
"loss": 0.0586,
"reward": 0.8310268223285675,
"reward_std": 0.1424413041677326,
"rewards/accuracy_reward": 0.8310268223285675,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 649.887752532959,
"epoch": 1.2899786780383795,
"grad_norm": 0.09998168796300888,
"kl": 0.00717315673828125,
"learning_rate": 5e-07,
"loss": 0.0564,
"reward": 0.8100446775555611,
"reward_std": 0.1757219755090773,
"rewards/accuracy_reward": 0.8100446775555611,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 614.6263717651367,
"epoch": 1.375266524520256,
"grad_norm": 0.08961261808872223,
"kl": 0.0022918701171875,
"learning_rate": 5e-07,
"loss": 0.0376,
"reward": 0.8328125387430191,
"reward_std": 0.13861298179253936,
"rewards/accuracy_reward": 0.8328125387430191,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 631.008511352539,
"epoch": 1.4605543710021323,
"grad_norm": 0.1273442804813385,
"kl": 0.002947235107421875,
"learning_rate": 5e-07,
"loss": 0.0541,
"reward": 0.8229911118745804,
"reward_std": 0.14886255729943515,
"rewards/accuracy_reward": 0.8229911118745804,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 646.872346496582,
"epoch": 1.5458422174840085,
"grad_norm": 0.15443700551986694,
"kl": 0.0033596038818359377,
"learning_rate": 5e-07,
"loss": 0.0595,
"reward": 0.809821467101574,
"reward_std": 0.15138995712623,
"rewards/accuracy_reward": 0.809821467101574,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 631.2808311462402,
"epoch": 1.6311300639658848,
"grad_norm": 0.09066915512084961,
"kl": 0.004022216796875,
"learning_rate": 5e-07,
"loss": 0.0418,
"reward": 0.8258928924798965,
"reward_std": 0.1533732468262315,
"rewards/accuracy_reward": 0.8258928924798965,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 627.2167701721191,
"epoch": 1.716417910447761,
"grad_norm": 0.10236337780952454,
"kl": 0.011969375610351562,
"learning_rate": 5e-07,
"loss": 0.0372,
"reward": 0.8267857551574707,
"reward_std": 0.13705341126769782,
"rewards/accuracy_reward": 0.8267857551574707,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 643.0727920532227,
"epoch": 1.8017057569296375,
"grad_norm": 0.09229780733585358,
"kl": 0.00559539794921875,
"learning_rate": 5e-07,
"loss": 0.0289,
"reward": 0.8116071805357933,
"reward_std": 0.147033178107813,
"rewards/accuracy_reward": 0.8116071805357933,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 634.0770401000976,
"epoch": 1.886993603411514,
"grad_norm": 0.1279992163181305,
"kl": 0.006862640380859375,
"learning_rate": 5e-07,
"loss": 0.0364,
"reward": 0.8312500357627869,
"reward_std": 0.14459644490852952,
"rewards/accuracy_reward": 0.8312500357627869,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 643.7125259399414,
"epoch": 1.9722814498933903,
"grad_norm": 0.12527693808078766,
"kl": 0.00738983154296875,
"learning_rate": 5e-07,
"loss": 0.0394,
"reward": 0.8138393253087998,
"reward_std": 0.15650860401801764,
"rewards/accuracy_reward": 0.8138393253087998,
"step": 115
},
{
"epoch": 2.068230277185501,
"grad_norm": 0.13853299617767334,
"learning_rate": 5e-07,
"loss": 0.0354,
"step": 120
},
{
"epoch": 2.068230277185501,
"eval_clip_ratio": 0.0,
"eval_completion_length": 627.0459775924683,
"eval_kl": 0.009876251220703125,
"eval_loss": 0.023924430832266808,
"eval_reward": 0.7343750353902578,
"eval_reward_std": 0.19236661097966135,
"eval_rewards/accuracy_reward": 0.7343750353902578,
"eval_runtime": 697.2301,
"eval_samples_per_second": 0.717,
"eval_steps_per_second": 0.007,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 621.6253170013428,
"epoch": 2.1535181236673773,
"grad_norm": 0.11815498024225235,
"kl": 0.01000518798828125,
"learning_rate": 5e-07,
"loss": 0.0358,
"reward": 0.8255580753087998,
"reward_std": 0.14198732506483794,
"rewards/accuracy_reward": 0.8255580753087998,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 621.314752960205,
"epoch": 2.2388059701492535,
"grad_norm": 0.113522969186306,
"kl": 0.0125152587890625,
"learning_rate": 5e-07,
"loss": 0.0269,
"reward": 0.8386161178350449,
"reward_std": 0.14197837365791202,
"rewards/accuracy_reward": 0.8386161178350449,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 642.9511466979981,
"epoch": 2.3240938166311302,
"grad_norm": 0.14222967624664307,
"kl": 0.01525726318359375,
"learning_rate": 5e-07,
"loss": 0.0476,
"reward": 0.7872768193483353,
"reward_std": 0.14514056108891965,
"rewards/accuracy_reward": 0.7872768193483353,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 628.785961151123,
"epoch": 2.4093816631130065,
"grad_norm": 0.13704024255275726,
"kl": 0.01926116943359375,
"learning_rate": 5e-07,
"loss": 0.0403,
"reward": 0.8256696745753288,
"reward_std": 0.14226720854640007,
"rewards/accuracy_reward": 0.8256696745753288,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 614.7006958007812,
"epoch": 2.4946695095948828,
"grad_norm": 0.19874536991119385,
"kl": 0.0266326904296875,
"learning_rate": 5e-07,
"loss": 0.0278,
"reward": 0.8165178939700126,
"reward_std": 0.16517118187621235,
"rewards/accuracy_reward": 0.8165178939700126,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 649.61431350708,
"epoch": 2.579957356076759,
"grad_norm": 0.40368160605430603,
"kl": 0.0365936279296875,
"learning_rate": 5e-07,
"loss": 0.0341,
"reward": 0.7767857447266578,
"reward_std": 0.1682931227609515,
"rewards/accuracy_reward": 0.7767857447266578,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 620.872575378418,
"epoch": 2.6652452025586353,
"grad_norm": 0.37761253118515015,
"kl": 0.049951171875,
"learning_rate": 5e-07,
"loss": 0.0415,
"reward": 0.7785714641213417,
"reward_std": 0.19512954521924258,
"rewards/accuracy_reward": 0.7785714641213417,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 617.9544944763184,
"epoch": 2.750533049040512,
"grad_norm": 0.44903331995010376,
"kl": 0.0691650390625,
"learning_rate": 5e-07,
"loss": 0.0422,
"reward": 0.7671875387430191,
"reward_std": 0.19579849690198897,
"rewards/accuracy_reward": 0.7671875387430191,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 616.282169342041,
"epoch": 2.835820895522388,
"grad_norm": 0.7222861647605896,
"kl": 0.10626220703125,
"learning_rate": 5e-07,
"loss": 0.0487,
"reward": 0.7156250298023223,
"reward_std": 0.2289330180734396,
"rewards/accuracy_reward": 0.7156250298023223,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 649.1158721923828,
"epoch": 2.9211087420042645,
"grad_norm": 1.717586636543274,
"kl": 0.194580078125,
"learning_rate": 5e-07,
"loss": 0.0679,
"reward": 0.614955385774374,
"reward_std": 0.2752906741574407,
"rewards/accuracy_reward": 0.614955385774374,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 786.9139389038086,
"epoch": 3.0170575692963753,
"grad_norm": 1.529920220375061,
"kl": 0.439892578125,
"learning_rate": 5e-07,
"loss": 0.1198,
"reward": 0.368080372735858,
"reward_std": 0.29059169851243494,
"rewards/accuracy_reward": 0.368080372735858,
"step": 175
},
{
"epoch": 3.1023454157782515,
"grad_norm": 1.5960689783096313,
"learning_rate": 5e-07,
"loss": 0.0887,
"step": 180
},
{
"epoch": 3.1023454157782515,
"eval_clip_ratio": 0.0,
"eval_completion_length": 791.9263305664062,
"eval_kl": 2.4365234375,
"eval_loss": 0.06115880608558655,
"eval_reward": 0.08565848605940118,
"eval_reward_std": 0.13261561130639166,
"eval_rewards/accuracy_reward": 0.08565848605940118,
"eval_runtime": 821.1595,
"eval_samples_per_second": 0.609,
"eval_steps_per_second": 0.006,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 818.8542762756348,
"epoch": 3.1876332622601278,
"grad_norm": 11.959312438964844,
"kl": 2.426806640625,
"learning_rate": 5e-07,
"loss": 0.0522,
"reward": 0.10647321877768263,
"reward_std": 0.14978813820052891,
"rewards/accuracy_reward": 0.10647321877768263,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 679.0966827392579,
"epoch": 3.272921108742004,
"grad_norm": 19.53175163269043,
"kl": 3.7345703125,
"learning_rate": 5e-07,
"loss": 0.0359,
"reward": 0.039732144516892734,
"reward_std": 0.07693687449209392,
"rewards/accuracy_reward": 0.039732144516892734,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 561.9143058776856,
"epoch": 3.3582089552238807,
"grad_norm": 8.676216125488281,
"kl": 5.15078125,
"learning_rate": 5e-07,
"loss": 0.0272,
"reward": 0.026116072735749184,
"reward_std": 0.05382296503521502,
"rewards/accuracy_reward": 0.026116072735749184,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 466.9301559448242,
"epoch": 3.443496801705757,
"grad_norm": 16.412755966186523,
"kl": 7.519140625,
"learning_rate": 5e-07,
"loss": 0.0222,
"reward": 0.02700892973225564,
"reward_std": 0.058131046639755365,
"rewards/accuracy_reward": 0.02700892973225564,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 409.43640365600584,
"epoch": 3.5287846481876333,
"grad_norm": 10.202103614807129,
"kl": 4.88203125,
"learning_rate": 5e-07,
"loss": 0.0116,
"reward": 0.02410714393481612,
"reward_std": 0.051837433129549026,
"rewards/accuracy_reward": 0.02410714393481612,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 366.6959976196289,
"epoch": 3.6140724946695095,
"grad_norm": 33.47189712524414,
"kl": 6.95859375,
"learning_rate": 5e-07,
"loss": 0.0144,
"reward": 0.026116072852164506,
"reward_std": 0.04902788205072284,
"rewards/accuracy_reward": 0.026116072852164506,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 356.4863983154297,
"epoch": 3.699360341151386,
"grad_norm": 7.119285583496094,
"kl": 3.1953125,
"learning_rate": 5e-07,
"loss": 0.0123,
"reward": 0.0292410729220137,
"reward_std": 0.05716597293503582,
"rewards/accuracy_reward": 0.0292410729220137,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 343.4513526916504,
"epoch": 3.7846481876332625,
"grad_norm": 15.441688537597656,
"kl": 4.012109375,
"learning_rate": 5e-07,
"loss": 0.0177,
"reward": 0.028571429941803218,
"reward_std": 0.058770314510911706,
"rewards/accuracy_reward": 0.028571429941803218,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 310.74108505249023,
"epoch": 3.8699360341151388,
"grad_norm": 7.061368942260742,
"kl": 5.580859375,
"learning_rate": 5e-07,
"loss": 0.0081,
"reward": 0.02857143001165241,
"reward_std": 0.06492680269293487,
"rewards/accuracy_reward": 0.02857143001165241,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 314.79555015563966,
"epoch": 3.955223880597015,
"grad_norm": 22.824426651000977,
"kl": 8.496484375,
"learning_rate": 5e-07,
"loss": 0.0108,
"reward": 0.033258930104784666,
"reward_std": 0.06678469418548047,
"rewards/accuracy_reward": 0.033258930104784666,
"step": 230
},
{
"clip_ratio": 0.0,
"completion_length": 311.46373558044434,
"epoch": 3.9893390191897655,
"kl": 4.9599609375,
"reward": 0.042968751688022166,
"reward_std": 0.08240398659836501,
"rewards/accuracy_reward": 0.042968751688022166,
"step": 232,
"total_flos": 0.0,
"train_loss": 0.0500773029434013,
"train_runtime": 52194.0457,
"train_samples_per_second": 0.575,
"train_steps_per_second": 0.004
}
],
"logging_steps": 5,
"max_steps": 232,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}