Qwen-2.5-Math-7B-Max-v5-accuracy / trainer_state.json
chenggong
Model save
5b47d40 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.997867803837953,
"eval_steps": 116,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 625.9241256713867,
"epoch": 0.008528784648187633,
"grad_norm": 0.20551569759845734,
"kl": 0.0,
"learning_rate": 2.127659574468085e-08,
"loss": 0.0447,
"reward": 0.7433036118745804,
"reward_std": 0.190913749858737,
"rewards/accuracy_reward": 0.7299107536673546,
"rewards/format_reward": 0.013392857741564512,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 583.8616371154785,
"epoch": 0.042643923240938165,
"grad_norm": 0.5290549397468567,
"kl": 7.251650094985962e-05,
"learning_rate": 1.0638297872340425e-07,
"loss": 0.054,
"reward": 0.7550223553553224,
"reward_std": 0.237873874604702,
"rewards/accuracy_reward": 0.7466518199071288,
"rewards/format_reward": 0.008370536146685481,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 599.1326179504395,
"epoch": 0.08528784648187633,
"grad_norm": 0.28203362226486206,
"kl": 8.721351623535157e-05,
"learning_rate": 2.127659574468085e-07,
"loss": 0.0351,
"reward": 0.727232176065445,
"reward_std": 0.2079640648327768,
"rewards/accuracy_reward": 0.7214286044239998,
"rewards/format_reward": 0.005803571781143546,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 575.5317222595215,
"epoch": 0.1279317697228145,
"grad_norm": 0.22249764204025269,
"kl": 0.00016361474990844727,
"learning_rate": 3.1914893617021275e-07,
"loss": 0.03,
"reward": 0.7473214611411094,
"reward_std": 0.22900055218487977,
"rewards/accuracy_reward": 0.7375000298023224,
"rewards/format_reward": 0.009821429150179029,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 582.1107414245605,
"epoch": 0.17057569296375266,
"grad_norm": 0.3668869137763977,
"kl": 0.00010156631469726562,
"learning_rate": 4.25531914893617e-07,
"loss": 0.0314,
"reward": 0.7700893200933934,
"reward_std": 0.21237293258309364,
"rewards/accuracy_reward": 0.7607143215835095,
"rewards/format_reward": 0.009375000512227416,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 565.0393089294433,
"epoch": 0.21321961620469082,
"grad_norm": 0.27896830439567566,
"kl": 0.00015020370483398438,
"learning_rate": 5.319148936170212e-07,
"loss": 0.036,
"reward": 0.7875000402331352,
"reward_std": 0.21531264819204807,
"rewards/accuracy_reward": 0.7767857536673546,
"rewards/format_reward": 0.010714286286383868,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 576.9236869812012,
"epoch": 0.255863539445629,
"grad_norm": 0.41902390122413635,
"kl": 0.0003345251083374023,
"learning_rate": 6.382978723404255e-07,
"loss": 0.0403,
"reward": 0.775000037252903,
"reward_std": 0.23194959200918674,
"rewards/accuracy_reward": 0.7656250312924385,
"rewards/format_reward": 0.00937500041909516,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 569.61029586792,
"epoch": 0.29850746268656714,
"grad_norm": 0.16671252250671387,
"kl": 0.13303523063659667,
"learning_rate": 7.446808510638297e-07,
"loss": 0.0383,
"reward": 0.7870536059141159,
"reward_std": 0.21664966912940145,
"rewards/accuracy_reward": 0.7705357536673546,
"rewards/format_reward": 0.0165178578812629,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 577.1330627441406,
"epoch": 0.3411513859275053,
"grad_norm": 0.4918578565120697,
"kl": 0.0008090019226074218,
"learning_rate": 8.51063829787234e-07,
"loss": 0.0388,
"reward": 0.8004464715719223,
"reward_std": 0.23178436178714037,
"rewards/accuracy_reward": 0.771428607404232,
"rewards/format_reward": 0.02901785881258547,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 591.5464546203614,
"epoch": 0.3837953091684435,
"grad_norm": 0.5081108808517456,
"kl": 0.005326557159423828,
"learning_rate": 9.574468085106384e-07,
"loss": 0.0432,
"reward": 0.8508928954601288,
"reward_std": 0.2962774943560362,
"rewards/accuracy_reward": 0.7718750327825546,
"rewards/format_reward": 0.07901786002330483,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 590.6241355895996,
"epoch": 0.42643923240938164,
"grad_norm": 1.225230097770691,
"kl": 0.02999114990234375,
"learning_rate": 9.998747147528373e-07,
"loss": 0.0284,
"reward": 0.9977679073810577,
"reward_std": 0.4272202838212252,
"rewards/accuracy_reward": 0.701339316368103,
"rewards/format_reward": 0.2964285858441144,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 584.9955612182617,
"epoch": 0.4690831556503198,
"grad_norm": 1.62312912940979,
"kl": 0.04905548095703125,
"learning_rate": 9.991093100466482e-07,
"loss": 0.052,
"reward": 1.2584822058677674,
"reward_std": 0.4769852660596371,
"rewards/accuracy_reward": 0.7388393208384514,
"rewards/format_reward": 0.5196428842842579,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 619.11029586792,
"epoch": 0.511727078891258,
"grad_norm": 0.35674989223480225,
"kl": 0.02668609619140625,
"learning_rate": 9.976491676662678e-07,
"loss": 0.0348,
"reward": 1.3142857804894448,
"reward_std": 0.4189721491187811,
"rewards/accuracy_reward": 0.7379464611411095,
"rewards/format_reward": 0.5763393118977547,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 606.5446723937988,
"epoch": 0.5543710021321961,
"grad_norm": 0.7614251375198364,
"kl": 0.1965301513671875,
"learning_rate": 9.95496320064109e-07,
"loss": 0.0326,
"reward": 1.4531250655651093,
"reward_std": 0.39104298427700995,
"rewards/accuracy_reward": 0.7379464611411095,
"rewards/format_reward": 0.7151786029338837,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 619.4513656616211,
"epoch": 0.5970149253731343,
"grad_norm": 0.5526299476623535,
"kl": 0.0309814453125,
"learning_rate": 9.926537639070456e-07,
"loss": 0.0332,
"reward": 1.5433036401867866,
"reward_std": 0.345558512583375,
"rewards/accuracy_reward": 0.7343750335276127,
"rewards/format_reward": 0.8089286044239998,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 601.5808296203613,
"epoch": 0.6396588486140725,
"grad_norm": 0.3345666825771332,
"kl": 0.02752227783203125,
"learning_rate": 9.891254559051884e-07,
"loss": 0.0323,
"reward": 1.6361607968807221,
"reward_std": 0.3074555268511176,
"rewards/accuracy_reward": 0.764285746216774,
"rewards/format_reward": 0.8718750342726708,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 611.288419342041,
"epoch": 0.6823027718550106,
"grad_norm": 0.35215917229652405,
"kl": 0.02867279052734375,
"learning_rate": 9.849163073043223e-07,
"loss": 0.0428,
"reward": 1.6526786476373672,
"reward_std": 0.28676611334085467,
"rewards/accuracy_reward": 0.7589286133646965,
"rewards/format_reward": 0.8937500417232513,
"step": 80
},
{
"clip_ratio": 0.0,
"completion_length": 626.8102943420411,
"epoch": 0.7249466950959488,
"grad_norm": 0.3329945206642151,
"kl": 0.01969757080078125,
"learning_rate": 9.800321770496724e-07,
"loss": 0.028,
"reward": 1.6875000774860383,
"reward_std": 0.2533166547305882,
"rewards/accuracy_reward": 0.7848214626312255,
"rewards/format_reward": 0.9026786088943481,
"step": 85
},
{
"clip_ratio": 0.0,
"completion_length": 606.9535957336426,
"epoch": 0.767590618336887,
"grad_norm": 0.29067689180374146,
"kl": 0.0251251220703125,
"learning_rate": 9.744798636305187e-07,
"loss": 0.024,
"reward": 1.662500074505806,
"reward_std": 0.2734585601836443,
"rewards/accuracy_reward": 0.7611607506871223,
"rewards/format_reward": 0.9013393223285675,
"step": 90
},
{
"clip_ratio": 0.0,
"completion_length": 608.546459197998,
"epoch": 0.8102345415778252,
"grad_norm": 0.33922263979911804,
"kl": 0.02018585205078125,
"learning_rate": 9.68267095617003e-07,
"loss": 0.0242,
"reward": 1.6745536416769027,
"reward_std": 0.24658216908574104,
"rewards/accuracy_reward": 0.7522321805357933,
"rewards/format_reward": 0.9223214715719223,
"step": 95
},
{
"clip_ratio": 0.0,
"completion_length": 609.9174369812011,
"epoch": 0.8528784648187633,
"grad_norm": 0.5688201785087585,
"kl": 0.02324676513671875,
"learning_rate": 9.614025209023083e-07,
"loss": 0.0296,
"reward": 1.6991072326898575,
"reward_std": 0.24431310119107366,
"rewards/accuracy_reward": 0.781250037252903,
"rewards/format_reward": 0.9178571790456772,
"step": 100
},
{
"clip_ratio": 0.0,
"completion_length": 589.7357414245605,
"epoch": 0.8955223880597015,
"grad_norm": 0.3856689929962158,
"kl": 0.02101287841796875,
"learning_rate": 9.538956946651815e-07,
"loss": 0.0288,
"reward": 1.7343750864267349,
"reward_std": 0.2345518351532519,
"rewards/accuracy_reward": 0.8107143267989159,
"rewards/format_reward": 0.9236607521772384,
"step": 105
},
{
"clip_ratio": 0.0,
"completion_length": 600.2009185791015,
"epoch": 0.9381663113006397,
"grad_norm": 0.3187831938266754,
"kl": 0.02591400146484375,
"learning_rate": 9.457570660695539e-07,
"loss": 0.0116,
"reward": 1.734821507334709,
"reward_std": 0.22256441051140427,
"rewards/accuracy_reward": 0.8138393193483353,
"rewards/format_reward": 0.9209821775555611,
"step": 110
},
{
"clip_ratio": 0.0,
"completion_length": 577.4808288574219,
"epoch": 0.9808102345415778,
"grad_norm": 0.19666177034378052,
"kl": 0.0201080322265625,
"learning_rate": 9.369979637197774e-07,
"loss": 0.0232,
"reward": 1.7446429431438446,
"reward_std": 0.2030067172832787,
"rewards/accuracy_reward": 0.7982143253087998,
"rewards/format_reward": 0.9464286059141159,
"step": 115
},
{
"epoch": 0.9893390191897654,
"eval_clip_ratio": 0.0,
"eval_completion_length": 591.1813688732329,
"eval_kl": 0.027180020771329364,
"eval_loss": 0.00365327182225883,
"eval_reward": 1.684807332735213,
"eval_reward_std": 0.22303236411913993,
"eval_rewards/accuracy_reward": 0.7500000307484279,
"eval_rewards/format_reward": 0.9348072892143613,
"eval_runtime": 686.4197,
"eval_samples_per_second": 0.728,
"eval_steps_per_second": 0.013,
"step": 116
},
{
"clip_ratio": 0.0,
"completion_length": 598.285604095459,
"epoch": 1.0255863539445629,
"grad_norm": 0.17957638204097748,
"kl": 0.0184295654296875,
"learning_rate": 9.276305798917158e-07,
"loss": 0.0077,
"reward": 1.723214367032051,
"reward_std": 0.22227218970656396,
"rewards/accuracy_reward": 0.7964286059141159,
"rewards/format_reward": 0.9267857506871223,
"step": 120
},
{
"clip_ratio": 0.0,
"completion_length": 582.7888671875,
"epoch": 1.068230277185501,
"grad_norm": 0.35526904463768005,
"kl": 0.0201629638671875,
"learning_rate": 9.176679535616476e-07,
"loss": 0.0216,
"reward": 1.751785784959793,
"reward_std": 0.2007270947098732,
"rewards/accuracy_reward": 0.8142857477068901,
"rewards/format_reward": 0.9375000387430191,
"step": 125
},
{
"clip_ratio": 0.0,
"completion_length": 572.4165481567383,
"epoch": 1.1108742004264391,
"grad_norm": 0.44200047850608826,
"kl": 0.362432861328125,
"learning_rate": 9.071239522565976e-07,
"loss": 0.021,
"reward": 1.732142946124077,
"reward_std": 0.21233755089342593,
"rewards/accuracy_reward": 0.8000000350177288,
"rewards/format_reward": 0.9321428954601287,
"step": 130
},
{
"clip_ratio": 0.0,
"completion_length": 551.3214492797852,
"epoch": 1.1535181236673775,
"grad_norm": 0.1597866415977478,
"kl": 0.0220245361328125,
"learning_rate": 8.960132527513642e-07,
"loss": 0.0171,
"reward": 1.7830357879400254,
"reward_std": 0.1866126311942935,
"rewards/accuracy_reward": 0.8316964685916901,
"rewards/format_reward": 0.9513393223285675,
"step": 135
},
{
"clip_ratio": 0.0,
"completion_length": 581.4509185791015,
"epoch": 1.1961620469083156,
"grad_norm": 0.24776776134967804,
"kl": 0.0202850341796875,
"learning_rate": 8.8435132063911e-07,
"loss": 0.0073,
"reward": 1.7156250864267348,
"reward_std": 0.18961388804018497,
"rewards/accuracy_reward": 0.77857146859169,
"rewards/format_reward": 0.9370535984635353,
"step": 140
},
{
"clip_ratio": 0.0,
"completion_length": 600.3785957336426,
"epoch": 1.2388059701492538,
"grad_norm": 0.38918188214302063,
"kl": 0.0415618896484375,
"learning_rate": 8.721543888039532e-07,
"loss": 0.0098,
"reward": 1.7325893640518188,
"reward_std": 0.20605442952364683,
"rewards/accuracy_reward": 0.7897321820259094,
"rewards/format_reward": 0.942857176065445,
"step": 145
},
{
"clip_ratio": 0.0,
"completion_length": 586.5723472595215,
"epoch": 1.2814498933901919,
"grad_norm": 0.39274245500564575,
"kl": 0.0493682861328125,
"learning_rate": 8.594394348255237e-07,
"loss": 0.0246,
"reward": 1.7558036506175996,
"reward_std": 0.2199950136244297,
"rewards/accuracy_reward": 0.8058036044239998,
"rewards/format_reward": 0.9500000298023223,
"step": 150
},
{
"clip_ratio": 0.0,
"completion_length": 564.6044891357421,
"epoch": 1.32409381663113,
"grad_norm": 0.7922531962394714,
"kl": 0.0613037109375,
"learning_rate": 8.462241573469377e-07,
"loss": 0.0263,
"reward": 1.7375000715255737,
"reward_std": 0.17584939412772654,
"rewards/accuracy_reward": 0.8004464656114578,
"rewards/format_reward": 0.9370536029338836,
"step": 155
},
{
"clip_ratio": 0.0,
"completion_length": 575.3759223937989,
"epoch": 1.3667377398720681,
"grad_norm": 10.02066707611084,
"kl": 0.46389617919921877,
"learning_rate": 8.325269514390834e-07,
"loss": 0.0185,
"reward": 1.7901786595582962,
"reward_std": 0.17941874554380774,
"rewards/accuracy_reward": 0.8223214641213417,
"rewards/format_reward": 0.9678571701049805,
"step": 160
},
{
"clip_ratio": 0.0,
"completion_length": 572.3335052490235,
"epoch": 1.4093816631130065,
"grad_norm": 0.1462014764547348,
"kl": 0.05237274169921875,
"learning_rate": 8.183668829955111e-07,
"loss": 0.0162,
"reward": 1.7723215103149415,
"reward_std": 0.1762597480788827,
"rewards/accuracy_reward": 0.816071467101574,
"rewards/format_reward": 0.9562500298023224,
"step": 165
},
{
"clip_ratio": 0.0,
"completion_length": 575.169223022461,
"epoch": 1.4520255863539446,
"grad_norm": 1.8751811981201172,
"kl": 0.18811492919921874,
"learning_rate": 8.037636621935684e-07,
"loss": 0.0151,
"reward": 1.7419643700122833,
"reward_std": 0.1959962229244411,
"rewards/accuracy_reward": 0.7982143238186836,
"rewards/format_reward": 0.9437500312924385,
"step": 170
},
{
"clip_ratio": 0.0,
"completion_length": 588.2317245483398,
"epoch": 1.4946695095948828,
"grad_norm": 0.17714911699295044,
"kl": 0.132525634765625,
"learning_rate": 7.887376160587213e-07,
"loss": 0.0172,
"reward": 1.7156250804662705,
"reward_std": 0.19640195239335298,
"rewards/accuracy_reward": 0.7674107566475868,
"rewards/format_reward": 0.948214316368103,
"step": 175
},
{
"clip_ratio": 0.0,
"completion_length": 559.4638603210449,
"epoch": 1.537313432835821,
"grad_norm": 0.21344700455665588,
"kl": 0.0337677001953125,
"learning_rate": 7.733096601702507e-07,
"loss": 0.0098,
"reward": 1.788839367032051,
"reward_std": 0.1706329697743058,
"rewards/accuracy_reward": 0.8178571820259094,
"rewards/format_reward": 0.9709821701049804,
"step": 180
},
{
"clip_ratio": 0.0,
"completion_length": 574.9826164245605,
"epoch": 1.579957356076759,
"grad_norm": 0.3246748745441437,
"kl": 0.0662689208984375,
"learning_rate": 7.575012695477076e-07,
"loss": 0.0171,
"reward": 1.764285796880722,
"reward_std": 0.18098030481487512,
"rewards/accuracy_reward": 0.8044643223285675,
"rewards/format_reward": 0.9598214566707611,
"step": 185
},
{
"clip_ratio": 0.0,
"completion_length": 591.3893157958985,
"epoch": 1.6226012793176974,
"grad_norm": 0.47960391640663147,
"kl": 0.056695556640625,
"learning_rate": 7.413344487586542e-07,
"loss": 0.0212,
"reward": 1.7665179401636124,
"reward_std": 0.22513661198318005,
"rewards/accuracy_reward": 0.8071428954601287,
"rewards/format_reward": 0.9593750357627868,
"step": 190
},
{
"clip_ratio": 0.0,
"completion_length": 576.8236846923828,
"epoch": 1.6652452025586353,
"grad_norm": 0.9285232424736023,
"kl": 0.2149566650390625,
"learning_rate": 7.248317012892968e-07,
"loss": 0.0264,
"reward": 1.7383929401636125,
"reward_std": 0.21417219610884786,
"rewards/accuracy_reward": 0.7852678969502449,
"rewards/format_reward": 0.9531250342726707,
"step": 195
},
{
"clip_ratio": 0.0,
"completion_length": 569.6236854553223,
"epoch": 1.7078891257995736,
"grad_norm": 19.19864845275879,
"kl": 0.493951416015625,
"learning_rate": 7.08015998220647e-07,
"loss": 0.025,
"reward": 1.7593750774860382,
"reward_std": 0.1970167408697307,
"rewards/accuracy_reward": 0.7933036029338837,
"rewards/format_reward": 0.9660714626312256,
"step": 200
},
{
"clip_ratio": 0.0,
"completion_length": 579.9201164245605,
"epoch": 1.7505330490405118,
"grad_norm": 8.568023681640625,
"kl": 0.36058349609375,
"learning_rate": 6.909107462538111e-07,
"loss": 0.0273,
"reward": 1.7156250894069671,
"reward_std": 0.2314098752103746,
"rewards/accuracy_reward": 0.762946467101574,
"rewards/format_reward": 0.9526785984635353,
"step": 205
},
{
"clip_ratio": 0.0,
"completion_length": 585.3098495483398,
"epoch": 1.79317697228145,
"grad_norm": 1.4387701749801636,
"kl": 0.8639892578125,
"learning_rate": 6.735397551289178e-07,
"loss": 0.0267,
"reward": 1.6991072207689286,
"reward_std": 0.25122642405331136,
"rewards/accuracy_reward": 0.740625037252903,
"rewards/format_reward": 0.9584821745753288,
"step": 210
},
{
"clip_ratio": 0.0,
"completion_length": 573.5219017028809,
"epoch": 1.835820895522388,
"grad_norm": 8.508292198181152,
"kl": 1.6532470703125,
"learning_rate": 6.559272044830316e-07,
"loss": 0.0335,
"reward": 1.7223215013742448,
"reward_std": 0.23952382281422616,
"rewards/accuracy_reward": 0.7700893215835094,
"rewards/format_reward": 0.9522321805357933,
"step": 215
},
{
"clip_ratio": 0.0,
"completion_length": 577.850471496582,
"epoch": 1.8784648187633262,
"grad_norm": 3.9072465896606445,
"kl": 0.9760009765625,
"learning_rate": 6.380976101931879e-07,
"loss": 0.0349,
"reward": 1.6732143580913543,
"reward_std": 0.2966056760400534,
"rewards/accuracy_reward": 0.7321428880095482,
"rewards/format_reward": 0.9410714641213417,
"step": 220
},
{
"clip_ratio": 0.0,
"completion_length": 571.9951133728027,
"epoch": 1.9211087420042645,
"grad_norm": 13.385315895080566,
"kl": 2.62431640625,
"learning_rate": 6.200757902513962e-07,
"loss": 0.0609,
"reward": 1.6495536506175994,
"reward_std": 0.29212585240602496,
"rewards/accuracy_reward": 0.7040178939700127,
"rewards/format_reward": 0.9455357491970062,
"step": 225
},
{
"clip_ratio": 0.0,
"completion_length": 602.8174324035645,
"epoch": 1.9637526652452024,
"grad_norm": 3.510239362716675,
"kl": 1.172021484375,
"learning_rate": 6.018868302191139e-07,
"loss": 0.0426,
"reward": 1.5660715043544768,
"reward_std": 0.343078551068902,
"rewards/accuracy_reward": 0.6276786014437675,
"rewards/format_reward": 0.9383928880095482,
"step": 230
},
{
"epoch": 1.9808102345415777,
"eval_clip_ratio": 0.0,
"eval_completion_length": 590.3601776607453,
"eval_kl": 2.955357142857143,
"eval_loss": 0.06222715228796005,
"eval_reward": 1.3279479032471067,
"eval_reward_std": 0.4392576685973576,
"eval_rewards/accuracy_reward": 0.46938777679488775,
"eval_rewards/format_reward": 0.858560131655799,
"eval_runtime": 674.3732,
"eval_samples_per_second": 0.741,
"eval_steps_per_second": 0.013,
"step": 232
},
{
"clip_ratio": 0.0,
"completion_length": 605.6790473937988,
"epoch": 2.008528784648188,
"grad_norm": 6.303433418273926,
"kl": 3.3275390625,
"learning_rate": 5.835560483092742e-07,
"loss": 0.0882,
"reward": 1.3517857775092126,
"reward_std": 0.4619227208197117,
"rewards/accuracy_reward": 0.5165178820490837,
"rewards/format_reward": 0.8352678924798965,
"step": 235
},
{
"clip_ratio": 0.0,
"completion_length": 588.4384216308594,
"epoch": 2.0511727078891258,
"grad_norm": 20.91950225830078,
"kl": 7.1640625,
"learning_rate": 5.651089601444752e-07,
"loss": 0.1247,
"reward": 1.1812500566244126,
"reward_std": 0.5094705298542976,
"rewards/accuracy_reward": 0.43125002160668374,
"rewards/format_reward": 0.7500000283122062,
"step": 240
},
{
"clip_ratio": 0.0,
"completion_length": 582.0741310119629,
"epoch": 2.093816631130064,
"grad_norm": 13.203470230102539,
"kl": 6.3,
"learning_rate": 5.465712432403811e-07,
"loss": 0.1256,
"reward": 1.2410714894533157,
"reward_std": 0.5110540725290775,
"rewards/accuracy_reward": 0.46830358877778056,
"rewards/format_reward": 0.7727678924798965,
"step": 245
},
{
"clip_ratio": 0.0,
"completion_length": 590.8152046203613,
"epoch": 2.136460554371002,
"grad_norm": 88.20843505859375,
"kl": 11.7703125,
"learning_rate": 5.279687012637798e-07,
"loss": 0.2042,
"reward": 1.3339286342263221,
"reward_std": 0.5208067961037159,
"rewards/accuracy_reward": 0.5129464477300644,
"rewards/format_reward": 0.8209821820259094,
"step": 250
},
{
"clip_ratio": 0.0,
"completion_length": 582.5169929504394,
"epoch": 2.1791044776119404,
"grad_norm": 19.759809494018555,
"kl": 4.6005859375,
"learning_rate": 5.093272281150382e-07,
"loss": 0.0949,
"reward": 1.3361607685685157,
"reward_std": 0.5283136948943138,
"rewards/accuracy_reward": 0.5258928835391998,
"rewards/format_reward": 0.8102678999304771,
"step": 255
},
{
"clip_ratio": 0.0,
"completion_length": 574.399136352539,
"epoch": 2.2217484008528783,
"grad_norm": 73.8835678100586,
"kl": 13.4765625,
"learning_rate": 4.906727718849618e-07,
"loss": 0.209,
"reward": 1.168750050663948,
"reward_std": 0.5519715771079063,
"rewards/accuracy_reward": 0.4321428779512644,
"rewards/format_reward": 0.7366071745753289,
"step": 260
},
{
"clip_ratio": 0.0,
"completion_length": 569.409400177002,
"epoch": 2.2643923240938166,
"grad_norm": 16.285621643066406,
"kl": 15.2921875,
"learning_rate": 4.7203129873622036e-07,
"loss": 0.2319,
"reward": 1.1008929148316384,
"reward_std": 0.5826808042824269,
"rewards/accuracy_reward": 0.39241073541343213,
"rewards/format_reward": 0.7084821775555611,
"step": 265
},
{
"clip_ratio": 0.0,
"completion_length": 563.786190032959,
"epoch": 2.307036247334755,
"grad_norm": 5.511695861816406,
"kl": 3.4419921875,
"learning_rate": 4.534287567596188e-07,
"loss": 0.0542,
"reward": 1.3165179178118707,
"reward_std": 0.49827431738376615,
"rewards/accuracy_reward": 0.5075893081724644,
"rewards/format_reward": 0.8089286148548126,
"step": 270
},
{
"clip_ratio": 0.0,
"completion_length": 558.287523651123,
"epoch": 2.349680170575693,
"grad_norm": 7.120125770568848,
"kl": 5.0576171875,
"learning_rate": 4.348910398555249e-07,
"loss": 0.0723,
"reward": 1.3750000685453414,
"reward_std": 0.5270605705678463,
"rewards/accuracy_reward": 0.5455357402563095,
"rewards/format_reward": 0.8294643267989159,
"step": 275
},
{
"clip_ratio": 0.0,
"completion_length": 573.1196708679199,
"epoch": 2.3923240938166312,
"grad_norm": 9.884531021118164,
"kl": 10.0318359375,
"learning_rate": 4.1644395169072575e-07,
"loss": 0.1729,
"reward": 1.2901786297559739,
"reward_std": 0.5422291226685048,
"rewards/accuracy_reward": 0.48660716265439985,
"rewards/format_reward": 0.8035714611411094,
"step": 280
},
{
"clip_ratio": 0.0,
"completion_length": 557.1683288574219,
"epoch": 2.434968017057569,
"grad_norm": 8.612386703491211,
"kl": 4.2802734375,
"learning_rate": 3.9811316978088615e-07,
"loss": 0.0639,
"reward": 1.4187500596046447,
"reward_std": 0.4759579010307789,
"rewards/accuracy_reward": 0.5776785999536515,
"rewards/format_reward": 0.8410714656114578,
"step": 285
},
{
"clip_ratio": 0.0,
"completion_length": 546.3219017028808,
"epoch": 2.4776119402985075,
"grad_norm": 31.60814094543457,
"kl": 9.052734375,
"learning_rate": 3.799242097486038e-07,
"loss": 0.1517,
"reward": 1.3687500596046447,
"reward_std": 0.5219749353826046,
"rewards/accuracy_reward": 0.5669643111526966,
"rewards/format_reward": 0.8017857521772385,
"step": 290
},
{
"clip_ratio": 0.0,
"completion_length": 548.1094017028809,
"epoch": 2.520255863539446,
"grad_norm": 59.2335090637207,
"kl": 10.06796875,
"learning_rate": 3.619023898068123e-07,
"loss": 0.1374,
"reward": 1.2995536252856255,
"reward_std": 0.51812051422894,
"rewards/accuracy_reward": 0.5620535910129547,
"rewards/format_reward": 0.7375000342726707,
"step": 295
},
{
"clip_ratio": 0.0,
"completion_length": 558.5031517028808,
"epoch": 2.5628997867803838,
"grad_norm": 31.365585327148438,
"kl": 3.1970703125,
"learning_rate": 3.4407279551696846e-07,
"loss": 0.0461,
"reward": 1.2352679088711738,
"reward_std": 0.531840232014656,
"rewards/accuracy_reward": 0.5383928880095482,
"rewards/format_reward": 0.6968750298023224,
"step": 300
},
{
"clip_ratio": 0.0,
"completion_length": 548.0366317749024,
"epoch": 2.605543710021322,
"grad_norm": 13.923192977905273,
"kl": 6.5228515625,
"learning_rate": 3.2646024487108213e-07,
"loss": 0.0853,
"reward": 1.2236607685685157,
"reward_std": 0.5477135334163904,
"rewards/accuracy_reward": 0.5044643051922322,
"rewards/format_reward": 0.7191964611411095,
"step": 305
},
{
"clip_ratio": 0.0,
"completion_length": 517.9973487854004,
"epoch": 2.64818763326226,
"grad_norm": 11.7457914352417,
"kl": 11.935546875,
"learning_rate": 3.0908925374618887e-07,
"loss": 0.1622,
"reward": 1.2687500640749931,
"reward_std": 0.5634565785527229,
"rewards/accuracy_reward": 0.5196428790688514,
"rewards/format_reward": 0.7491071805357933,
"step": 310
},
{
"clip_ratio": 0.0,
"completion_length": 542.1607376098633,
"epoch": 2.6908315565031984,
"grad_norm": 10.802907943725586,
"kl": 7.883203125,
"learning_rate": 2.91984001779353e-07,
"loss": 0.1125,
"reward": 1.2767857626080512,
"reward_std": 0.5758342906832695,
"rewards/accuracy_reward": 0.5187500260770321,
"rewards/format_reward": 0.7580357491970062,
"step": 315
},
{
"clip_ratio": 0.0,
"completion_length": 536.2406478881836,
"epoch": 2.7334754797441363,
"grad_norm": 9.723703384399414,
"kl": 6.530859375,
"learning_rate": 2.751682987107029e-07,
"loss": 0.0812,
"reward": 1.3321429282426833,
"reward_std": 0.526002112776041,
"rewards/accuracy_reward": 0.5491071671247483,
"rewards/format_reward": 0.7830357521772384,
"step": 320
},
{
"clip_ratio": 0.0,
"completion_length": 521.1303764343262,
"epoch": 2.7761194029850746,
"grad_norm": 18.712772369384766,
"kl": 10.8078125,
"learning_rate": 2.5866555124134577e-07,
"loss": 0.1505,
"reward": 1.2674107685685159,
"reward_std": 0.5784162662923336,
"rewards/accuracy_reward": 0.5366071693599224,
"rewards/format_reward": 0.730803607404232,
"step": 325
},
{
"clip_ratio": 0.0,
"completion_length": 535.4544891357422,
"epoch": 2.818763326226013,
"grad_norm": 7.585498809814453,
"kl": 9.569921875,
"learning_rate": 2.424987304522924e-07,
"loss": 0.1261,
"reward": 1.19464291036129,
"reward_std": 0.5549623288214207,
"rewards/accuracy_reward": 0.5209821693599224,
"rewards/format_reward": 0.6736607439815998,
"step": 330
},
{
"clip_ratio": 0.0,
"completion_length": 528.2076141357422,
"epoch": 2.861407249466951,
"grad_norm": 9.224991798400879,
"kl": 6.6015625,
"learning_rate": 2.2669033982974944e-07,
"loss": 0.0755,
"reward": 1.228125052154064,
"reward_std": 0.5089043751358986,
"rewards/accuracy_reward": 0.550000024586916,
"rewards/format_reward": 0.6781250283122062,
"step": 335
},
{
"clip_ratio": 0.0,
"completion_length": 530.958950805664,
"epoch": 2.9040511727078893,
"grad_norm": 24.710325241088867,
"kl": 7.4828125,
"learning_rate": 2.1126238394127867e-07,
"loss": 0.114,
"reward": 1.2035714864730835,
"reward_std": 0.5295904573053122,
"rewards/accuracy_reward": 0.5526785984635353,
"rewards/format_reward": 0.6508928887546063,
"step": 340
},
{
"clip_ratio": 0.0,
"completion_length": 529.0553779602051,
"epoch": 2.946695095948827,
"grad_norm": 36.00743865966797,
"kl": 11.071875,
"learning_rate": 1.9623633780643155e-07,
"loss": 0.188,
"reward": 1.220535770058632,
"reward_std": 0.5231191631406545,
"rewards/accuracy_reward": 0.5361607357859611,
"rewards/format_reward": 0.6843750298023223,
"step": 345
},
{
"epoch": 2.9722814498933903,
"eval_clip_ratio": 0.0,
"eval_completion_length": 531.799803234282,
"eval_kl": 9.749503968253968,
"eval_loss": 0.13944962620735168,
"eval_reward": 1.1026077540147872,
"eval_reward_std": 0.4790610531492839,
"eval_rewards/accuracy_reward": 0.4600340352644996,
"eval_rewards/format_reward": 0.642573726082605,
"eval_runtime": 734.1918,
"eval_samples_per_second": 0.681,
"eval_steps_per_second": 0.012,
"step": 348
},
{
"clip_ratio": 0.0,
"completion_length": 531.3727928161621,
"epoch": 2.9893390191897655,
"grad_norm": 17.88933753967285,
"kl": 10.34296875,
"learning_rate": 1.8163311700448898e-07,
"loss": 0.1236,
"reward": 1.1388393327593804,
"reward_std": 0.4919752091169357,
"rewards/accuracy_reward": 0.49821431189775467,
"rewards/format_reward": 0.6406250275671482,
"step": 350
},
{
"clip_ratio": 0.0,
"completion_length": 524.6839851379394,
"epoch": 3.0341151385927505,
"grad_norm": 8.39860725402832,
"kl": 7.53515625,
"learning_rate": 1.674730485609166e-07,
"loss": 0.099,
"reward": 1.140625049173832,
"reward_std": 0.5006550896912814,
"rewards/accuracy_reward": 0.5415178820490837,
"rewards/format_reward": 0.5991071693599224,
"step": 355
},
{
"clip_ratio": 0.0,
"completion_length": 531.0768104553223,
"epoch": 3.076759061833689,
"grad_norm": 18.39265251159668,
"kl": 8.1181640625,
"learning_rate": 1.537758426530622e-07,
"loss": 0.1106,
"reward": 1.1013393431901932,
"reward_std": 0.4775242738425732,
"rewards/accuracy_reward": 0.5392857365310192,
"rewards/format_reward": 0.5620535992085933,
"step": 360
},
{
"clip_ratio": 0.0,
"completion_length": 534.2736846923829,
"epoch": 3.1194029850746268,
"grad_norm": 11.617506980895996,
"kl": 7.2947265625,
"learning_rate": 1.4056056517447634e-07,
"loss": 0.0915,
"reward": 1.0933036252856254,
"reward_std": 0.4881337985396385,
"rewards/accuracy_reward": 0.532589315623045,
"rewards/format_reward": 0.5607143111526967,
"step": 365
},
{
"clip_ratio": 0.0,
"completion_length": 538.0062713623047,
"epoch": 3.162046908315565,
"grad_norm": 11.465629577636719,
"kl": 7.7173828125,
"learning_rate": 1.2784561119604682e-07,
"loss": 0.0985,
"reward": 1.10089291036129,
"reward_std": 0.4965208202600479,
"rewards/accuracy_reward": 0.5200893104076385,
"rewards/format_reward": 0.5808035977184772,
"step": 370
},
{
"clip_ratio": 0.0,
"completion_length": 531.9433242797852,
"epoch": 3.204690831556503,
"grad_norm": 23.9652156829834,
"kl": 9.834765625,
"learning_rate": 1.156486793608899e-07,
"loss": 0.1229,
"reward": 1.101339338719845,
"reward_std": 0.452479437738657,
"rewards/accuracy_reward": 0.5160714529454709,
"rewards/format_reward": 0.5852678865194321,
"step": 375
},
{
"clip_ratio": 0.0,
"completion_length": 534.8468994140625,
"epoch": 3.2473347547974414,
"grad_norm": 18.34585189819336,
"kl": 10.09140625,
"learning_rate": 1.0398674724863581e-07,
"loss": 0.1464,
"reward": 1.1111607685685159,
"reward_std": 0.5048069790005684,
"rewards/accuracy_reward": 0.5276785962283611,
"rewards/format_reward": 0.5834821693599224,
"step": 380
},
{
"clip_ratio": 0.0,
"completion_length": 536.6924369812011,
"epoch": 3.2899786780383797,
"grad_norm": 14.767237663269043,
"kl": 9.25234375,
"learning_rate": 9.287604774340235e-08,
"loss": 0.1232,
"reward": 1.101339329779148,
"reward_std": 0.49512304849922656,
"rewards/accuracy_reward": 0.5138393111526967,
"rewards/format_reward": 0.5875000298023224,
"step": 385
},
{
"clip_ratio": 0.0,
"completion_length": 532.2884185791015,
"epoch": 3.3326226012793176,
"grad_norm": 9.985774993896484,
"kl": 7.86640625,
"learning_rate": 8.233204643835234e-08,
"loss": 0.1138,
"reward": 1.075446480512619,
"reward_std": 0.46411947570741174,
"rewards/accuracy_reward": 0.5026785962283611,
"rewards/format_reward": 0.5727678835391998,
"step": 390
},
{
"clip_ratio": 0.0,
"completion_length": 527.2410995483399,
"epoch": 3.375266524520256,
"grad_norm": 10.612527847290039,
"kl": 8.09375,
"learning_rate": 7.236942010828429e-08,
"loss": 0.0785,
"reward": 1.1446428999304772,
"reward_std": 0.48770338781177996,
"rewards/accuracy_reward": 0.5575893074274063,
"rewards/format_reward": 0.5870535977184772,
"step": 395
},
{
"clip_ratio": 0.0,
"completion_length": 519.1937744140625,
"epoch": 3.417910447761194,
"grad_norm": 14.665472984313965,
"kl": 11.00234375,
"learning_rate": 6.300203628022271e-08,
"loss": 0.152,
"reward": 1.1830357760190964,
"reward_std": 0.5015905275940895,
"rewards/accuracy_reward": 0.5790178842842579,
"rewards/format_reward": 0.6040178872644901,
"step": 400
},
{
"clip_ratio": 0.0,
"completion_length": 531.4053771972656,
"epoch": 3.4605543710021323,
"grad_norm": 8.385228157043457,
"kl": 9.28125,
"learning_rate": 5.42429339304461e-08,
"loss": 0.1379,
"reward": 1.1437500521540642,
"reward_std": 0.46195379123091695,
"rewards/accuracy_reward": 0.5531250216066838,
"rewards/format_reward": 0.5906250216066837,
"step": 405
},
{
"clip_ratio": 0.0,
"completion_length": 531.0544860839843,
"epoch": 3.50319829424307,
"grad_norm": 18.6485652923584,
"kl": 8.909765625,
"learning_rate": 4.610430533481857e-08,
"loss": 0.1119,
"reward": 1.1084821969270706,
"reward_std": 0.4929712563753128,
"rewards/accuracy_reward": 0.5491071715950966,
"rewards/format_reward": 0.5593750216066837,
"step": 410
},
{
"clip_ratio": 0.0,
"completion_length": 533.8643081665039,
"epoch": 3.5458422174840085,
"grad_norm": 84.65238189697266,
"kl": 8.594140625,
"learning_rate": 3.859747909769162e-08,
"loss": 0.1078,
"reward": 1.0660714849829673,
"reward_std": 0.473931773006916,
"rewards/accuracy_reward": 0.5361607417464256,
"rewards/format_reward": 0.5299107395112514,
"step": 415
},
{
"clip_ratio": 0.0,
"completion_length": 536.6745796203613,
"epoch": 3.588486140724947,
"grad_norm": 19.54568862915039,
"kl": 9.418359375,
"learning_rate": 3.173290438299697e-08,
"loss": 0.1327,
"reward": 1.0656250432133674,
"reward_std": 0.4773729760199785,
"rewards/accuracy_reward": 0.5245535988360643,
"rewards/format_reward": 0.5410714514553547,
"step": 420
},
{
"clip_ratio": 0.0,
"completion_length": 517.5031486511231,
"epoch": 3.631130063965885,
"grad_norm": 22.7406005859375,
"kl": 8.5650390625,
"learning_rate": 2.5520136369481194e-08,
"loss": 0.1112,
"reward": 1.1593750432133674,
"reward_std": 0.45310505069792273,
"rewards/accuracy_reward": 0.5647321693599224,
"rewards/format_reward": 0.5946428835391998,
"step": 425
},
{
"clip_ratio": 0.0,
"completion_length": 513.7027023315429,
"epoch": 3.673773987206823,
"grad_norm": 25.179290771484375,
"kl": 8.64296875,
"learning_rate": 1.996782295032745e-08,
"loss": 0.1274,
"reward": 1.1517857566475869,
"reward_std": 0.4889927223324776,
"rewards/accuracy_reward": 0.5678571730852127,
"rewards/format_reward": 0.583928594738245,
"step": 430
},
{
"clip_ratio": 0.0,
"completion_length": 527.8647552490235,
"epoch": 3.716417910447761,
"grad_norm": 14.159469604492188,
"kl": 7.8265625,
"learning_rate": 1.508369269567783e-08,
"loss": 0.1046,
"reward": 1.1281250417232513,
"reward_std": 0.5101183526217937,
"rewards/accuracy_reward": 0.546428595483303,
"rewards/format_reward": 0.5816964566707611,
"step": 435
},
{
"clip_ratio": 0.0,
"completion_length": 523.8451080322266,
"epoch": 3.7590618336886994,
"grad_norm": 18.611404418945312,
"kl": 9.108984375,
"learning_rate": 1.0874544094811422e-08,
"loss": 0.1173,
"reward": 1.0482143327593803,
"reward_std": 0.45587412640452385,
"rewards/accuracy_reward": 0.5294643141329288,
"rewards/format_reward": 0.518750024214387,
"step": 440
},
{
"clip_ratio": 0.0,
"completion_length": 528.406273651123,
"epoch": 3.8017057569296373,
"grad_norm": 9.024343490600586,
"kl": 8.72421875,
"learning_rate": 7.346236092954316e-09,
"loss": 0.103,
"reward": 1.0665179088711738,
"reward_std": 0.47449378967285155,
"rewards/accuracy_reward": 0.5200893059372902,
"rewards/format_reward": 0.5464285977184773,
"step": 445
},
{
"clip_ratio": 0.0,
"completion_length": 522.9732391357422,
"epoch": 3.8443496801705757,
"grad_norm": 15.007638931274414,
"kl": 7.5888671875,
"learning_rate": 4.50367993589107e-09,
"loss": 0.1024,
"reward": 1.1848214849829675,
"reward_std": 0.4590866263955832,
"rewards/accuracy_reward": 0.5825893096625805,
"rewards/format_reward": 0.6022321686148644,
"step": 450
},
{
"clip_ratio": 0.0,
"completion_length": 539.4236862182618,
"epoch": 3.886993603411514,
"grad_norm": 11.34084701538086,
"kl": 8.90546875,
"learning_rate": 2.3508323337321224e-09,
"loss": 0.1158,
"reward": 1.0486607655882836,
"reward_std": 0.4737320654094219,
"rewards/accuracy_reward": 0.5111607365310192,
"rewards/format_reward": 0.5375000245869159,
"step": 455
},
{
"clip_ratio": 0.0,
"completion_length": 534.0035942077636,
"epoch": 3.929637526652452,
"grad_norm": 18.918825149536133,
"kl": 7.205078125,
"learning_rate": 8.906899533517864e-10,
"loss": 0.0902,
"reward": 1.1607143417000771,
"reward_std": 0.46907868683338166,
"rewards/accuracy_reward": 0.5776785992085933,
"rewards/format_reward": 0.5830357417464256,
"step": 460
},
{
"epoch": 3.9637526652452024,
"eval_clip_ratio": 0.0,
"eval_completion_length": 529.8555946955605,
"eval_kl": 8.018105158730158,
"eval_loss": 0.10982762277126312,
"eval_reward": 1.0456349707785106,
"eval_reward_std": 0.43395746865915874,
"eval_rewards/accuracy_reward": 0.48384356072970797,
"eval_rewards/format_reward": 0.561791407683539,
"eval_runtime": 659.4598,
"eval_samples_per_second": 0.758,
"eval_steps_per_second": 0.014,
"step": 464
},
{
"clip_ratio": 0.0,
"completion_length": 526.7964500427246,
"epoch": 3.9722814498933903,
"grad_norm": 13.803497314453125,
"kl": 7.684765625,
"learning_rate": 1.252852471625987e-10,
"loss": 0.0773,
"reward": 1.129464340209961,
"reward_std": 0.445505191385746,
"rewards/accuracy_reward": 0.5602678842842579,
"rewards/format_reward": 0.5691964529454708,
"step": 465
},
{
"clip_ratio": 0.0,
"completion_length": 529.8702189127604,
"epoch": 3.997867803837953,
"kl": 8.166666666666666,
"reward": 1.1056548183163006,
"reward_std": 0.48472560321291286,
"rewards/accuracy_reward": 0.5610119315485159,
"rewards/format_reward": 0.5446428805589676,
"step": 468,
"total_flos": 0.0,
"train_loss": 0.07395310898940279,
"train_runtime": 53156.2352,
"train_samples_per_second": 0.564,
"train_steps_per_second": 0.009
}
],
"logging_steps": 5,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}