Qwen-2.5-3B-Simple-RL / trainer_state.json
JeffP111's picture
Model save
f087f56 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 40,
"global_step": 201,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"completion_length": 547.1242248535157,
"epoch": 0.07462686567164178,
"grad_norm": 1.2733972072601318,
"kl": 0.00047130584716796874,
"learning_rate": 7.142857142857143e-07,
"loss": 0.0,
"reward": 0.5517856992781163,
"reward_std": 0.27616733238101004,
"rewards/accuracy_reward": 0.3724489748477936,
"rewards/format_reward": 0.17933673106599599,
"step": 5,
"success_rate": 0.37244899198412895
},
{
"completion_length": 460.88060302734374,
"epoch": 0.14925373134328357,
"grad_norm": 4.886548042297363,
"kl": 0.024450111389160156,
"learning_rate": 1.4285714285714286e-06,
"loss": 0.001,
"reward": 0.6818877421319485,
"reward_std": 0.2652753606438637,
"rewards/accuracy_reward": 0.3150510139763355,
"rewards/format_reward": 0.36683672592043876,
"step": 10,
"success_rate": 0.3150510285049677
},
{
"completion_length": 488.90407409667966,
"epoch": 0.22388059701492538,
"grad_norm": 0.48002028465270996,
"kl": 0.018406105041503907,
"learning_rate": 2.142857142857143e-06,
"loss": 0.0007,
"reward": 0.6711734563112259,
"reward_std": 0.24843686558306216,
"rewards/accuracy_reward": 0.3446428500115871,
"rewards/format_reward": 0.32653060741722584,
"step": 15,
"success_rate": 0.34464287348091605
},
{
"completion_length": 490.06836166381834,
"epoch": 0.29850746268656714,
"grad_norm": 0.5253956317901611,
"kl": 0.009164047241210938,
"learning_rate": 2.8571428571428573e-06,
"loss": 0.0004,
"reward": 0.7178571283817291,
"reward_std": 0.26088091973215344,
"rewards/accuracy_reward": 0.40714284889400004,
"rewards/format_reward": 0.31071427930146456,
"step": 20,
"success_rate": 0.40714286640286446
},
{
"completion_length": 342.0505027770996,
"epoch": 0.373134328358209,
"grad_norm": 0.4979659616947174,
"kl": 0.0306243896484375,
"learning_rate": 2.9963460753897363e-06,
"loss": 0.0012,
"reward": 0.9278060972690583,
"reward_std": 0.24459648579359056,
"rewards/accuracy_reward": 0.18979591503739357,
"rewards/format_reward": 0.7380101881921292,
"step": 25,
"success_rate": 0.1897959278896451
},
{
"completion_length": 323.44387130737306,
"epoch": 0.44776119402985076,
"grad_norm": 0.9019961357116699,
"kl": 0.0350311279296875,
"learning_rate": 2.981532510892707e-06,
"loss": 0.0014,
"reward": 0.9882652848958969,
"reward_std": 0.2664802324026823,
"rewards/accuracy_reward": 0.15943877436220646,
"rewards/format_reward": 0.8288265079259872,
"step": 30,
"success_rate": 0.15943878153339028
},
{
"completion_length": 281.84744491577146,
"epoch": 0.5223880597014925,
"grad_norm": 0.42541709542274475,
"kl": 0.046868896484375,
"learning_rate": 2.9554435894139947e-06,
"loss": 0.0019,
"reward": 1.116326493024826,
"reward_std": 0.28026170562952757,
"rewards/accuracy_reward": 0.19030611962080002,
"rewards/format_reward": 0.9260203883051872,
"step": 35,
"success_rate": 0.19030613116919995
},
{
"completion_length": 280.4670871734619,
"epoch": 0.5970149253731343,
"grad_norm": 0.786392331123352,
"kl": 0.054290771484375,
"learning_rate": 2.9182778633989753e-06,
"loss": 0.0022,
"reward": 1.1989795714616776,
"reward_std": 0.3117813114076853,
"rewards/accuracy_reward": 0.24183672983199359,
"rewards/format_reward": 0.957142835855484,
"step": 40,
"success_rate": 0.24183674417436124
},
{
"epoch": 0.5970149253731343,
"eval_completion_length": 256.7967397167696,
"eval_kl": 0.06267662687674581,
"eval_loss": 0.002505573211237788,
"eval_reward": 1.2279956347449532,
"eval_reward_std": 0.283521942211596,
"eval_rewards/accuracy_reward": 0.2540759276182458,
"eval_rewards/format_reward": 0.9739197237864553,
"eval_runtime": 5454.0827,
"eval_samples_per_second": 0.917,
"eval_steps_per_second": 0.066,
"eval_success_rate": 0.2544749851166869,
"step": 40
},
{
"completion_length": 237.62269897460936,
"epoch": 0.6716417910447762,
"grad_norm": 0.7149950861930847,
"kl": 0.0769775390625,
"learning_rate": 2.8703181864639013e-06,
"loss": 0.0031,
"reward": 1.2594387471675872,
"reward_std": 0.28514467738568783,
"rewards/accuracy_reward": 0.2785714233294129,
"rewards/format_reward": 0.9808673411607742,
"step": 45,
"success_rate": 0.2785714427009225
},
{
"completion_length": 309.43800468444823,
"epoch": 0.746268656716418,
"grad_norm": 2.802035093307495,
"kl": 0.081781005859375,
"learning_rate": 2.811929560709094e-06,
"loss": 0.0033,
"reward": 1.3290816009044648,
"reward_std": 0.3365088116377592,
"rewards/accuracy_reward": 0.3614795859903097,
"rewards/format_reward": 0.9676020219922066,
"step": 50,
"success_rate": 0.3596938900649548
},
{
"completion_length": 303.08392372131345,
"epoch": 0.8208955223880597,
"grad_norm": 0.3051517605781555,
"kl": 0.075457763671875,
"learning_rate": 2.7435563588325624e-06,
"loss": 0.003,
"reward": 1.3415815979242325,
"reward_std": 0.3468840003013611,
"rewards/accuracy_reward": 0.3770408075302839,
"rewards/format_reward": 0.9645407989621162,
"step": 55,
"success_rate": 0.3752551130950451
},
{
"completion_length": 294.10815811157227,
"epoch": 0.8955223880597015,
"grad_norm": 0.33523619174957275,
"kl": 0.097998046875,
"learning_rate": 2.6657189421854562e-06,
"loss": 0.0039,
"reward": 1.3653060972690583,
"reward_std": 0.33864556923508643,
"rewards/accuracy_reward": 0.38443876840174196,
"rewards/format_reward": 0.9808673366904259,
"step": 60,
"success_rate": 0.3844387885183096
},
{
"completion_length": 304.046932220459,
"epoch": 0.9701492537313433,
"grad_norm": 0.3474382162094116,
"kl": 0.096484375,
"learning_rate": 2.5790097005079765e-06,
"loss": 0.0039,
"reward": 1.4303571164608002,
"reward_std": 0.32698816806077957,
"rewards/accuracy_reward": 0.454846927523613,
"rewards/format_reward": 0.9755101934075355,
"step": 65,
"success_rate": 0.4548469439148903
},
{
"completion_length": 348.61254768371583,
"epoch": 1.044776119402985,
"grad_norm": 0.28319722414016724,
"kl": 0.0887451171875,
"learning_rate": 2.484088543485761e-06,
"loss": 0.0035,
"reward": 1.4494387701153755,
"reward_std": 0.34128306433558464,
"rewards/accuracy_reward": 0.48418366685509684,
"rewards/format_reward": 0.9652550905942917,
"step": 70,
"success_rate": 0.5017857238650322
},
{
"completion_length": 374.4933601379395,
"epoch": 1.1194029850746268,
"grad_norm": 0.27081194519996643,
"kl": 25395.28706665039,
"learning_rate": 2.3816778784387097e-06,
"loss": 1014.577,
"reward": 1.4698979407548904,
"reward_std": 0.3440066184848547,
"rewards/accuracy_reward": 0.5155612148344517,
"rewards/format_reward": 0.9543367087841034,
"step": 75,
"success_rate": 0.5155612342059612
},
{
"completion_length": 361.42677841186526,
"epoch": 1.1940298507462686,
"grad_norm": 0.31254705786705017,
"kl": 0.087762451171875,
"learning_rate": 2.2725571123650813e-06,
"loss": 0.0035,
"reward": 1.5135203808546067,
"reward_std": 0.35050575956702235,
"rewards/accuracy_reward": 0.566326516866684,
"rewards/format_reward": 0.947193855047226,
"step": 80,
"success_rate": 0.5645408242940902
},
{
"epoch": 1.1940298507462686,
"eval_completion_length": 375.1230107738985,
"eval_kl": 0.2485719819308659,
"eval_loss": 0.009942025877535343,
"eval_reward": 1.4252650491352188,
"eval_reward_std": 0.35333527712016133,
"eval_rewards/accuracy_reward": 0.4842093172769307,
"eval_rewards/format_reward": 0.9410557287365364,
"eval_runtime": 6454.9469,
"eval_samples_per_second": 0.775,
"eval_steps_per_second": 0.055,
"eval_success_rate": 0.485406461291473,
"step": 80
},
{
"completion_length": 378.264786529541,
"epoch": 1.2686567164179103,
"grad_norm": 0.25367024540901184,
"kl": 0.08892822265625,
"learning_rate": 2.157556720183616e-06,
"loss": 0.0036,
"reward": 1.4543367117643355,
"reward_std": 0.3721345618367195,
"rewards/accuracy_reward": 0.5137754999101162,
"rewards/format_reward": 0.9405611962080002,
"step": 85,
"success_rate": 0.5137755192816258
},
{
"completion_length": 377.1609634399414,
"epoch": 1.3432835820895521,
"grad_norm": 0.2705287039279938,
"kl": 0.30084228515625,
"learning_rate": 2.03755192431795e-06,
"loss": 0.012,
"reward": 1.5015305757522583,
"reward_std": 0.34179753065109253,
"rewards/accuracy_reward": 0.5466836676001549,
"rewards/format_reward": 0.954846915602684,
"step": 90,
"success_rate": 0.5466836795210839
},
{
"completion_length": 395.2339202880859,
"epoch": 1.417910447761194,
"grad_norm": 0.2481299340724945,
"kl": 0.08546142578125,
"learning_rate": 1.9134560337254986e-06,
"loss": 0.0034,
"reward": 1.5122448593378066,
"reward_std": 0.32668328285217285,
"rewards/accuracy_reward": 0.5599489718675613,
"rewards/format_reward": 0.9522958919405937,
"step": 95,
"success_rate": 0.559948992729187
},
{
"completion_length": 404.2803482055664,
"epoch": 1.4925373134328357,
"grad_norm": 0.22587481141090393,
"kl": 0.08536376953125,
"learning_rate": 1.7862134930648174e-06,
"loss": 0.0034,
"reward": 1.5109693586826325,
"reward_std": 0.3110779445618391,
"rewards/accuracy_reward": 0.5604591690003872,
"rewards/format_reward": 0.9505101799964905,
"step": 100,
"success_rate": 0.560459190607071
},
{
"completion_length": 446.25024871826173,
"epoch": 1.5671641791044775,
"grad_norm": 0.2071794718503952,
"kl": 0.077789306640625,
"learning_rate": 1.6567926949014804e-06,
"loss": 0.0031,
"reward": 1.521683645248413,
"reward_std": 0.32725758776068686,
"rewards/accuracy_reward": 0.5826530493795872,
"rewards/format_reward": 0.9390305906534195,
"step": 105,
"success_rate": 0.5826530683785677
},
{
"completion_length": 444.8515205383301,
"epoch": 1.6417910447761193,
"grad_norm": 0.2457750141620636,
"kl": 0.081201171875,
"learning_rate": 1.5261786096559255e-06,
"loss": 0.0032,
"reward": 1.5280611962080002,
"reward_std": 0.33711482025682926,
"rewards/accuracy_reward": 0.5818877436220646,
"rewards/format_reward": 0.9461734414100647,
"step": 110,
"success_rate": 0.5818877592682838
},
{
"completion_length": 428.1119789123535,
"epoch": 1.716417910447761,
"grad_norm": 0.8355852365493774,
"kl": 0.08641357421875,
"learning_rate": 1.395365289383812e-06,
"loss": 0.0035,
"reward": 1.5198979318141936,
"reward_std": 0.33952501937747004,
"rewards/accuracy_reward": 0.5668367221951485,
"rewards/format_reward": 0.9530612006783485,
"step": 115,
"success_rate": 0.5668367445468903
},
{
"completion_length": 416.2086639404297,
"epoch": 1.7910447761194028,
"grad_norm": 0.24691729247570038,
"kl": 0.0869140625,
"learning_rate": 1.2653483024396534e-06,
"loss": 0.0035,
"reward": 1.5033163011074067,
"reward_std": 0.33274373821914194,
"rewards/accuracy_reward": 0.5446428425610066,
"rewards/format_reward": 0.9586734384298324,
"step": 120,
"success_rate": 0.5446428678929806
},
{
"epoch": 1.7910447761194028,
"eval_completion_length": 416.68637263974665,
"eval_kl": 0.08460700178945531,
"eval_loss": 0.0033832318149507046,
"eval_reward": 1.4522859107848651,
"eval_reward_std": 0.33377148831190345,
"eval_rewards/accuracy_reward": 0.4971211837739918,
"eval_rewards/format_reward": 0.9551647285509376,
"eval_runtime": 6690.2624,
"eval_samples_per_second": 0.747,
"eval_steps_per_second": 0.054,
"eval_success_rate": 0.49851784963348056,
"step": 120
},
{
"completion_length": 399.90585861206057,
"epoch": 1.8656716417910446,
"grad_norm": 0.2619114816188812,
"kl": 0.08975830078125,
"learning_rate": 1.1371171566004986e-06,
"loss": 0.0036,
"reward": 1.4979591608047484,
"reward_std": 0.32781863324344157,
"rewards/accuracy_reward": 0.5415816225111485,
"rewards/format_reward": 0.9563775300979614,
"step": 125,
"success_rate": 0.5415816411376
},
{
"completion_length": 413.0073921203613,
"epoch": 1.9402985074626866,
"grad_norm": 0.31122443079948425,
"kl": 0.0883056640625,
"learning_rate": 1.0116477683142654e-06,
"loss": 0.0035,
"reward": 1.5224489539861679,
"reward_std": 0.32416500747203825,
"rewards/accuracy_reward": 0.5683673366904258,
"rewards/format_reward": 0.954081603884697,
"step": 130,
"success_rate": 0.568367350846529
},
{
"completion_length": 425.88346633911135,
"epoch": 2.014925373134328,
"grad_norm": 0.20976552367210388,
"kl": 0.080792236328125,
"learning_rate": 8.898950353863e-07,
"loss": 0.0032,
"reward": 1.5138265073299408,
"reward_std": 0.3204653847962618,
"rewards/accuracy_reward": 0.5654081603512168,
"rewards/format_reward": 0.9484183505177498,
"step": 135,
"success_rate": 0.5627551212906837
},
{
"completion_length": 431.6691268920898,
"epoch": 2.08955223880597,
"grad_norm": 0.24750804901123047,
"kl": 0.082379150390625,
"learning_rate": 7.727855696304945e-07,
"loss": 0.0033,
"reward": 1.506122413277626,
"reward_std": 0.32942725978791715,
"rewards/accuracy_reward": 0.5604591719806195,
"rewards/format_reward": 0.9456632405519485,
"step": 140,
"success_rate": 0.5586734853684903
},
{
"completion_length": 438.63060150146487,
"epoch": 2.1641791044776117,
"grad_norm": 0.25727197527885437,
"kl": 0.0776123046875,
"learning_rate": 6.6121064479388e-07,
"loss": 0.0031,
"reward": 1.4864795625209808,
"reward_std": 0.328120681270957,
"rewards/accuracy_reward": 0.5380101919174194,
"rewards/format_reward": 0.9484693706035614,
"step": 145,
"success_rate": 0.5380102179944515
},
{
"completion_length": 403.204328918457,
"epoch": 2.2388059701492535,
"grad_norm": 0.24068014323711395,
"kl": 0.08330078125,
"learning_rate": 5.560194134252441e-07,
"loss": 0.0033,
"reward": 1.5372448682785034,
"reward_std": 0.3228706333786249,
"rewards/accuracy_reward": 0.5795918248593808,
"rewards/format_reward": 0.9576530396938324,
"step": 150,
"success_rate": 0.5795918427407741
},
{
"completion_length": 407.8196342468262,
"epoch": 2.3134328358208958,
"grad_norm": 0.2765465974807739,
"kl": 0.0848876953125,
"learning_rate": 4.5801244431150397e-07,
"loss": 0.0034,
"reward": 1.5224489510059356,
"reward_std": 0.313472930341959,
"rewards/accuracy_reward": 0.5660714194178581,
"rewards/format_reward": 0.9563775330781936,
"step": 155,
"success_rate": 0.5660714313387871
},
{
"completion_length": 392.9127471923828,
"epoch": 2.388059701492537,
"grad_norm": 0.28657880425453186,
"kl": 0.09124755859375,
"learning_rate": 3.67935629665842e-07,
"loss": 0.0036,
"reward": 1.561479565501213,
"reward_std": 0.3107341818511486,
"rewards/accuracy_reward": 0.5979591712355614,
"rewards/format_reward": 0.9635203838348388,
"step": 160,
"success_rate": 0.5979591906070709
},
{
"epoch": 2.388059701492537,
"eval_completion_length": 415.15222543045127,
"eval_kl": 0.08421828626920391,
"eval_loss": 0.003366992576047778,
"eval_reward": 1.4807034238090728,
"eval_reward_std": 0.3203208099780136,
"eval_rewards/accuracy_reward": 0.5245125879788531,
"eval_rewards/format_reward": 0.956190837162167,
"eval_runtime": 6690.7517,
"eval_samples_per_second": 0.747,
"eval_steps_per_second": 0.054,
"eval_success_rate": 0.5258094906890193,
"step": 160
},
{
"completion_length": 432.94667587280276,
"epoch": 2.4626865671641793,
"grad_norm": 0.23113694787025452,
"kl": 0.0843994140625,
"learning_rate": 2.86474508437579e-07,
"loss": 0.0034,
"reward": 1.4971938461065293,
"reward_std": 0.3238256432116032,
"rewards/accuracy_reward": 0.5499999865889549,
"rewards/format_reward": 0.9471938535571098,
"step": 165,
"success_rate": 0.5499999992549419
},
{
"completion_length": 414.23238906860354,
"epoch": 2.5373134328358207,
"grad_norm": 0.2992941439151764,
"kl": 0.087384033203125,
"learning_rate": 2.1424904894683168e-07,
"loss": 0.0035,
"reward": 1.5561224222183228,
"reward_std": 0.3134998256340623,
"rewards/accuracy_reward": 0.600255086272955,
"rewards/format_reward": 0.9558673143386841,
"step": 170,
"success_rate": 0.5984694063663483
},
{
"completion_length": 438.1839202880859,
"epoch": 2.611940298507463,
"grad_norm": 0.2166847288608551,
"kl": 0.085552978515625,
"learning_rate": 1.5180893055124977e-07,
"loss": 0.0034,
"reward": 1.5079081356525421,
"reward_std": 0.33949046954512596,
"rewards/accuracy_reward": 0.5670918263494968,
"rewards/format_reward": 0.9408163040876388,
"step": 175,
"success_rate": 0.5670918501913548
},
{
"completion_length": 424.2168281555176,
"epoch": 2.6865671641791042,
"grad_norm": 0.21476835012435913,
"kl": 0.08350830078125,
"learning_rate": 9.962936025419756e-08,
"loss": 0.0033,
"reward": 1.5306122213602067,
"reward_std": 0.31606815941631794,
"rewards/accuracy_reward": 0.5747448861598968,
"rewards/format_reward": 0.9558673217892647,
"step": 180,
"success_rate": 0.5747449062764645
},
{
"completion_length": 418.3198890686035,
"epoch": 2.7611940298507465,
"grad_norm": 0.2472531795501709,
"kl": 0.081756591796875,
"learning_rate": 5.810745609252166e-08,
"loss": 0.0033,
"reward": 1.5512754768133163,
"reward_std": 0.29889940060675146,
"rewards/accuracy_reward": 0.5910714209079743,
"rewards/format_reward": 0.9602040618658065,
"step": 185,
"success_rate": 0.5910714328289032
},
{
"completion_length": 431.39743728637694,
"epoch": 2.835820895522388,
"grad_norm": 0.2179958075284958,
"kl": 0.0802490234375,
"learning_rate": 2.7559224828504036e-08,
"loss": 0.0032,
"reward": 1.5283162951469422,
"reward_std": 0.3320562928915024,
"rewards/accuracy_reward": 0.5762754924595356,
"rewards/format_reward": 0.9520407900214195,
"step": 190,
"success_rate": 0.5762755177915097
},
{
"completion_length": 427.4851936340332,
"epoch": 2.91044776119403,
"grad_norm": 0.235799178481102,
"kl": 0.0825927734375,
"learning_rate": 8.217156947590065e-09,
"loss": 0.0033,
"reward": 1.5198979258537293,
"reward_std": 0.30454444214701654,
"rewards/accuracy_reward": 0.5729591682553291,
"rewards/format_reward": 0.9469387531280518,
"step": 195,
"success_rate": 0.5729591898620129
},
{
"completion_length": 437.469376373291,
"epoch": 2.9850746268656714,
"grad_norm": 0.22140200436115265,
"kl": 0.079302978515625,
"learning_rate": 2.2845726541309565e-10,
"loss": 0.0032,
"reward": 1.5232142567634583,
"reward_std": 0.32379055954515934,
"rewards/accuracy_reward": 0.5744897864758969,
"rewards/format_reward": 0.9487244680523872,
"step": 200,
"success_rate": 0.5744898058474064
},
{
"epoch": 2.9850746268656714,
"eval_completion_length": 427.15005846929284,
"eval_kl": 0.10578449611557263,
"eval_loss": 0.004237522836774588,
"eval_reward": 1.480674920468357,
"eval_reward_std": 0.32418302708830915,
"eval_rewards/accuracy_reward": 0.528075465443414,
"eval_rewards/format_reward": 0.9525994578553312,
"eval_runtime": 6859.1519,
"eval_samples_per_second": 0.729,
"eval_steps_per_second": 0.052,
"eval_success_rate": 0.5293723670505611,
"step": 200
},
{
"completion_length": 396.71250343322754,
"epoch": 3.0,
"kl": 0.073760986328125,
"reward": 1.6749999970197678,
"reward_std": 0.246222835034132,
"rewards/accuracy_reward": 0.6750000044703484,
"rewards/format_reward": 1.0,
"step": 201,
"success_rate": 0.5982142873108387,
"total_flos": 0.0,
"train_loss": 25.241294363718474,
"train_runtime": 70314.3192,
"train_samples_per_second": 0.32,
"train_steps_per_second": 0.003
}
],
"logging_steps": 5,
"max_steps": 201,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}