qwen25_7b_sft_eng_math / trainer_state.json
Jennny's picture
Model save
d7e292a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 100,
"global_step": 478,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0041841004184100415,
"grad_norm": 41.53185103984857,
"learning_rate": 4.1666666666666667e-07,
"loss": 0.4749,
"step": 1
},
{
"epoch": 0.02092050209205021,
"grad_norm": 38.30127983273649,
"learning_rate": 2.0833333333333334e-06,
"loss": 0.45,
"step": 5
},
{
"epoch": 0.04184100418410042,
"grad_norm": 18.219981518260038,
"learning_rate": 4.166666666666667e-06,
"loss": 0.4009,
"step": 10
},
{
"epoch": 0.06276150627615062,
"grad_norm": 19.993422812493172,
"learning_rate": 6.25e-06,
"loss": 0.2968,
"step": 15
},
{
"epoch": 0.08368200836820083,
"grad_norm": 2.728691896454265,
"learning_rate": 8.333333333333334e-06,
"loss": 0.2389,
"step": 20
},
{
"epoch": 0.10460251046025104,
"grad_norm": 1.885623294377404,
"learning_rate": 1.0416666666666668e-05,
"loss": 0.2226,
"step": 25
},
{
"epoch": 0.12552301255230125,
"grad_norm": 1.884968683506505,
"learning_rate": 1.25e-05,
"loss": 0.2151,
"step": 30
},
{
"epoch": 0.14644351464435146,
"grad_norm": 1.7305199472110575,
"learning_rate": 1.4583333333333333e-05,
"loss": 0.2033,
"step": 35
},
{
"epoch": 0.16736401673640167,
"grad_norm": 1.624704502269397,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.1962,
"step": 40
},
{
"epoch": 0.18828451882845187,
"grad_norm": 1.0713867108877762,
"learning_rate": 1.8750000000000002e-05,
"loss": 0.1981,
"step": 45
},
{
"epoch": 0.20920502092050208,
"grad_norm": 0.9779064413853102,
"learning_rate": 1.9998932457674904e-05,
"loss": 0.1765,
"step": 50
},
{
"epoch": 0.2301255230125523,
"grad_norm": 0.9775409995700168,
"learning_rate": 1.9986925223989665e-05,
"loss": 0.2064,
"step": 55
},
{
"epoch": 0.2510460251046025,
"grad_norm": 0.9714894285452347,
"learning_rate": 1.996159240342547e-05,
"loss": 0.2019,
"step": 60
},
{
"epoch": 0.2719665271966527,
"grad_norm": 1.0717005150854204,
"learning_rate": 1.9922967797647357e-05,
"loss": 0.2068,
"step": 65
},
{
"epoch": 0.2928870292887029,
"grad_norm": 0.9560722147267008,
"learning_rate": 1.9871102943592717e-05,
"loss": 0.2068,
"step": 70
},
{
"epoch": 0.3138075313807531,
"grad_norm": 1.0251690501367547,
"learning_rate": 1.9806067044705375e-05,
"loss": 0.206,
"step": 75
},
{
"epoch": 0.33472803347280333,
"grad_norm": 0.8930867203257072,
"learning_rate": 1.9727946878597193e-05,
"loss": 0.2133,
"step": 80
},
{
"epoch": 0.35564853556485354,
"grad_norm": 0.978887003337326,
"learning_rate": 1.963684668126046e-05,
"loss": 0.2138,
"step": 85
},
{
"epoch": 0.37656903765690375,
"grad_norm": 0.9045993049904391,
"learning_rate": 1.9532888007985408e-05,
"loss": 0.2101,
"step": 90
},
{
"epoch": 0.39748953974895396,
"grad_norm": 0.8416364765226602,
"learning_rate": 1.9416209571168648e-05,
"loss": 0.1986,
"step": 95
},
{
"epoch": 0.41841004184100417,
"grad_norm": 0.889013264056005,
"learning_rate": 1.9286967055228744e-05,
"loss": 0.2025,
"step": 100
},
{
"epoch": 0.41841004184100417,
"eval_loss": 0.2196013182401657,
"eval_runtime": 4.704,
"eval_samples_per_second": 63.776,
"eval_steps_per_second": 2.126,
"step": 100
},
{
"epoch": 0.4393305439330544,
"grad_norm": 0.8700387414372234,
"learning_rate": 1.9145332908875984e-05,
"loss": 0.2135,
"step": 105
},
{
"epoch": 0.4602510460251046,
"grad_norm": 0.9841056505818219,
"learning_rate": 1.89914961150135e-05,
"loss": 0.2236,
"step": 110
},
{
"epoch": 0.4811715481171548,
"grad_norm": 0.9183128471339689,
"learning_rate": 1.8825661938576784e-05,
"loss": 0.1981,
"step": 115
},
{
"epoch": 0.502092050209205,
"grad_norm": 0.9064275588483083,
"learning_rate": 1.864805165264799e-05,
"loss": 0.2112,
"step": 120
},
{
"epoch": 0.5230125523012552,
"grad_norm": 0.9277328098243097,
"learning_rate": 1.8458902243210558e-05,
"loss": 0.2108,
"step": 125
},
{
"epoch": 0.5439330543933054,
"grad_norm": 0.8390131309861991,
"learning_rate": 1.8258466092938042e-05,
"loss": 0.2036,
"step": 130
},
{
"epoch": 0.5648535564853556,
"grad_norm": 0.8475693099997708,
"learning_rate": 1.8047010644439074e-05,
"loss": 0.2055,
"step": 135
},
{
"epoch": 0.5857740585774058,
"grad_norm": 0.8174296090345491,
"learning_rate": 1.7824818043407828e-05,
"loss": 0.1965,
"step": 140
},
{
"epoch": 0.606694560669456,
"grad_norm": 0.9073247456446722,
"learning_rate": 1.75921847621561e-05,
"loss": 0.2189,
"step": 145
},
{
"epoch": 0.6276150627615062,
"grad_norm": 0.8921810755026803,
"learning_rate": 1.7349421204029343e-05,
"loss": 0.2083,
"step": 150
},
{
"epoch": 0.6485355648535565,
"grad_norm": 0.8941222203338465,
"learning_rate": 1.7096851289234448e-05,
"loss": 0.1969,
"step": 155
},
{
"epoch": 0.6694560669456067,
"grad_norm": 0.804546952924198,
"learning_rate": 1.6834812022632e-05,
"loss": 0.1967,
"step": 160
},
{
"epoch": 0.6903765690376569,
"grad_norm": 0.9037696185427722,
"learning_rate": 1.656365304406953e-05,
"loss": 0.2203,
"step": 165
},
{
"epoch": 0.7112970711297071,
"grad_norm": 0.8167977368613387,
"learning_rate": 1.6283736161855995e-05,
"loss": 0.2086,
"step": 170
},
{
"epoch": 0.7322175732217573,
"grad_norm": 0.9715422112753858,
"learning_rate": 1.5995434869999723e-05,
"loss": 0.2079,
"step": 175
},
{
"epoch": 0.7531380753138075,
"grad_norm": 0.8466883455587352,
"learning_rate": 1.5699133849854164e-05,
"loss": 0.2023,
"step": 180
},
{
"epoch": 0.7740585774058577,
"grad_norm": 0.8429311671633376,
"learning_rate": 1.5395228456836298e-05,
"loss": 0.2133,
"step": 185
},
{
"epoch": 0.7949790794979079,
"grad_norm": 0.7670187426056939,
"learning_rate": 1.5084124192902612e-05,
"loss": 0.2111,
"step": 190
},
{
"epoch": 0.8158995815899581,
"grad_norm": 0.8839232649853831,
"learning_rate": 1.4766236165486526e-05,
"loss": 0.2066,
"step": 195
},
{
"epoch": 0.8368200836820083,
"grad_norm": 0.8327080522078059,
"learning_rate": 1.4441988533619182e-05,
"loss": 0.2033,
"step": 200
},
{
"epoch": 0.8368200836820083,
"eval_loss": 0.21050043404102325,
"eval_runtime": 4.6998,
"eval_samples_per_second": 63.833,
"eval_steps_per_second": 2.128,
"step": 200
},
{
"epoch": 0.8577405857740585,
"grad_norm": 0.882685513711128,
"learning_rate": 1.4111813941972672e-05,
"loss": 0.2115,
"step": 205
},
{
"epoch": 0.8786610878661087,
"grad_norm": 0.7869283802680849,
"learning_rate": 1.3776152943580846e-05,
"loss": 0.2032,
"step": 210
},
{
"epoch": 0.899581589958159,
"grad_norm": 0.9323518972909932,
"learning_rate": 1.3435453412007949e-05,
"loss": 0.2065,
"step": 215
},
{
"epoch": 0.9205020920502092,
"grad_norm": 0.8687693359849569,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.1992,
"step": 220
},
{
"epoch": 0.9414225941422594,
"grad_norm": 0.788589875497053,
"learning_rate": 1.2740763251662585e-05,
"loss": 0.2088,
"step": 225
},
{
"epoch": 0.9623430962343096,
"grad_norm": 0.8338643090434609,
"learning_rate": 1.2387699550235419e-05,
"loss": 0.2134,
"step": 230
},
{
"epoch": 0.9832635983263598,
"grad_norm": 0.8945112364204731,
"learning_rate": 1.2031449933515625e-05,
"loss": 0.2143,
"step": 235
},
{
"epoch": 1.00418410041841,
"grad_norm": 0.7637171354512544,
"learning_rate": 1.1672489746527979e-05,
"loss": 0.1895,
"step": 240
},
{
"epoch": 1.0251046025104602,
"grad_norm": 0.7512141249987404,
"learning_rate": 1.1311297951020028e-05,
"loss": 0.11,
"step": 245
},
{
"epoch": 1.0460251046025104,
"grad_norm": 0.831833076591956,
"learning_rate": 1.0948356486381829e-05,
"loss": 0.1,
"step": 250
},
{
"epoch": 1.0669456066945606,
"grad_norm": 0.7206258555841986,
"learning_rate": 1.0584149626592662e-05,
"loss": 0.1037,
"step": 255
},
{
"epoch": 1.0878661087866108,
"grad_norm": 0.7085622045220055,
"learning_rate": 1.0219163334052682e-05,
"loss": 0.0973,
"step": 260
},
{
"epoch": 1.108786610878661,
"grad_norm": 0.7831952386854,
"learning_rate": 9.853884611161709e-06,
"loss": 0.0973,
"step": 265
},
{
"epoch": 1.1297071129707112,
"grad_norm": 0.8240407306584351,
"learning_rate": 9.48880085051033e-06,
"loss": 0.1013,
"step": 270
},
{
"epoch": 1.1506276150627615,
"grad_norm": 0.6482126525278735,
"learning_rate": 9.124399184550377e-06,
"loss": 0.0918,
"step": 275
},
{
"epoch": 1.1715481171548117,
"grad_norm": 0.7359956824319477,
"learning_rate": 8.76116583561252e-06,
"loss": 0.1009,
"step": 280
},
{
"epoch": 1.1924686192468619,
"grad_norm": 0.7488157028333433,
"learning_rate": 8.399585467138215e-06,
"loss": 0.104,
"step": 285
},
{
"epoch": 1.213389121338912,
"grad_norm": 0.9216117075330189,
"learning_rate": 8.040140536991688e-06,
"loss": 0.1088,
"step": 290
},
{
"epoch": 1.2343096234309623,
"grad_norm": 0.7078309437741438,
"learning_rate": 7.683310653714857e-06,
"loss": 0.0961,
"step": 295
},
{
"epoch": 1.2552301255230125,
"grad_norm": 0.6400397127293158,
"learning_rate": 7.329571936584072e-06,
"loss": 0.0953,
"step": 300
},
{
"epoch": 1.2552301255230125,
"eval_loss": 0.21791227161884308,
"eval_runtime": 4.7021,
"eval_samples_per_second": 63.801,
"eval_steps_per_second": 2.127,
"step": 300
},
{
"epoch": 1.2761506276150627,
"grad_norm": 0.7946545562826351,
"learning_rate": 6.979396380322621e-06,
"loss": 0.1012,
"step": 305
},
{
"epoch": 1.297071129707113,
"grad_norm": 0.8682581025303368,
"learning_rate": 6.63325122531663e-06,
"loss": 0.0969,
"step": 310
},
{
"epoch": 1.3179916317991631,
"grad_norm": 0.6258337390507666,
"learning_rate": 6.291598334174685e-06,
"loss": 0.0882,
"step": 315
},
{
"epoch": 1.3389121338912133,
"grad_norm": 0.6131224054121029,
"learning_rate": 5.954893575463064e-06,
"loss": 0.0944,
"step": 320
},
{
"epoch": 1.3598326359832635,
"grad_norm": 0.7854353505666302,
"learning_rate": 5.623586215438813e-06,
"loss": 0.0958,
"step": 325
},
{
"epoch": 1.3807531380753137,
"grad_norm": 0.6627539948719601,
"learning_rate": 5.298118318592316e-06,
"loss": 0.0986,
"step": 330
},
{
"epoch": 1.401673640167364,
"grad_norm": 0.6727466516033451,
"learning_rate": 4.978924157799208e-06,
"loss": 0.0871,
"step": 335
},
{
"epoch": 1.4225941422594142,
"grad_norm": 0.837001485916376,
"learning_rate": 4.666429634868651e-06,
"loss": 0.0919,
"step": 340
},
{
"epoch": 1.4435146443514644,
"grad_norm": 0.6312831436608192,
"learning_rate": 4.361051712261173e-06,
"loss": 0.0956,
"step": 345
},
{
"epoch": 1.4644351464435146,
"grad_norm": 0.7493453791258211,
"learning_rate": 4.063197856734295e-06,
"loss": 0.0973,
"step": 350
},
{
"epoch": 1.4853556485355648,
"grad_norm": 0.6718965888684427,
"learning_rate": 3.773265495658309e-06,
"loss": 0.0935,
"step": 355
},
{
"epoch": 1.506276150627615,
"grad_norm": 0.669199054440991,
"learning_rate": 3.491641486727645e-06,
"loss": 0.0934,
"step": 360
},
{
"epoch": 1.5271966527196654,
"grad_norm": 0.6071433879794146,
"learning_rate": 3.2187016017753714e-06,
"loss": 0.0856,
"step": 365
},
{
"epoch": 1.5481171548117154,
"grad_norm": 0.657064753562051,
"learning_rate": 2.954810025379633e-06,
"loss": 0.0946,
"step": 370
},
{
"epoch": 1.5690376569037658,
"grad_norm": 0.7108276426518368,
"learning_rate": 2.700318868930977e-06,
"loss": 0.0908,
"step": 375
},
{
"epoch": 1.5899581589958158,
"grad_norm": 0.6801812109704454,
"learning_rate": 2.455567700808974e-06,
"loss": 0.0867,
"step": 380
},
{
"epoch": 1.6108786610878663,
"grad_norm": 0.7675050726089606,
"learning_rate": 2.2208830932950175e-06,
"loss": 0.0947,
"step": 385
},
{
"epoch": 1.6317991631799162,
"grad_norm": 0.6895822123846358,
"learning_rate": 1.996578186825876e-06,
"loss": 0.0897,
"step": 390
},
{
"epoch": 1.6527196652719667,
"grad_norm": 0.7665469889085529,
"learning_rate": 1.7829522721693738e-06,
"loss": 0.0942,
"step": 395
},
{
"epoch": 1.6736401673640167,
"grad_norm": 0.7844644313627527,
"learning_rate": 1.5802903910797584e-06,
"loss": 0.0963,
"step": 400
},
{
"epoch": 1.6736401673640167,
"eval_loss": 0.21011365950107574,
"eval_runtime": 4.7026,
"eval_samples_per_second": 63.794,
"eval_steps_per_second": 2.126,
"step": 400
},
{
"epoch": 1.694560669456067,
"grad_norm": 0.8226521847258956,
"learning_rate": 1.3888629559655497e-06,
"loss": 0.0933,
"step": 405
},
{
"epoch": 1.715481171548117,
"grad_norm": 0.7519758533069896,
"learning_rate": 1.2089253890773789e-06,
"loss": 0.0931,
"step": 410
},
{
"epoch": 1.7364016736401675,
"grad_norm": 0.6928321056405649,
"learning_rate": 1.0407177816972558e-06,
"loss": 0.0921,
"step": 415
},
{
"epoch": 1.7573221757322175,
"grad_norm": 0.7598493011363774,
"learning_rate": 8.844645737839874e-07,
"loss": 0.0836,
"step": 420
},
{
"epoch": 1.778242677824268,
"grad_norm": 0.7883660509749922,
"learning_rate": 7.403742545021986e-07,
"loss": 0.0957,
"step": 425
},
{
"epoch": 1.799163179916318,
"grad_norm": 0.6379425807939199,
"learning_rate": 6.086390840345758e-07,
"loss": 0.0855,
"step": 430
},
{
"epoch": 1.8200836820083683,
"grad_norm": 0.7094514927837703,
"learning_rate": 4.894348370484648e-07,
"loss": 0.0961,
"step": 435
},
{
"epoch": 1.8410041841004183,
"grad_norm": 0.7307395509986969,
"learning_rate": 3.8292056815916965e-07,
"loss": 0.0906,
"step": 440
},
{
"epoch": 1.8619246861924688,
"grad_norm": 0.698574360214073,
"learning_rate": 2.8923839970285473e-07,
"loss": 0.0935,
"step": 445
},
{
"epoch": 1.8828451882845187,
"grad_norm": 0.6707455680501093,
"learning_rate": 2.0851333210225032e-07,
"loss": 0.0883,
"step": 450
},
{
"epoch": 1.9037656903765692,
"grad_norm": 0.7392245964188288,
"learning_rate": 1.408530770781813e-07,
"loss": 0.093,
"step": 455
},
{
"epoch": 1.9246861924686192,
"grad_norm": 0.8076174792599771,
"learning_rate": 8.634791392946429e-08,
"loss": 0.0883,
"step": 460
},
{
"epoch": 1.9456066945606696,
"grad_norm": 0.7903152829386576,
"learning_rate": 4.5070569072952485e-08,
"loss": 0.0896,
"step": 465
},
{
"epoch": 1.9665271966527196,
"grad_norm": 0.7075539268549017,
"learning_rate": 1.7076119004429958e-08,
"loss": 0.0899,
"step": 470
},
{
"epoch": 1.98744769874477,
"grad_norm": 0.6839920154541665,
"learning_rate": 2.401916809872118e-09,
"loss": 0.086,
"step": 475
},
{
"epoch": 2.0,
"step": 478,
"total_flos": 29254447685632.0,
"train_loss": 0.15641464851017278,
"train_runtime": 1194.3029,
"train_samples_per_second": 12.777,
"train_steps_per_second": 0.4
}
],
"logging_steps": 5,
"max_steps": 478,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 29254447685632.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}