c24 / checkpoint-900 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
28a3646 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.8989712476919018,
"eval_steps": 30,
"global_step": 900,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.021102611448166712,
"grad_norm": 5.49099588394165,
"learning_rate": 1.8947368421052634e-05,
"loss": 2.4799,
"step": 10
},
{
"epoch": 0.042205222896333425,
"grad_norm": 0.7057839035987854,
"learning_rate": 4e-05,
"loss": 0.9017,
"step": 20
},
{
"epoch": 0.06330783434450013,
"grad_norm": 0.7793235778808594,
"learning_rate": 6.105263157894737e-05,
"loss": 0.5561,
"step": 30
},
{
"epoch": 0.06330783434450013,
"eval_loss": 0.49612972140312195,
"eval_runtime": 44.7367,
"eval_samples_per_second": 4.471,
"eval_steps_per_second": 4.471,
"step": 30
},
{
"epoch": 0.08441044579266685,
"grad_norm": 0.6479541063308716,
"learning_rate": 8.210526315789474e-05,
"loss": 0.4088,
"step": 40
},
{
"epoch": 0.10551305724083355,
"grad_norm": 0.6190423965454102,
"learning_rate": 0.00010315789473684211,
"loss": 0.3332,
"step": 50
},
{
"epoch": 0.12661566868900026,
"grad_norm": 0.48822861909866333,
"learning_rate": 0.00012421052631578949,
"loss": 0.2752,
"step": 60
},
{
"epoch": 0.12661566868900026,
"eval_loss": 0.25475138425827026,
"eval_runtime": 44.1798,
"eval_samples_per_second": 4.527,
"eval_steps_per_second": 4.527,
"step": 60
},
{
"epoch": 0.14771828013716698,
"grad_norm": 0.3956296443939209,
"learning_rate": 0.00014526315789473686,
"loss": 0.2283,
"step": 70
},
{
"epoch": 0.1688208915853337,
"grad_norm": 0.6851626038551331,
"learning_rate": 0.00016631578947368423,
"loss": 0.2017,
"step": 80
},
{
"epoch": 0.1899235030335004,
"grad_norm": 2.741124153137207,
"learning_rate": 0.0001873684210526316,
"loss": 0.1877,
"step": 90
},
{
"epoch": 0.1899235030335004,
"eval_loss": 0.19585472345352173,
"eval_runtime": 44.05,
"eval_samples_per_second": 4.54,
"eval_steps_per_second": 4.54,
"step": 90
},
{
"epoch": 0.2110261144816671,
"grad_norm": 0.4686296880245209,
"learning_rate": 0.00019998914864890175,
"loss": 0.1862,
"step": 100
},
{
"epoch": 0.23212872592983383,
"grad_norm": 0.5210604071617126,
"learning_rate": 0.0001998670979935533,
"loss": 0.1754,
"step": 110
},
{
"epoch": 0.2532313373780005,
"grad_norm": 0.2621477246284485,
"learning_rate": 0.00019960959858204754,
"loss": 0.1767,
"step": 120
},
{
"epoch": 0.2532313373780005,
"eval_loss": 0.16388529539108276,
"eval_runtime": 44.3724,
"eval_samples_per_second": 4.507,
"eval_steps_per_second": 4.507,
"step": 120
},
{
"epoch": 0.27433394882616724,
"grad_norm": 0.3740817904472351,
"learning_rate": 0.00019921699965828662,
"loss": 0.1666,
"step": 130
},
{
"epoch": 0.29543656027433396,
"grad_norm": 0.5232918858528137,
"learning_rate": 0.00019868983370030348,
"loss": 0.1624,
"step": 140
},
{
"epoch": 0.3165391717225007,
"grad_norm": 0.2019444853067398,
"learning_rate": 0.00019802881569806706,
"loss": 0.1647,
"step": 150
},
{
"epoch": 0.3165391717225007,
"eval_loss": 0.15114803612232208,
"eval_runtime": 44.2313,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 4.522,
"step": 150
},
{
"epoch": 0.3376417831706674,
"grad_norm": 0.23078428208827972,
"learning_rate": 0.00019723484218374865,
"loss": 0.142,
"step": 160
},
{
"epoch": 0.35874439461883406,
"grad_norm": 0.15399664640426636,
"learning_rate": 0.00019630899001576405,
"loss": 0.1472,
"step": 170
},
{
"epoch": 0.3798470060670008,
"grad_norm": 0.21795395016670227,
"learning_rate": 0.0001952525149182412,
"loss": 0.1511,
"step": 180
},
{
"epoch": 0.3798470060670008,
"eval_loss": 0.14088743925094604,
"eval_runtime": 44.23,
"eval_samples_per_second": 4.522,
"eval_steps_per_second": 4.522,
"step": 180
},
{
"epoch": 0.4009496175151675,
"grad_norm": 0.2159528136253357,
"learning_rate": 0.00019406684977789395,
"loss": 0.1426,
"step": 190
},
{
"epoch": 0.4220522289633342,
"grad_norm": 0.154087632894516,
"learning_rate": 0.00019275360270061217,
"loss": 0.1469,
"step": 200
},
{
"epoch": 0.44315484041150094,
"grad_norm": 0.1741837114095688,
"learning_rate": 0.0001913145548304034,
"loss": 0.139,
"step": 210
},
{
"epoch": 0.44315484041150094,
"eval_loss": 0.13762199878692627,
"eval_runtime": 44.3745,
"eval_samples_per_second": 4.507,
"eval_steps_per_second": 4.507,
"step": 210
},
{
"epoch": 0.46425745185966766,
"grad_norm": 0.14827653765678406,
"learning_rate": 0.00018975165793364503,
"loss": 0.1391,
"step": 220
},
{
"epoch": 0.4853600633078343,
"grad_norm": 0.152383491396904,
"learning_rate": 0.00018806703175192283,
"loss": 0.1418,
"step": 230
},
{
"epoch": 0.506462674756001,
"grad_norm": 0.2013552486896515,
"learning_rate": 0.0001862629611270464,
"loss": 0.1442,
"step": 240
},
{
"epoch": 0.506462674756001,
"eval_loss": 0.136888787150383,
"eval_runtime": 44.4633,
"eval_samples_per_second": 4.498,
"eval_steps_per_second": 4.498,
"step": 240
},
{
"epoch": 0.5275652862041678,
"grad_norm": 0.16066157817840576,
"learning_rate": 0.00018434189290214106,
"loss": 0.1424,
"step": 250
},
{
"epoch": 0.5486678976523345,
"grad_norm": 0.1520007848739624,
"learning_rate": 0.00018230643260301838,
"loss": 0.1608,
"step": 260
},
{
"epoch": 0.5697705091005012,
"grad_norm": 0.1666969507932663,
"learning_rate": 0.00018015934090432757,
"loss": 0.1342,
"step": 270
},
{
"epoch": 0.5697705091005012,
"eval_loss": 0.13377884030342102,
"eval_runtime": 44.2414,
"eval_samples_per_second": 4.521,
"eval_steps_per_second": 4.521,
"step": 270
},
{
"epoch": 0.5908731205486679,
"grad_norm": 0.1368287354707718,
"learning_rate": 0.00017790352988527984,
"loss": 0.1367,
"step": 280
},
{
"epoch": 0.6119757319968346,
"grad_norm": 0.13659009337425232,
"learning_rate": 0.000175542059080024,
"loss": 0.14,
"step": 290
},
{
"epoch": 0.6330783434450014,
"grad_norm": 0.19124054908752441,
"learning_rate": 0.00017307813132803066,
"loss": 0.1403,
"step": 300
},
{
"epoch": 0.6330783434450014,
"eval_loss": 0.13211439549922943,
"eval_runtime": 44.7668,
"eval_samples_per_second": 4.468,
"eval_steps_per_second": 4.468,
"step": 300
},
{
"epoch": 0.6541809548931681,
"grad_norm": 0.14924179017543793,
"learning_rate": 0.0001705150884301129,
"loss": 0.1317,
"step": 310
},
{
"epoch": 0.6752835663413348,
"grad_norm": 0.1274843066930771,
"learning_rate": 0.00016785640661597467,
"loss": 0.1477,
"step": 320
},
{
"epoch": 0.6963861777895014,
"grad_norm": 0.15062034130096436,
"learning_rate": 0.00016510569182943524,
"loss": 0.1367,
"step": 330
},
{
"epoch": 0.6963861777895014,
"eval_loss": 0.13036254048347473,
"eval_runtime": 44.6273,
"eval_samples_per_second": 4.482,
"eval_steps_per_second": 4.482,
"step": 330
},
{
"epoch": 0.7174887892376681,
"grad_norm": 0.13609440624713898,
"learning_rate": 0.00016226667483772275,
"loss": 0.1294,
"step": 340
},
{
"epoch": 0.7385914006858348,
"grad_norm": 0.12146595865488052,
"learning_rate": 0.00015934320617147214,
"loss": 0.1356,
"step": 350
},
{
"epoch": 0.7596940121340016,
"grad_norm": 0.33670490980148315,
"learning_rate": 0.0001563392509022882,
"loss": 0.1348,
"step": 360
},
{
"epoch": 0.7596940121340016,
"eval_loss": 0.1280602663755417,
"eval_runtime": 44.5897,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 4.485,
"step": 360
},
{
"epoch": 0.7807966235821683,
"grad_norm": 0.11047045141458511,
"learning_rate": 0.00015325888326495833,
"loss": 0.1306,
"step": 370
},
{
"epoch": 0.801899235030335,
"grad_norm": 0.24782629311084747,
"learning_rate": 0.0001501062811316082,
"loss": 0.1421,
"step": 380
},
{
"epoch": 0.8230018464785017,
"grad_norm": 0.15822850167751312,
"learning_rate": 0.0001468857203452953,
"loss": 0.1345,
"step": 390
},
{
"epoch": 0.8230018464785017,
"eval_loss": 0.12854796648025513,
"eval_runtime": 44.5908,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 4.485,
"step": 390
},
{
"epoch": 0.8441044579266684,
"grad_norm": 0.17262572050094604,
"learning_rate": 0.00014360156892072518,
"loss": 0.138,
"step": 400
},
{
"epoch": 0.8652070693748352,
"grad_norm": 0.16533038020133972,
"learning_rate": 0.00014025828111995635,
"loss": 0.13,
"step": 410
},
{
"epoch": 0.8863096808230019,
"grad_norm": 0.10585814714431763,
"learning_rate": 0.00013686039141112886,
"loss": 0.1267,
"step": 420
},
{
"epoch": 0.8863096808230019,
"eval_loss": 0.12870755791664124,
"eval_runtime": 44.5358,
"eval_samples_per_second": 4.491,
"eval_steps_per_second": 4.491,
"step": 420
},
{
"epoch": 0.9074122922711686,
"grad_norm": 0.12024762481451035,
"learning_rate": 0.00013341250831840998,
"loss": 0.1394,
"step": 430
},
{
"epoch": 0.9285149037193353,
"grad_norm": 0.11544947326183319,
"learning_rate": 0.0001299193081714986,
"loss": 0.1409,
"step": 440
},
{
"epoch": 0.9496175151675019,
"grad_norm": 0.2589576840400696,
"learning_rate": 0.0001263855287631654,
"loss": 0.1352,
"step": 450
},
{
"epoch": 0.9496175151675019,
"eval_loss": 0.12487487494945526,
"eval_runtime": 44.5323,
"eval_samples_per_second": 4.491,
"eval_steps_per_second": 4.491,
"step": 450
},
{
"epoch": 0.9707201266156686,
"grad_norm": 0.13384610414505005,
"learning_rate": 0.00012281596292343163,
"loss": 0.1231,
"step": 460
},
{
"epoch": 0.9918227380638354,
"grad_norm": 0.10393290221691132,
"learning_rate": 0.00011921545201910099,
"loss": 0.1347,
"step": 470
},
{
"epoch": 1.0126615668689,
"grad_norm": 0.11627262830734253,
"learning_rate": 0.00011558887938746194,
"loss": 0.1305,
"step": 480
},
{
"epoch": 1.0126615668689,
"eval_loss": 0.12428628653287888,
"eval_runtime": 44.5885,
"eval_samples_per_second": 4.485,
"eval_steps_per_second": 4.485,
"step": 480
},
{
"epoch": 1.0337641783170668,
"grad_norm": 0.13240262866020203,
"learning_rate": 0.00011194116371306573,
"loss": 0.1308,
"step": 490
},
{
"epoch": 1.0548667897652335,
"grad_norm": 0.15070736408233643,
"learning_rate": 0.00010827725235656294,
"loss": 0.13,
"step": 500
},
{
"epoch": 1.0759694012134002,
"grad_norm": 0.1023663654923439,
"learning_rate": 0.00010460211464464757,
"loss": 0.1291,
"step": 510
},
{
"epoch": 1.0759694012134002,
"eval_loss": 0.12311653047800064,
"eval_runtime": 44.4742,
"eval_samples_per_second": 4.497,
"eval_steps_per_second": 4.497,
"step": 510
},
{
"epoch": 1.097072012661567,
"grad_norm": 0.11538528650999069,
"learning_rate": 0.00010092073513020834,
"loss": 0.1314,
"step": 520
},
{
"epoch": 1.1181746241097337,
"grad_norm": 0.09773898124694824,
"learning_rate": 9.723810683182883e-05,
"loss": 0.1238,
"step": 530
},
{
"epoch": 1.1392772355579002,
"grad_norm": 0.1469469964504242,
"learning_rate": 9.355922446180593e-05,
"loss": 0.1302,
"step": 540
},
{
"epoch": 1.1392772355579002,
"eval_loss": 0.12340555340051651,
"eval_runtime": 44.7791,
"eval_samples_per_second": 4.466,
"eval_steps_per_second": 4.466,
"step": 540
},
{
"epoch": 1.160379847006067,
"grad_norm": 0.08568503707647324,
"learning_rate": 8.988907765187104e-05,
"loss": 0.1291,
"step": 550
},
{
"epoch": 1.1814824584542336,
"grad_norm": 0.11329666525125504,
"learning_rate": 8.623264418580185e-05,
"loss": 0.1243,
"step": 560
},
{
"epoch": 1.2025850699024003,
"grad_norm": 0.20136046409606934,
"learning_rate": 8.259488324810359e-05,
"loss": 0.1236,
"step": 570
},
{
"epoch": 1.2025850699024003,
"eval_loss": 0.12261851131916046,
"eval_runtime": 44.5714,
"eval_samples_per_second": 4.487,
"eval_steps_per_second": 4.487,
"step": 570
},
{
"epoch": 1.223687681350567,
"grad_norm": 0.09609243273735046,
"learning_rate": 7.89807286979162e-05,
"loss": 0.125,
"step": 580
},
{
"epoch": 1.2447902927987338,
"grad_norm": 0.15356111526489258,
"learning_rate": 7.539508237726986e-05,
"loss": 0.1268,
"step": 590
},
{
"epoch": 1.2658929042469005,
"grad_norm": 0.09328490495681763,
"learning_rate": 7.184280746276537e-05,
"loss": 0.1239,
"step": 600
},
{
"epoch": 1.2658929042469005,
"eval_loss": 0.1222267746925354,
"eval_runtime": 44.5091,
"eval_samples_per_second": 4.493,
"eval_steps_per_second": 4.493,
"step": 600
},
{
"epoch": 1.2869955156950672,
"grad_norm": 0.10134255886077881,
"learning_rate": 6.832872186969583e-05,
"loss": 0.122,
"step": 610
},
{
"epoch": 1.308098127143234,
"grad_norm": 0.08854757249355316,
"learning_rate": 6.485759171755574e-05,
"loss": 0.1271,
"step": 620
},
{
"epoch": 1.3292007385914006,
"grad_norm": 0.12832631170749664,
"learning_rate": 6.143412486580051e-05,
"loss": 0.1243,
"step": 630
},
{
"epoch": 1.3292007385914006,
"eval_loss": 0.1208844780921936,
"eval_runtime": 44.7617,
"eval_samples_per_second": 4.468,
"eval_steps_per_second": 4.468,
"step": 630
},
{
"epoch": 1.3503033500395674,
"grad_norm": 0.10490237921476364,
"learning_rate": 5.8062964528623096e-05,
"loss": 0.1233,
"step": 640
},
{
"epoch": 1.371405961487734,
"grad_norm": 0.09632379561662674,
"learning_rate": 5.474868297740874e-05,
"loss": 0.1246,
"step": 650
},
{
"epoch": 1.3925085729359008,
"grad_norm": 0.09513814002275467,
"learning_rate": 5.149577533940836e-05,
"loss": 0.125,
"step": 660
},
{
"epoch": 1.3925085729359008,
"eval_loss": 0.12093473225831985,
"eval_runtime": 44.6914,
"eval_samples_per_second": 4.475,
"eval_steps_per_second": 4.475,
"step": 660
},
{
"epoch": 1.4136111843840675,
"grad_norm": 0.08354990929365158,
"learning_rate": 4.8308653501042166e-05,
"loss": 0.1247,
"step": 670
},
{
"epoch": 1.4347137958322342,
"grad_norm": 0.07765179127454758,
"learning_rate": 4.519164012410171e-05,
"loss": 0.1225,
"step": 680
},
{
"epoch": 1.455816407280401,
"grad_norm": 0.09267658740282059,
"learning_rate": 4.214896278296646e-05,
"loss": 0.1269,
"step": 690
},
{
"epoch": 1.455816407280401,
"eval_loss": 0.12080849707126617,
"eval_runtime": 44.6277,
"eval_samples_per_second": 4.482,
"eval_steps_per_second": 4.482,
"step": 690
},
{
"epoch": 1.4769190187285677,
"grad_norm": 0.10476211458444595,
"learning_rate": 3.9184748230786584e-05,
"loss": 0.1207,
"step": 700
},
{
"epoch": 1.4980216301767344,
"grad_norm": 0.09438898414373398,
"learning_rate": 3.6303016802408594e-05,
"loss": 0.1243,
"step": 710
},
{
"epoch": 1.5191242416249011,
"grad_norm": 0.09003426134586334,
"learning_rate": 3.3507676961634796e-05,
"loss": 0.1225,
"step": 720
},
{
"epoch": 1.5191242416249011,
"eval_loss": 0.11986906081438065,
"eval_runtime": 44.9813,
"eval_samples_per_second": 4.446,
"eval_steps_per_second": 4.446,
"step": 720
},
{
"epoch": 1.5402268530730678,
"grad_norm": 0.09385745972394943,
"learning_rate": 3.080252000021264e-05,
"loss": 0.1262,
"step": 730
},
{
"epoch": 1.5613294645212346,
"grad_norm": 0.09874516725540161,
"learning_rate": 2.8191214895743424e-05,
"loss": 0.1195,
"step": 740
},
{
"epoch": 1.5824320759694013,
"grad_norm": 0.08888901770114899,
"learning_rate": 2.5677303335484025e-05,
"loss": 0.1176,
"step": 750
},
{
"epoch": 1.5824320759694013,
"eval_loss": 0.11973254382610321,
"eval_runtime": 44.7792,
"eval_samples_per_second": 4.466,
"eval_steps_per_second": 4.466,
"step": 750
},
{
"epoch": 1.603534687417568,
"grad_norm": 0.08025766164064407,
"learning_rate": 2.3264194912791605e-05,
"loss": 0.1294,
"step": 760
},
{
"epoch": 1.6246372988657347,
"grad_norm": 0.1188935935497284,
"learning_rate": 2.0955162502726135e-05,
"loss": 0.1186,
"step": 770
},
{
"epoch": 1.6457399103139014,
"grad_norm": 0.0846625491976738,
"learning_rate": 1.8753337823082084e-05,
"loss": 0.1227,
"step": 780
},
{
"epoch": 1.6457399103139014,
"eval_loss": 0.11954256147146225,
"eval_runtime": 44.8428,
"eval_samples_per_second": 4.46,
"eval_steps_per_second": 4.46,
"step": 780
},
{
"epoch": 1.6668425217620682,
"grad_norm": 0.09665997326374054,
"learning_rate": 1.666170718687069e-05,
"loss": 0.121,
"step": 790
},
{
"epoch": 1.6879451332102349,
"grad_norm": 0.10237396508455276,
"learning_rate": 1.4683107452013223e-05,
"loss": 0.122,
"step": 800
},
{
"epoch": 1.7090477446584016,
"grad_norm": 0.09061301499605179,
"learning_rate": 1.2820222173738628e-05,
"loss": 0.1133,
"step": 810
},
{
"epoch": 1.7090477446584016,
"eval_loss": 0.11932696402072906,
"eval_runtime": 44.6177,
"eval_samples_per_second": 4.483,
"eval_steps_per_second": 4.483,
"step": 810
},
{
"epoch": 1.7301503561065683,
"grad_norm": 0.09903734177350998,
"learning_rate": 1.1075577964904104e-05,
"loss": 0.1259,
"step": 820
},
{
"epoch": 1.751252967554735,
"grad_norm": 0.0887165367603302,
"learning_rate": 9.451541069175273e-06,
"loss": 0.116,
"step": 830
},
{
"epoch": 1.7723555790029017,
"grad_norm": 0.09350364655256271,
"learning_rate": 7.950314151713056e-06,
"loss": 0.1241,
"step": 840
},
{
"epoch": 1.7723555790029017,
"eval_loss": 0.11908172816038132,
"eval_runtime": 44.7048,
"eval_samples_per_second": 4.474,
"eval_steps_per_second": 4.474,
"step": 840
},
{
"epoch": 1.7934581904510685,
"grad_norm": 0.10343047231435776,
"learning_rate": 6.57393331172097e-06,
"loss": 0.1183,
"step": 850
},
{
"epoch": 1.8145608018992352,
"grad_norm": 0.08541911840438843,
"learning_rate": 5.324265320903843e-06,
"loss": 0.1258,
"step": 860
},
{
"epoch": 1.835663413347402,
"grad_norm": 0.10567805916070938,
"learning_rate": 4.203005091583801e-06,
"loss": 0.1204,
"step": 870
},
{
"epoch": 1.835663413347402,
"eval_loss": 0.11883310228586197,
"eval_runtime": 44.6887,
"eval_samples_per_second": 4.475,
"eval_steps_per_second": 4.475,
"step": 870
},
{
"epoch": 1.8567660247955684,
"grad_norm": 0.10206745564937592,
"learning_rate": 3.2116733779075094e-06,
"loss": 0.1273,
"step": 880
},
{
"epoch": 1.8778686362437351,
"grad_norm": 0.09496993571519852,
"learning_rate": 2.351614713262418e-06,
"loss": 0.1239,
"step": 890
},
{
"epoch": 1.8989712476919018,
"grad_norm": 0.13183720409870148,
"learning_rate": 1.623995586699334e-06,
"loss": 0.122,
"step": 900
},
{
"epoch": 1.8989712476919018,
"eval_loss": 0.11881602555513382,
"eval_runtime": 44.6717,
"eval_samples_per_second": 4.477,
"eval_steps_per_second": 4.477,
"step": 900
}
],
"logging_steps": 10,
"max_steps": 948,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.348133794370519e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}