c17 / checkpoint-800 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
08f2072 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9860291834833903,
"eval_steps": 30,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024837007140639553,
"grad_norm": 1.6665902137756348,
"learning_rate": 4.390243902439025e-05,
"loss": 1.8541,
"step": 10
},
{
"epoch": 0.04967401428127911,
"grad_norm": 0.7774304747581482,
"learning_rate": 9.26829268292683e-05,
"loss": 0.5737,
"step": 20
},
{
"epoch": 0.07451102142191866,
"grad_norm": 0.5949566960334778,
"learning_rate": 0.00014146341463414634,
"loss": 0.3491,
"step": 30
},
{
"epoch": 0.07451102142191866,
"eval_loss": 0.2992604076862335,
"eval_runtime": 38.9257,
"eval_samples_per_second": 4.367,
"eval_steps_per_second": 4.367,
"step": 30
},
{
"epoch": 0.09934802856255821,
"grad_norm": 0.518982470035553,
"learning_rate": 0.0001902439024390244,
"loss": 0.2738,
"step": 40
},
{
"epoch": 0.12418503570319776,
"grad_norm": 0.40878191590309143,
"learning_rate": 0.00019994603803069594,
"loss": 0.2339,
"step": 50
},
{
"epoch": 0.14902204284383733,
"grad_norm": 0.3464236259460449,
"learning_rate": 0.00019972691733857883,
"loss": 0.199,
"step": 60
},
{
"epoch": 0.14902204284383733,
"eval_loss": 0.1967637687921524,
"eval_runtime": 38.4564,
"eval_samples_per_second": 4.421,
"eval_steps_per_second": 4.421,
"step": 60
},
{
"epoch": 0.17385904998447688,
"grad_norm": 0.27865490317344666,
"learning_rate": 0.00019933963450321945,
"loss": 0.1858,
"step": 70
},
{
"epoch": 0.19869605712511643,
"grad_norm": 0.31942445039749146,
"learning_rate": 0.00019878484257109083,
"loss": 0.1773,
"step": 80
},
{
"epoch": 0.22353306426575598,
"grad_norm": 0.2719893157482147,
"learning_rate": 0.00019806347704689778,
"loss": 0.1689,
"step": 90
},
{
"epoch": 0.22353306426575598,
"eval_loss": 0.16391794383525848,
"eval_runtime": 38.3889,
"eval_samples_per_second": 4.428,
"eval_steps_per_second": 4.428,
"step": 90
},
{
"epoch": 0.24837007140639553,
"grad_norm": 0.2718851566314697,
"learning_rate": 0.00019717675431610415,
"loss": 0.1679,
"step": 100
},
{
"epoch": 0.2732070785470351,
"grad_norm": 0.403368204832077,
"learning_rate": 0.0001961261695938319,
"loss": 0.146,
"step": 110
},
{
"epoch": 0.29804408568767465,
"grad_norm": 0.1546928435564041,
"learning_rate": 0.00019491349440359015,
"loss": 0.153,
"step": 120
},
{
"epoch": 0.29804408568767465,
"eval_loss": 0.14231818914413452,
"eval_runtime": 38.5329,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 4.412,
"step": 120
},
{
"epoch": 0.3228810928283142,
"grad_norm": 0.3227437138557434,
"learning_rate": 0.0001935407735900857,
"loss": 0.1483,
"step": 130
},
{
"epoch": 0.34771809996895375,
"grad_norm": 0.1824907660484314,
"learning_rate": 0.00019201032187115234,
"loss": 0.1519,
"step": 140
},
{
"epoch": 0.3725551071095933,
"grad_norm": 0.19652438163757324,
"learning_rate": 0.0001903247199346129,
"loss": 0.1455,
"step": 150
},
{
"epoch": 0.3725551071095933,
"eval_loss": 0.13592763245105743,
"eval_runtime": 38.412,
"eval_samples_per_second": 4.426,
"eval_steps_per_second": 4.426,
"step": 150
},
{
"epoch": 0.39739211425023285,
"grad_norm": 0.18074853718280792,
"learning_rate": 0.00018848681008665582,
"loss": 0.1466,
"step": 160
},
{
"epoch": 0.4222291213908724,
"grad_norm": 0.3232487142086029,
"learning_rate": 0.0001864996914590638,
"loss": 0.1408,
"step": 170
},
{
"epoch": 0.44706612853151195,
"grad_norm": 0.15204332768917084,
"learning_rate": 0.00018436671478337666,
"loss": 0.1549,
"step": 180
},
{
"epoch": 0.44706612853151195,
"eval_loss": 0.13638228178024292,
"eval_runtime": 38.3896,
"eval_samples_per_second": 4.428,
"eval_steps_per_second": 4.428,
"step": 180
},
{
"epoch": 0.47190313567215153,
"grad_norm": 0.2306644767522812,
"learning_rate": 0.00018209147674079983,
"loss": 0.1444,
"step": 190
},
{
"epoch": 0.49674014281279105,
"grad_norm": 0.14888478815555573,
"learning_rate": 0.00017967781389738625,
"loss": 0.1455,
"step": 200
},
{
"epoch": 0.5215771499534306,
"grad_norm": 0.13995014131069183,
"learning_rate": 0.00017712979623471807,
"loss": 0.1413,
"step": 210
},
{
"epoch": 0.5215771499534306,
"eval_loss": 0.13154758512973785,
"eval_runtime": 38.4362,
"eval_samples_per_second": 4.423,
"eval_steps_per_second": 4.423,
"step": 210
},
{
"epoch": 0.5464141570940702,
"grad_norm": 0.1627102494239807,
"learning_rate": 0.000174451720286997,
"loss": 0.1396,
"step": 220
},
{
"epoch": 0.5712511642347097,
"grad_norm": 0.13829496502876282,
"learning_rate": 0.0001716481018961156,
"loss": 0.1444,
"step": 230
},
{
"epoch": 0.5960881713753493,
"grad_norm": 0.14025089144706726,
"learning_rate": 0.00016872366859692627,
"loss": 0.1474,
"step": 240
},
{
"epoch": 0.5960881713753493,
"eval_loss": 0.13241083920001984,
"eval_runtime": 38.4912,
"eval_samples_per_second": 4.417,
"eval_steps_per_second": 4.417,
"step": 240
},
{
"epoch": 0.6209251785159888,
"grad_norm": 0.11233100295066833,
"learning_rate": 0.00016568335164554812,
"loss": 0.1383,
"step": 250
},
{
"epoch": 0.6457621856566284,
"grad_norm": 0.12720470130443573,
"learning_rate": 0.0001625322777041534,
"loss": 0.1359,
"step": 260
},
{
"epoch": 0.670599192797268,
"grad_norm": 0.11088231950998306,
"learning_rate": 0.0001592757601962555,
"loss": 0.1437,
"step": 270
},
{
"epoch": 0.670599192797268,
"eval_loss": 0.12654946744441986,
"eval_runtime": 38.7011,
"eval_samples_per_second": 4.393,
"eval_steps_per_second": 4.393,
"step": 270
},
{
"epoch": 0.6954361999379075,
"grad_norm": 0.13584497570991516,
"learning_rate": 0.0001559192903470747,
"loss": 0.1312,
"step": 280
},
{
"epoch": 0.720273207078547,
"grad_norm": 0.32189086079597473,
"learning_rate": 0.00015246852792409033,
"loss": 0.1414,
"step": 290
},
{
"epoch": 0.7451102142191866,
"grad_norm": 0.1525665819644928,
"learning_rate": 0.00014892929169339235,
"loss": 0.1496,
"step": 300
},
{
"epoch": 0.7451102142191866,
"eval_loss": 0.12869440019130707,
"eval_runtime": 38.7851,
"eval_samples_per_second": 4.383,
"eval_steps_per_second": 4.383,
"step": 300
},
{
"epoch": 0.7699472213598262,
"grad_norm": 0.12182191759347916,
"learning_rate": 0.00014530754960792553,
"loss": 0.1436,
"step": 310
},
{
"epoch": 0.7947842285004657,
"grad_norm": 1.4180772304534912,
"learning_rate": 0.0001416094087441704,
"loss": 0.145,
"step": 320
},
{
"epoch": 0.8196212356411052,
"grad_norm": 0.14027242362499237,
"learning_rate": 0.00013784110500423104,
"loss": 0.1486,
"step": 330
},
{
"epoch": 0.8196212356411052,
"eval_loss": 0.12838058173656464,
"eval_runtime": 38.6316,
"eval_samples_per_second": 4.401,
"eval_steps_per_second": 4.401,
"step": 330
},
{
"epoch": 0.8444582427817448,
"grad_norm": 0.13633093237876892,
"learning_rate": 0.00013400899260069323,
"loss": 0.1413,
"step": 340
},
{
"epoch": 0.8692952499223844,
"grad_norm": 0.11211191117763519,
"learning_rate": 0.00013011953334198466,
"loss": 0.1361,
"step": 350
},
{
"epoch": 0.8941322570630239,
"grad_norm": 0.30127570033073425,
"learning_rate": 0.00012617928573630406,
"loss": 0.1363,
"step": 360
},
{
"epoch": 0.8941322570630239,
"eval_loss": 0.12504999339580536,
"eval_runtime": 38.8649,
"eval_samples_per_second": 4.374,
"eval_steps_per_second": 4.374,
"step": 360
},
{
"epoch": 0.9189692642036634,
"grad_norm": 0.10214308649301529,
"learning_rate": 0.00012219489393249262,
"loss": 0.1483,
"step": 370
},
{
"epoch": 0.9438062713443031,
"grad_norm": 0.09070255607366562,
"learning_rate": 0.00011817307651649616,
"loss": 0.1349,
"step": 380
},
{
"epoch": 0.9686432784849426,
"grad_norm": 0.10173656791448593,
"learning_rate": 0.00011412061518230914,
"loss": 0.1421,
"step": 390
},
{
"epoch": 0.9686432784849426,
"eval_loss": 0.12429468333721161,
"eval_runtime": 38.7145,
"eval_samples_per_second": 4.391,
"eval_steps_per_second": 4.391,
"step": 390
},
{
"epoch": 0.9934802856255821,
"grad_norm": 0.10592233389616013,
"learning_rate": 0.00011004434329650452,
"loss": 0.1296,
"step": 400
},
{
"epoch": 1.0173859049984477,
"grad_norm": 0.15041767060756683,
"learning_rate": 0.00010595113437563176,
"loss": 0.1367,
"step": 410
},
{
"epoch": 1.0422229121390871,
"grad_norm": 0.10861553996801376,
"learning_rate": 0.00010184789049591299,
"loss": 0.1353,
"step": 420
},
{
"epoch": 1.0422229121390871,
"eval_loss": 0.12352242320775986,
"eval_runtime": 38.7533,
"eval_samples_per_second": 4.387,
"eval_steps_per_second": 4.387,
"step": 420
},
{
"epoch": 1.0670599192797268,
"grad_norm": 0.0957934781908989,
"learning_rate": 9.774153065478121e-05,
"loss": 0.134,
"step": 430
},
{
"epoch": 1.0918969264203664,
"grad_norm": 0.09080129116773605,
"learning_rate": 9.36389791038851e-05,
"loss": 0.1329,
"step": 440
},
{
"epoch": 1.1167339335610058,
"grad_norm": 0.12591005861759186,
"learning_rate": 8.954715367323468e-05,
"loss": 0.121,
"step": 450
},
{
"epoch": 1.1167339335610058,
"eval_loss": 0.12231362611055374,
"eval_runtime": 38.7245,
"eval_samples_per_second": 4.39,
"eval_steps_per_second": 4.39,
"step": 450
},
{
"epoch": 1.1415709407016454,
"grad_norm": 0.08681875467300415,
"learning_rate": 8.547295410617453e-05,
"loss": 0.1305,
"step": 460
},
{
"epoch": 1.166407947842285,
"grad_norm": 0.0953899621963501,
"learning_rate": 8.142325042485592e-05,
"loss": 0.1309,
"step": 470
},
{
"epoch": 1.1912449549829245,
"grad_norm": 0.07845437526702881,
"learning_rate": 7.740487134582525e-05,
"loss": 0.1298,
"step": 480
},
{
"epoch": 1.1912449549829245,
"eval_loss": 0.12134242057800293,
"eval_runtime": 38.8319,
"eval_samples_per_second": 4.378,
"eval_steps_per_second": 4.378,
"step": 480
},
{
"epoch": 1.2160819621235641,
"grad_norm": 0.11160232126712799,
"learning_rate": 7.342459276526302e-05,
"loss": 0.1348,
"step": 490
},
{
"epoch": 1.2409189692642038,
"grad_norm": 0.09501124173402786,
"learning_rate": 6.948912633329007e-05,
"loss": 0.1321,
"step": 500
},
{
"epoch": 1.2657559764048432,
"grad_norm": 0.07836316525936127,
"learning_rate": 6.560510813660719e-05,
"loss": 0.1246,
"step": 510
},
{
"epoch": 1.2657559764048432,
"eval_loss": 0.1200186014175415,
"eval_runtime": 38.566,
"eval_samples_per_second": 4.408,
"eval_steps_per_second": 4.408,
"step": 510
},
{
"epoch": 1.2905929835454828,
"grad_norm": 0.08444702625274658,
"learning_rate": 6.177908750855164e-05,
"loss": 0.1293,
"step": 520
},
{
"epoch": 1.3154299906861224,
"grad_norm": 0.07452095299959183,
"learning_rate": 5.8017515985439465e-05,
"loss": 0.1319,
"step": 530
},
{
"epoch": 1.3402669978267618,
"grad_norm": 0.06832710653543472,
"learning_rate": 5.4326736427815946e-05,
"loss": 0.1298,
"step": 540
},
{
"epoch": 1.3402669978267618,
"eval_loss": 0.1198083758354187,
"eval_runtime": 38.7172,
"eval_samples_per_second": 4.391,
"eval_steps_per_second": 4.391,
"step": 540
},
{
"epoch": 1.3651040049674015,
"grad_norm": 0.10138995200395584,
"learning_rate": 5.071297232495769e-05,
"loss": 0.1274,
"step": 550
},
{
"epoch": 1.389941012108041,
"grad_norm": 0.08988513052463531,
"learning_rate": 4.7182317300661796e-05,
"loss": 0.1309,
"step": 560
},
{
"epoch": 1.4147780192486805,
"grad_norm": 0.08663639426231384,
"learning_rate": 4.374072483801769e-05,
"loss": 0.1305,
"step": 570
},
{
"epoch": 1.4147780192486805,
"eval_loss": 0.1200200691819191,
"eval_runtime": 38.6578,
"eval_samples_per_second": 4.398,
"eval_steps_per_second": 4.398,
"step": 570
},
{
"epoch": 1.4396150263893202,
"grad_norm": 0.0839414894580841,
"learning_rate": 4.039399824048777e-05,
"loss": 0.1332,
"step": 580
},
{
"epoch": 1.4644520335299596,
"grad_norm": 0.07634599506855011,
"learning_rate": 3.714778084622492e-05,
"loss": 0.1275,
"step": 590
},
{
"epoch": 1.4892890406705992,
"grad_norm": 0.08726586401462555,
"learning_rate": 3.400754651212776e-05,
"loss": 0.1302,
"step": 600
},
{
"epoch": 1.4892890406705992,
"eval_loss": 0.11870752274990082,
"eval_runtime": 38.731,
"eval_samples_per_second": 4.389,
"eval_steps_per_second": 4.389,
"step": 600
},
{
"epoch": 1.5141260478112386,
"grad_norm": 0.08947139978408813,
"learning_rate": 3.097859038367947e-05,
"loss": 0.1296,
"step": 610
},
{
"epoch": 1.5389630549518785,
"grad_norm": 0.08135833591222763,
"learning_rate": 2.8066019966134904e-05,
"loss": 0.1281,
"step": 620
},
{
"epoch": 1.5638000620925179,
"grad_norm": 0.08194943517446518,
"learning_rate": 2.527474651211089e-05,
"loss": 0.1296,
"step": 630
},
{
"epoch": 1.5638000620925179,
"eval_loss": 0.11857092380523682,
"eval_runtime": 38.7225,
"eval_samples_per_second": 4.39,
"eval_steps_per_second": 4.39,
"step": 630
},
{
"epoch": 1.5886370692331573,
"grad_norm": 0.08808406442403793,
"learning_rate": 2.260947674010372e-05,
"loss": 0.1299,
"step": 640
},
{
"epoch": 1.613474076373797,
"grad_norm": 0.08768365532159805,
"learning_rate": 2.0074704897896558e-05,
"loss": 0.1242,
"step": 650
},
{
"epoch": 1.6383110835144366,
"grad_norm": 0.09054244309663773,
"learning_rate": 1.767470518424129e-05,
"loss": 0.1167,
"step": 660
},
{
"epoch": 1.6383110835144366,
"eval_loss": 0.11811664700508118,
"eval_runtime": 38.7626,
"eval_samples_per_second": 4.386,
"eval_steps_per_second": 4.386,
"step": 660
},
{
"epoch": 1.663148090655076,
"grad_norm": 0.06916210800409317,
"learning_rate": 1.541352454159237e-05,
"loss": 0.1286,
"step": 670
},
{
"epoch": 1.6879850977957156,
"grad_norm": 0.08965995907783508,
"learning_rate": 1.3294975832046353e-05,
"loss": 0.1293,
"step": 680
},
{
"epoch": 1.7128221049363552,
"grad_norm": 0.09365396201610565,
"learning_rate": 1.1322631407993811e-05,
"loss": 0.128,
"step": 690
},
{
"epoch": 1.7128221049363552,
"eval_loss": 0.1179969310760498,
"eval_runtime": 38.7285,
"eval_samples_per_second": 4.39,
"eval_steps_per_second": 4.39,
"step": 690
},
{
"epoch": 1.7376591120769946,
"grad_norm": 0.06408954411745071,
"learning_rate": 9.499817088325102e-06,
"loss": 0.1292,
"step": 700
},
{
"epoch": 1.7624961192176343,
"grad_norm": 0.08411859720945358,
"learning_rate": 7.829606550347313e-06,
"loss": 0.1238,
"step": 710
},
{
"epoch": 1.787333126358274,
"grad_norm": 0.08746035397052765,
"learning_rate": 6.314816146868952e-06,
"loss": 0.1354,
"step": 720
},
{
"epoch": 1.787333126358274,
"eval_loss": 0.11767658591270447,
"eval_runtime": 38.9009,
"eval_samples_per_second": 4.37,
"eval_steps_per_second": 4.37,
"step": 720
},
{
"epoch": 1.8121701334989133,
"grad_norm": 0.10084281116724014,
"learning_rate": 4.958000157192022e-06,
"loss": 0.1277,
"step": 730
},
{
"epoch": 1.837007140639553,
"grad_norm": 0.08554716408252716,
"learning_rate": 3.761446480019315e-06,
"loss": 0.1287,
"step": 740
},
{
"epoch": 1.8618441477801926,
"grad_norm": 0.08761674165725708,
"learning_rate": 2.7271727755395214e-06,
"loss": 0.1289,
"step": 750
},
{
"epoch": 1.8618441477801926,
"eval_loss": 0.11756357550621033,
"eval_runtime": 38.7293,
"eval_samples_per_second": 4.389,
"eval_steps_per_second": 4.389,
"step": 750
},
{
"epoch": 1.886681154920832,
"grad_norm": 0.08505561947822571,
"learning_rate": 1.8569230631958256e-06,
"loss": 0.1245,
"step": 760
},
{
"epoch": 1.9115181620614716,
"grad_norm": 0.08099253475666046,
"learning_rate": 1.1521647808744873e-06,
"loss": 0.1215,
"step": 770
},
{
"epoch": 1.9363551692021113,
"grad_norm": 0.08459154516458511,
"learning_rate": 6.140863104726391e-07,
"loss": 0.13,
"step": 780
},
{
"epoch": 1.9363551692021113,
"eval_loss": 0.11756289005279541,
"eval_runtime": 38.7944,
"eval_samples_per_second": 4.382,
"eval_steps_per_second": 4.382,
"step": 780
},
{
"epoch": 1.9611921763427507,
"grad_norm": 0.08602018654346466,
"learning_rate": 2.4359497401758024e-07,
"loss": 0.1288,
"step": 790
},
{
"epoch": 1.9860291834833903,
"grad_norm": 0.08176976442337036,
"learning_rate": 4.131550371655468e-08,
"loss": 0.128,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 806,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2864376597719245e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}