c5 / checkpoint-800 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
7a823ff verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9857231533209188,
"eval_steps": 30,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024829298572315334,
"grad_norm": 5.036550998687744,
"learning_rate": 2.2222222222222223e-05,
"loss": 2.4126,
"step": 10
},
{
"epoch": 0.04965859714463067,
"grad_norm": 0.6553554534912109,
"learning_rate": 4.691358024691358e-05,
"loss": 0.8516,
"step": 20
},
{
"epoch": 0.074487895716946,
"grad_norm": 0.7089270353317261,
"learning_rate": 7.160493827160494e-05,
"loss": 0.5451,
"step": 30
},
{
"epoch": 0.074487895716946,
"eval_loss": 0.46651971340179443,
"eval_runtime": 40.7643,
"eval_samples_per_second": 4.17,
"eval_steps_per_second": 2.085,
"step": 30
},
{
"epoch": 0.09931719428926133,
"grad_norm": 0.5849066376686096,
"learning_rate": 9.62962962962963e-05,
"loss": 0.3903,
"step": 40
},
{
"epoch": 0.12414649286157665,
"grad_norm": 0.6945697069168091,
"learning_rate": 0.00012098765432098766,
"loss": 0.3218,
"step": 50
},
{
"epoch": 0.148975791433892,
"grad_norm": 0.43555283546447754,
"learning_rate": 0.00014567901234567902,
"loss": 0.2579,
"step": 60
},
{
"epoch": 0.148975791433892,
"eval_loss": 0.2548002004623413,
"eval_runtime": 40.1887,
"eval_samples_per_second": 4.23,
"eval_steps_per_second": 2.115,
"step": 60
},
{
"epoch": 0.17380509000620734,
"grad_norm": 0.4026840627193451,
"learning_rate": 0.00017037037037037037,
"loss": 0.23,
"step": 70
},
{
"epoch": 0.19863438857852267,
"grad_norm": 0.635771632194519,
"learning_rate": 0.00019506172839506175,
"loss": 0.2108,
"step": 80
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.43689030408859253,
"learning_rate": 0.0001999399199592735,
"loss": 0.1878,
"step": 90
},
{
"epoch": 0.22346368715083798,
"eval_loss": 0.186412051320076,
"eval_runtime": 40.2208,
"eval_samples_per_second": 4.227,
"eval_steps_per_second": 2.113,
"step": 90
},
{
"epoch": 0.2482929857231533,
"grad_norm": 0.3414112627506256,
"learning_rate": 0.00019969596851644327,
"loss": 0.178,
"step": 100
},
{
"epoch": 0.27312228429546864,
"grad_norm": 0.47279804944992065,
"learning_rate": 0.00019926484830975113,
"loss": 0.1595,
"step": 110
},
{
"epoch": 0.297951582867784,
"grad_norm": 0.2666519582271576,
"learning_rate": 0.0001986473687223383,
"loss": 0.159,
"step": 120
},
{
"epoch": 0.297951582867784,
"eval_loss": 0.152946338057518,
"eval_runtime": 40.3051,
"eval_samples_per_second": 4.218,
"eval_steps_per_second": 2.109,
"step": 120
},
{
"epoch": 0.3227808814400993,
"grad_norm": 0.2780194580554962,
"learning_rate": 0.00019784468900761095,
"loss": 0.1491,
"step": 130
},
{
"epoch": 0.34761018001241467,
"grad_norm": 0.5531139969825745,
"learning_rate": 0.0001968583161128631,
"loss": 0.1523,
"step": 140
},
{
"epoch": 0.37243947858473,
"grad_norm": 0.3274007737636566,
"learning_rate": 0.00019569010185014062,
"loss": 0.1447,
"step": 150
},
{
"epoch": 0.37243947858473,
"eval_loss": 0.1445446014404297,
"eval_runtime": 40.255,
"eval_samples_per_second": 4.223,
"eval_steps_per_second": 2.112,
"step": 150
},
{
"epoch": 0.39726877715704534,
"grad_norm": 0.2487361580133438,
"learning_rate": 0.00019434223941965738,
"loss": 0.1509,
"step": 160
},
{
"epoch": 0.42209807572936064,
"grad_norm": 0.5522840023040771,
"learning_rate": 0.00019281725929229127,
"loss": 0.1433,
"step": 170
},
{
"epoch": 0.44692737430167595,
"grad_norm": 0.1760244369506836,
"learning_rate": 0.00019111802445888936,
"loss": 0.1434,
"step": 180
},
{
"epoch": 0.44692737430167595,
"eval_loss": 0.13944680988788605,
"eval_runtime": 40.2246,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 2.113,
"step": 180
},
{
"epoch": 0.4717566728739913,
"grad_norm": 0.2646051347255707,
"learning_rate": 0.00018924772505530174,
"loss": 0.1366,
"step": 190
},
{
"epoch": 0.4965859714463066,
"grad_norm": 0.3032621741294861,
"learning_rate": 0.000187209872373235,
"loss": 0.1359,
"step": 200
},
{
"epoch": 0.521415270018622,
"grad_norm": 0.5465778112411499,
"learning_rate": 0.00018500829226816853,
"loss": 0.143,
"step": 210
},
{
"epoch": 0.521415270018622,
"eval_loss": 0.13419194519519806,
"eval_runtime": 40.1269,
"eval_samples_per_second": 4.237,
"eval_steps_per_second": 2.118,
"step": 210
},
{
"epoch": 0.5462445685909373,
"grad_norm": 0.1879195123910904,
"learning_rate": 0.0001826471179767111,
"loss": 0.1364,
"step": 220
},
{
"epoch": 0.5710738671632526,
"grad_norm": 0.19969278573989868,
"learning_rate": 0.0001801307823568806,
"loss": 0.1407,
"step": 230
},
{
"epoch": 0.595903165735568,
"grad_norm": 0.15893523395061493,
"learning_rate": 0.00017746400956587653,
"loss": 0.1397,
"step": 240
},
{
"epoch": 0.595903165735568,
"eval_loss": 0.13155966997146606,
"eval_runtime": 40.1893,
"eval_samples_per_second": 4.23,
"eval_steps_per_second": 2.115,
"step": 240
},
{
"epoch": 0.6207324643078833,
"grad_norm": 0.1424490511417389,
"learning_rate": 0.00017465180619096832,
"loss": 0.1337,
"step": 250
},
{
"epoch": 0.6455617628801986,
"grad_norm": 0.13029974699020386,
"learning_rate": 0.00017169945185015106,
"loss": 0.1376,
"step": 260
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.13530579209327698,
"learning_rate": 0.00016861248928021411,
"loss": 0.129,
"step": 270
},
{
"epoch": 0.6703910614525139,
"eval_loss": 0.12780845165252686,
"eval_runtime": 40.2498,
"eval_samples_per_second": 4.224,
"eval_steps_per_second": 2.112,
"step": 270
},
{
"epoch": 0.6952203600248293,
"grad_norm": 0.2645304501056671,
"learning_rate": 0.00016539671393083215,
"loss": 0.1246,
"step": 280
},
{
"epoch": 0.7200496585971446,
"grad_norm": 0.15367014706134796,
"learning_rate": 0.00016205816308421386,
"loss": 0.1273,
"step": 290
},
{
"epoch": 0.74487895716946,
"grad_norm": 0.2134842574596405,
"learning_rate": 0.0001586031045207354,
"loss": 0.1361,
"step": 300
},
{
"epoch": 0.74487895716946,
"eval_loss": 0.1297762393951416,
"eval_runtime": 40.2241,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 2.113,
"step": 300
},
{
"epoch": 0.7697082557417753,
"grad_norm": 0.13907591998577118,
"learning_rate": 0.00015503802475183773,
"loss": 0.14,
"step": 310
},
{
"epoch": 0.7945375543140907,
"grad_norm": 0.10886333137750626,
"learning_rate": 0.00015136961684227904,
"loss": 0.1351,
"step": 320
},
{
"epoch": 0.819366852886406,
"grad_norm": 0.1071273609995842,
"learning_rate": 0.00014760476784460514,
"loss": 0.1288,
"step": 330
},
{
"epoch": 0.819366852886406,
"eval_loss": 0.1265447735786438,
"eval_runtime": 40.2437,
"eval_samples_per_second": 4.224,
"eval_steps_per_second": 2.112,
"step": 330
},
{
"epoch": 0.8441961514587213,
"grad_norm": 0.13940832018852234,
"learning_rate": 0.0001437505458694277,
"loss": 0.1331,
"step": 340
},
{
"epoch": 0.8690254500310366,
"grad_norm": 0.12029105424880981,
"learning_rate": 0.00013981418681578546,
"loss": 0.1297,
"step": 350
},
{
"epoch": 0.8938547486033519,
"grad_norm": 0.09277268499135971,
"learning_rate": 0.0001358030807864995,
"loss": 0.1259,
"step": 360
},
{
"epoch": 0.8938547486033519,
"eval_loss": 0.12379591166973114,
"eval_runtime": 40.2294,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 2.113,
"step": 360
},
{
"epoch": 0.9186840471756673,
"grad_norm": 0.172864630818367,
"learning_rate": 0.00013172475821402748,
"loss": 0.1301,
"step": 370
},
{
"epoch": 0.9435133457479826,
"grad_norm": 0.10042418539524078,
"learning_rate": 0.00012758687572286367,
"loss": 0.1271,
"step": 380
},
{
"epoch": 0.9683426443202979,
"grad_norm": 0.09972112625837326,
"learning_rate": 0.00012339720175502642,
"loss": 0.1352,
"step": 390
},
{
"epoch": 0.9683426443202979,
"eval_loss": 0.12407374382019043,
"eval_runtime": 40.2691,
"eval_samples_per_second": 4.222,
"eval_steps_per_second": 2.111,
"step": 390
},
{
"epoch": 0.9931719428926132,
"grad_norm": 0.10841402411460876,
"learning_rate": 0.0001191636019856198,
"loss": 0.1254,
"step": 400
},
{
"epoch": 1.0173805090006207,
"grad_norm": 0.14438092708587646,
"learning_rate": 0.00011489402455585076,
"loss": 0.1321,
"step": 410
},
{
"epoch": 1.042209807572936,
"grad_norm": 0.10602834075689316,
"learning_rate": 0.00011059648515122424,
"loss": 0.1211,
"step": 420
},
{
"epoch": 1.042209807572936,
"eval_loss": 0.12403523921966553,
"eval_runtime": 40.5744,
"eval_samples_per_second": 4.19,
"eval_steps_per_second": 2.095,
"step": 420
},
{
"epoch": 1.0670391061452513,
"grad_norm": 0.10185902565717697,
"learning_rate": 0.00010627905195293135,
"loss": 0.1237,
"step": 430
},
{
"epoch": 1.0918684047175666,
"grad_norm": 0.09817427396774292,
"learning_rate": 0.00010194983049068212,
"loss": 0.1138,
"step": 440
},
{
"epoch": 1.1166977032898822,
"grad_norm": 0.14020408689975739,
"learning_rate": 9.76169484254204e-05,
"loss": 0.118,
"step": 450
},
{
"epoch": 1.1166977032898822,
"eval_loss": 0.12372539937496185,
"eval_runtime": 40.514,
"eval_samples_per_second": 4.196,
"eval_steps_per_second": 2.098,
"step": 450
},
{
"epoch": 1.1415270018621975,
"grad_norm": 0.09354697167873383,
"learning_rate": 9.328854029048984e-05,
"loss": 0.1241,
"step": 460
},
{
"epoch": 1.1663563004345128,
"grad_norm": 0.10786397010087967,
"learning_rate": 8.897273221989714e-05,
"loss": 0.1254,
"step": 470
},
{
"epoch": 1.191185599006828,
"grad_norm": 0.08708823472261429,
"learning_rate": 8.467762669234495e-05,
"loss": 0.1214,
"step": 480
},
{
"epoch": 1.191185599006828,
"eval_loss": 0.12238769233226776,
"eval_runtime": 40.5104,
"eval_samples_per_second": 4.196,
"eval_steps_per_second": 2.098,
"step": 480
},
{
"epoch": 1.2160148975791434,
"grad_norm": 0.11436637490987778,
"learning_rate": 8.041128731967444e-05,
"loss": 0.1278,
"step": 490
},
{
"epoch": 1.2408441961514587,
"grad_norm": 0.10331734269857407,
"learning_rate": 7.61817237082768e-05,
"loss": 0.1242,
"step": 500
},
{
"epoch": 1.265673494723774,
"grad_norm": 0.09123562276363373,
"learning_rate": 7.199687642189387e-05,
"loss": 0.1191,
"step": 510
},
{
"epoch": 1.265673494723774,
"eval_loss": 0.1218603253364563,
"eval_runtime": 40.5038,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 2.099,
"step": 510
},
{
"epoch": 1.2905027932960893,
"grad_norm": 0.08754425495862961,
"learning_rate": 6.786460207403978e-05,
"loss": 0.12,
"step": 520
},
{
"epoch": 1.3153320918684046,
"grad_norm": 0.08568098396062851,
"learning_rate": 6.379265857802969e-05,
"loss": 0.1205,
"step": 530
},
{
"epoch": 1.34016139044072,
"grad_norm": 0.09707140177488327,
"learning_rate": 5.9788690582308404e-05,
"loss": 0.1277,
"step": 540
},
{
"epoch": 1.34016139044072,
"eval_loss": 0.1208547055721283,
"eval_runtime": 40.5912,
"eval_samples_per_second": 4.188,
"eval_steps_per_second": 2.094,
"step": 540
},
{
"epoch": 1.3649906890130354,
"grad_norm": 0.11941556632518768,
"learning_rate": 5.586021511842136e-05,
"loss": 0.1143,
"step": 550
},
{
"epoch": 1.3898199875853507,
"grad_norm": 0.11442070454359055,
"learning_rate": 5.201460748857369e-05,
"loss": 0.1215,
"step": 560
},
{
"epoch": 1.414649286157666,
"grad_norm": 0.09243914484977722,
"learning_rate": 4.8259087419270756e-05,
"loss": 0.124,
"step": 570
},
{
"epoch": 1.414649286157666,
"eval_loss": 0.12113272398710251,
"eval_runtime": 40.5079,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 2.098,
"step": 570
},
{
"epoch": 1.4394785847299814,
"grad_norm": 0.09338078647851944,
"learning_rate": 4.460070550703612e-05,
"loss": 0.1248,
"step": 580
},
{
"epoch": 1.4643078833022967,
"grad_norm": 0.0838402733206749,
"learning_rate": 4.1046329981653086e-05,
"loss": 0.1174,
"step": 590
},
{
"epoch": 1.489137181874612,
"grad_norm": 0.09681010991334915,
"learning_rate": 3.7602633811781166e-05,
"loss": 0.1204,
"step": 600
},
{
"epoch": 1.489137181874612,
"eval_loss": 0.12003795057535172,
"eval_runtime": 40.5066,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 2.098,
"step": 600
},
{
"epoch": 1.5139664804469275,
"grad_norm": 0.10349903255701065,
"learning_rate": 3.4276082177154535e-05,
"loss": 0.1254,
"step": 610
},
{
"epoch": 1.5387957790192428,
"grad_norm": 0.07736501842737198,
"learning_rate": 3.1072920330882647e-05,
"loss": 0.1207,
"step": 620
},
{
"epoch": 1.563625077591558,
"grad_norm": 0.09067176282405853,
"learning_rate": 2.7999161874640022e-05,
"loss": 0.1286,
"step": 630
},
{
"epoch": 1.563625077591558,
"eval_loss": 0.11949945241212845,
"eval_runtime": 40.5526,
"eval_samples_per_second": 4.192,
"eval_steps_per_second": 2.096,
"step": 630
},
{
"epoch": 1.5884543761638734,
"grad_norm": 0.10042094439268112,
"learning_rate": 2.506057746875753e-05,
"loss": 0.1194,
"step": 640
},
{
"epoch": 1.6132836747361887,
"grad_norm": 0.0972597673535347,
"learning_rate": 2.226268399841055e-05,
"loss": 0.1212,
"step": 650
},
{
"epoch": 1.638112973308504,
"grad_norm": 0.09647821635007858,
"learning_rate": 1.9610734216243522e-05,
"loss": 0.1095,
"step": 660
},
{
"epoch": 1.638112973308504,
"eval_loss": 0.11894174665212631,
"eval_runtime": 40.5349,
"eval_samples_per_second": 4.194,
"eval_steps_per_second": 2.097,
"step": 660
},
{
"epoch": 1.6629422718808193,
"grad_norm": 0.06773627549409866,
"learning_rate": 1.710970688087561e-05,
"loss": 0.1194,
"step": 670
},
{
"epoch": 1.6877715704531346,
"grad_norm": 0.12803693115711212,
"learning_rate": 1.4764297409801764e-05,
"loss": 0.1175,
"step": 680
},
{
"epoch": 1.71260086902545,
"grad_norm": 0.10929796099662781,
"learning_rate": 1.2578909064236889e-05,
"loss": 0.1222,
"step": 690
},
{
"epoch": 1.71260086902545,
"eval_loss": 0.11896785348653793,
"eval_runtime": 40.5323,
"eval_samples_per_second": 4.194,
"eval_steps_per_second": 2.097,
"step": 690
},
{
"epoch": 1.7374301675977653,
"grad_norm": 0.06872426718473434,
"learning_rate": 1.0557644682453039e-05,
"loss": 0.1246,
"step": 700
},
{
"epoch": 1.7622594661700806,
"grad_norm": 0.10362172871828079,
"learning_rate": 8.70429897712921e-06,
"loss": 0.1165,
"step": 710
},
{
"epoch": 1.7870887647423959,
"grad_norm": 0.0892619714140892,
"learning_rate": 7.022351411174866e-06,
"loss": 0.1292,
"step": 720
},
{
"epoch": 1.7870887647423959,
"eval_loss": 0.11873549222946167,
"eval_runtime": 40.5842,
"eval_samples_per_second": 4.189,
"eval_steps_per_second": 2.094,
"step": 720
},
{
"epoch": 1.8119180633147114,
"grad_norm": 0.10651155561208725,
"learning_rate": 5.51495966540182e-06,
"loss": 0.1182,
"step": 730
},
{
"epoch": 1.8367473618870267,
"grad_norm": 0.09011874347925186,
"learning_rate": 4.1849537103084925e-06,
"loss": 0.1221,
"step": 740
},
{
"epoch": 1.861576660459342,
"grad_norm": 0.09894613921642303,
"learning_rate": 3.034830493105956e-06,
"loss": 0.1149,
"step": 750
},
{
"epoch": 1.861576660459342,
"eval_loss": 0.11866023391485214,
"eval_runtime": 40.5331,
"eval_samples_per_second": 4.194,
"eval_steps_per_second": 2.097,
"step": 750
},
{
"epoch": 1.8864059590316573,
"grad_norm": 0.10858767479658127,
"learning_rate": 2.066749249960498e-06,
"loss": 0.1212,
"step": 760
},
{
"epoch": 1.9112352576039728,
"grad_norm": 0.07641536742448807,
"learning_rate": 1.2825274522532792e-06,
"loss": 0.1107,
"step": 770
},
{
"epoch": 1.9360645561762881,
"grad_norm": 0.11793581396341324,
"learning_rate": 6.836373944677954e-07,
"loss": 0.1144,
"step": 780
},
{
"epoch": 1.9360645561762881,
"eval_loss": 0.1186189278960228,
"eval_runtime": 40.4999,
"eval_samples_per_second": 4.198,
"eval_steps_per_second": 2.099,
"step": 780
},
{
"epoch": 1.9608938547486034,
"grad_norm": 0.1014862135052681,
"learning_rate": 2.712034301107114e-07,
"loss": 0.1264,
"step": 790
},
{
"epoch": 1.9857231533209188,
"grad_norm": 0.09327876567840576,
"learning_rate": 4.599986085573882e-08,
"loss": 0.1182,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 806,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.5263477755117363e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}