c15 / checkpoint-800 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
8229030 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9860291834833903,
"eval_steps": 30,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024837007140639553,
"grad_norm": 5.0680694580078125,
"learning_rate": 2.2222222222222223e-05,
"loss": 2.4793,
"step": 10
},
{
"epoch": 0.04967401428127911,
"grad_norm": 0.6718662977218628,
"learning_rate": 4.691358024691358e-05,
"loss": 0.8831,
"step": 20
},
{
"epoch": 0.07451102142191866,
"grad_norm": 0.7071403861045837,
"learning_rate": 7.160493827160494e-05,
"loss": 0.5408,
"step": 30
},
{
"epoch": 0.07451102142191866,
"eval_loss": 0.4726891815662384,
"eval_runtime": 38.6732,
"eval_samples_per_second": 4.396,
"eval_steps_per_second": 4.396,
"step": 30
},
{
"epoch": 0.09934802856255821,
"grad_norm": 0.5646480321884155,
"learning_rate": 9.62962962962963e-05,
"loss": 0.3935,
"step": 40
},
{
"epoch": 0.12418503570319776,
"grad_norm": 0.5533119440078735,
"learning_rate": 0.00012098765432098766,
"loss": 0.3001,
"step": 50
},
{
"epoch": 0.14902204284383733,
"grad_norm": 0.4143104553222656,
"learning_rate": 0.00014567901234567902,
"loss": 0.2543,
"step": 60
},
{
"epoch": 0.14902204284383733,
"eval_loss": 0.2523055374622345,
"eval_runtime": 38.0335,
"eval_samples_per_second": 4.47,
"eval_steps_per_second": 4.47,
"step": 60
},
{
"epoch": 0.17385904998447688,
"grad_norm": 0.47011032700538635,
"learning_rate": 0.00017037037037037037,
"loss": 0.2306,
"step": 70
},
{
"epoch": 0.19869605712511643,
"grad_norm": 0.6400521397590637,
"learning_rate": 0.00019506172839506175,
"loss": 0.2104,
"step": 80
},
{
"epoch": 0.22353306426575598,
"grad_norm": 0.6687613129615784,
"learning_rate": 0.0001999399199592735,
"loss": 0.19,
"step": 90
},
{
"epoch": 0.22353306426575598,
"eval_loss": 0.18850156664848328,
"eval_runtime": 38.0935,
"eval_samples_per_second": 4.463,
"eval_steps_per_second": 4.463,
"step": 90
},
{
"epoch": 0.24837007140639553,
"grad_norm": 0.24749410152435303,
"learning_rate": 0.00019969596851644327,
"loss": 0.1835,
"step": 100
},
{
"epoch": 0.2732070785470351,
"grad_norm": 0.43560871481895447,
"learning_rate": 0.00019926484830975113,
"loss": 0.1577,
"step": 110
},
{
"epoch": 0.29804408568767465,
"grad_norm": 0.6888434290885925,
"learning_rate": 0.0001986473687223383,
"loss": 0.1585,
"step": 120
},
{
"epoch": 0.29804408568767465,
"eval_loss": 0.15052124857902527,
"eval_runtime": 38.1444,
"eval_samples_per_second": 4.457,
"eval_steps_per_second": 4.457,
"step": 120
},
{
"epoch": 0.3228810928283142,
"grad_norm": 0.2602541446685791,
"learning_rate": 0.00019784468900761095,
"loss": 0.1548,
"step": 130
},
{
"epoch": 0.34771809996895375,
"grad_norm": 0.23804056644439697,
"learning_rate": 0.0001968583161128631,
"loss": 0.1581,
"step": 140
},
{
"epoch": 0.3725551071095933,
"grad_norm": 0.20728255808353424,
"learning_rate": 0.00019569010185014062,
"loss": 0.1487,
"step": 150
},
{
"epoch": 0.3725551071095933,
"eval_loss": 0.14161638915538788,
"eval_runtime": 38.1143,
"eval_samples_per_second": 4.46,
"eval_steps_per_second": 4.46,
"step": 150
},
{
"epoch": 0.39739211425023285,
"grad_norm": 0.2241806834936142,
"learning_rate": 0.00019434223941965738,
"loss": 0.1521,
"step": 160
},
{
"epoch": 0.4222291213908724,
"grad_norm": 0.24058522284030914,
"learning_rate": 0.00019281725929229127,
"loss": 0.1438,
"step": 170
},
{
"epoch": 0.44706612853151195,
"grad_norm": 0.17390510439872742,
"learning_rate": 0.00019111802445888936,
"loss": 0.1513,
"step": 180
},
{
"epoch": 0.44706612853151195,
"eval_loss": 0.1333342343568802,
"eval_runtime": 38.1525,
"eval_samples_per_second": 4.456,
"eval_steps_per_second": 4.456,
"step": 180
},
{
"epoch": 0.47190313567215153,
"grad_norm": 0.1597527265548706,
"learning_rate": 0.00018924772505530174,
"loss": 0.1416,
"step": 190
},
{
"epoch": 0.49674014281279105,
"grad_norm": 0.16071690618991852,
"learning_rate": 0.000187209872373235,
"loss": 0.1437,
"step": 200
},
{
"epoch": 0.5215771499534306,
"grad_norm": 0.15209132432937622,
"learning_rate": 0.00018500829226816853,
"loss": 0.1421,
"step": 210
},
{
"epoch": 0.5215771499534306,
"eval_loss": 0.13083834946155548,
"eval_runtime": 38.1005,
"eval_samples_per_second": 4.462,
"eval_steps_per_second": 4.462,
"step": 210
},
{
"epoch": 0.5464141570940702,
"grad_norm": 0.17985881865024567,
"learning_rate": 0.0001826471179767111,
"loss": 0.1395,
"step": 220
},
{
"epoch": 0.5712511642347097,
"grad_norm": 0.1529396027326584,
"learning_rate": 0.0001801307823568806,
"loss": 0.1443,
"step": 230
},
{
"epoch": 0.5960881713753493,
"grad_norm": 0.14357714354991913,
"learning_rate": 0.00017746400956587653,
"loss": 0.1478,
"step": 240
},
{
"epoch": 0.5960881713753493,
"eval_loss": 0.12620185315608978,
"eval_runtime": 38.1161,
"eval_samples_per_second": 4.46,
"eval_steps_per_second": 4.46,
"step": 240
},
{
"epoch": 0.6209251785159888,
"grad_norm": 0.15918347239494324,
"learning_rate": 0.00017465180619096832,
"loss": 0.1338,
"step": 250
},
{
"epoch": 0.6457621856566284,
"grad_norm": 0.25342991948127747,
"learning_rate": 0.00017169945185015106,
"loss": 0.1353,
"step": 260
},
{
"epoch": 0.670599192797268,
"grad_norm": 0.1387968510389328,
"learning_rate": 0.00016861248928021411,
"loss": 0.1507,
"step": 270
},
{
"epoch": 0.670599192797268,
"eval_loss": 0.12623676657676697,
"eval_runtime": 38.5722,
"eval_samples_per_second": 4.407,
"eval_steps_per_second": 4.407,
"step": 270
},
{
"epoch": 0.6954361999379075,
"grad_norm": 0.1433294266462326,
"learning_rate": 0.00016539671393083215,
"loss": 0.1342,
"step": 280
},
{
"epoch": 0.720273207078547,
"grad_norm": 0.16263321042060852,
"learning_rate": 0.00016205816308421386,
"loss": 0.1361,
"step": 290
},
{
"epoch": 0.7451102142191866,
"grad_norm": 0.15614983439445496,
"learning_rate": 0.0001586031045207354,
"loss": 0.1457,
"step": 300
},
{
"epoch": 0.7451102142191866,
"eval_loss": 0.1330355405807495,
"eval_runtime": 38.7441,
"eval_samples_per_second": 4.388,
"eval_steps_per_second": 4.388,
"step": 300
},
{
"epoch": 0.7699472213598262,
"grad_norm": 0.13199648261070251,
"learning_rate": 0.00015503802475183773,
"loss": 0.1429,
"step": 310
},
{
"epoch": 0.7947842285004657,
"grad_norm": 0.11272416263818741,
"learning_rate": 0.00015136961684227904,
"loss": 0.1402,
"step": 320
},
{
"epoch": 0.8196212356411052,
"grad_norm": 0.19195938110351562,
"learning_rate": 0.00014760476784460514,
"loss": 0.144,
"step": 330
},
{
"epoch": 0.8196212356411052,
"eval_loss": 0.12694701552391052,
"eval_runtime": 38.5526,
"eval_samples_per_second": 4.41,
"eval_steps_per_second": 4.41,
"step": 330
},
{
"epoch": 0.8444582427817448,
"grad_norm": 0.1164827048778534,
"learning_rate": 0.0001437505458694277,
"loss": 0.1404,
"step": 340
},
{
"epoch": 0.8692952499223844,
"grad_norm": 0.15164071321487427,
"learning_rate": 0.00013981418681578546,
"loss": 0.1332,
"step": 350
},
{
"epoch": 0.8941322570630239,
"grad_norm": 0.11416257917881012,
"learning_rate": 0.0001358030807864995,
"loss": 0.1333,
"step": 360
},
{
"epoch": 0.8941322570630239,
"eval_loss": 0.12437459081411362,
"eval_runtime": 38.4955,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 4.416,
"step": 360
},
{
"epoch": 0.9189692642036634,
"grad_norm": 0.13110032677650452,
"learning_rate": 0.00013172475821402748,
"loss": 0.147,
"step": 370
},
{
"epoch": 0.9438062713443031,
"grad_norm": 0.09652693569660187,
"learning_rate": 0.00012758687572286367,
"loss": 0.1333,
"step": 380
},
{
"epoch": 0.9686432784849426,
"grad_norm": 0.09613075852394104,
"learning_rate": 0.00012339720175502642,
"loss": 0.1402,
"step": 390
},
{
"epoch": 0.9686432784849426,
"eval_loss": 0.12314330041408539,
"eval_runtime": 38.4882,
"eval_samples_per_second": 4.417,
"eval_steps_per_second": 4.417,
"step": 390
},
{
"epoch": 0.9934802856255821,
"grad_norm": 0.10631036758422852,
"learning_rate": 0.0001191636019856198,
"loss": 0.1287,
"step": 400
},
{
"epoch": 1.0173859049984477,
"grad_norm": 0.46618208289146423,
"learning_rate": 0.00011489402455585076,
"loss": 0.138,
"step": 410
},
{
"epoch": 1.0422229121390871,
"grad_norm": 0.30142149329185486,
"learning_rate": 0.00011059648515122424,
"loss": 0.1365,
"step": 420
},
{
"epoch": 1.0422229121390871,
"eval_loss": 0.12314148247241974,
"eval_runtime": 38.5184,
"eval_samples_per_second": 4.413,
"eval_steps_per_second": 4.413,
"step": 420
},
{
"epoch": 1.0670599192797268,
"grad_norm": 0.3135342597961426,
"learning_rate": 0.00010627905195293135,
"loss": 0.1328,
"step": 430
},
{
"epoch": 1.0918969264203664,
"grad_norm": 0.0945952981710434,
"learning_rate": 0.00010194983049068212,
"loss": 0.134,
"step": 440
},
{
"epoch": 1.1167339335610058,
"grad_norm": 0.1430775374174118,
"learning_rate": 9.76169484254204e-05,
"loss": 0.1201,
"step": 450
},
{
"epoch": 1.1167339335610058,
"eval_loss": 0.1227191910147667,
"eval_runtime": 38.5653,
"eval_samples_per_second": 4.408,
"eval_steps_per_second": 4.408,
"step": 450
},
{
"epoch": 1.1415709407016454,
"grad_norm": 0.09269551187753677,
"learning_rate": 9.328854029048984e-05,
"loss": 0.1305,
"step": 460
},
{
"epoch": 1.166407947842285,
"grad_norm": 0.10537844151258469,
"learning_rate": 8.897273221989714e-05,
"loss": 0.1315,
"step": 470
},
{
"epoch": 1.1912449549829245,
"grad_norm": 0.08408211916685104,
"learning_rate": 8.467762669234495e-05,
"loss": 0.1294,
"step": 480
},
{
"epoch": 1.1912449549829245,
"eval_loss": 0.12096220254898071,
"eval_runtime": 38.4954,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 4.416,
"step": 480
},
{
"epoch": 1.2160819621235641,
"grad_norm": 0.1160721555352211,
"learning_rate": 8.041128731967444e-05,
"loss": 0.1341,
"step": 490
},
{
"epoch": 1.2409189692642038,
"grad_norm": 0.09623311460018158,
"learning_rate": 7.61817237082768e-05,
"loss": 0.1321,
"step": 500
},
{
"epoch": 1.2657559764048432,
"grad_norm": 0.3000585436820984,
"learning_rate": 7.199687642189387e-05,
"loss": 0.1249,
"step": 510
},
{
"epoch": 1.2657559764048432,
"eval_loss": 0.12054261565208435,
"eval_runtime": 38.4372,
"eval_samples_per_second": 4.423,
"eval_steps_per_second": 4.423,
"step": 510
},
{
"epoch": 1.2905929835454828,
"grad_norm": 0.0921676829457283,
"learning_rate": 6.786460207403978e-05,
"loss": 0.1294,
"step": 520
},
{
"epoch": 1.3154299906861224,
"grad_norm": 0.08408155292272568,
"learning_rate": 6.379265857802969e-05,
"loss": 0.1318,
"step": 530
},
{
"epoch": 1.3402669978267618,
"grad_norm": 0.19716006517410278,
"learning_rate": 5.9788690582308404e-05,
"loss": 0.1305,
"step": 540
},
{
"epoch": 1.3402669978267618,
"eval_loss": 0.12028669565916061,
"eval_runtime": 38.6144,
"eval_samples_per_second": 4.402,
"eval_steps_per_second": 4.402,
"step": 540
},
{
"epoch": 1.3651040049674015,
"grad_norm": 0.14016658067703247,
"learning_rate": 5.586021511842136e-05,
"loss": 0.1271,
"step": 550
},
{
"epoch": 1.389941012108041,
"grad_norm": 0.11117364466190338,
"learning_rate": 5.201460748857369e-05,
"loss": 0.1306,
"step": 560
},
{
"epoch": 1.4147780192486805,
"grad_norm": 0.09563940018415451,
"learning_rate": 4.8259087419270756e-05,
"loss": 0.1302,
"step": 570
},
{
"epoch": 1.4147780192486805,
"eval_loss": 0.12124165892601013,
"eval_runtime": 38.671,
"eval_samples_per_second": 4.396,
"eval_steps_per_second": 4.396,
"step": 570
},
{
"epoch": 1.4396150263893202,
"grad_norm": 0.08976716548204422,
"learning_rate": 4.460070550703612e-05,
"loss": 0.1332,
"step": 580
},
{
"epoch": 1.4644520335299596,
"grad_norm": 0.0839507058262825,
"learning_rate": 4.1046329981653086e-05,
"loss": 0.1276,
"step": 590
},
{
"epoch": 1.4892890406705992,
"grad_norm": 0.09810927510261536,
"learning_rate": 3.7602633811781166e-05,
"loss": 0.1298,
"step": 600
},
{
"epoch": 1.4892890406705992,
"eval_loss": 0.11950553208589554,
"eval_runtime": 38.7353,
"eval_samples_per_second": 4.389,
"eval_steps_per_second": 4.389,
"step": 600
},
{
"epoch": 1.5141260478112386,
"grad_norm": 0.10148163884878159,
"learning_rate": 3.4276082177154535e-05,
"loss": 0.1303,
"step": 610
},
{
"epoch": 1.5389630549518785,
"grad_norm": 0.10012848675251007,
"learning_rate": 3.1072920330882647e-05,
"loss": 0.128,
"step": 620
},
{
"epoch": 1.5638000620925179,
"grad_norm": 0.09456542879343033,
"learning_rate": 2.7999161874640022e-05,
"loss": 0.1293,
"step": 630
},
{
"epoch": 1.5638000620925179,
"eval_loss": 0.11886715888977051,
"eval_runtime": 38.5823,
"eval_samples_per_second": 4.406,
"eval_steps_per_second": 4.406,
"step": 630
},
{
"epoch": 1.5886370692331573,
"grad_norm": 0.09369179606437683,
"learning_rate": 2.506057746875753e-05,
"loss": 0.1299,
"step": 640
},
{
"epoch": 1.613474076373797,
"grad_norm": 0.09706632047891617,
"learning_rate": 2.226268399841055e-05,
"loss": 0.1244,
"step": 650
},
{
"epoch": 1.6383110835144366,
"grad_norm": 0.08659979701042175,
"learning_rate": 1.9610734216243522e-05,
"loss": 0.1166,
"step": 660
},
{
"epoch": 1.6383110835144366,
"eval_loss": 0.11844287067651749,
"eval_runtime": 38.4977,
"eval_samples_per_second": 4.416,
"eval_steps_per_second": 4.416,
"step": 660
},
{
"epoch": 1.663148090655076,
"grad_norm": 0.0770510733127594,
"learning_rate": 1.710970688087561e-05,
"loss": 0.1284,
"step": 670
},
{
"epoch": 1.6879850977957156,
"grad_norm": 0.10163529217243195,
"learning_rate": 1.4764297409801764e-05,
"loss": 0.1298,
"step": 680
},
{
"epoch": 1.7128221049363552,
"grad_norm": 0.10533007979393005,
"learning_rate": 1.2578909064236889e-05,
"loss": 0.1282,
"step": 690
},
{
"epoch": 1.7128221049363552,
"eval_loss": 0.11826732009649277,
"eval_runtime": 38.6249,
"eval_samples_per_second": 4.401,
"eval_steps_per_second": 4.401,
"step": 690
},
{
"epoch": 1.7376591120769946,
"grad_norm": 0.07278448343276978,
"learning_rate": 1.0557644682453039e-05,
"loss": 0.1292,
"step": 700
},
{
"epoch": 1.7624961192176343,
"grad_norm": 0.09515662491321564,
"learning_rate": 8.70429897712921e-06,
"loss": 0.1236,
"step": 710
},
{
"epoch": 1.787333126358274,
"grad_norm": 0.08752795308828354,
"learning_rate": 7.022351411174866e-06,
"loss": 0.1353,
"step": 720
},
{
"epoch": 1.787333126358274,
"eval_loss": 0.11793459951877594,
"eval_runtime": 38.65,
"eval_samples_per_second": 4.398,
"eval_steps_per_second": 4.398,
"step": 720
},
{
"epoch": 1.8121701334989133,
"grad_norm": 0.10476204007863998,
"learning_rate": 5.51495966540182e-06,
"loss": 0.1279,
"step": 730
},
{
"epoch": 1.837007140639553,
"grad_norm": 0.11006776243448257,
"learning_rate": 4.1849537103084925e-06,
"loss": 0.1286,
"step": 740
},
{
"epoch": 1.8618441477801926,
"grad_norm": 0.0957954004406929,
"learning_rate": 3.034830493105956e-06,
"loss": 0.1292,
"step": 750
},
{
"epoch": 1.8618441477801926,
"eval_loss": 0.11800022423267365,
"eval_runtime": 38.6325,
"eval_samples_per_second": 4.4,
"eval_steps_per_second": 4.4,
"step": 750
},
{
"epoch": 1.886681154920832,
"grad_norm": 0.09181220084428787,
"learning_rate": 2.066749249960498e-06,
"loss": 0.1249,
"step": 760
},
{
"epoch": 1.9115181620614716,
"grad_norm": 0.08642429113388062,
"learning_rate": 1.2825274522532792e-06,
"loss": 0.1218,
"step": 770
},
{
"epoch": 1.9363551692021113,
"grad_norm": 0.0925707072019577,
"learning_rate": 6.836373944677954e-07,
"loss": 0.1299,
"step": 780
},
{
"epoch": 1.9363551692021113,
"eval_loss": 0.1179690733551979,
"eval_runtime": 38.5796,
"eval_samples_per_second": 4.406,
"eval_steps_per_second": 4.406,
"step": 780
},
{
"epoch": 1.9611921763427507,
"grad_norm": 0.09501737356185913,
"learning_rate": 2.712034301107114e-07,
"loss": 0.1289,
"step": 790
},
{
"epoch": 1.9860291834833903,
"grad_norm": 0.09520602226257324,
"learning_rate": 4.599986085573882e-08,
"loss": 0.128,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 806,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.2428317974302515e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}