c1 / checkpoint-800 /trainer_state.json
thetmon's picture
Upload merged Qwen3-4B-Instruct-2507 model (auto-generated README)
c836753 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9857231533209188,
"eval_steps": 30,
"global_step": 800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.024829298572315334,
"grad_norm": 7.621621131896973,
"learning_rate": 2.222222222222222e-06,
"loss": 2.9278,
"step": 10
},
{
"epoch": 0.04965859714463067,
"grad_norm": 6.982970237731934,
"learning_rate": 4.691358024691358e-06,
"loss": 2.7139,
"step": 20
},
{
"epoch": 0.074487895716946,
"grad_norm": 6.744388580322266,
"learning_rate": 7.160493827160494e-06,
"loss": 2.5972,
"step": 30
},
{
"epoch": 0.074487895716946,
"eval_loss": 2.417867660522461,
"eval_runtime": 41.6844,
"eval_samples_per_second": 4.078,
"eval_steps_per_second": 2.039,
"step": 30
},
{
"epoch": 0.09931719428926133,
"grad_norm": 5.335218906402588,
"learning_rate": 9.62962962962963e-06,
"loss": 2.0383,
"step": 40
},
{
"epoch": 0.12414649286157665,
"grad_norm": 2.143411874771118,
"learning_rate": 1.2098765432098767e-05,
"loss": 1.3351,
"step": 50
},
{
"epoch": 0.148975791433892,
"grad_norm": 0.8463016748428345,
"learning_rate": 1.4567901234567903e-05,
"loss": 0.9604,
"step": 60
},
{
"epoch": 0.148975791433892,
"eval_loss": 0.9155183434486389,
"eval_runtime": 41.0238,
"eval_samples_per_second": 4.144,
"eval_steps_per_second": 2.072,
"step": 60
},
{
"epoch": 0.17380509000620734,
"grad_norm": 0.595171332359314,
"learning_rate": 1.7037037037037038e-05,
"loss": 0.7447,
"step": 70
},
{
"epoch": 0.19863438857852267,
"grad_norm": 0.5561698079109192,
"learning_rate": 1.9506172839506175e-05,
"loss": 0.6919,
"step": 80
},
{
"epoch": 0.22346368715083798,
"grad_norm": 0.5283234715461731,
"learning_rate": 1.999399199592735e-05,
"loss": 0.6338,
"step": 90
},
{
"epoch": 0.22346368715083798,
"eval_loss": 0.6309370994567871,
"eval_runtime": 41.0557,
"eval_samples_per_second": 4.141,
"eval_steps_per_second": 2.07,
"step": 90
},
{
"epoch": 0.2482929857231533,
"grad_norm": 0.5148088335990906,
"learning_rate": 1.996959685164433e-05,
"loss": 0.5407,
"step": 100
},
{
"epoch": 0.27312228429546864,
"grad_norm": 0.7455502152442932,
"learning_rate": 1.9926484830975116e-05,
"loss": 0.5309,
"step": 110
},
{
"epoch": 0.297951582867784,
"grad_norm": 0.5646942853927612,
"learning_rate": 1.986473687223383e-05,
"loss": 0.474,
"step": 120
},
{
"epoch": 0.297951582867784,
"eval_loss": 0.4833507835865021,
"eval_runtime": 41.0325,
"eval_samples_per_second": 4.143,
"eval_steps_per_second": 2.072,
"step": 120
},
{
"epoch": 0.3227808814400993,
"grad_norm": 0.5670679211616516,
"learning_rate": 1.9784468900761097e-05,
"loss": 0.4259,
"step": 130
},
{
"epoch": 0.34761018001241467,
"grad_norm": 0.7423049211502075,
"learning_rate": 1.9685831611286312e-05,
"loss": 0.412,
"step": 140
},
{
"epoch": 0.37243947858473,
"grad_norm": 0.6956289410591125,
"learning_rate": 1.9569010185014062e-05,
"loss": 0.377,
"step": 150
},
{
"epoch": 0.37243947858473,
"eval_loss": 0.38657110929489136,
"eval_runtime": 40.924,
"eval_samples_per_second": 4.154,
"eval_steps_per_second": 2.077,
"step": 150
},
{
"epoch": 0.39726877715704534,
"grad_norm": 0.8065354824066162,
"learning_rate": 1.9434223941965738e-05,
"loss": 0.3361,
"step": 160
},
{
"epoch": 0.42209807572936064,
"grad_norm": 0.7633559703826904,
"learning_rate": 1.9281725929229127e-05,
"loss": 0.3348,
"step": 170
},
{
"epoch": 0.44692737430167595,
"grad_norm": 0.7941247224807739,
"learning_rate": 1.9111802445888936e-05,
"loss": 0.2987,
"step": 180
},
{
"epoch": 0.44692737430167595,
"eval_loss": 0.3138997256755829,
"eval_runtime": 41.0146,
"eval_samples_per_second": 4.145,
"eval_steps_per_second": 2.072,
"step": 180
},
{
"epoch": 0.4717566728739913,
"grad_norm": 1.1606348752975464,
"learning_rate": 1.8924772505530177e-05,
"loss": 0.2776,
"step": 190
},
{
"epoch": 0.4965859714463066,
"grad_norm": 1.1247198581695557,
"learning_rate": 1.8720987237323497e-05,
"loss": 0.2788,
"step": 200
},
{
"epoch": 0.521415270018622,
"grad_norm": 0.8864783644676208,
"learning_rate": 1.8500829226816853e-05,
"loss": 0.2588,
"step": 210
},
{
"epoch": 0.521415270018622,
"eval_loss": 0.26379144191741943,
"eval_runtime": 41.1029,
"eval_samples_per_second": 4.136,
"eval_steps_per_second": 2.068,
"step": 210
},
{
"epoch": 0.5462445685909373,
"grad_norm": 0.9481696486473083,
"learning_rate": 1.826471179767111e-05,
"loss": 0.2359,
"step": 220
},
{
"epoch": 0.5710738671632526,
"grad_norm": 1.218240737915039,
"learning_rate": 1.801307823568806e-05,
"loss": 0.2406,
"step": 230
},
{
"epoch": 0.595903165735568,
"grad_norm": 0.9764422178268433,
"learning_rate": 1.7746400956587653e-05,
"loss": 0.2241,
"step": 240
},
{
"epoch": 0.595903165735568,
"eval_loss": 0.2281169593334198,
"eval_runtime": 41.0529,
"eval_samples_per_second": 4.141,
"eval_steps_per_second": 2.071,
"step": 240
},
{
"epoch": 0.6207324643078833,
"grad_norm": 1.1048036813735962,
"learning_rate": 1.7465180619096834e-05,
"loss": 0.2199,
"step": 250
},
{
"epoch": 0.6455617628801986,
"grad_norm": 0.973822832107544,
"learning_rate": 1.7169945185015106e-05,
"loss": 0.2025,
"step": 260
},
{
"epoch": 0.6703910614525139,
"grad_norm": 0.758234977722168,
"learning_rate": 1.686124892802141e-05,
"loss": 0.1932,
"step": 270
},
{
"epoch": 0.6703910614525139,
"eval_loss": 0.20496493577957153,
"eval_runtime": 41.1068,
"eval_samples_per_second": 4.136,
"eval_steps_per_second": 2.068,
"step": 270
},
{
"epoch": 0.6952203600248293,
"grad_norm": 1.2279834747314453,
"learning_rate": 1.6539671393083218e-05,
"loss": 0.1943,
"step": 280
},
{
"epoch": 0.7200496585971446,
"grad_norm": 0.7134841084480286,
"learning_rate": 1.6205816308421386e-05,
"loss": 0.1796,
"step": 290
},
{
"epoch": 0.74487895716946,
"grad_norm": 0.9929455518722534,
"learning_rate": 1.586031045207354e-05,
"loss": 0.1874,
"step": 300
},
{
"epoch": 0.74487895716946,
"eval_loss": 0.19150954484939575,
"eval_runtime": 41.0713,
"eval_samples_per_second": 4.139,
"eval_steps_per_second": 2.07,
"step": 300
},
{
"epoch": 0.7697082557417753,
"grad_norm": 0.8576317429542542,
"learning_rate": 1.5503802475183773e-05,
"loss": 0.1852,
"step": 310
},
{
"epoch": 0.7945375543140907,
"grad_norm": 1.0153837203979492,
"learning_rate": 1.5136961684227905e-05,
"loss": 0.1814,
"step": 320
},
{
"epoch": 0.819366852886406,
"grad_norm": 0.8694589734077454,
"learning_rate": 1.4760476784460514e-05,
"loss": 0.1841,
"step": 330
},
{
"epoch": 0.819366852886406,
"eval_loss": 0.17898762226104736,
"eval_runtime": 41.0055,
"eval_samples_per_second": 4.146,
"eval_steps_per_second": 2.073,
"step": 330
},
{
"epoch": 0.8441961514587213,
"grad_norm": 1.1345281600952148,
"learning_rate": 1.4375054586942771e-05,
"loss": 0.1725,
"step": 340
},
{
"epoch": 0.8690254500310366,
"grad_norm": 1.0460193157196045,
"learning_rate": 1.3981418681578546e-05,
"loss": 0.1605,
"step": 350
},
{
"epoch": 0.8938547486033519,
"grad_norm": 1.0834463834762573,
"learning_rate": 1.3580308078649948e-05,
"loss": 0.1652,
"step": 360
},
{
"epoch": 0.8938547486033519,
"eval_loss": 0.1723683923482895,
"eval_runtime": 41.0124,
"eval_samples_per_second": 4.145,
"eval_steps_per_second": 2.073,
"step": 360
},
{
"epoch": 0.9186840471756673,
"grad_norm": 0.7139394283294678,
"learning_rate": 1.3172475821402748e-05,
"loss": 0.1742,
"step": 370
},
{
"epoch": 0.9435133457479826,
"grad_norm": 0.8907492756843567,
"learning_rate": 1.2758687572286367e-05,
"loss": 0.154,
"step": 380
},
{
"epoch": 0.9683426443202979,
"grad_norm": 0.7731947302818298,
"learning_rate": 1.2339720175502643e-05,
"loss": 0.1627,
"step": 390
},
{
"epoch": 0.9683426443202979,
"eval_loss": 0.16476133465766907,
"eval_runtime": 41.0851,
"eval_samples_per_second": 4.138,
"eval_steps_per_second": 2.069,
"step": 390
},
{
"epoch": 0.9931719428926132,
"grad_norm": 0.8102223873138428,
"learning_rate": 1.191636019856198e-05,
"loss": 0.1546,
"step": 400
},
{
"epoch": 1.0173805090006207,
"grad_norm": 0.9590178728103638,
"learning_rate": 1.1489402455585078e-05,
"loss": 0.1635,
"step": 410
},
{
"epoch": 1.042209807572936,
"grad_norm": 1.270085334777832,
"learning_rate": 1.1059648515122426e-05,
"loss": 0.1578,
"step": 420
},
{
"epoch": 1.042209807572936,
"eval_loss": 0.16066311299800873,
"eval_runtime": 41.0613,
"eval_samples_per_second": 4.14,
"eval_steps_per_second": 2.07,
"step": 420
},
{
"epoch": 1.0670391061452513,
"grad_norm": 0.7239245176315308,
"learning_rate": 1.0627905195293135e-05,
"loss": 0.1509,
"step": 430
},
{
"epoch": 1.0918684047175666,
"grad_norm": 1.2316311597824097,
"learning_rate": 1.0194983049068212e-05,
"loss": 0.1493,
"step": 440
},
{
"epoch": 1.1166977032898822,
"grad_norm": 1.034386157989502,
"learning_rate": 9.761694842542042e-06,
"loss": 0.1427,
"step": 450
},
{
"epoch": 1.1166977032898822,
"eval_loss": 0.15721318125724792,
"eval_runtime": 40.9543,
"eval_samples_per_second": 4.151,
"eval_steps_per_second": 2.075,
"step": 450
},
{
"epoch": 1.1415270018621975,
"grad_norm": 1.0646947622299194,
"learning_rate": 9.328854029048985e-06,
"loss": 0.1528,
"step": 460
},
{
"epoch": 1.1663563004345128,
"grad_norm": 0.8461800813674927,
"learning_rate": 8.897273221989715e-06,
"loss": 0.1505,
"step": 470
},
{
"epoch": 1.191185599006828,
"grad_norm": 0.6844385862350464,
"learning_rate": 8.467762669234496e-06,
"loss": 0.1472,
"step": 480
},
{
"epoch": 1.191185599006828,
"eval_loss": 0.15322649478912354,
"eval_runtime": 41.0165,
"eval_samples_per_second": 4.145,
"eval_steps_per_second": 2.072,
"step": 480
},
{
"epoch": 1.2160148975791434,
"grad_norm": 0.8525738716125488,
"learning_rate": 8.041128731967445e-06,
"loss": 0.1519,
"step": 490
},
{
"epoch": 1.2408441961514587,
"grad_norm": 0.7886703014373779,
"learning_rate": 7.61817237082768e-06,
"loss": 0.1519,
"step": 500
},
{
"epoch": 1.265673494723774,
"grad_norm": 0.7268862128257751,
"learning_rate": 7.199687642189388e-06,
"loss": 0.142,
"step": 510
},
{
"epoch": 1.265673494723774,
"eval_loss": 0.1511753350496292,
"eval_runtime": 41.1138,
"eval_samples_per_second": 4.135,
"eval_steps_per_second": 2.067,
"step": 510
},
{
"epoch": 1.2905027932960893,
"grad_norm": 0.7751151323318481,
"learning_rate": 6.7864602074039775e-06,
"loss": 0.1471,
"step": 520
},
{
"epoch": 1.3153320918684046,
"grad_norm": 0.8264702558517456,
"learning_rate": 6.37926585780297e-06,
"loss": 0.1438,
"step": 530
},
{
"epoch": 1.34016139044072,
"grad_norm": 0.5469579100608826,
"learning_rate": 5.978869058230841e-06,
"loss": 0.1493,
"step": 540
},
{
"epoch": 1.34016139044072,
"eval_loss": 0.1491098254919052,
"eval_runtime": 41.1584,
"eval_samples_per_second": 4.13,
"eval_steps_per_second": 2.065,
"step": 540
},
{
"epoch": 1.3649906890130354,
"grad_norm": 0.9022724032402039,
"learning_rate": 5.586021511842136e-06,
"loss": 0.1371,
"step": 550
},
{
"epoch": 1.3898199875853507,
"grad_norm": 0.8355094790458679,
"learning_rate": 5.201460748857369e-06,
"loss": 0.1409,
"step": 560
},
{
"epoch": 1.414649286157666,
"grad_norm": 0.7820518016815186,
"learning_rate": 4.825908741927076e-06,
"loss": 0.1417,
"step": 570
},
{
"epoch": 1.414649286157666,
"eval_loss": 0.14838995039463043,
"eval_runtime": 41.137,
"eval_samples_per_second": 4.133,
"eval_steps_per_second": 2.066,
"step": 570
},
{
"epoch": 1.4394785847299814,
"grad_norm": 0.7794449925422668,
"learning_rate": 4.4600705507036125e-06,
"loss": 0.1433,
"step": 580
},
{
"epoch": 1.4643078833022967,
"grad_norm": 0.7465994358062744,
"learning_rate": 4.104632998165309e-06,
"loss": 0.1445,
"step": 590
},
{
"epoch": 1.489137181874612,
"grad_norm": 0.8334828615188599,
"learning_rate": 3.7602633811781165e-06,
"loss": 0.1458,
"step": 600
},
{
"epoch": 1.489137181874612,
"eval_loss": 0.1462916135787964,
"eval_runtime": 41.086,
"eval_samples_per_second": 4.138,
"eval_steps_per_second": 2.069,
"step": 600
},
{
"epoch": 1.5139664804469275,
"grad_norm": 0.8605223894119263,
"learning_rate": 3.4276082177154536e-06,
"loss": 0.1447,
"step": 610
},
{
"epoch": 1.5387957790192428,
"grad_norm": 0.5568763017654419,
"learning_rate": 3.107292033088265e-06,
"loss": 0.1384,
"step": 620
},
{
"epoch": 1.563625077591558,
"grad_norm": 0.7917467951774597,
"learning_rate": 2.7999161874640026e-06,
"loss": 0.1481,
"step": 630
},
{
"epoch": 1.563625077591558,
"eval_loss": 0.14508940279483795,
"eval_runtime": 41.1236,
"eval_samples_per_second": 4.134,
"eval_steps_per_second": 2.067,
"step": 630
},
{
"epoch": 1.5884543761638734,
"grad_norm": 0.7165929675102234,
"learning_rate": 2.506057746875753e-06,
"loss": 0.1422,
"step": 640
},
{
"epoch": 1.6132836747361887,
"grad_norm": 0.8443304896354675,
"learning_rate": 2.226268399841055e-06,
"loss": 0.1406,
"step": 650
},
{
"epoch": 1.638112973308504,
"grad_norm": 1.0195279121398926,
"learning_rate": 1.961073421624352e-06,
"loss": 0.1403,
"step": 660
},
{
"epoch": 1.638112973308504,
"eval_loss": 0.1443227380514145,
"eval_runtime": 41.1177,
"eval_samples_per_second": 4.134,
"eval_steps_per_second": 2.067,
"step": 660
},
{
"epoch": 1.6629422718808193,
"grad_norm": 0.7534502148628235,
"learning_rate": 1.710970688087561e-06,
"loss": 0.1398,
"step": 670
},
{
"epoch": 1.6877715704531346,
"grad_norm": 0.8134092092514038,
"learning_rate": 1.4764297409801764e-06,
"loss": 0.1377,
"step": 680
},
{
"epoch": 1.71260086902545,
"grad_norm": 0.7144195437431335,
"learning_rate": 1.2578909064236887e-06,
"loss": 0.1457,
"step": 690
},
{
"epoch": 1.71260086902545,
"eval_loss": 0.1440444439649582,
"eval_runtime": 41.1467,
"eval_samples_per_second": 4.132,
"eval_steps_per_second": 2.066,
"step": 690
},
{
"epoch": 1.7374301675977653,
"grad_norm": 0.5405588150024414,
"learning_rate": 1.055764468245304e-06,
"loss": 0.1406,
"step": 700
},
{
"epoch": 1.7622594661700806,
"grad_norm": 0.6921040415763855,
"learning_rate": 8.70429897712921e-07,
"loss": 0.1366,
"step": 710
},
{
"epoch": 1.7870887647423959,
"grad_norm": 0.7384780645370483,
"learning_rate": 7.022351411174866e-07,
"loss": 0.1473,
"step": 720
},
{
"epoch": 1.7870887647423959,
"eval_loss": 0.1436140090227127,
"eval_runtime": 41.1175,
"eval_samples_per_second": 4.134,
"eval_steps_per_second": 2.067,
"step": 720
},
{
"epoch": 1.8119180633147114,
"grad_norm": 0.7989315986633301,
"learning_rate": 5.51495966540182e-07,
"loss": 0.1368,
"step": 730
},
{
"epoch": 1.8367473618870267,
"grad_norm": 0.8284072875976562,
"learning_rate": 4.1849537103084924e-07,
"loss": 0.1445,
"step": 740
},
{
"epoch": 1.861576660459342,
"grad_norm": 0.8498286604881287,
"learning_rate": 3.0348304931059556e-07,
"loss": 0.1347,
"step": 750
},
{
"epoch": 1.861576660459342,
"eval_loss": 0.14337773621082306,
"eval_runtime": 41.0893,
"eval_samples_per_second": 4.137,
"eval_steps_per_second": 2.069,
"step": 750
},
{
"epoch": 1.8864059590316573,
"grad_norm": 0.7535120844841003,
"learning_rate": 2.066749249960498e-07,
"loss": 0.1422,
"step": 760
},
{
"epoch": 1.9112352576039728,
"grad_norm": 0.8711650371551514,
"learning_rate": 1.2825274522532795e-07,
"loss": 0.129,
"step": 770
},
{
"epoch": 1.9360645561762881,
"grad_norm": 0.751017689704895,
"learning_rate": 6.836373944677954e-08,
"loss": 0.1423,
"step": 780
},
{
"epoch": 1.9360645561762881,
"eval_loss": 0.1432679146528244,
"eval_runtime": 41.1344,
"eval_samples_per_second": 4.133,
"eval_steps_per_second": 2.066,
"step": 780
},
{
"epoch": 1.9608938547486034,
"grad_norm": 0.7860766053199768,
"learning_rate": 2.7120343011071138e-08,
"loss": 0.1492,
"step": 790
},
{
"epoch": 1.9857231533209188,
"grad_norm": 0.7165619134902954,
"learning_rate": 4.599986085573882e-09,
"loss": 0.138,
"step": 800
}
],
"logging_steps": 10,
"max_steps": 806,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.4861828204940288e+17,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}