microshift-epoch10-2025-Mar-27 / trainer_state.json
taguser's picture
Add files using upload-large-folder tool
1a9f362 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.761904761904763,
"eval_steps": 500,
"global_step": 390,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12698412698412698,
"grad_norm": 0.296919047832489,
"learning_rate": 6.41025641025641e-06,
"loss": 3.8313,
"step": 5
},
{
"epoch": 0.25396825396825395,
"grad_norm": 0.33771219849586487,
"learning_rate": 1.282051282051282e-05,
"loss": 3.8502,
"step": 10
},
{
"epoch": 0.38095238095238093,
"grad_norm": 0.41062507033348083,
"learning_rate": 1.923076923076923e-05,
"loss": 3.8537,
"step": 15
},
{
"epoch": 0.5079365079365079,
"grad_norm": 0.5127325057983398,
"learning_rate": 2.564102564102564e-05,
"loss": 3.6949,
"step": 20
},
{
"epoch": 0.6349206349206349,
"grad_norm": 0.5786504149436951,
"learning_rate": 3.205128205128206e-05,
"loss": 3.5371,
"step": 25
},
{
"epoch": 0.7619047619047619,
"grad_norm": 0.6281372904777527,
"learning_rate": 3.846153846153846e-05,
"loss": 3.3822,
"step": 30
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.677841305732727,
"learning_rate": 4.4871794871794874e-05,
"loss": 3.0836,
"step": 35
},
{
"epoch": 1.0,
"grad_norm": 0.7423370480537415,
"learning_rate": 4.999899863449631e-05,
"loss": 2.5068,
"step": 40
},
{
"epoch": 1.126984126984127,
"grad_norm": 0.6677795648574829,
"learning_rate": 4.9963959264103544e-05,
"loss": 1.9914,
"step": 45
},
{
"epoch": 1.253968253968254,
"grad_norm": 0.7547211050987244,
"learning_rate": 4.98789318082748e-05,
"loss": 1.7329,
"step": 50
},
{
"epoch": 1.380952380952381,
"grad_norm": 0.6714099645614624,
"learning_rate": 4.974408652685072e-05,
"loss": 1.4401,
"step": 55
},
{
"epoch": 1.507936507936508,
"grad_norm": 0.6784941554069519,
"learning_rate": 4.955969343539162e-05,
"loss": 1.3861,
"step": 60
},
{
"epoch": 1.6349206349206349,
"grad_norm": 0.8798362016677856,
"learning_rate": 4.9326121764495596e-05,
"loss": 1.1406,
"step": 65
},
{
"epoch": 1.7619047619047619,
"grad_norm": 0.7543187737464905,
"learning_rate": 4.90438392204474e-05,
"loss": 1.2,
"step": 70
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.7632691860198975,
"learning_rate": 4.8713411048678635e-05,
"loss": 1.1137,
"step": 75
},
{
"epoch": 2.0,
"grad_norm": 0.8194475173950195,
"learning_rate": 4.83354989019146e-05,
"loss": 0.8988,
"step": 80
},
{
"epoch": 2.126984126984127,
"grad_norm": 0.5129795670509338,
"learning_rate": 4.791085951527408e-05,
"loss": 1.0144,
"step": 85
},
{
"epoch": 2.253968253968254,
"grad_norm": 0.5384055972099304,
"learning_rate": 4.744034319097535e-05,
"loss": 0.9417,
"step": 90
},
{
"epoch": 2.380952380952381,
"grad_norm": 0.5197637677192688,
"learning_rate": 4.692489209568234e-05,
"loss": 0.9124,
"step": 95
},
{
"epoch": 2.507936507936508,
"grad_norm": 0.5490031838417053,
"learning_rate": 4.636553837390051e-05,
"loss": 1.0367,
"step": 100
},
{
"epoch": 2.634920634920635,
"grad_norm": 0.4497690796852112,
"learning_rate": 4.5763402081200294e-05,
"loss": 0.9391,
"step": 105
},
{
"epoch": 2.761904761904762,
"grad_norm": 0.40242263674736023,
"learning_rate": 4.511968894140639e-05,
"loss": 0.8668,
"step": 110
},
{
"epoch": 2.888888888888889,
"grad_norm": 0.4423123598098755,
"learning_rate": 4.443568793224415e-05,
"loss": 0.9419,
"step": 115
},
{
"epoch": 3.0,
"grad_norm": 0.7325336933135986,
"learning_rate": 4.371276870427753e-05,
"loss": 0.9379,
"step": 120
},
{
"epoch": 3.126984126984127,
"grad_norm": 0.4649835526943207,
"learning_rate": 4.295237883830685e-05,
"loss": 0.8399,
"step": 125
},
{
"epoch": 3.253968253968254,
"grad_norm": 0.49590322375297546,
"learning_rate": 4.215604094671835e-05,
"loss": 0.8626,
"step": 130
},
{
"epoch": 3.380952380952381,
"grad_norm": 0.5896856188774109,
"learning_rate": 4.132534962458962e-05,
"loss": 0.8826,
"step": 135
},
{
"epoch": 3.507936507936508,
"grad_norm": 0.4864223301410675,
"learning_rate": 4.0461968256656376e-05,
"loss": 0.8538,
"step": 140
},
{
"epoch": 3.634920634920635,
"grad_norm": 0.5413251519203186,
"learning_rate": 3.956762568653378e-05,
"loss": 0.8788,
"step": 145
},
{
"epoch": 3.761904761904762,
"grad_norm": 0.5263978838920593,
"learning_rate": 3.8644112754862614e-05,
"loss": 0.8279,
"step": 150
},
{
"epoch": 3.888888888888889,
"grad_norm": 0.5732172131538391,
"learning_rate": 3.76932787133117e-05,
"loss": 0.7522,
"step": 155
},
{
"epoch": 4.0,
"grad_norm": 0.8840337991714478,
"learning_rate": 3.6717027521617595e-05,
"loss": 0.8394,
"step": 160
},
{
"epoch": 4.1269841269841265,
"grad_norm": 0.5274865031242371,
"learning_rate": 3.5717314035076355e-05,
"loss": 0.8122,
"step": 165
},
{
"epoch": 4.253968253968254,
"grad_norm": 0.57438725233078,
"learning_rate": 3.4696140090121376e-05,
"loss": 0.8541,
"step": 170
},
{
"epoch": 4.380952380952381,
"grad_norm": 0.6474905610084534,
"learning_rate": 3.365555049582582e-05,
"loss": 0.7446,
"step": 175
},
{
"epoch": 4.507936507936508,
"grad_norm": 0.6062578558921814,
"learning_rate": 3.2597628939356175e-05,
"loss": 0.7282,
"step": 180
},
{
"epoch": 4.634920634920634,
"grad_norm": 0.7624045610427856,
"learning_rate": 3.152449381357593e-05,
"loss": 0.6846,
"step": 185
},
{
"epoch": 4.761904761904762,
"grad_norm": 0.6446404457092285,
"learning_rate": 3.0438293975154186e-05,
"loss": 0.7597,
"step": 190
},
{
"epoch": 4.888888888888889,
"grad_norm": 0.7280287146568298,
"learning_rate": 2.9341204441673266e-05,
"loss": 0.6788,
"step": 195
},
{
"epoch": 5.0,
"grad_norm": 1.213476538658142,
"learning_rate": 2.8235422036351382e-05,
"loss": 0.7306,
"step": 200
},
{
"epoch": 5.1269841269841265,
"grad_norm": 0.6391934156417847,
"learning_rate": 2.712316098910162e-05,
"loss": 0.5925,
"step": 205
},
{
"epoch": 5.253968253968254,
"grad_norm": 0.6667851209640503,
"learning_rate": 2.600664850273538e-05,
"loss": 0.6754,
"step": 210
},
{
"epoch": 5.380952380952381,
"grad_norm": 0.8730446100234985,
"learning_rate": 2.4888120293188916e-05,
"loss": 0.7179,
"step": 215
},
{
"epoch": 5.507936507936508,
"grad_norm": 0.8416940569877625,
"learning_rate": 2.3769816112703047e-05,
"loss": 0.6565,
"step": 220
},
{
"epoch": 5.634920634920634,
"grad_norm": 0.7638925909996033,
"learning_rate": 2.265397526492052e-05,
"loss": 0.629,
"step": 225
},
{
"epoch": 5.761904761904762,
"grad_norm": 0.8404481410980225,
"learning_rate": 2.154283212088168e-05,
"loss": 0.6779,
"step": 230
},
{
"epoch": 5.888888888888889,
"grad_norm": 0.7644646167755127,
"learning_rate": 2.043861164489719e-05,
"loss": 0.6363,
"step": 235
},
{
"epoch": 6.0,
"grad_norm": 1.407417893409729,
"learning_rate": 1.934352493925695e-05,
"loss": 0.6873,
"step": 240
},
{
"epoch": 6.1269841269841265,
"grad_norm": 0.8645577430725098,
"learning_rate": 1.825976481669641e-05,
"loss": 0.6057,
"step": 245
},
{
"epoch": 6.253968253968254,
"grad_norm": 0.8925438523292542,
"learning_rate": 1.7189501409486062e-05,
"loss": 0.5787,
"step": 250
},
{
"epoch": 6.380952380952381,
"grad_norm": 1.0913372039794922,
"learning_rate": 1.613487782393661e-05,
"loss": 0.5813,
"step": 255
},
{
"epoch": 6.507936507936508,
"grad_norm": 1.0974910259246826,
"learning_rate": 1.509800584902108e-05,
"loss": 0.5534,
"step": 260
},
{
"epoch": 6.634920634920634,
"grad_norm": 0.9944404363632202,
"learning_rate": 1.4080961727707184e-05,
"loss": 0.5976,
"step": 265
},
{
"epoch": 6.761904761904762,
"grad_norm": 0.9145245552062988,
"learning_rate": 1.3085781999467303e-05,
"loss": 0.5592,
"step": 270
},
{
"epoch": 6.888888888888889,
"grad_norm": 1.0390015840530396,
"learning_rate": 1.2114459422291205e-05,
"loss": 0.5811,
"step": 275
},
{
"epoch": 7.0,
"grad_norm": 1.7172333002090454,
"learning_rate": 1.116893898236716e-05,
"loss": 0.6019,
"step": 280
},
{
"epoch": 7.1269841269841265,
"grad_norm": 0.9733636975288391,
"learning_rate": 1.0251113999421935e-05,
"loss": 0.4985,
"step": 285
},
{
"epoch": 7.253968253968254,
"grad_norm": 1.0347152948379517,
"learning_rate": 9.362822335518063e-06,
"loss": 0.5585,
"step": 290
},
{
"epoch": 7.380952380952381,
"grad_norm": 0.8855388164520264,
"learning_rate": 8.505842714900297e-06,
"loss": 0.5163,
"step": 295
},
{
"epoch": 7.507936507936508,
"grad_norm": 1.2762815952301025,
"learning_rate": 7.681891162260015e-06,
"loss": 0.5637,
"step": 300
},
{
"epoch": 7.634920634920634,
"grad_norm": 1.1591953039169312,
"learning_rate": 6.892617566550044e-06,
"loss": 0.4971,
"step": 305
},
{
"epoch": 7.761904761904762,
"grad_norm": 1.12722647190094,
"learning_rate": 6.1396023772302465e-06,
"loss": 0.5231,
"step": 310
},
{
"epoch": 7.888888888888889,
"grad_norm": 1.0456628799438477,
"learning_rate": 5.424353439559446e-06,
"loss": 0.5106,
"step": 315
},
{
"epoch": 8.0,
"grad_norm": 2.141434669494629,
"learning_rate": 4.748302975270838e-06,
"loss": 0.5518,
"step": 320
},
{
"epoch": 8.126984126984127,
"grad_norm": 1.1136949062347412,
"learning_rate": 4.112804714676594e-06,
"loss": 0.5119,
"step": 325
},
{
"epoch": 8.253968253968253,
"grad_norm": 1.1117428541183472,
"learning_rate": 3.5191311859445796e-06,
"loss": 0.484,
"step": 330
},
{
"epoch": 8.380952380952381,
"grad_norm": 1.1094739437103271,
"learning_rate": 2.9684711669750313e-06,
"loss": 0.4984,
"step": 335
},
{
"epoch": 8.507936507936508,
"grad_norm": 1.3133466243743896,
"learning_rate": 2.4619273049796e-06,
"loss": 0.5241,
"step": 340
},
{
"epoch": 8.634920634920634,
"grad_norm": 1.1593372821807861,
"learning_rate": 2.0005139085293945e-06,
"loss": 0.5127,
"step": 345
},
{
"epoch": 8.761904761904763,
"grad_norm": 1.176182746887207,
"learning_rate": 1.5851549164932116e-06,
"loss": 0.4987,
"step": 350
},
{
"epoch": 8.88888888888889,
"grad_norm": 1.2116988897323608,
"learning_rate": 1.2166820479329572e-06,
"loss": 0.4652,
"step": 355
},
{
"epoch": 9.0,
"grad_norm": 1.6440435647964478,
"learning_rate": 8.958331366609423e-07,
"loss": 0.4911,
"step": 360
},
{
"epoch": 9.126984126984127,
"grad_norm": 1.0842829942703247,
"learning_rate": 6.232506537939941e-07,
"loss": 0.4461,
"step": 365
},
{
"epoch": 9.253968253968253,
"grad_norm": 1.2127434015274048,
"learning_rate": 3.994804212627462e-07,
"loss": 0.5129,
"step": 370
},
{
"epoch": 9.380952380952381,
"grad_norm": 1.1075149774551392,
"learning_rate": 2.2497051885228827e-07,
"loss": 0.4841,
"step": 375
},
{
"epoch": 9.507936507936508,
"grad_norm": 1.1152573823928833,
"learning_rate": 1.0007038696262516e-07,
"loss": 0.4762,
"step": 380
},
{
"epoch": 9.634920634920634,
"grad_norm": 1.2555053234100342,
"learning_rate": 2.5030126885694506e-08,
"loss": 0.4963,
"step": 385
},
{
"epoch": 9.761904761904763,
"grad_norm": 1.1572444438934326,
"learning_rate": 0.0,
"loss": 0.4631,
"step": 390
},
{
"epoch": 9.761904761904763,
"step": 390,
"total_flos": 3.320475505655808e+16,
"train_loss": 1.0253899280841534,
"train_runtime": 4620.778,
"train_samples_per_second": 2.727,
"train_steps_per_second": 0.084
}
],
"logging_steps": 5,
"max_steps": 390,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 39,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.320475505655808e+16,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}