sla_cpt / trainer_state.json
tktung's picture
Upload folder using huggingface_hub
e81bbde verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 126,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023827252419955324,
"grad_norm": 27.642908096313477,
"learning_rate": 0.0,
"loss": 3.3263,
"step": 1
},
{
"epoch": 0.04765450483991065,
"grad_norm": 29.03249740600586,
"learning_rate": 2.8571428571428573e-06,
"loss": 3.393,
"step": 2
},
{
"epoch": 0.0953090096798213,
"grad_norm": 4.636140823364258,
"learning_rate": 8.571428571428571e-06,
"loss": 2.6817,
"step": 4
},
{
"epoch": 0.14296351451973194,
"grad_norm": 4.222386360168457,
"learning_rate": 1.4285714285714287e-05,
"loss": 2.5385,
"step": 6
},
{
"epoch": 0.1906180193596426,
"grad_norm": 3.460394859313965,
"learning_rate": 2e-05,
"loss": 2.352,
"step": 8
},
{
"epoch": 0.23827252419955325,
"grad_norm": 3.7368428707122803,
"learning_rate": 1.998606410321534e-05,
"loss": 2.1794,
"step": 10
},
{
"epoch": 0.2859270290394639,
"grad_norm": 2.6725502014160156,
"learning_rate": 1.9944295254705187e-05,
"loss": 2.043,
"step": 12
},
{
"epoch": 0.33358153387937456,
"grad_norm": 2.636101484298706,
"learning_rate": 1.9874809871741877e-05,
"loss": 1.8958,
"step": 14
},
{
"epoch": 0.3812360387192852,
"grad_norm": 2.869380474090576,
"learning_rate": 1.977780162255041e-05,
"loss": 1.8137,
"step": 16
},
{
"epoch": 0.4288905435591958,
"grad_norm": 2.8045756816864014,
"learning_rate": 1.9653540886520387e-05,
"loss": 1.7214,
"step": 18
},
{
"epoch": 0.4765450483991065,
"grad_norm": 1.602889060974121,
"learning_rate": 1.9502374000610152e-05,
"loss": 1.6312,
"step": 20
},
{
"epoch": 0.5241995532390171,
"grad_norm": 2.921908378601074,
"learning_rate": 1.932472229404356e-05,
"loss": 1.5755,
"step": 22
},
{
"epoch": 0.5718540580789278,
"grad_norm": 3.2532429695129395,
"learning_rate": 1.912108091398988e-05,
"loss": 1.5273,
"step": 24
},
{
"epoch": 0.6195085629188384,
"grad_norm": 2.755262613296509,
"learning_rate": 1.8892017445499812e-05,
"loss": 1.4395,
"step": 26
},
{
"epoch": 0.6671630677587491,
"grad_norm": 2.0402467250823975,
"learning_rate": 1.8638170329544164e-05,
"loss": 1.4345,
"step": 28
},
{
"epoch": 0.7148175725986597,
"grad_norm": 2.27799129486084,
"learning_rate": 1.8360247083564343e-05,
"loss": 1.3749,
"step": 30
},
{
"epoch": 0.7624720774385704,
"grad_norm": 1.5731786489486694,
"learning_rate": 1.805902232949435e-05,
"loss": 1.28,
"step": 32
},
{
"epoch": 0.810126582278481,
"grad_norm": 2.364778757095337,
"learning_rate": 1.773533563475053e-05,
"loss": 1.2666,
"step": 34
},
{
"epoch": 0.8577810871183916,
"grad_norm": 1.5619275569915771,
"learning_rate": 1.7390089172206594e-05,
"loss": 1.2227,
"step": 36
},
{
"epoch": 0.9054355919583023,
"grad_norm": 1.949645757675171,
"learning_rate": 1.7024245205675986e-05,
"loss": 1.1678,
"step": 38
},
{
"epoch": 0.953090096798213,
"grad_norm": 1.5181773900985718,
"learning_rate": 1.6638823407910085e-05,
"loss": 1.1625,
"step": 40
},
{
"epoch": 1.0,
"grad_norm": 1.2812515497207642,
"learning_rate": 1.6234898018587336e-05,
"loss": 1.1757,
"step": 42
},
{
"epoch": 1.0476545048399106,
"grad_norm": 1.1844559907913208,
"learning_rate": 1.58135948502146e-05,
"loss": 1.1938,
"step": 44
},
{
"epoch": 1.0953090096798213,
"grad_norm": 0.7573416233062744,
"learning_rate": 1.5376088150285777e-05,
"loss": 1.157,
"step": 46
},
{
"epoch": 1.1429635145197319,
"grad_norm": 0.7566890120506287,
"learning_rate": 1.4923597328443423e-05,
"loss": 1.1033,
"step": 48
},
{
"epoch": 1.1906180193596425,
"grad_norm": 0.7004701495170593,
"learning_rate": 1.4457383557765385e-05,
"loss": 1.1247,
"step": 50
},
{
"epoch": 1.2382725241995534,
"grad_norm": 0.9414187669754028,
"learning_rate": 1.397874625964921e-05,
"loss": 1.0881,
"step": 52
},
{
"epoch": 1.2859270290394638,
"grad_norm": 0.6709342002868652,
"learning_rate": 1.348901948209167e-05,
"loss": 1.0797,
"step": 54
},
{
"epoch": 1.3335815338793746,
"grad_norm": 0.8439044952392578,
"learning_rate": 1.2989568181457704e-05,
"loss": 1.0723,
"step": 56
},
{
"epoch": 1.3812360387192852,
"grad_norm": 0.4865539073944092,
"learning_rate": 1.248178441810224e-05,
"loss": 1.0495,
"step": 58
},
{
"epoch": 1.4288905435591959,
"grad_norm": 0.5796261429786682,
"learning_rate": 1.1967083476448282e-05,
"loss": 1.0648,
"step": 60
},
{
"epoch": 1.4765450483991065,
"grad_norm": 0.6459540128707886,
"learning_rate": 1.1446899920335407e-05,
"loss": 1.0539,
"step": 62
},
{
"epoch": 1.5241995532390171,
"grad_norm": 0.41356927156448364,
"learning_rate": 1.092268359463302e-05,
"loss": 1.0292,
"step": 64
},
{
"epoch": 1.5718540580789278,
"grad_norm": 0.45745816826820374,
"learning_rate": 1.0395895584262696e-05,
"loss": 1.002,
"step": 66
},
{
"epoch": 1.6195085629188384,
"grad_norm": 0.4945538640022278,
"learning_rate": 9.868004141892412e-06,
"loss": 1.0291,
"step": 68
},
{
"epoch": 1.6671630677587492,
"grad_norm": 0.4044688045978546,
"learning_rate": 9.340480595653047e-06,
"loss": 1.0213,
"step": 70
},
{
"epoch": 1.7148175725986596,
"grad_norm": 0.40503114461898804,
"learning_rate": 8.814795248282974e-06,
"loss": 1.0027,
"step": 72
},
{
"epoch": 1.7624720774385705,
"grad_norm": 0.409345805644989,
"learning_rate": 8.292413279130625e-06,
"loss": 1.0292,
"step": 74
},
{
"epoch": 1.810126582278481,
"grad_norm": 0.3667221963405609,
"learning_rate": 7.774790660436857e-06,
"loss": 0.9908,
"step": 76
},
{
"epoch": 1.8577810871183917,
"grad_norm": 0.37784790992736816,
"learning_rate": 7.263370099279173e-06,
"loss": 1.0006,
"step": 78
},
{
"epoch": 1.9054355919583021,
"grad_norm": 0.33597901463508606,
"learning_rate": 6.759577016488343e-06,
"loss": 0.9708,
"step": 80
},
{
"epoch": 1.953090096798213,
"grad_norm": 0.37733909487724304,
"learning_rate": 6.264815573744884e-06,
"loss": 0.9695,
"step": 82
},
{
"epoch": 2.0,
"grad_norm": 0.32598474621772766,
"learning_rate": 5.780464759928623e-06,
"loss": 1.0016,
"step": 84
},
{
"epoch": 2.047654504839911,
"grad_norm": 0.3468402922153473,
"learning_rate": 5.307874547629339e-06,
"loss": 0.9625,
"step": 86
},
{
"epoch": 2.0953090096798213,
"grad_norm": 0.3330307602882385,
"learning_rate": 4.848362130531039e-06,
"loss": 0.9447,
"step": 88
},
{
"epoch": 2.142963514519732,
"grad_norm": 0.33839651942253113,
"learning_rate": 4.403208252156921e-06,
"loss": 0.9482,
"step": 90
},
{
"epoch": 2.1906180193596425,
"grad_norm": 0.3227793276309967,
"learning_rate": 3.973653636207437e-06,
"loss": 0.979,
"step": 92
},
{
"epoch": 2.2382725241995534,
"grad_norm": 0.33203473687171936,
"learning_rate": 3.560895528440844e-06,
"loss": 0.9626,
"step": 94
},
{
"epoch": 2.2859270290394638,
"grad_norm": 0.29434484243392944,
"learning_rate": 3.1660843597345137e-06,
"loss": 0.9814,
"step": 96
},
{
"epoch": 2.3335815338793746,
"grad_norm": 0.2951570451259613,
"learning_rate": 2.7903205396277546e-06,
"loss": 0.9368,
"step": 98
},
{
"epoch": 2.381236038719285,
"grad_norm": 0.3088631331920624,
"learning_rate": 2.4346513892830427e-06,
"loss": 0.952,
"step": 100
},
{
"epoch": 2.428890543559196,
"grad_norm": 0.3212229609489441,
"learning_rate": 2.100068222414121e-06,
"loss": 0.924,
"step": 102
},
{
"epoch": 2.4765450483991067,
"grad_norm": 0.28245487809181213,
"learning_rate": 1.7875035823168641e-06,
"loss": 0.9435,
"step": 104
},
{
"epoch": 2.524199553239017,
"grad_norm": 0.2676449716091156,
"learning_rate": 1.4978286427038602e-06,
"loss": 0.92,
"step": 106
},
{
"epoch": 2.5718540580789275,
"grad_norm": 0.26664310693740845,
"learning_rate": 1.2318507795870138e-06,
"loss": 0.9385,
"step": 108
},
{
"epoch": 2.6195085629188384,
"grad_norm": 0.263298362493515,
"learning_rate": 9.903113209758098e-07,
"loss": 0.931,
"step": 110
},
{
"epoch": 2.6671630677587492,
"grad_norm": 0.27026382088661194,
"learning_rate": 7.738834806631712e-07,
"loss": 0.947,
"step": 112
},
{
"epoch": 2.7148175725986596,
"grad_norm": 0.2611420452594757,
"learning_rate": 5.831704818578842e-07,
"loss": 0.9346,
"step": 114
},
{
"epoch": 2.7624720774385705,
"grad_norm": 0.25799694657325745,
"learning_rate": 4.187038758933204e-07,
"loss": 0.9363,
"step": 116
},
{
"epoch": 2.810126582278481,
"grad_norm": 0.2518501877784729,
"learning_rate": 2.809420606985236e-07,
"loss": 0.9359,
"step": 118
},
{
"epoch": 2.8577810871183917,
"grad_norm": 0.2569182813167572,
"learning_rate": 1.7026900316098217e-07,
"loss": 0.9399,
"step": 120
},
{
"epoch": 2.905435591958302,
"grad_norm": 0.24561701714992523,
"learning_rate": 8.699316894203225e-08,
"loss": 0.9467,
"step": 122
},
{
"epoch": 2.953090096798213,
"grad_norm": 0.2518835961818695,
"learning_rate": 3.134666272774034e-08,
"loss": 0.9405,
"step": 124
},
{
"epoch": 3.0,
"grad_norm": 0.25132957100868225,
"learning_rate": 3.4845813115114147e-09,
"loss": 0.9466,
"step": 126
}
],
"logging_steps": 2,
"max_steps": 126,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 300.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.312585296119595e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}