sft-ALL-checkpoint-2000 / trainer_state.json
nickhe's picture
Upload folder using huggingface_hub
9d4e561 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.1008142690965492,
"eval_steps": 500,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07754943776657619,
"grad_norm": 0.41543584930406274,
"learning_rate": 1.984472049689441e-05,
"loss": 0.3571,
"step": 50
},
{
"epoch": 0.15509887553315238,
"grad_norm": 0.2360007761618504,
"learning_rate": 1.9689440993788823e-05,
"loss": 0.1481,
"step": 100
},
{
"epoch": 0.23264831329972857,
"grad_norm": 0.2459469865789585,
"learning_rate": 1.9534161490683232e-05,
"loss": 0.1413,
"step": 150
},
{
"epoch": 0.31019775106630476,
"grad_norm": 0.1836701486527724,
"learning_rate": 1.937888198757764e-05,
"loss": 0.139,
"step": 200
},
{
"epoch": 0.38774718883288095,
"grad_norm": 0.2053715213473303,
"learning_rate": 1.922360248447205e-05,
"loss": 0.1359,
"step": 250
},
{
"epoch": 0.46529662659945714,
"grad_norm": 0.20210517044823975,
"learning_rate": 1.906832298136646e-05,
"loss": 0.1343,
"step": 300
},
{
"epoch": 0.5428460643660333,
"grad_norm": 0.17133333163173686,
"learning_rate": 1.891304347826087e-05,
"loss": 0.1336,
"step": 350
},
{
"epoch": 0.6203955021326095,
"grad_norm": 0.16513031282114732,
"learning_rate": 1.875776397515528e-05,
"loss": 0.1323,
"step": 400
},
{
"epoch": 0.6979449398991857,
"grad_norm": 0.16414324471781971,
"learning_rate": 1.8602484472049693e-05,
"loss": 0.1318,
"step": 450
},
{
"epoch": 0.7754943776657619,
"grad_norm": 0.16672201846671922,
"learning_rate": 1.84472049689441e-05,
"loss": 0.1307,
"step": 500
},
{
"epoch": 0.8530438154323381,
"grad_norm": 0.1588831815209266,
"learning_rate": 1.829192546583851e-05,
"loss": 0.1301,
"step": 550
},
{
"epoch": 0.9305932531989143,
"grad_norm": 0.17229438485787515,
"learning_rate": 1.8136645962732923e-05,
"loss": 0.13,
"step": 600
},
{
"epoch": 1.0077549437766575,
"grad_norm": 0.1626649495069495,
"learning_rate": 1.798136645962733e-05,
"loss": 0.1284,
"step": 650
},
{
"epoch": 1.0853043815432337,
"grad_norm": 0.16302373242598406,
"learning_rate": 1.782608695652174e-05,
"loss": 0.1256,
"step": 700
},
{
"epoch": 1.16285381930981,
"grad_norm": 0.15938032051749196,
"learning_rate": 1.767080745341615e-05,
"loss": 0.1252,
"step": 750
},
{
"epoch": 1.240403257076386,
"grad_norm": 0.19138770209482472,
"learning_rate": 1.751552795031056e-05,
"loss": 0.1244,
"step": 800
},
{
"epoch": 1.3179526948429623,
"grad_norm": 0.15984339587089894,
"learning_rate": 1.736024844720497e-05,
"loss": 0.1253,
"step": 850
},
{
"epoch": 1.3955021326095385,
"grad_norm": 0.1502502706654219,
"learning_rate": 1.720496894409938e-05,
"loss": 0.1248,
"step": 900
},
{
"epoch": 1.4730515703761147,
"grad_norm": 0.13661135477508957,
"learning_rate": 1.704968944099379e-05,
"loss": 0.125,
"step": 950
},
{
"epoch": 1.5506010081426909,
"grad_norm": 0.24839381800982097,
"learning_rate": 1.68944099378882e-05,
"loss": 0.1253,
"step": 1000
},
{
"epoch": 1.628150445909267,
"grad_norm": 0.12815184233515442,
"learning_rate": 1.673913043478261e-05,
"loss": 0.1243,
"step": 1050
},
{
"epoch": 1.7056998836758432,
"grad_norm": 0.13153520379094585,
"learning_rate": 1.658385093167702e-05,
"loss": 0.1237,
"step": 1100
},
{
"epoch": 1.7832493214424194,
"grad_norm": 0.1189084339669079,
"learning_rate": 1.642857142857143e-05,
"loss": 0.1245,
"step": 1150
},
{
"epoch": 1.8607987592089956,
"grad_norm": 0.15491708781159905,
"learning_rate": 1.627329192546584e-05,
"loss": 0.1235,
"step": 1200
},
{
"epoch": 1.9383481969755718,
"grad_norm": 0.12739351593431672,
"learning_rate": 1.611801242236025e-05,
"loss": 0.1243,
"step": 1250
},
{
"epoch": 2.015509887553315,
"grad_norm": 0.12465194041174449,
"learning_rate": 1.596273291925466e-05,
"loss": 0.1219,
"step": 1300
},
{
"epoch": 2.0930593253198913,
"grad_norm": 0.1404295274618665,
"learning_rate": 1.580745341614907e-05,
"loss": 0.1186,
"step": 1350
},
{
"epoch": 2.1706087630864674,
"grad_norm": 0.1359342551816161,
"learning_rate": 1.565217391304348e-05,
"loss": 0.1179,
"step": 1400
},
{
"epoch": 2.2481582008530436,
"grad_norm": 0.15332233562241915,
"learning_rate": 1.549689440993789e-05,
"loss": 0.1185,
"step": 1450
},
{
"epoch": 2.32570763861962,
"grad_norm": 0.11859966428735469,
"learning_rate": 1.5341614906832298e-05,
"loss": 0.1185,
"step": 1500
},
{
"epoch": 2.403257076386196,
"grad_norm": 0.1493931915889296,
"learning_rate": 1.5186335403726709e-05,
"loss": 0.1186,
"step": 1550
},
{
"epoch": 2.480806514152772,
"grad_norm": 0.1319324405407719,
"learning_rate": 1.5031055900621118e-05,
"loss": 0.1189,
"step": 1600
},
{
"epoch": 2.5583559519193484,
"grad_norm": 0.12024679968154829,
"learning_rate": 1.4875776397515529e-05,
"loss": 0.1187,
"step": 1650
},
{
"epoch": 2.6359053896859246,
"grad_norm": 0.11739796193835754,
"learning_rate": 1.472049689440994e-05,
"loss": 0.1187,
"step": 1700
},
{
"epoch": 2.7134548274525008,
"grad_norm": 0.13178912063786213,
"learning_rate": 1.456521739130435e-05,
"loss": 0.1187,
"step": 1750
},
{
"epoch": 2.791004265219077,
"grad_norm": 0.11853885559187641,
"learning_rate": 1.4409937888198759e-05,
"loss": 0.119,
"step": 1800
},
{
"epoch": 2.868553702985653,
"grad_norm": 0.12827528414635014,
"learning_rate": 1.425465838509317e-05,
"loss": 0.1187,
"step": 1850
},
{
"epoch": 2.9461031407522293,
"grad_norm": 0.13153122815676796,
"learning_rate": 1.409937888198758e-05,
"loss": 0.1188,
"step": 1900
},
{
"epoch": 3.023264831329973,
"grad_norm": 0.15315655570709347,
"learning_rate": 1.3944099378881988e-05,
"loss": 0.1156,
"step": 1950
},
{
"epoch": 3.1008142690965492,
"grad_norm": 0.11997442542975086,
"learning_rate": 1.3788819875776398e-05,
"loss": 0.1105,
"step": 2000
}
],
"logging_steps": 50,
"max_steps": 6440,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.0334530111995904e+16,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}