gm3-step9000 / trainer_state.json
kaamd's picture
Add files using upload-large-folder tool
966d7ce verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8737651998737894,
"eval_steps": 500,
"global_step": 9000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 9.708502220819883e-05,
"grad_norm": 96.5,
"learning_rate": 1.9379844961240311e-07,
"loss": 4.2495036125183105,
"memory(GiB)": 112.92,
"step": 1,
"token_acc": 0.40126382306477093,
"train_speed(iter/s)": 0.130363
},
{
"epoch": 0.0248537656852989,
"grad_norm": 6.84375,
"learning_rate": 4.96124031007752e-05,
"loss": 2.0199908088235294,
"memory(GiB)": 138.16,
"step": 256,
"token_acc": 0.5958056756256362,
"train_speed(iter/s)": 0.135217
},
{
"epoch": 0.0497075313705978,
"grad_norm": 2.6875,
"learning_rate": 9.92248062015504e-05,
"loss": 1.4577598571777344,
"memory(GiB)": 138.17,
"step": 512,
"token_acc": 0.6785023086521644,
"train_speed(iter/s)": 0.135385
},
{
"epoch": 0.0745612970558967,
"grad_norm": 2.296875,
"learning_rate": 9.983643805989245e-05,
"loss": 1.4217802286148071,
"memory(GiB)": 138.17,
"step": 768,
"token_acc": 0.6849496734466487,
"train_speed(iter/s)": 0.134581
},
{
"epoch": 0.0994150627411956,
"grad_norm": 1.8984375,
"learning_rate": 9.933643638247476e-05,
"loss": 1.3426355123519897,
"memory(GiB)": 138.17,
"step": 1024,
"token_acc": 0.699673728686982,
"train_speed(iter/s)": 0.132515
},
{
"epoch": 0.1242688284264945,
"grad_norm": 1.84375,
"learning_rate": 9.850332959964666e-05,
"loss": 1.2444982528686523,
"memory(GiB)": 138.17,
"step": 1280,
"token_acc": 0.7187271993751019,
"train_speed(iter/s)": 0.133056
},
{
"epoch": 0.1491225941117934,
"grad_norm": 1.859375,
"learning_rate": 9.73427426033351e-05,
"loss": 1.1624003648757935,
"memory(GiB)": 138.17,
"step": 1536,
"token_acc": 0.7339836063834865,
"train_speed(iter/s)": 0.133627
},
{
"epoch": 0.1739763597970923,
"grad_norm": 1.7421875,
"learning_rate": 9.58625113355353e-05,
"loss": 1.095345377922058,
"memory(GiB)": 138.17,
"step": 1792,
"token_acc": 0.7477444378213578,
"train_speed(iter/s)": 0.133066
},
{
"epoch": 0.1988301254823912,
"grad_norm": 1.796875,
"learning_rate": 9.407262988233549e-05,
"loss": 1.0396682024002075,
"memory(GiB)": 138.17,
"step": 2048,
"token_acc": 0.7577293820771361,
"train_speed(iter/s)": 0.132805
},
{
"epoch": 0.2236838911676901,
"grad_norm": 2.078125,
"learning_rate": 9.19851829967875e-05,
"loss": 0.9765125513076782,
"memory(GiB)": 138.17,
"step": 2304,
"token_acc": 0.7712227904219364,
"train_speed(iter/s)": 0.133242
},
{
"epoch": 0.248537656852989,
"grad_norm": 1.5703125,
"learning_rate": 8.961426450620912e-05,
"loss": 0.920336127281189,
"memory(GiB)": 138.17,
"step": 2560,
"token_acc": 0.7830755957422817,
"train_speed(iter/s)": 0.133383
},
{
"epoch": 0.2733914225382879,
"grad_norm": 1.59375,
"learning_rate": 8.69758821548079e-05,
"loss": 0.8801365494728088,
"memory(GiB)": 138.17,
"step": 2816,
"token_acc": 0.7913349866408025,
"train_speed(iter/s)": 0.133433
},
{
"epoch": 0.2982451882235868,
"grad_norm": 1.1640625,
"learning_rate": 8.408784952410122e-05,
"loss": 0.8334779739379883,
"memory(GiB)": 138.17,
"step": 3072,
"token_acc": 0.8006175937055493,
"train_speed(iter/s)": 0.132889
},
{
"epoch": 0.3230989539088857,
"grad_norm": 1.6875,
"learning_rate": 8.096966576085406e-05,
"loss": 0.7884229421615601,
"memory(GiB)": 138.17,
"step": 3328,
"token_acc": 0.8102322071595001,
"train_speed(iter/s)": 0.133073
},
{
"epoch": 0.3479527195941846,
"grad_norm": 1.4921875,
"learning_rate": 7.764238392457582e-05,
"loss": 0.7397578954696655,
"memory(GiB)": 138.17,
"step": 3584,
"token_acc": 0.8212528591555482,
"train_speed(iter/s)": 0.133335
},
{
"epoch": 0.3728064852794835,
"grad_norm": 2.1875,
"learning_rate": 7.412846884345582e-05,
"loss": 0.7087571024894714,
"memory(GiB)": 138.17,
"step": 3840,
"token_acc": 0.8286589691203703,
"train_speed(iter/s)": 0.133468
},
{
"epoch": 0.3976602509647824,
"grad_norm": 1.1953125,
"learning_rate": 7.045164543845158e-05,
"loss": 0.6600534319877625,
"memory(GiB)": 138.17,
"step": 4096,
"token_acc": 0.8389953998490116,
"train_speed(iter/s)": 0.133269
},
{
"epoch": 0.4225140166500813,
"grad_norm": 1.984375,
"learning_rate": 6.663673853960154e-05,
"loss": 0.6196721196174622,
"memory(GiB)": 138.17,
"step": 4352,
"token_acc": 0.8484769522886115,
"train_speed(iter/s)": 0.133418
},
{
"epoch": 0.4473677823353802,
"grad_norm": 1.203125,
"learning_rate": 6.270950527607537e-05,
"loss": 0.5864973068237305,
"memory(GiB)": 138.17,
"step": 4608,
"token_acc": 0.8560292743162837,
"train_speed(iter/s)": 0.133475
},
{
"epoch": 0.4722215480206791,
"grad_norm": 1.203125,
"learning_rate": 5.86964611716145e-05,
"loss": 0.5385364294052124,
"memory(GiB)": 138.17,
"step": 4864,
"token_acc": 0.86720534525908,
"train_speed(iter/s)": 0.133422
},
{
"epoch": 0.497075313705978,
"grad_norm": 1.359375,
"learning_rate": 5.4624701119515856e-05,
"loss": 0.49772173166275024,
"memory(GiB)": 138.17,
"step": 5120,
"token_acc": 0.8767477774531491,
"train_speed(iter/s)": 0.133344
},
{
"epoch": 0.5219290793912769,
"grad_norm": 1.40625,
"learning_rate": 5.0521716445882614e-05,
"loss": 0.46582430601119995,
"memory(GiB)": 138.17,
"step": 5376,
"token_acc": 0.8850724068459814,
"train_speed(iter/s)": 0.13342
},
{
"epoch": 0.5467828450765758,
"grad_norm": 1.4140625,
"learning_rate": 4.64152092962774e-05,
"loss": 0.4441249668598175,
"memory(GiB)": 138.17,
"step": 5632,
"token_acc": 0.8896486479315997,
"train_speed(iter/s)": 0.133319
},
{
"epoch": 0.5716366107618747,
"grad_norm": 1.8203125,
"learning_rate": 4.2332905598984413e-05,
"loss": 0.40981537103652954,
"memory(GiB)": 138.17,
"step": 5888,
"token_acc": 0.8990366693094052,
"train_speed(iter/s)": 0.133434
},
{
"epoch": 0.5964903764471736,
"grad_norm": 1.5234375,
"learning_rate": 3.830236786769761e-05,
"loss": 0.3865773379802704,
"memory(GiB)": 138.17,
"step": 6144,
"token_acc": 0.9034815882027802,
"train_speed(iter/s)": 0.133189
},
{
"epoch": 0.6213441421324725,
"grad_norm": 1.03125,
"learning_rate": 3.4350809107536214e-05,
"loss": 0.36623093485832214,
"memory(GiB)": 138.17,
"step": 6400,
"token_acc": 0.9082446782242596,
"train_speed(iter/s)": 0.133233
},
{
"epoch": 0.6461979078177714,
"grad_norm": 2.203125,
"learning_rate": 3.0504909080839294e-05,
"loss": 0.34115397930145264,
"memory(GiB)": 138.17,
"step": 6656,
"token_acc": 0.914435132291292,
"train_speed(iter/s)": 0.133334
},
{
"epoch": 0.6710516735030703,
"grad_norm": 1.6015625,
"learning_rate": 2.6790634173258577e-05,
"loss": 0.3342404067516327,
"memory(GiB)": 138.17,
"step": 6912,
"token_acc": 0.916860246202295,
"train_speed(iter/s)": 0.133453
},
{
"epoch": 0.6959054391883692,
"grad_norm": 1.3515625,
"learning_rate": 2.323306207636102e-05,
"loss": 0.3142353296279907,
"memory(GiB)": 138.17,
"step": 7168,
"token_acc": 0.9221213834353058,
"train_speed(iter/s)": 0.133271
},
{
"epoch": 0.7207592048736681,
"grad_norm": 2.03125,
"learning_rate": 1.9856212470432345e-05,
"loss": 0.30621492862701416,
"memory(GiB)": 138.17,
"step": 7424,
"token_acc": 0.9249788937888913,
"train_speed(iter/s)": 0.133231
},
{
"epoch": 0.745612970558967,
"grad_norm": 1.5078125,
"learning_rate": 1.6682884850661395e-05,
"loss": 0.2921682596206665,
"memory(GiB)": 138.17,
"step": 7680,
"token_acc": 0.9275989615640866,
"train_speed(iter/s)": 0.133307
},
{
"epoch": 0.7704667362442659,
"grad_norm": 1.3671875,
"learning_rate": 1.3734504591655495e-05,
"loss": 0.2854159474372864,
"memory(GiB)": 138.17,
"step": 7936,
"token_acc": 0.9285902741314146,
"train_speed(iter/s)": 0.133282
},
{
"epoch": 0.7953205019295648,
"grad_norm": 2.0625,
"learning_rate": 1.1030978289613726e-05,
"loss": 0.28136610984802246,
"memory(GiB)": 138.17,
"step": 8192,
"token_acc": 0.9303321847535716,
"train_speed(iter/s)": 0.132989
},
{
"epoch": 0.8201742676148637,
"grad_norm": 1.28125,
"learning_rate": 8.590559358845118e-06,
"loss": 0.2735920548439026,
"memory(GiB)": 138.17,
"step": 8448,
"token_acc": 0.931173780136838,
"train_speed(iter/s)": 0.13298
},
{
"epoch": 0.8450280333001626,
"grad_norm": 1.3671875,
"learning_rate": 6.4297247900848125e-06,
"loss": 0.2691897451877594,
"memory(GiB)": 138.17,
"step": 8704,
"token_acc": 0.9327428202220522,
"train_speed(iter/s)": 0.132967
},
{
"epoch": 0.8698817989854615,
"grad_norm": 1.5234375,
"learning_rate": 4.563063902699582e-06,
"loss": 0.26750853657722473,
"memory(GiB)": 138.17,
"step": 8960,
"token_acc": 0.9330804530345964,
"train_speed(iter/s)": 0.132959
}
],
"logging_steps": 256,
"max_steps": 10301,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1037325459482546e+19,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}