| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8737651998737894, | |
| "eval_steps": 500, | |
| "global_step": 9000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 9.708502220819883e-05, | |
| "grad_norm": 96.5, | |
| "learning_rate": 1.9379844961240311e-07, | |
| "loss": 4.2495036125183105, | |
| "memory(GiB)": 112.92, | |
| "step": 1, | |
| "token_acc": 0.40126382306477093, | |
| "train_speed(iter/s)": 0.130363 | |
| }, | |
| { | |
| "epoch": 0.0248537656852989, | |
| "grad_norm": 6.84375, | |
| "learning_rate": 4.96124031007752e-05, | |
| "loss": 2.0199908088235294, | |
| "memory(GiB)": 138.16, | |
| "step": 256, | |
| "token_acc": 0.5958056756256362, | |
| "train_speed(iter/s)": 0.135217 | |
| }, | |
| { | |
| "epoch": 0.0497075313705978, | |
| "grad_norm": 2.6875, | |
| "learning_rate": 9.92248062015504e-05, | |
| "loss": 1.4577598571777344, | |
| "memory(GiB)": 138.17, | |
| "step": 512, | |
| "token_acc": 0.6785023086521644, | |
| "train_speed(iter/s)": 0.135385 | |
| }, | |
| { | |
| "epoch": 0.0745612970558967, | |
| "grad_norm": 2.296875, | |
| "learning_rate": 9.983643805989245e-05, | |
| "loss": 1.4217802286148071, | |
| "memory(GiB)": 138.17, | |
| "step": 768, | |
| "token_acc": 0.6849496734466487, | |
| "train_speed(iter/s)": 0.134581 | |
| }, | |
| { | |
| "epoch": 0.0994150627411956, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 9.933643638247476e-05, | |
| "loss": 1.3426355123519897, | |
| "memory(GiB)": 138.17, | |
| "step": 1024, | |
| "token_acc": 0.699673728686982, | |
| "train_speed(iter/s)": 0.132515 | |
| }, | |
| { | |
| "epoch": 0.1242688284264945, | |
| "grad_norm": 1.84375, | |
| "learning_rate": 9.850332959964666e-05, | |
| "loss": 1.2444982528686523, | |
| "memory(GiB)": 138.17, | |
| "step": 1280, | |
| "token_acc": 0.7187271993751019, | |
| "train_speed(iter/s)": 0.133056 | |
| }, | |
| { | |
| "epoch": 0.1491225941117934, | |
| "grad_norm": 1.859375, | |
| "learning_rate": 9.73427426033351e-05, | |
| "loss": 1.1624003648757935, | |
| "memory(GiB)": 138.17, | |
| "step": 1536, | |
| "token_acc": 0.7339836063834865, | |
| "train_speed(iter/s)": 0.133627 | |
| }, | |
| { | |
| "epoch": 0.1739763597970923, | |
| "grad_norm": 1.7421875, | |
| "learning_rate": 9.58625113355353e-05, | |
| "loss": 1.095345377922058, | |
| "memory(GiB)": 138.17, | |
| "step": 1792, | |
| "token_acc": 0.7477444378213578, | |
| "train_speed(iter/s)": 0.133066 | |
| }, | |
| { | |
| "epoch": 0.1988301254823912, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 9.407262988233549e-05, | |
| "loss": 1.0396682024002075, | |
| "memory(GiB)": 138.17, | |
| "step": 2048, | |
| "token_acc": 0.7577293820771361, | |
| "train_speed(iter/s)": 0.132805 | |
| }, | |
| { | |
| "epoch": 0.2236838911676901, | |
| "grad_norm": 2.078125, | |
| "learning_rate": 9.19851829967875e-05, | |
| "loss": 0.9765125513076782, | |
| "memory(GiB)": 138.17, | |
| "step": 2304, | |
| "token_acc": 0.7712227904219364, | |
| "train_speed(iter/s)": 0.133242 | |
| }, | |
| { | |
| "epoch": 0.248537656852989, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 8.961426450620912e-05, | |
| "loss": 0.920336127281189, | |
| "memory(GiB)": 138.17, | |
| "step": 2560, | |
| "token_acc": 0.7830755957422817, | |
| "train_speed(iter/s)": 0.133383 | |
| }, | |
| { | |
| "epoch": 0.2733914225382879, | |
| "grad_norm": 1.59375, | |
| "learning_rate": 8.69758821548079e-05, | |
| "loss": 0.8801365494728088, | |
| "memory(GiB)": 138.17, | |
| "step": 2816, | |
| "token_acc": 0.7913349866408025, | |
| "train_speed(iter/s)": 0.133433 | |
| }, | |
| { | |
| "epoch": 0.2982451882235868, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 8.408784952410122e-05, | |
| "loss": 0.8334779739379883, | |
| "memory(GiB)": 138.17, | |
| "step": 3072, | |
| "token_acc": 0.8006175937055493, | |
| "train_speed(iter/s)": 0.132889 | |
| }, | |
| { | |
| "epoch": 0.3230989539088857, | |
| "grad_norm": 1.6875, | |
| "learning_rate": 8.096966576085406e-05, | |
| "loss": 0.7884229421615601, | |
| "memory(GiB)": 138.17, | |
| "step": 3328, | |
| "token_acc": 0.8102322071595001, | |
| "train_speed(iter/s)": 0.133073 | |
| }, | |
| { | |
| "epoch": 0.3479527195941846, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 7.764238392457582e-05, | |
| "loss": 0.7397578954696655, | |
| "memory(GiB)": 138.17, | |
| "step": 3584, | |
| "token_acc": 0.8212528591555482, | |
| "train_speed(iter/s)": 0.133335 | |
| }, | |
| { | |
| "epoch": 0.3728064852794835, | |
| "grad_norm": 2.1875, | |
| "learning_rate": 7.412846884345582e-05, | |
| "loss": 0.7087571024894714, | |
| "memory(GiB)": 138.17, | |
| "step": 3840, | |
| "token_acc": 0.8286589691203703, | |
| "train_speed(iter/s)": 0.133468 | |
| }, | |
| { | |
| "epoch": 0.3976602509647824, | |
| "grad_norm": 1.1953125, | |
| "learning_rate": 7.045164543845158e-05, | |
| "loss": 0.6600534319877625, | |
| "memory(GiB)": 138.17, | |
| "step": 4096, | |
| "token_acc": 0.8389953998490116, | |
| "train_speed(iter/s)": 0.133269 | |
| }, | |
| { | |
| "epoch": 0.4225140166500813, | |
| "grad_norm": 1.984375, | |
| "learning_rate": 6.663673853960154e-05, | |
| "loss": 0.6196721196174622, | |
| "memory(GiB)": 138.17, | |
| "step": 4352, | |
| "token_acc": 0.8484769522886115, | |
| "train_speed(iter/s)": 0.133418 | |
| }, | |
| { | |
| "epoch": 0.4473677823353802, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 6.270950527607537e-05, | |
| "loss": 0.5864973068237305, | |
| "memory(GiB)": 138.17, | |
| "step": 4608, | |
| "token_acc": 0.8560292743162837, | |
| "train_speed(iter/s)": 0.133475 | |
| }, | |
| { | |
| "epoch": 0.4722215480206791, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 5.86964611716145e-05, | |
| "loss": 0.5385364294052124, | |
| "memory(GiB)": 138.17, | |
| "step": 4864, | |
| "token_acc": 0.86720534525908, | |
| "train_speed(iter/s)": 0.133422 | |
| }, | |
| { | |
| "epoch": 0.497075313705978, | |
| "grad_norm": 1.359375, | |
| "learning_rate": 5.4624701119515856e-05, | |
| "loss": 0.49772173166275024, | |
| "memory(GiB)": 138.17, | |
| "step": 5120, | |
| "token_acc": 0.8767477774531491, | |
| "train_speed(iter/s)": 0.133344 | |
| }, | |
| { | |
| "epoch": 0.5219290793912769, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 5.0521716445882614e-05, | |
| "loss": 0.46582430601119995, | |
| "memory(GiB)": 138.17, | |
| "step": 5376, | |
| "token_acc": 0.8850724068459814, | |
| "train_speed(iter/s)": 0.13342 | |
| }, | |
| { | |
| "epoch": 0.5467828450765758, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 4.64152092962774e-05, | |
| "loss": 0.4441249668598175, | |
| "memory(GiB)": 138.17, | |
| "step": 5632, | |
| "token_acc": 0.8896486479315997, | |
| "train_speed(iter/s)": 0.133319 | |
| }, | |
| { | |
| "epoch": 0.5716366107618747, | |
| "grad_norm": 1.8203125, | |
| "learning_rate": 4.2332905598984413e-05, | |
| "loss": 0.40981537103652954, | |
| "memory(GiB)": 138.17, | |
| "step": 5888, | |
| "token_acc": 0.8990366693094052, | |
| "train_speed(iter/s)": 0.133434 | |
| }, | |
| { | |
| "epoch": 0.5964903764471736, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 3.830236786769761e-05, | |
| "loss": 0.3865773379802704, | |
| "memory(GiB)": 138.17, | |
| "step": 6144, | |
| "token_acc": 0.9034815882027802, | |
| "train_speed(iter/s)": 0.133189 | |
| }, | |
| { | |
| "epoch": 0.6213441421324725, | |
| "grad_norm": 1.03125, | |
| "learning_rate": 3.4350809107536214e-05, | |
| "loss": 0.36623093485832214, | |
| "memory(GiB)": 138.17, | |
| "step": 6400, | |
| "token_acc": 0.9082446782242596, | |
| "train_speed(iter/s)": 0.133233 | |
| }, | |
| { | |
| "epoch": 0.6461979078177714, | |
| "grad_norm": 2.203125, | |
| "learning_rate": 3.0504909080839294e-05, | |
| "loss": 0.34115397930145264, | |
| "memory(GiB)": 138.17, | |
| "step": 6656, | |
| "token_acc": 0.914435132291292, | |
| "train_speed(iter/s)": 0.133334 | |
| }, | |
| { | |
| "epoch": 0.6710516735030703, | |
| "grad_norm": 1.6015625, | |
| "learning_rate": 2.6790634173258577e-05, | |
| "loss": 0.3342404067516327, | |
| "memory(GiB)": 138.17, | |
| "step": 6912, | |
| "token_acc": 0.916860246202295, | |
| "train_speed(iter/s)": 0.133453 | |
| }, | |
| { | |
| "epoch": 0.6959054391883692, | |
| "grad_norm": 1.3515625, | |
| "learning_rate": 2.323306207636102e-05, | |
| "loss": 0.3142353296279907, | |
| "memory(GiB)": 138.17, | |
| "step": 7168, | |
| "token_acc": 0.9221213834353058, | |
| "train_speed(iter/s)": 0.133271 | |
| }, | |
| { | |
| "epoch": 0.7207592048736681, | |
| "grad_norm": 2.03125, | |
| "learning_rate": 1.9856212470432345e-05, | |
| "loss": 0.30621492862701416, | |
| "memory(GiB)": 138.17, | |
| "step": 7424, | |
| "token_acc": 0.9249788937888913, | |
| "train_speed(iter/s)": 0.133231 | |
| }, | |
| { | |
| "epoch": 0.745612970558967, | |
| "grad_norm": 1.5078125, | |
| "learning_rate": 1.6682884850661395e-05, | |
| "loss": 0.2921682596206665, | |
| "memory(GiB)": 138.17, | |
| "step": 7680, | |
| "token_acc": 0.9275989615640866, | |
| "train_speed(iter/s)": 0.133307 | |
| }, | |
| { | |
| "epoch": 0.7704667362442659, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 1.3734504591655495e-05, | |
| "loss": 0.2854159474372864, | |
| "memory(GiB)": 138.17, | |
| "step": 7936, | |
| "token_acc": 0.9285902741314146, | |
| "train_speed(iter/s)": 0.133282 | |
| }, | |
| { | |
| "epoch": 0.7953205019295648, | |
| "grad_norm": 2.0625, | |
| "learning_rate": 1.1030978289613726e-05, | |
| "loss": 0.28136610984802246, | |
| "memory(GiB)": 138.17, | |
| "step": 8192, | |
| "token_acc": 0.9303321847535716, | |
| "train_speed(iter/s)": 0.132989 | |
| }, | |
| { | |
| "epoch": 0.8201742676148637, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.590559358845118e-06, | |
| "loss": 0.2735920548439026, | |
| "memory(GiB)": 138.17, | |
| "step": 8448, | |
| "token_acc": 0.931173780136838, | |
| "train_speed(iter/s)": 0.13298 | |
| }, | |
| { | |
| "epoch": 0.8450280333001626, | |
| "grad_norm": 1.3671875, | |
| "learning_rate": 6.4297247900848125e-06, | |
| "loss": 0.2691897451877594, | |
| "memory(GiB)": 138.17, | |
| "step": 8704, | |
| "token_acc": 0.9327428202220522, | |
| "train_speed(iter/s)": 0.132967 | |
| }, | |
| { | |
| "epoch": 0.8698817989854615, | |
| "grad_norm": 1.5234375, | |
| "learning_rate": 4.563063902699582e-06, | |
| "loss": 0.26750853657722473, | |
| "memory(GiB)": 138.17, | |
| "step": 8960, | |
| "token_acc": 0.9330804530345964, | |
| "train_speed(iter/s)": 0.132959 | |
| } | |
| ], | |
| "logging_steps": 256, | |
| "max_steps": 10301, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1037325459482546e+19, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |