{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8737651998737894, "eval_steps": 500, "global_step": 9000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 9.708502220819883e-05, "grad_norm": 96.5, "learning_rate": 1.9379844961240311e-07, "loss": 4.2495036125183105, "memory(GiB)": 112.92, "step": 1, "token_acc": 0.40126382306477093, "train_speed(iter/s)": 0.130363 }, { "epoch": 0.0248537656852989, "grad_norm": 6.84375, "learning_rate": 4.96124031007752e-05, "loss": 2.0199908088235294, "memory(GiB)": 138.16, "step": 256, "token_acc": 0.5958056756256362, "train_speed(iter/s)": 0.135217 }, { "epoch": 0.0497075313705978, "grad_norm": 2.6875, "learning_rate": 9.92248062015504e-05, "loss": 1.4577598571777344, "memory(GiB)": 138.17, "step": 512, "token_acc": 0.6785023086521644, "train_speed(iter/s)": 0.135385 }, { "epoch": 0.0745612970558967, "grad_norm": 2.296875, "learning_rate": 9.983643805989245e-05, "loss": 1.4217802286148071, "memory(GiB)": 138.17, "step": 768, "token_acc": 0.6849496734466487, "train_speed(iter/s)": 0.134581 }, { "epoch": 0.0994150627411956, "grad_norm": 1.8984375, "learning_rate": 9.933643638247476e-05, "loss": 1.3426355123519897, "memory(GiB)": 138.17, "step": 1024, "token_acc": 0.699673728686982, "train_speed(iter/s)": 0.132515 }, { "epoch": 0.1242688284264945, "grad_norm": 1.84375, "learning_rate": 9.850332959964666e-05, "loss": 1.2444982528686523, "memory(GiB)": 138.17, "step": 1280, "token_acc": 0.7187271993751019, "train_speed(iter/s)": 0.133056 }, { "epoch": 0.1491225941117934, "grad_norm": 1.859375, "learning_rate": 9.73427426033351e-05, "loss": 1.1624003648757935, "memory(GiB)": 138.17, "step": 1536, "token_acc": 0.7339836063834865, "train_speed(iter/s)": 0.133627 }, { "epoch": 0.1739763597970923, "grad_norm": 1.7421875, "learning_rate": 9.58625113355353e-05, "loss": 1.095345377922058, "memory(GiB)": 138.17, "step": 1792, "token_acc": 0.7477444378213578, "train_speed(iter/s)": 0.133066 }, { "epoch": 0.1988301254823912, "grad_norm": 1.796875, "learning_rate": 9.407262988233549e-05, "loss": 1.0396682024002075, "memory(GiB)": 138.17, "step": 2048, "token_acc": 0.7577293820771361, "train_speed(iter/s)": 0.132805 }, { "epoch": 0.2236838911676901, "grad_norm": 2.078125, "learning_rate": 9.19851829967875e-05, "loss": 0.9765125513076782, "memory(GiB)": 138.17, "step": 2304, "token_acc": 0.7712227904219364, "train_speed(iter/s)": 0.133242 }, { "epoch": 0.248537656852989, "grad_norm": 1.5703125, "learning_rate": 8.961426450620912e-05, "loss": 0.920336127281189, "memory(GiB)": 138.17, "step": 2560, "token_acc": 0.7830755957422817, "train_speed(iter/s)": 0.133383 }, { "epoch": 0.2733914225382879, "grad_norm": 1.59375, "learning_rate": 8.69758821548079e-05, "loss": 0.8801365494728088, "memory(GiB)": 138.17, "step": 2816, "token_acc": 0.7913349866408025, "train_speed(iter/s)": 0.133433 }, { "epoch": 0.2982451882235868, "grad_norm": 1.1640625, "learning_rate": 8.408784952410122e-05, "loss": 0.8334779739379883, "memory(GiB)": 138.17, "step": 3072, "token_acc": 0.8006175937055493, "train_speed(iter/s)": 0.132889 }, { "epoch": 0.3230989539088857, "grad_norm": 1.6875, "learning_rate": 8.096966576085406e-05, "loss": 0.7884229421615601, "memory(GiB)": 138.17, "step": 3328, "token_acc": 0.8102322071595001, "train_speed(iter/s)": 0.133073 }, { "epoch": 0.3479527195941846, "grad_norm": 1.4921875, "learning_rate": 7.764238392457582e-05, "loss": 0.7397578954696655, "memory(GiB)": 138.17, "step": 3584, "token_acc": 0.8212528591555482, "train_speed(iter/s)": 0.133335 }, { "epoch": 0.3728064852794835, "grad_norm": 2.1875, "learning_rate": 7.412846884345582e-05, "loss": 0.7087571024894714, "memory(GiB)": 138.17, "step": 3840, "token_acc": 0.8286589691203703, "train_speed(iter/s)": 0.133468 }, { "epoch": 0.3976602509647824, "grad_norm": 1.1953125, "learning_rate": 7.045164543845158e-05, "loss": 0.6600534319877625, "memory(GiB)": 138.17, "step": 4096, "token_acc": 0.8389953998490116, "train_speed(iter/s)": 0.133269 }, { "epoch": 0.4225140166500813, "grad_norm": 1.984375, "learning_rate": 6.663673853960154e-05, "loss": 0.6196721196174622, "memory(GiB)": 138.17, "step": 4352, "token_acc": 0.8484769522886115, "train_speed(iter/s)": 0.133418 }, { "epoch": 0.4473677823353802, "grad_norm": 1.203125, "learning_rate": 6.270950527607537e-05, "loss": 0.5864973068237305, "memory(GiB)": 138.17, "step": 4608, "token_acc": 0.8560292743162837, "train_speed(iter/s)": 0.133475 }, { "epoch": 0.4722215480206791, "grad_norm": 1.203125, "learning_rate": 5.86964611716145e-05, "loss": 0.5385364294052124, "memory(GiB)": 138.17, "step": 4864, "token_acc": 0.86720534525908, "train_speed(iter/s)": 0.133422 }, { "epoch": 0.497075313705978, "grad_norm": 1.359375, "learning_rate": 5.4624701119515856e-05, "loss": 0.49772173166275024, "memory(GiB)": 138.17, "step": 5120, "token_acc": 0.8767477774531491, "train_speed(iter/s)": 0.133344 }, { "epoch": 0.5219290793912769, "grad_norm": 1.40625, "learning_rate": 5.0521716445882614e-05, "loss": 0.46582430601119995, "memory(GiB)": 138.17, "step": 5376, "token_acc": 0.8850724068459814, "train_speed(iter/s)": 0.13342 }, { "epoch": 0.5467828450765758, "grad_norm": 1.4140625, "learning_rate": 4.64152092962774e-05, "loss": 0.4441249668598175, "memory(GiB)": 138.17, "step": 5632, "token_acc": 0.8896486479315997, "train_speed(iter/s)": 0.133319 }, { "epoch": 0.5716366107618747, "grad_norm": 1.8203125, "learning_rate": 4.2332905598984413e-05, "loss": 0.40981537103652954, "memory(GiB)": 138.17, "step": 5888, "token_acc": 0.8990366693094052, "train_speed(iter/s)": 0.133434 }, { "epoch": 0.5964903764471736, "grad_norm": 1.5234375, "learning_rate": 3.830236786769761e-05, "loss": 0.3865773379802704, "memory(GiB)": 138.17, "step": 6144, "token_acc": 0.9034815882027802, "train_speed(iter/s)": 0.133189 }, { "epoch": 0.6213441421324725, "grad_norm": 1.03125, "learning_rate": 3.4350809107536214e-05, "loss": 0.36623093485832214, "memory(GiB)": 138.17, "step": 6400, "token_acc": 0.9082446782242596, "train_speed(iter/s)": 0.133233 }, { "epoch": 0.6461979078177714, "grad_norm": 2.203125, "learning_rate": 3.0504909080839294e-05, "loss": 0.34115397930145264, "memory(GiB)": 138.17, "step": 6656, "token_acc": 0.914435132291292, "train_speed(iter/s)": 0.133334 }, { "epoch": 0.6710516735030703, "grad_norm": 1.6015625, "learning_rate": 2.6790634173258577e-05, "loss": 0.3342404067516327, "memory(GiB)": 138.17, "step": 6912, "token_acc": 0.916860246202295, "train_speed(iter/s)": 0.133453 }, { "epoch": 0.6959054391883692, "grad_norm": 1.3515625, "learning_rate": 2.323306207636102e-05, "loss": 0.3142353296279907, "memory(GiB)": 138.17, "step": 7168, "token_acc": 0.9221213834353058, "train_speed(iter/s)": 0.133271 }, { "epoch": 0.7207592048736681, "grad_norm": 2.03125, "learning_rate": 1.9856212470432345e-05, "loss": 0.30621492862701416, "memory(GiB)": 138.17, "step": 7424, "token_acc": 0.9249788937888913, "train_speed(iter/s)": 0.133231 }, { "epoch": 0.745612970558967, "grad_norm": 1.5078125, "learning_rate": 1.6682884850661395e-05, "loss": 0.2921682596206665, "memory(GiB)": 138.17, "step": 7680, "token_acc": 0.9275989615640866, "train_speed(iter/s)": 0.133307 }, { "epoch": 0.7704667362442659, "grad_norm": 1.3671875, "learning_rate": 1.3734504591655495e-05, "loss": 0.2854159474372864, "memory(GiB)": 138.17, "step": 7936, "token_acc": 0.9285902741314146, "train_speed(iter/s)": 0.133282 }, { "epoch": 0.7953205019295648, "grad_norm": 2.0625, "learning_rate": 1.1030978289613726e-05, "loss": 0.28136610984802246, "memory(GiB)": 138.17, "step": 8192, "token_acc": 0.9303321847535716, "train_speed(iter/s)": 0.132989 }, { "epoch": 0.8201742676148637, "grad_norm": 1.28125, "learning_rate": 8.590559358845118e-06, "loss": 0.2735920548439026, "memory(GiB)": 138.17, "step": 8448, "token_acc": 0.931173780136838, "train_speed(iter/s)": 0.13298 }, { "epoch": 0.8450280333001626, "grad_norm": 1.3671875, "learning_rate": 6.4297247900848125e-06, "loss": 0.2691897451877594, "memory(GiB)": 138.17, "step": 8704, "token_acc": 0.9327428202220522, "train_speed(iter/s)": 0.132967 }, { "epoch": 0.8698817989854615, "grad_norm": 1.5234375, "learning_rate": 4.563063902699582e-06, "loss": 0.26750853657722473, "memory(GiB)": 138.17, "step": 8960, "token_acc": 0.9330804530345964, "train_speed(iter/s)": 0.132959 } ], "logging_steps": 256, "max_steps": 10301, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.1037325459482546e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }