{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08, "grad_norm": 2.2252185344696045, "learning_rate": 1.0526315789473684e-05, "loss": 1.6436, "step": 5 }, { "epoch": 0.16, "grad_norm": 1.4878095388412476, "learning_rate": 2.368421052631579e-05, "loss": 0.772, "step": 10 }, { "epoch": 0.24, "grad_norm": 0.5770473480224609, "learning_rate": 3.6842105263157895e-05, "loss": 0.1927, "step": 15 }, { "epoch": 0.32, "grad_norm": 0.24875187873840332, "learning_rate": 5e-05, "loss": 0.08, "step": 20 }, { "epoch": 0.4, "grad_norm": 0.2741605043411255, "learning_rate": 4.989335440737586e-05, "loss": 0.0735, "step": 25 }, { "epoch": 0.48, "grad_norm": 0.10829973965883255, "learning_rate": 4.957432749209755e-05, "loss": 0.0719, "step": 30 }, { "epoch": 0.56, "grad_norm": 0.16560488939285278, "learning_rate": 4.9045641079320484e-05, "loss": 0.0702, "step": 35 }, { "epoch": 0.64, "grad_norm": 0.1393565684556961, "learning_rate": 4.8311805735108894e-05, "loss": 0.071, "step": 40 }, { "epoch": 0.72, "grad_norm": 0.1896258294582367, "learning_rate": 4.7379082283876566e-05, "loss": 0.0677, "step": 45 }, { "epoch": 0.8, "grad_norm": 0.1210947260260582, "learning_rate": 4.625542839324036e-05, "loss": 0.0599, "step": 50 }, { "epoch": 0.88, "grad_norm": 0.16256098449230194, "learning_rate": 4.4950430682006e-05, "loss": 0.0685, "step": 55 }, { "epoch": 0.96, "grad_norm": 0.10290851444005966, "learning_rate": 4.347522293051648e-05, "loss": 0.0653, "step": 60 }, { "epoch": 1.032, "grad_norm": 0.10546600073575974, "learning_rate": 4.184239109116393e-05, "loss": 0.0655, "step": 65 }, { "epoch": 1.112, "grad_norm": 0.09690473228693008, "learning_rate": 4.0065865909481417e-05, "loss": 0.0629, "step": 70 }, { "epoch": 1.192, "grad_norm": 0.11338075995445251, "learning_rate": 3.81608040719339e-05, "loss": 0.0594, "step": 75 }, { "epoch": 1.272, "grad_norm": 0.11339253932237625, "learning_rate": 3.6143458894413465e-05, "loss": 0.0594, "step": 80 }, { "epoch": 1.3519999999999999, "grad_norm": 0.18642409145832062, "learning_rate": 3.403104165467883e-05, "loss": 0.058, "step": 85 }, { "epoch": 1.432, "grad_norm": 0.16320252418518066, "learning_rate": 3.1841574751802076e-05, "loss": 0.0598, "step": 90 }, { "epoch": 1.512, "grad_norm": 0.1201169490814209, "learning_rate": 2.9593737945414264e-05, "loss": 0.064, "step": 95 }, { "epoch": 1.592, "grad_norm": 0.18611621856689453, "learning_rate": 2.7306708986582553e-05, "loss": 0.0629, "step": 100 }, { "epoch": 1.6720000000000002, "grad_norm": 0.09128480404615402, "learning_rate": 2.5e-05, "loss": 0.0609, "step": 105 }, { "epoch": 1.752, "grad_norm": 0.1125178411602974, "learning_rate": 2.2693291013417453e-05, "loss": 0.0641, "step": 110 }, { "epoch": 1.8319999999999999, "grad_norm": 0.1417843997478485, "learning_rate": 2.0406262054585738e-05, "loss": 0.0602, "step": 115 }, { "epoch": 1.912, "grad_norm": 0.10574869811534882, "learning_rate": 1.815842524819793e-05, "loss": 0.0604, "step": 120 }, { "epoch": 1.992, "grad_norm": 0.12096602469682693, "learning_rate": 1.5968958345321178e-05, "loss": 0.0557, "step": 125 }, { "epoch": 2.064, "grad_norm": 0.13963642716407776, "learning_rate": 1.3856541105586545e-05, "loss": 0.0545, "step": 130 }, { "epoch": 2.144, "grad_norm": 0.10345666855573654, "learning_rate": 1.1839195928066102e-05, "loss": 0.0503, "step": 135 }, { "epoch": 2.224, "grad_norm": 0.11484814435243607, "learning_rate": 9.934134090518593e-06, "loss": 0.0543, "step": 140 }, { "epoch": 2.304, "grad_norm": 0.1633157581090927, "learning_rate": 8.15760890883607e-06, "loss": 0.0538, "step": 145 }, { "epoch": 2.384, "grad_norm": 0.13634610176086426, "learning_rate": 6.524777069483526e-06, "loss": 0.0521, "step": 150 }, { "epoch": 2.464, "grad_norm": 0.12374045699834824, "learning_rate": 5.049569317994013e-06, "loss": 0.057, "step": 155 }, { "epoch": 2.544, "grad_norm": 0.11331922560930252, "learning_rate": 3.7445716067596503e-06, "loss": 0.0545, "step": 160 }, { "epoch": 2.624, "grad_norm": 0.12591035664081573, "learning_rate": 2.6209177161234445e-06, "loss": 0.0544, "step": 165 }, { "epoch": 2.7039999999999997, "grad_norm": 0.13460087776184082, "learning_rate": 1.6881942648911076e-06, "loss": 0.0533, "step": 170 }, { "epoch": 2.784, "grad_norm": 0.16594184935092926, "learning_rate": 9.54358920679524e-07, "loss": 0.054, "step": 175 }, { "epoch": 2.864, "grad_norm": 0.13098712265491486, "learning_rate": 4.256725079024554e-07, "loss": 0.0549, "step": 180 }, { "epoch": 2.944, "grad_norm": 0.10799074172973633, "learning_rate": 1.0664559262413831e-07, "loss": 0.0532, "step": 185 }, { "epoch": 3.0, "step": 189, "total_flos": 1422043529084928.0, "train_loss": 0.0, "train_runtime": 0.0314, "train_samples_per_second": 47715.321, "train_steps_per_second": 6012.131 } ], "logging_steps": 5, "max_steps": 189, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1422043529084928.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }