| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0, | |
| "eval_steps": 500, | |
| "global_step": 189, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 2.2252185344696045, | |
| "learning_rate": 1.0526315789473684e-05, | |
| "loss": 1.6436, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 1.4878095388412476, | |
| "learning_rate": 2.368421052631579e-05, | |
| "loss": 0.772, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 0.5770473480224609, | |
| "learning_rate": 3.6842105263157895e-05, | |
| "loss": 0.1927, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 0.24875187873840332, | |
| "learning_rate": 5e-05, | |
| "loss": 0.08, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.2741605043411255, | |
| "learning_rate": 4.989335440737586e-05, | |
| "loss": 0.0735, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 0.10829973965883255, | |
| "learning_rate": 4.957432749209755e-05, | |
| "loss": 0.0719, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 0.16560488939285278, | |
| "learning_rate": 4.9045641079320484e-05, | |
| "loss": 0.0702, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 0.1393565684556961, | |
| "learning_rate": 4.8311805735108894e-05, | |
| "loss": 0.071, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 0.1896258294582367, | |
| "learning_rate": 4.7379082283876566e-05, | |
| "loss": 0.0677, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.1210947260260582, | |
| "learning_rate": 4.625542839324036e-05, | |
| "loss": 0.0599, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.16256098449230194, | |
| "learning_rate": 4.4950430682006e-05, | |
| "loss": 0.0685, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 0.10290851444005966, | |
| "learning_rate": 4.347522293051648e-05, | |
| "loss": 0.0653, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 1.032, | |
| "grad_norm": 0.10546600073575974, | |
| "learning_rate": 4.184239109116393e-05, | |
| "loss": 0.0655, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 1.112, | |
| "grad_norm": 0.09690473228693008, | |
| "learning_rate": 4.0065865909481417e-05, | |
| "loss": 0.0629, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.192, | |
| "grad_norm": 0.11338075995445251, | |
| "learning_rate": 3.81608040719339e-05, | |
| "loss": 0.0594, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 1.272, | |
| "grad_norm": 0.11339253932237625, | |
| "learning_rate": 3.6143458894413465e-05, | |
| "loss": 0.0594, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.3519999999999999, | |
| "grad_norm": 0.18642409145832062, | |
| "learning_rate": 3.403104165467883e-05, | |
| "loss": 0.058, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 1.432, | |
| "grad_norm": 0.16320252418518066, | |
| "learning_rate": 3.1841574751802076e-05, | |
| "loss": 0.0598, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.512, | |
| "grad_norm": 0.1201169490814209, | |
| "learning_rate": 2.9593737945414264e-05, | |
| "loss": 0.064, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 1.592, | |
| "grad_norm": 0.18611621856689453, | |
| "learning_rate": 2.7306708986582553e-05, | |
| "loss": 0.0629, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.6720000000000002, | |
| "grad_norm": 0.09128480404615402, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0609, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.752, | |
| "grad_norm": 0.1125178411602974, | |
| "learning_rate": 2.2693291013417453e-05, | |
| "loss": 0.0641, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.8319999999999999, | |
| "grad_norm": 0.1417843997478485, | |
| "learning_rate": 2.0406262054585738e-05, | |
| "loss": 0.0602, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.912, | |
| "grad_norm": 0.10574869811534882, | |
| "learning_rate": 1.815842524819793e-05, | |
| "loss": 0.0604, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.992, | |
| "grad_norm": 0.12096602469682693, | |
| "learning_rate": 1.5968958345321178e-05, | |
| "loss": 0.0557, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 2.064, | |
| "grad_norm": 0.13963642716407776, | |
| "learning_rate": 1.3856541105586545e-05, | |
| "loss": 0.0545, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 2.144, | |
| "grad_norm": 0.10345666855573654, | |
| "learning_rate": 1.1839195928066102e-05, | |
| "loss": 0.0503, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 2.224, | |
| "grad_norm": 0.11484814435243607, | |
| "learning_rate": 9.934134090518593e-06, | |
| "loss": 0.0543, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.304, | |
| "grad_norm": 0.1633157581090927, | |
| "learning_rate": 8.15760890883607e-06, | |
| "loss": 0.0538, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 2.384, | |
| "grad_norm": 0.13634610176086426, | |
| "learning_rate": 6.524777069483526e-06, | |
| "loss": 0.0521, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.464, | |
| "grad_norm": 0.12374045699834824, | |
| "learning_rate": 5.049569317994013e-06, | |
| "loss": 0.057, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 2.544, | |
| "grad_norm": 0.11331922560930252, | |
| "learning_rate": 3.7445716067596503e-06, | |
| "loss": 0.0545, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.624, | |
| "grad_norm": 0.12591035664081573, | |
| "learning_rate": 2.6209177161234445e-06, | |
| "loss": 0.0544, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 2.7039999999999997, | |
| "grad_norm": 0.13460087776184082, | |
| "learning_rate": 1.6881942648911076e-06, | |
| "loss": 0.0533, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.784, | |
| "grad_norm": 0.16594184935092926, | |
| "learning_rate": 9.54358920679524e-07, | |
| "loss": 0.054, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 2.864, | |
| "grad_norm": 0.13098712265491486, | |
| "learning_rate": 4.256725079024554e-07, | |
| "loss": 0.0549, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.944, | |
| "grad_norm": 0.10799074172973633, | |
| "learning_rate": 1.0664559262413831e-07, | |
| "loss": 0.0532, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "step": 189, | |
| "total_flos": 1422043529084928.0, | |
| "train_loss": 0.0, | |
| "train_runtime": 0.0314, | |
| "train_samples_per_second": 47715.321, | |
| "train_steps_per_second": 6012.131 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 189, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1422043529084928.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |