| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9985652797704447, | |
| "eval_steps": 500, | |
| "global_step": 261, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 0.45229363441467285, | |
| "learning_rate": 9.990947518281311e-05, | |
| "loss": 1.2293, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 0.45895835757255554, | |
| "learning_rate": 9.963822852095345e-05, | |
| "loss": 1.0332, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 1.148818850517273, | |
| "learning_rate": 9.918724219660013e-05, | |
| "loss": 1.1688, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 0.6193984150886536, | |
| "learning_rate": 9.855814922793582e-05, | |
| "loss": 0.8491, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 0.5973692536354065, | |
| "learning_rate": 9.775322755599978e-05, | |
| "loss": 0.8356, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 0.471483439207077, | |
| "learning_rate": 9.677539179628005e-05, | |
| "loss": 0.782, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 0.5395711660385132, | |
| "learning_rate": 9.562818268491216e-05, | |
| "loss": 0.7464, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 0.7068768739700317, | |
| "learning_rate": 9.431575425769938e-05, | |
| "loss": 0.6813, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 0.7348042130470276, | |
| "learning_rate": 9.284285880837946e-05, | |
| "loss": 0.6508, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 1.763695478439331, | |
| "learning_rate": 9.121482968060384e-05, | |
| "loss": 0.7296, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 0.7689581513404846, | |
| "learning_rate": 8.943756195593916e-05, | |
| "loss": 0.604, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 1.1951208114624023, | |
| "learning_rate": 8.751749110782012e-05, | |
| "loss": 0.594, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 0.9435452818870544, | |
| "learning_rate": 8.546156969874723e-05, | |
| "loss": 0.6778, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 0.8245531320571899, | |
| "learning_rate": 8.327724220510873e-05, | |
| "loss": 0.6173, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 0.6063089966773987, | |
| "learning_rate": 8.097241806078615e-05, | |
| "loss": 0.7239, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 0.4883978068828583, | |
| "learning_rate": 7.855544301715203e-05, | |
| "loss": 0.5158, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 0.6936870217323303, | |
| "learning_rate": 7.603506892316512e-05, | |
| "loss": 0.5011, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 2.553333282470703, | |
| "learning_rate": 7.342042203498951e-05, | |
| "loss": 0.6461, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 0.9251458644866943, | |
| "learning_rate": 7.07209699698876e-05, | |
| "loss": 0.7273, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 0.699193000793457, | |
| "learning_rate": 6.79464874240473e-05, | |
| "loss": 0.5878, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 0.661618173122406, | |
| "learning_rate": 6.510702077847863e-05, | |
| "loss": 0.4787, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 0.4883117079734802, | |
| "learning_rate": 6.221285172114157e-05, | |
| "loss": 0.5766, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 2.715449810028076, | |
| "learning_rate": 5.927446001702899e-05, | |
| "loss": 0.5553, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 0.6433199644088745, | |
| "learning_rate": 5.6302485561014475e-05, | |
| "loss": 0.7007, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 1.7128715515136719, | |
| "learning_rate": 5.330768985087059e-05, | |
| "loss": 0.5773, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 0.7177631258964539, | |
| "learning_rate": 5.030091701996428e-05, | |
| "loss": 0.5787, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 0.6795344948768616, | |
| "learning_rate": 4.729305457072913e-05, | |
| "loss": 0.5013, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 1.1840591430664062, | |
| "learning_rate": 4.429499395109877e-05, | |
| "loss": 0.6419, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 0.7961875200271606, | |
| "learning_rate": 4.131759111665349e-05, | |
| "loss": 0.4529, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 0.7156918048858643, | |
| "learning_rate": 3.8371627221284495e-05, | |
| "loss": 0.5831, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 0.8424770832061768, | |
| "learning_rate": 3.546776957871445e-05, | |
| "loss": 0.6044, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 0.9914979338645935, | |
| "learning_rate": 3.261653303623263e-05, | |
| "loss": 0.5824, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 1.0051041841506958, | |
| "learning_rate": 2.982824190050958e-05, | |
| "loss": 0.4595, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 1.2977936267852783, | |
| "learning_rate": 2.711299255335833e-05, | |
| "loss": 0.544, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 0.5247394442558289, | |
| "learning_rate": 2.4480616892809594e-05, | |
| "loss": 0.5458, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 0.9446833729743958, | |
| "learning_rate": 2.194064673188089e-05, | |
| "loss": 0.5625, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 0.9111084938049316, | |
| "learning_rate": 1.9502279283951364e-05, | |
| "loss": 0.5163, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 0.9301843643188477, | |
| "learning_rate": 1.7174343859719333e-05, | |
| "loss": 0.5594, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 1.5420268774032593, | |
| "learning_rate": 1.4965269896332885e-05, | |
| "loss": 0.4737, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 0.866782546043396, | |
| "learning_rate": 1.2883056434459506e-05, | |
| "loss": 0.5539, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 1.928578495979309, | |
| "learning_rate": 1.0935243153818436e-05, | |
| "loss": 0.5426, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.5703736543655396, | |
| "learning_rate": 9.12888307205541e-06, | |
| "loss": 0.5781, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 1.3220263719558716, | |
| "learning_rate": 7.470517005817474e-06, | |
| "loss": 0.6523, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 4.24189567565918, | |
| "learning_rate": 5.966149886503614e-06, | |
| "loss": 0.5199, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 0.606140673160553, | |
| "learning_rate": 4.621229016452156e-06, | |
| "loss": 0.487, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 0.7255881428718567, | |
| "learning_rate": 3.4406243442987764e-06, | |
| "loss": 0.5654, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 0.7342286705970764, | |
| "learning_rate": 2.428610830928152e-06, | |
| "loss": 0.5211, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 1.0628710985183716, | |
| "learning_rate": 1.5888529698718346e-06, | |
| "loss": 0.5965, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 0.6802533268928528, | |
| "learning_rate": 9.243915182039431e-07, | |
| "loss": 0.542, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 2.282472610473633, | |
| "learning_rate": 4.376324859820924e-07, | |
| "loss": 0.5677, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 2.717949628829956, | |
| "learning_rate": 1.3033842410251075e-07, | |
| "loss": 0.4775, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 1.769280195236206, | |
| "learning_rate": 3.622042116169233e-09, | |
| "loss": 0.6862, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 261, | |
| "total_flos": 3.809983450683802e+16, | |
| "train_loss": 0.6279083029063726, | |
| "train_runtime": 203702.1714, | |
| "train_samples_per_second": 0.01, | |
| "train_steps_per_second": 0.001 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 261, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "total_flos": 3.809983450683802e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } |