| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 500, | |
| "global_step": 444, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13513513513513514, | |
| "grad_norm": 0.7316311597824097, | |
| "learning_rate": 5.617977528089888e-06, | |
| "loss": 1.5387, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.2702702702702703, | |
| "grad_norm": 1.1696785688400269, | |
| "learning_rate": 1.1235955056179776e-05, | |
| "loss": 1.4734, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.40540540540540543, | |
| "grad_norm": 0.8717026114463806, | |
| "learning_rate": 1.6853932584269665e-05, | |
| "loss": 1.3024, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 0.5852603316307068, | |
| "learning_rate": 2.2471910112359552e-05, | |
| "loss": 1.0762, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.6756756756756757, | |
| "grad_norm": 1.0755618810653687, | |
| "learning_rate": 2.752808988764045e-05, | |
| "loss": 1.0119, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.8108108108108109, | |
| "grad_norm": 0.751964807510376, | |
| "learning_rate": 3.314606741573034e-05, | |
| "loss": 0.9915, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.9459459459459459, | |
| "grad_norm": 0.802929699420929, | |
| "learning_rate": 3.876404494382023e-05, | |
| "loss": 1.0209, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 1.0810810810810811, | |
| "grad_norm": 1.2205873727798462, | |
| "learning_rate": 4.438202247191011e-05, | |
| "loss": 0.8858, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 1.2162162162162162, | |
| "grad_norm": 0.42553073167800903, | |
| "learning_rate": 5e-05, | |
| "loss": 0.7881, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 1.3513513513513513, | |
| "grad_norm": 0.6855661273002625, | |
| "learning_rate": 4.990217055187362e-05, | |
| "loss": 0.8729, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.4864864864864864, | |
| "grad_norm": 1.399084210395813, | |
| "learning_rate": 4.960944785556814e-05, | |
| "loss": 0.914, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.6216216216216215, | |
| "grad_norm": 0.7945070266723633, | |
| "learning_rate": 4.9124122863070255e-05, | |
| "loss": 0.8129, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.7567567567567568, | |
| "grad_norm": 1.7592657804489136, | |
| "learning_rate": 4.8449993900474187e-05, | |
| "loss": 0.7831, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.8918918918918919, | |
| "grad_norm": 0.5450412631034851, | |
| "learning_rate": 4.75923369409301e-05, | |
| "loss": 0.8187, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 2.027027027027027, | |
| "grad_norm": 0.6182835698127747, | |
| "learning_rate": 4.6557864313000695e-05, | |
| "loss": 0.6924, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 2.1621621621621623, | |
| "grad_norm": 1.4527682065963745, | |
| "learning_rate": 4.5354672167589356e-05, | |
| "loss": 0.5337, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 2.2972972972972974, | |
| "grad_norm": 0.8686306476593018, | |
| "learning_rate": 4.3992177114582124e-05, | |
| "loss": 0.5554, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 2.4324324324324325, | |
| "grad_norm": 0.9178914427757263, | |
| "learning_rate": 4.2481042525107854e-05, | |
| "loss": 0.5543, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 2.5675675675675675, | |
| "grad_norm": 1.9653586149215698, | |
| "learning_rate": 4.083309507620118e-05, | |
| "loss": 0.5286, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 2.7027027027027026, | |
| "grad_norm": 0.8671745657920837, | |
| "learning_rate": 3.906123219101952e-05, | |
| "loss": 0.464, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 2.8378378378378377, | |
| "grad_norm": 1.1303811073303223, | |
| "learning_rate": 3.7179321099019916e-05, | |
| "loss": 0.4896, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.972972972972973, | |
| "grad_norm": 1.381509780883789, | |
| "learning_rate": 3.520209030608662e-05, | |
| "loss": 0.5281, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 3.108108108108108, | |
| "grad_norm": 1.1887260675430298, | |
| "learning_rate": 3.3145014324002944e-05, | |
| "loss": 0.3098, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 3.2432432432432434, | |
| "grad_norm": 1.386113166809082, | |
| "learning_rate": 3.102419256141536e-05, | |
| "loss": 0.2799, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 3.3783783783783785, | |
| "grad_norm": 0.8863763213157654, | |
| "learning_rate": 2.885622332413256e-05, | |
| "loss": 0.319, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 3.5135135135135136, | |
| "grad_norm": 0.7007670402526855, | |
| "learning_rate": 2.6658073910877603e-05, | |
| "loss": 0.2532, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 3.6486486486486487, | |
| "grad_norm": 0.9560308456420898, | |
| "learning_rate": 2.444694782117033e-05, | |
| "loss": 0.2739, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 3.7837837837837838, | |
| "grad_norm": 0.7131654620170593, | |
| "learning_rate": 2.224015011461826e-05, | |
| "loss": 0.3163, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 3.918918918918919, | |
| "grad_norm": 0.7580399513244629, | |
| "learning_rate": 2.0054951975362067e-05, | |
| "loss": 0.2788, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 4.054054054054054, | |
| "grad_norm": 1.0264191627502441, | |
| "learning_rate": 1.7908455541642584e-05, | |
| "loss": 0.2263, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4.1891891891891895, | |
| "grad_norm": 0.6259729862213135, | |
| "learning_rate": 1.5817460058381088e-05, | |
| "loss": 0.0985, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 4.324324324324325, | |
| "grad_norm": 0.6307621002197266, | |
| "learning_rate": 1.3798330400310539e-05, | |
| "loss": 0.1152, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 4.45945945945946, | |
| "grad_norm": 0.781829833984375, | |
| "learning_rate": 1.1866868994642535e-05, | |
| "loss": 0.1294, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 4.594594594594595, | |
| "grad_norm": 0.2717326581478119, | |
| "learning_rate": 1.0038192145648567e-05, | |
| "loss": 0.1243, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 4.72972972972973, | |
| "grad_norm": 0.44046151638031006, | |
| "learning_rate": 8.32661172908373e-06, | |
| "loss": 0.1214, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 4.864864864864865, | |
| "grad_norm": 0.7771790027618408, | |
| "learning_rate": 6.745523182354147e-06, | |
| "loss": 0.1531, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.5140301585197449, | |
| "learning_rate": 5.307300667057049e-06, | |
| "loss": 0.1355, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 5.135135135135135, | |
| "grad_norm": 0.792259156703949, | |
| "learning_rate": 4.023200224388787e-06, | |
| "loss": 0.0637, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 5.27027027027027, | |
| "grad_norm": 0.6544216871261597, | |
| "learning_rate": 2.9032716813609723e-06, | |
| "loss": 0.0764, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 5.405405405405405, | |
| "grad_norm": 0.15133225917816162, | |
| "learning_rate": 1.956279997278043e-06, | |
| "loss": 0.0471, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5.54054054054054, | |
| "grad_norm": 0.508730411529541, | |
| "learning_rate": 1.1896366660467173e-06, | |
| "loss": 0.0418, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 5.675675675675675, | |
| "grad_norm": 0.5652210712432861, | |
| "learning_rate": 6.093417111873306e-07, | |
| "loss": 0.049, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 5.8108108108108105, | |
| "grad_norm": 0.5781662464141846, | |
| "learning_rate": 2.1993672751463579e-07, | |
| "loss": 0.0657, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 5.945945945945946, | |
| "grad_norm": 0.36368322372436523, | |
| "learning_rate": 2.44693370006599e-08, | |
| "loss": 0.0487, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 6.0, | |
| "step": 444, | |
| "total_flos": 3.5286579164676096e+16, | |
| "train_loss": 0.5000483628597345, | |
| "train_runtime": 2552.3298, | |
| "train_samples_per_second": 1.389, | |
| "train_steps_per_second": 0.174 | |
| } | |
| ], | |
| "logging_steps": 10, | |
| "max_steps": 444, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 1000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.5286579164676096e+16, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |