| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.2058706862356208, | |
| "eval_steps": 500, | |
| "global_step": 190, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0317334391114637, | |
| "grad_norm": 0.060546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.6421, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.0634668782229274, | |
| "grad_norm": 0.11572265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.5213, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.09520031733439112, | |
| "grad_norm": 0.08251953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2925, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.1269337564458548, | |
| "grad_norm": 0.0634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1978, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.15866719555731854, | |
| "grad_norm": 0.08251953125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1538, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.19040063466878224, | |
| "grad_norm": 0.10888671875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.106, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.22213407378024594, | |
| "grad_norm": 0.049560546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0454, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.2538675128917096, | |
| "grad_norm": 0.310546875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1215, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.28560095200317337, | |
| "grad_norm": 0.06494140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.2476, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.31733439111463707, | |
| "grad_norm": 0.40234375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1073, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.3490678302261008, | |
| "grad_norm": 0.04052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0863, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.3808012693375645, | |
| "grad_norm": 0.03369140625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0671, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.4125347084490282, | |
| "grad_norm": 0.0274658203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0493, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.4442681475604919, | |
| "grad_norm": 0.0277099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0311, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.4760015866719556, | |
| "grad_norm": 0.01275634765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0125, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.5077350257834192, | |
| "grad_norm": 0.06787109375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1307, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.539468464894883, | |
| "grad_norm": 0.050048828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.171, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.5712019040063467, | |
| "grad_norm": 0.060791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0818, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.6029353431178104, | |
| "grad_norm": 0.033203125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0658, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.6346687822292741, | |
| "grad_norm": 0.0235595703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.046, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.6664022213407378, | |
| "grad_norm": 0.0299072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0384, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.6981356604522015, | |
| "grad_norm": 0.0181884765625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0187, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.7298690995636652, | |
| "grad_norm": 0.019775390625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0095, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.761602538675129, | |
| "grad_norm": 0.060791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1381, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.7933359777865926, | |
| "grad_norm": 0.038818359375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1125, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.8250694168980564, | |
| "grad_norm": 0.032958984375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.062, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.85680285600952, | |
| "grad_norm": 0.03173828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0526, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.8885362951209838, | |
| "grad_norm": 0.02392578125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0382, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.9202697342324474, | |
| "grad_norm": 0.027099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.027, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.9520031733439112, | |
| "grad_norm": 0.02294921875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0115, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.9837366124553748, | |
| "grad_norm": 0.02099609375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.005, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.0154700515668384, | |
| "grad_norm": 0.0703125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1291, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.0472034906783023, | |
| "grad_norm": 0.04052734375, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1033, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.078936929789766, | |
| "grad_norm": 0.03173828125, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0539, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.1106703689012296, | |
| "grad_norm": 0.0299072265625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.043, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.1424038080126935, | |
| "grad_norm": 0.0262451171875, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0303, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.1741372471241571, | |
| "grad_norm": 0.060791015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0239, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.2058706862356208, | |
| "grad_norm": 0.015625, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0095, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.2058706862356208, | |
| "step": 190, | |
| "total_flos": 1.216645538039931e+18, | |
| "train_loss": 0.10745409297707834, | |
| "train_runtime": 37043.3755, | |
| "train_samples_per_second": 0.657, | |
| "train_steps_per_second": 0.005 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 190, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 90, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.216645538039931e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |