| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 325, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.015384615384615385, | |
| "grad_norm": 30.625, | |
| "learning_rate": 1.4705882352941177e-05, | |
| "loss": 1.2726, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.03076923076923077, | |
| "grad_norm": 3.921875, | |
| "learning_rate": 2.9411764705882354e-05, | |
| "loss": 0.7669, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.046153846153846156, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 4.411764705882353e-05, | |
| "loss": 0.5372, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.06153846153846154, | |
| "grad_norm": 6.0, | |
| "learning_rate": 4.998946682120479e-05, | |
| "loss": 0.4509, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.07692307692307693, | |
| "grad_norm": 3.03125, | |
| "learning_rate": 4.992513310611935e-05, | |
| "loss": 0.3986, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.09230769230769231, | |
| "grad_norm": 5.6875, | |
| "learning_rate": 4.9802484533945446e-05, | |
| "loss": 0.3708, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.1076923076923077, | |
| "grad_norm": 2.75, | |
| "learning_rate": 4.9621840042988815e-05, | |
| "loss": 0.3834, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.12307692307692308, | |
| "grad_norm": 2.5625, | |
| "learning_rate": 4.9383669385541406e-05, | |
| "loss": 0.3746, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.13846153846153847, | |
| "grad_norm": 1.9453125, | |
| "learning_rate": 4.90885919063262e-05, | |
| "loss": 0.3346, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.15384615384615385, | |
| "grad_norm": 2.015625, | |
| "learning_rate": 4.873737493193827e-05, | |
| "loss": 0.3152, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.16923076923076924, | |
| "grad_norm": 2.25, | |
| "learning_rate": 4.83309317754702e-05, | |
| "loss": 0.3072, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.18461538461538463, | |
| "grad_norm": 2.09375, | |
| "learning_rate": 4.7870319361510674e-05, | |
| "loss": 0.3333, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 1.8984375, | |
| "learning_rate": 4.735673547769223e-05, | |
| "loss": 0.3045, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.2153846153846154, | |
| "grad_norm": 2.671875, | |
| "learning_rate": 4.67915156599355e-05, | |
| "loss": 0.3211, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.23076923076923078, | |
| "grad_norm": 1.8125, | |
| "learning_rate": 4.6176129719489314e-05, | |
| "loss": 0.297, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.24615384615384617, | |
| "grad_norm": 1.703125, | |
| "learning_rate": 4.551217792079811e-05, | |
| "loss": 0.2793, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.26153846153846155, | |
| "grad_norm": 1.65625, | |
| "learning_rate": 4.480138682013565e-05, | |
| "loss": 0.2818, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.27692307692307694, | |
| "grad_norm": 1.46875, | |
| "learning_rate": 4.404560477582644e-05, | |
| "loss": 0.2751, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.2923076923076923, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 4.3246797141730146e-05, | |
| "loss": 0.2925, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.3076923076923077, | |
| "grad_norm": 1.515625, | |
| "learning_rate": 4.2407041156488044e-05, | |
| "loss": 0.2849, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.3230769230769231, | |
| "grad_norm": 1.4765625, | |
| "learning_rate": 4.1528520541821506e-05, | |
| "loss": 0.2806, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.3384615384615385, | |
| "grad_norm": 1.640625, | |
| "learning_rate": 4.061351982392938e-05, | |
| "loss": 0.2968, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.35384615384615387, | |
| "grad_norm": 1.796875, | |
| "learning_rate": 3.966441839275095e-05, | |
| "loss": 0.2754, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.36923076923076925, | |
| "grad_norm": 1.53125, | |
| "learning_rate": 3.868368431454292e-05, | |
| "loss": 0.2815, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.38461538461538464, | |
| "grad_norm": 1.453125, | |
| "learning_rate": 3.767386791386043e-05, | |
| "loss": 0.266, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 1.375, | |
| "learning_rate": 3.6637595141631514e-05, | |
| "loss": 0.2688, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.4153846153846154, | |
| "grad_norm": 1.5703125, | |
| "learning_rate": 3.557756074657097e-05, | |
| "loss": 0.2654, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.4307692307692308, | |
| "grad_norm": 1.484375, | |
| "learning_rate": 3.449652126769084e-05, | |
| "loss": 0.2659, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.4461538461538462, | |
| "grad_norm": 1.25, | |
| "learning_rate": 3.3397287866129804e-05, | |
| "loss": 0.2627, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.46153846153846156, | |
| "grad_norm": 1.4921875, | |
| "learning_rate": 3.228271901494197e-05, | |
| "loss": 0.2671, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.47692307692307695, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 3.1155713065854636e-05, | |
| "loss": 0.2595, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.49230769230769234, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 3.001920071232443e-05, | |
| "loss": 0.2674, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.5076923076923077, | |
| "grad_norm": 1.15625, | |
| "learning_rate": 2.887613736849147e-05, | |
| "loss": 0.2303, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.5230769230769231, | |
| "grad_norm": 1.015625, | |
| "learning_rate": 2.7729495483849044e-05, | |
| "loss": 0.2434, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.5384615384615384, | |
| "grad_norm": 1.2578125, | |
| "learning_rate": 2.6582256813614327e-05, | |
| "loss": 0.2536, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.5538461538461539, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 2.543740466490011e-05, | |
| "loss": 0.2373, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.5692307692307692, | |
| "grad_norm": 1.265625, | |
| "learning_rate": 2.429791613885109e-05, | |
| "loss": 0.2659, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.5846153846153846, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 2.3166754388918143e-05, | |
| "loss": 0.24, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 1.203125, | |
| "learning_rate": 2.2046860915402724e-05, | |
| "loss": 0.2584, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.6153846153846154, | |
| "grad_norm": 1.234375, | |
| "learning_rate": 2.094114791630844e-05, | |
| "loss": 0.261, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.6307692307692307, | |
| "grad_norm": 1.1015625, | |
| "learning_rate": 1.985249071439113e-05, | |
| "loss": 0.246, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.6461538461538462, | |
| "grad_norm": 1.2890625, | |
| "learning_rate": 1.8783720280100235e-05, | |
| "loss": 0.2553, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.6615384615384615, | |
| "grad_norm": 1.2265625, | |
| "learning_rate": 1.7737615869854944e-05, | |
| "loss": 0.2493, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.676923076923077, | |
| "grad_norm": 1.3984375, | |
| "learning_rate": 1.671689779879873e-05, | |
| "loss": 0.2542, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.6923076923076923, | |
| "grad_norm": 1.578125, | |
| "learning_rate": 1.5724220366826235e-05, | |
| "loss": 0.2289, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.7076923076923077, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 1.4762164956277833e-05, | |
| "loss": 0.237, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.7230769230769231, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 1.3833233319250687e-05, | |
| "loss": 0.2602, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.7384615384615385, | |
| "grad_norm": 1.0078125, | |
| "learning_rate": 1.2939841071982239e-05, | |
| "loss": 0.2388, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.7538461538461538, | |
| "grad_norm": 1.125, | |
| "learning_rate": 1.2084311413223508e-05, | |
| "loss": 0.2565, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.7692307692307693, | |
| "grad_norm": 1.171875, | |
| "learning_rate": 1.126886908293699e-05, | |
| "loss": 0.2452, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.7846153846153846, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 1.0495634577029192e-05, | |
| "loss": 0.2394, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 0.9765625, | |
| "learning_rate": 9.766618633161939e-06, | |
| "loss": 0.2414, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.8153846153846154, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 9.083717001981646e-06, | |
| "loss": 0.2546, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.8307692307692308, | |
| "grad_norm": 1.28125, | |
| "learning_rate": 8.448705517363609e-06, | |
| "loss": 0.2629, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.8461538461538461, | |
| "grad_norm": 1.0, | |
| "learning_rate": 7.863235478490735e-06, | |
| "loss": 0.2536, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.8615384615384616, | |
| "grad_norm": 0.91015625, | |
| "learning_rate": 7.32882935577536e-06, | |
| "loss": 0.2367, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.8769230769230769, | |
| "grad_norm": 1.3125, | |
| "learning_rate": 6.846876831790416e-06, | |
| "loss": 0.2527, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.8923076923076924, | |
| "grad_norm": 0.99609375, | |
| "learning_rate": 6.418631187505366e-06, | |
| "loss": 0.2333, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.9076923076923077, | |
| "grad_norm": 0.83203125, | |
| "learning_rate": 6.045206043224129e-06, | |
| "loss": 0.2288, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.9230769230769231, | |
| "grad_norm": 1.1875, | |
| "learning_rate": 5.727572462700006e-06, | |
| "loss": 0.257, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.9384615384615385, | |
| "grad_norm": 0.95703125, | |
| "learning_rate": 5.466556427958028e-06, | |
| "loss": 0.2438, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.9538461538461539, | |
| "grad_norm": 0.9140625, | |
| "learning_rate": 5.262836691391401e-06, | |
| "loss": 0.2336, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.9692307692307692, | |
| "grad_norm": 1.1640625, | |
| "learning_rate": 5.116943010717409e-06, | |
| "loss": 0.2557, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.9846153846153847, | |
| "grad_norm": 1.0390625, | |
| "learning_rate": 5.029254771382656e-06, | |
| "loss": 0.2571, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.984375, | |
| "learning_rate": 5e-06, | |
| "loss": 0.2485, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 325, | |
| "total_flos": 4.404961824525517e+16, | |
| "train_loss": 0.30152319064507116, | |
| "train_runtime": 336.0969, | |
| "train_samples_per_second": 3.868, | |
| "train_steps_per_second": 0.967 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 325, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 4.404961824525517e+16, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |