{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 325, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015384615384615385, "grad_norm": 30.625, "learning_rate": 1.4705882352941177e-05, "loss": 1.2726, "step": 5 }, { "epoch": 0.03076923076923077, "grad_norm": 3.921875, "learning_rate": 2.9411764705882354e-05, "loss": 0.7669, "step": 10 }, { "epoch": 0.046153846153846156, "grad_norm": 4.5625, "learning_rate": 4.411764705882353e-05, "loss": 0.5372, "step": 15 }, { "epoch": 0.06153846153846154, "grad_norm": 6.0, "learning_rate": 4.998946682120479e-05, "loss": 0.4509, "step": 20 }, { "epoch": 0.07692307692307693, "grad_norm": 3.03125, "learning_rate": 4.992513310611935e-05, "loss": 0.3986, "step": 25 }, { "epoch": 0.09230769230769231, "grad_norm": 5.6875, "learning_rate": 4.9802484533945446e-05, "loss": 0.3708, "step": 30 }, { "epoch": 0.1076923076923077, "grad_norm": 2.75, "learning_rate": 4.9621840042988815e-05, "loss": 0.3834, "step": 35 }, { "epoch": 0.12307692307692308, "grad_norm": 2.5625, "learning_rate": 4.9383669385541406e-05, "loss": 0.3746, "step": 40 }, { "epoch": 0.13846153846153847, "grad_norm": 1.9453125, "learning_rate": 4.90885919063262e-05, "loss": 0.3346, "step": 45 }, { "epoch": 0.15384615384615385, "grad_norm": 2.015625, "learning_rate": 4.873737493193827e-05, "loss": 0.3152, "step": 50 }, { "epoch": 0.16923076923076924, "grad_norm": 2.25, "learning_rate": 4.83309317754702e-05, "loss": 0.3072, "step": 55 }, { "epoch": 0.18461538461538463, "grad_norm": 2.09375, "learning_rate": 4.7870319361510674e-05, "loss": 0.3333, "step": 60 }, { "epoch": 0.2, "grad_norm": 1.8984375, "learning_rate": 4.735673547769223e-05, "loss": 0.3045, "step": 65 }, { "epoch": 0.2153846153846154, "grad_norm": 2.671875, "learning_rate": 4.67915156599355e-05, "loss": 0.3211, "step": 70 }, { "epoch": 0.23076923076923078, "grad_norm": 1.8125, "learning_rate": 4.6176129719489314e-05, "loss": 0.297, "step": 75 }, { "epoch": 0.24615384615384617, "grad_norm": 1.703125, "learning_rate": 4.551217792079811e-05, "loss": 0.2793, "step": 80 }, { "epoch": 0.26153846153846155, "grad_norm": 1.65625, "learning_rate": 4.480138682013565e-05, "loss": 0.2818, "step": 85 }, { "epoch": 0.27692307692307694, "grad_norm": 1.46875, "learning_rate": 4.404560477582644e-05, "loss": 0.2751, "step": 90 }, { "epoch": 0.2923076923076923, "grad_norm": 1.453125, "learning_rate": 4.3246797141730146e-05, "loss": 0.2925, "step": 95 }, { "epoch": 0.3076923076923077, "grad_norm": 1.515625, "learning_rate": 4.2407041156488044e-05, "loss": 0.2849, "step": 100 }, { "epoch": 0.3230769230769231, "grad_norm": 1.4765625, "learning_rate": 4.1528520541821506e-05, "loss": 0.2806, "step": 105 }, { "epoch": 0.3384615384615385, "grad_norm": 1.640625, "learning_rate": 4.061351982392938e-05, "loss": 0.2968, "step": 110 }, { "epoch": 0.35384615384615387, "grad_norm": 1.796875, "learning_rate": 3.966441839275095e-05, "loss": 0.2754, "step": 115 }, { "epoch": 0.36923076923076925, "grad_norm": 1.53125, "learning_rate": 3.868368431454292e-05, "loss": 0.2815, "step": 120 }, { "epoch": 0.38461538461538464, "grad_norm": 1.453125, "learning_rate": 3.767386791386043e-05, "loss": 0.266, "step": 125 }, { "epoch": 0.4, "grad_norm": 1.375, "learning_rate": 3.6637595141631514e-05, "loss": 0.2688, "step": 130 }, { "epoch": 0.4153846153846154, "grad_norm": 1.5703125, "learning_rate": 3.557756074657097e-05, "loss": 0.2654, "step": 135 }, { "epoch": 0.4307692307692308, "grad_norm": 1.484375, "learning_rate": 3.449652126769084e-05, "loss": 0.2659, "step": 140 }, { "epoch": 0.4461538461538462, "grad_norm": 1.25, "learning_rate": 3.3397287866129804e-05, "loss": 0.2627, "step": 145 }, { "epoch": 0.46153846153846156, "grad_norm": 1.4921875, "learning_rate": 3.228271901494197e-05, "loss": 0.2671, "step": 150 }, { "epoch": 0.47692307692307695, "grad_norm": 1.265625, "learning_rate": 3.1155713065854636e-05, "loss": 0.2595, "step": 155 }, { "epoch": 0.49230769230769234, "grad_norm": 1.1875, "learning_rate": 3.001920071232443e-05, "loss": 0.2674, "step": 160 }, { "epoch": 0.5076923076923077, "grad_norm": 1.15625, "learning_rate": 2.887613736849147e-05, "loss": 0.2303, "step": 165 }, { "epoch": 0.5230769230769231, "grad_norm": 1.015625, "learning_rate": 2.7729495483849044e-05, "loss": 0.2434, "step": 170 }, { "epoch": 0.5384615384615384, "grad_norm": 1.2578125, "learning_rate": 2.6582256813614327e-05, "loss": 0.2536, "step": 175 }, { "epoch": 0.5538461538461539, "grad_norm": 1.171875, "learning_rate": 2.543740466490011e-05, "loss": 0.2373, "step": 180 }, { "epoch": 0.5692307692307692, "grad_norm": 1.265625, "learning_rate": 2.429791613885109e-05, "loss": 0.2659, "step": 185 }, { "epoch": 0.5846153846153846, "grad_norm": 1.1015625, "learning_rate": 2.3166754388918143e-05, "loss": 0.24, "step": 190 }, { "epoch": 0.6, "grad_norm": 1.203125, "learning_rate": 2.2046860915402724e-05, "loss": 0.2584, "step": 195 }, { "epoch": 0.6153846153846154, "grad_norm": 1.234375, "learning_rate": 2.094114791630844e-05, "loss": 0.261, "step": 200 }, { "epoch": 0.6307692307692307, "grad_norm": 1.1015625, "learning_rate": 1.985249071439113e-05, "loss": 0.246, "step": 205 }, { "epoch": 0.6461538461538462, "grad_norm": 1.2890625, "learning_rate": 1.8783720280100235e-05, "loss": 0.2553, "step": 210 }, { "epoch": 0.6615384615384615, "grad_norm": 1.2265625, "learning_rate": 1.7737615869854944e-05, "loss": 0.2493, "step": 215 }, { "epoch": 0.676923076923077, "grad_norm": 1.3984375, "learning_rate": 1.671689779879873e-05, "loss": 0.2542, "step": 220 }, { "epoch": 0.6923076923076923, "grad_norm": 1.578125, "learning_rate": 1.5724220366826235e-05, "loss": 0.2289, "step": 225 }, { "epoch": 0.7076923076923077, "grad_norm": 1.1328125, "learning_rate": 1.4762164956277833e-05, "loss": 0.237, "step": 230 }, { "epoch": 0.7230769230769231, "grad_norm": 0.9140625, "learning_rate": 1.3833233319250687e-05, "loss": 0.2602, "step": 235 }, { "epoch": 0.7384615384615385, "grad_norm": 1.0078125, "learning_rate": 1.2939841071982239e-05, "loss": 0.2388, "step": 240 }, { "epoch": 0.7538461538461538, "grad_norm": 1.125, "learning_rate": 1.2084311413223508e-05, "loss": 0.2565, "step": 245 }, { "epoch": 0.7692307692307693, "grad_norm": 1.171875, "learning_rate": 1.126886908293699e-05, "loss": 0.2452, "step": 250 }, { "epoch": 0.7846153846153846, "grad_norm": 0.984375, "learning_rate": 1.0495634577029192e-05, "loss": 0.2394, "step": 255 }, { "epoch": 0.8, "grad_norm": 0.9765625, "learning_rate": 9.766618633161939e-06, "loss": 0.2414, "step": 260 }, { "epoch": 0.8153846153846154, "grad_norm": 1.1875, "learning_rate": 9.083717001981646e-06, "loss": 0.2546, "step": 265 }, { "epoch": 0.8307692307692308, "grad_norm": 1.28125, "learning_rate": 8.448705517363609e-06, "loss": 0.2629, "step": 270 }, { "epoch": 0.8461538461538461, "grad_norm": 1.0, "learning_rate": 7.863235478490735e-06, "loss": 0.2536, "step": 275 }, { "epoch": 0.8615384615384616, "grad_norm": 0.91015625, "learning_rate": 7.32882935577536e-06, "loss": 0.2367, "step": 280 }, { "epoch": 0.8769230769230769, "grad_norm": 1.3125, "learning_rate": 6.846876831790416e-06, "loss": 0.2527, "step": 285 }, { "epoch": 0.8923076923076924, "grad_norm": 0.99609375, "learning_rate": 6.418631187505366e-06, "loss": 0.2333, "step": 290 }, { "epoch": 0.9076923076923077, "grad_norm": 0.83203125, "learning_rate": 6.045206043224129e-06, "loss": 0.2288, "step": 295 }, { "epoch": 0.9230769230769231, "grad_norm": 1.1875, "learning_rate": 5.727572462700006e-06, "loss": 0.257, "step": 300 }, { "epoch": 0.9384615384615385, "grad_norm": 0.95703125, "learning_rate": 5.466556427958028e-06, "loss": 0.2438, "step": 305 }, { "epoch": 0.9538461538461539, "grad_norm": 0.9140625, "learning_rate": 5.262836691391401e-06, "loss": 0.2336, "step": 310 }, { "epoch": 0.9692307692307692, "grad_norm": 1.1640625, "learning_rate": 5.116943010717409e-06, "loss": 0.2557, "step": 315 }, { "epoch": 0.9846153846153847, "grad_norm": 1.0390625, "learning_rate": 5.029254771382656e-06, "loss": 0.2571, "step": 320 }, { "epoch": 1.0, "grad_norm": 0.984375, "learning_rate": 5e-06, "loss": 0.2485, "step": 325 }, { "epoch": 1.0, "step": 325, "total_flos": 4.404961824525517e+16, "train_loss": 0.30152319064507116, "train_runtime": 336.0969, "train_samples_per_second": 3.868, "train_steps_per_second": 0.967 } ], "logging_steps": 5, "max_steps": 325, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.404961824525517e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }