{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.721170395869191, "eval_steps": 2000, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 2.5756406784057617, "learning_rate": 4.000000000000001e-06, "loss": 0.5053, "step": 100 }, { "epoch": 0.04, "grad_norm": 2.708136796951294, "learning_rate": 8.000000000000001e-06, "loss": 0.4834, "step": 200 }, { "epoch": 0.06, "grad_norm": 2.605242967605591, "learning_rate": 1.2e-05, "loss": 0.4346, "step": 300 }, { "epoch": 0.09, "grad_norm": 3.3453047275543213, "learning_rate": 1.6000000000000003e-05, "loss": 0.4465, "step": 400 }, { "epoch": 0.11, "grad_norm": 1.8422973155975342, "learning_rate": 2e-05, "loss": 0.4542, "step": 500 }, { "epoch": 0.13, "grad_norm": 1.822622299194336, "learning_rate": 1.98974358974359e-05, "loss": 0.4507, "step": 600 }, { "epoch": 0.15, "grad_norm": 2.805553674697876, "learning_rate": 1.9794871794871798e-05, "loss": 0.4556, "step": 700 }, { "epoch": 0.17, "grad_norm": 2.9561212062835693, "learning_rate": 1.9692307692307696e-05, "loss": 0.452, "step": 800 }, { "epoch": 0.19, "grad_norm": 4.11840295791626, "learning_rate": 1.958974358974359e-05, "loss": 0.4535, "step": 900 }, { "epoch": 0.22, "grad_norm": 1.7729780673980713, "learning_rate": 1.9487179487179488e-05, "loss": 0.4735, "step": 1000 }, { "epoch": 0.24, "grad_norm": 1.458228588104248, "learning_rate": 1.9384615384615386e-05, "loss": 0.3946, "step": 1100 }, { "epoch": 0.26, "grad_norm": 2.6751554012298584, "learning_rate": 1.9282051282051284e-05, "loss": 0.4231, "step": 1200 }, { "epoch": 0.28, "grad_norm": 2.862396240234375, "learning_rate": 1.9179487179487182e-05, "loss": 0.3933, "step": 1300 }, { "epoch": 0.3, "grad_norm": 2.2546944618225098, "learning_rate": 1.907692307692308e-05, "loss": 0.4084, "step": 1400 }, { "epoch": 0.32, "grad_norm": 1.8640025854110718, "learning_rate": 1.8974358974358975e-05, "loss": 0.4353, "step": 1500 }, { "epoch": 0.34, "grad_norm": 2.273890972137451, "learning_rate": 1.8871794871794873e-05, "loss": 0.4283, "step": 1600 }, { "epoch": 0.37, "grad_norm": 2.463329315185547, "learning_rate": 1.876923076923077e-05, "loss": 0.4219, "step": 1700 }, { "epoch": 0.39, "grad_norm": 2.782228946685791, "learning_rate": 1.866666666666667e-05, "loss": 0.4186, "step": 1800 }, { "epoch": 0.41, "grad_norm": 2.084019422531128, "learning_rate": 1.8564102564102567e-05, "loss": 0.4552, "step": 1900 }, { "epoch": 0.43, "grad_norm": 2.8400542736053467, "learning_rate": 1.8461538461538465e-05, "loss": 0.3935, "step": 2000 }, { "epoch": 0.43, "eval_loss": 0.3142565190792084, "eval_runtime": 65.2006, "eval_samples_per_second": 15.337, "eval_steps_per_second": 3.834, "step": 2000 }, { "epoch": 0.45, "grad_norm": 2.180964946746826, "learning_rate": 1.835897435897436e-05, "loss": 0.3924, "step": 2100 }, { "epoch": 0.47, "grad_norm": 2.9051389694213867, "learning_rate": 1.8256410256410257e-05, "loss": 0.3901, "step": 2200 }, { "epoch": 0.49, "grad_norm": 1.5432188510894775, "learning_rate": 1.8153846153846155e-05, "loss": 0.4155, "step": 2300 }, { "epoch": 0.52, "grad_norm": 2.0124869346618652, "learning_rate": 1.8051282051282053e-05, "loss": 0.4078, "step": 2400 }, { "epoch": 0.54, "grad_norm": 1.4548277854919434, "learning_rate": 1.794871794871795e-05, "loss": 0.4033, "step": 2500 }, { "epoch": 0.56, "grad_norm": 1.5465322732925415, "learning_rate": 1.784615384615385e-05, "loss": 0.4038, "step": 2600 }, { "epoch": 0.58, "grad_norm": 2.648297071456909, "learning_rate": 1.7743589743589744e-05, "loss": 0.3933, "step": 2700 }, { "epoch": 0.6, "grad_norm": 4.687922477722168, "learning_rate": 1.7641025641025642e-05, "loss": 0.4, "step": 2800 }, { "epoch": 0.62, "grad_norm": 1.872753620147705, "learning_rate": 1.753846153846154e-05, "loss": 0.3588, "step": 2900 }, { "epoch": 0.65, "grad_norm": 1.814994215965271, "learning_rate": 1.7435897435897438e-05, "loss": 0.4426, "step": 3000 }, { "epoch": 0.67, "grad_norm": 3.421119451522827, "learning_rate": 1.7333333333333336e-05, "loss": 0.3985, "step": 3100 }, { "epoch": 0.69, "grad_norm": 1.6136844158172607, "learning_rate": 1.7230769230769234e-05, "loss": 0.3522, "step": 3200 }, { "epoch": 0.71, "grad_norm": 2.4265120029449463, "learning_rate": 1.7128205128205128e-05, "loss": 0.3473, "step": 3300 }, { "epoch": 0.73, "grad_norm": 2.5966241359710693, "learning_rate": 1.7025641025641026e-05, "loss": 0.3807, "step": 3400 }, { "epoch": 0.75, "grad_norm": 1.95503568649292, "learning_rate": 1.6923076923076924e-05, "loss": 0.3731, "step": 3500 }, { "epoch": 0.77, "grad_norm": 1.2861559391021729, "learning_rate": 1.6820512820512822e-05, "loss": 0.3952, "step": 3600 }, { "epoch": 0.8, "grad_norm": 1.7286114692687988, "learning_rate": 1.671794871794872e-05, "loss": 0.376, "step": 3700 }, { "epoch": 0.82, "grad_norm": 1.7622095346450806, "learning_rate": 1.6615384615384618e-05, "loss": 0.3517, "step": 3800 }, { "epoch": 0.84, "grad_norm": 1.7638356685638428, "learning_rate": 1.6512820512820513e-05, "loss": 0.3529, "step": 3900 }, { "epoch": 0.86, "grad_norm": 2.3966591358184814, "learning_rate": 1.641025641025641e-05, "loss": 0.3614, "step": 4000 }, { "epoch": 0.86, "eval_loss": 0.2630089223384857, "eval_runtime": 65.7983, "eval_samples_per_second": 15.198, "eval_steps_per_second": 3.799, "step": 4000 }, { "epoch": 0.88, "grad_norm": 2.331758499145508, "learning_rate": 1.630769230769231e-05, "loss": 0.3815, "step": 4100 }, { "epoch": 0.9, "grad_norm": 1.6653661727905273, "learning_rate": 1.6205128205128207e-05, "loss": 0.3701, "step": 4200 }, { "epoch": 0.93, "grad_norm": 1.6307231187820435, "learning_rate": 1.6102564102564105e-05, "loss": 0.3811, "step": 4300 }, { "epoch": 0.95, "grad_norm": 0.9386046528816223, "learning_rate": 1.6000000000000003e-05, "loss": 0.3408, "step": 4400 }, { "epoch": 0.97, "grad_norm": 1.8476628065109253, "learning_rate": 1.5897435897435897e-05, "loss": 0.3515, "step": 4500 }, { "epoch": 0.99, "grad_norm": 1.7403932809829712, "learning_rate": 1.5794871794871795e-05, "loss": 0.366, "step": 4600 }, { "epoch": 1.01, "grad_norm": 1.0889652967453003, "learning_rate": 1.5692307692307693e-05, "loss": 0.3001, "step": 4700 }, { "epoch": 1.03, "grad_norm": 1.335119605064392, "learning_rate": 1.558974358974359e-05, "loss": 0.218, "step": 4800 }, { "epoch": 1.05, "grad_norm": 0.9958767890930176, "learning_rate": 1.548717948717949e-05, "loss": 0.2403, "step": 4900 }, { "epoch": 1.08, "grad_norm": 2.5801336765289307, "learning_rate": 1.5384615384615387e-05, "loss": 0.228, "step": 5000 }, { "epoch": 1.1, "grad_norm": 2.4592084884643555, "learning_rate": 1.5282051282051282e-05, "loss": 0.2946, "step": 5100 }, { "epoch": 1.12, "grad_norm": 0.4749181568622589, "learning_rate": 1.517948717948718e-05, "loss": 0.2307, "step": 5200 }, { "epoch": 1.14, "grad_norm": 2.7117793560028076, "learning_rate": 1.5076923076923078e-05, "loss": 0.2213, "step": 5300 }, { "epoch": 1.16, "grad_norm": 1.6409398317337036, "learning_rate": 1.4974358974358976e-05, "loss": 0.2448, "step": 5400 }, { "epoch": 1.18, "grad_norm": 2.0051097869873047, "learning_rate": 1.4871794871794874e-05, "loss": 0.23, "step": 5500 }, { "epoch": 1.2, "grad_norm": 1.724003791809082, "learning_rate": 1.4769230769230772e-05, "loss": 0.2273, "step": 5600 }, { "epoch": 1.23, "grad_norm": 1.6625137329101562, "learning_rate": 1.4666666666666666e-05, "loss": 0.231, "step": 5700 }, { "epoch": 1.25, "grad_norm": 2.87290358543396, "learning_rate": 1.4564102564102564e-05, "loss": 0.2426, "step": 5800 }, { "epoch": 1.27, "grad_norm": 1.323164701461792, "learning_rate": 1.4461538461538462e-05, "loss": 0.2498, "step": 5900 }, { "epoch": 1.29, "grad_norm": 1.9449268579483032, "learning_rate": 1.435897435897436e-05, "loss": 0.2241, "step": 6000 }, { "epoch": 1.29, "eval_loss": 0.24455103278160095, "eval_runtime": 65.1007, "eval_samples_per_second": 15.361, "eval_steps_per_second": 3.84, "step": 6000 }, { "epoch": 1.31, "grad_norm": 3.3016514778137207, "learning_rate": 1.4256410256410258e-05, "loss": 0.2341, "step": 6100 }, { "epoch": 1.33, "grad_norm": 1.8060470819473267, "learning_rate": 1.4153846153846156e-05, "loss": 0.2389, "step": 6200 }, { "epoch": 1.36, "grad_norm": 1.6190078258514404, "learning_rate": 1.405128205128205e-05, "loss": 0.2288, "step": 6300 }, { "epoch": 1.38, "grad_norm": 2.1221323013305664, "learning_rate": 1.3948717948717949e-05, "loss": 0.2344, "step": 6400 }, { "epoch": 1.4, "grad_norm": 1.716636300086975, "learning_rate": 1.3846153846153847e-05, "loss": 0.2553, "step": 6500 }, { "epoch": 1.42, "grad_norm": 2.039508581161499, "learning_rate": 1.3743589743589745e-05, "loss": 0.2504, "step": 6600 }, { "epoch": 1.44, "grad_norm": 2.224332809448242, "learning_rate": 1.3641025641025643e-05, "loss": 0.2327, "step": 6700 }, { "epoch": 1.46, "grad_norm": 2.1847877502441406, "learning_rate": 1.353846153846154e-05, "loss": 0.2363, "step": 6800 }, { "epoch": 1.48, "grad_norm": 1.154945731163025, "learning_rate": 1.3435897435897435e-05, "loss": 0.2528, "step": 6900 }, { "epoch": 1.51, "grad_norm": 2.06807279586792, "learning_rate": 1.3333333333333333e-05, "loss": 0.2465, "step": 7000 }, { "epoch": 1.53, "grad_norm": 3.2610838413238525, "learning_rate": 1.3231794871794872e-05, "loss": 0.248, "step": 7100 }, { "epoch": 1.55, "grad_norm": 1.7421317100524902, "learning_rate": 1.312923076923077e-05, "loss": 0.2221, "step": 7200 }, { "epoch": 1.57, "grad_norm": 1.0780401229858398, "learning_rate": 1.3026666666666667e-05, "loss": 0.2245, "step": 7300 }, { "epoch": 1.59, "grad_norm": 2.143017292022705, "learning_rate": 1.2924102564102565e-05, "loss": 0.218, "step": 7400 }, { "epoch": 1.61, "grad_norm": 1.5114802122116089, "learning_rate": 1.2821538461538463e-05, "loss": 0.2222, "step": 7500 }, { "epoch": 1.64, "grad_norm": 1.4918652772903442, "learning_rate": 1.271897435897436e-05, "loss": 0.1958, "step": 7600 }, { "epoch": 1.66, "grad_norm": 1.8960070610046387, "learning_rate": 1.2616410256410257e-05, "loss": 0.2351, "step": 7700 }, { "epoch": 1.68, "grad_norm": 1.8487619161605835, "learning_rate": 1.2513846153846155e-05, "loss": 0.2293, "step": 7800 }, { "epoch": 1.7, "grad_norm": 2.432880163192749, "learning_rate": 1.2411282051282051e-05, "loss": 0.2344, "step": 7900 }, { "epoch": 1.72, "grad_norm": 2.5734400749206543, "learning_rate": 1.230871794871795e-05, "loss": 0.245, "step": 8000 }, { "epoch": 1.72, "eval_loss": 0.22669802606105804, "eval_runtime": 65.0878, "eval_samples_per_second": 15.364, "eval_steps_per_second": 3.841, "step": 8000 } ], "logging_steps": 100, "max_steps": 20000, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2000, "total_flos": 2.5172343959165338e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }