{ "best_metric": null, "best_model_checkpoint": null, "epoch": 8.605851979345955, "eval_steps": 500, "global_step": 5000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.17, "grad_norm": 1.3079030513763428, "learning_rate": 4.000000000000001e-06, "loss": 0.4896, "step": 100 }, { "epoch": 0.34, "grad_norm": 1.4786765575408936, "learning_rate": 8.000000000000001e-06, "loss": 0.4113, "step": 200 }, { "epoch": 0.52, "grad_norm": 1.2072057723999023, "learning_rate": 1.2e-05, "loss": 0.3924, "step": 300 }, { "epoch": 0.69, "grad_norm": 1.5262311697006226, "learning_rate": 1.6000000000000003e-05, "loss": 0.3768, "step": 400 }, { "epoch": 0.86, "grad_norm": 1.114892840385437, "learning_rate": 2e-05, "loss": 0.353, "step": 500 }, { "epoch": 0.86, "eval_loss": 0.2806718051433563, "eval_runtime": 64.9966, "eval_samples_per_second": 15.385, "eval_steps_per_second": 3.846, "step": 500 }, { "epoch": 1.03, "grad_norm": 1.1068037748336792, "learning_rate": 1.9555555555555557e-05, "loss": 0.3373, "step": 600 }, { "epoch": 1.2, "grad_norm": 1.0337824821472168, "learning_rate": 1.9111111111111113e-05, "loss": 0.2773, "step": 700 }, { "epoch": 1.38, "grad_norm": 1.1779000759124756, "learning_rate": 1.866666666666667e-05, "loss": 0.2757, "step": 800 }, { "epoch": 1.55, "grad_norm": 1.2178924083709717, "learning_rate": 1.8222222222222224e-05, "loss": 0.2891, "step": 900 }, { "epoch": 1.72, "grad_norm": 1.1651198863983154, "learning_rate": 1.7777777777777777e-05, "loss": 0.2648, "step": 1000 }, { "epoch": 1.72, "eval_loss": 0.23123475909233093, "eval_runtime": 64.9606, "eval_samples_per_second": 15.394, "eval_steps_per_second": 3.848, "step": 1000 }, { "epoch": 1.89, "grad_norm": 0.941031813621521, "learning_rate": 1.7333333333333336e-05, "loss": 0.2556, "step": 1100 }, { "epoch": 2.07, "grad_norm": 1.5538479089736938, "learning_rate": 1.688888888888889e-05, "loss": 0.2188, "step": 1200 }, { "epoch": 2.24, "grad_norm": 1.7369763851165771, "learning_rate": 1.6444444444444444e-05, "loss": 0.1589, "step": 1300 }, { "epoch": 2.41, "grad_norm": 1.8593116998672485, "learning_rate": 1.6000000000000003e-05, "loss": 0.1526, "step": 1400 }, { "epoch": 2.58, "grad_norm": 0.9025155305862427, "learning_rate": 1.555555555555556e-05, "loss": 0.1512, "step": 1500 }, { "epoch": 2.58, "eval_loss": 0.23454324901103973, "eval_runtime": 64.9882, "eval_samples_per_second": 15.387, "eval_steps_per_second": 3.847, "step": 1500 }, { "epoch": 2.75, "grad_norm": 1.2440764904022217, "learning_rate": 1.5111111111111112e-05, "loss": 0.1596, "step": 1600 }, { "epoch": 2.93, "grad_norm": 0.9365782141685486, "learning_rate": 1.4666666666666666e-05, "loss": 0.1603, "step": 1700 }, { "epoch": 3.1, "grad_norm": 0.977448582649231, "learning_rate": 1.4222222222222224e-05, "loss": 0.1086, "step": 1800 }, { "epoch": 3.27, "grad_norm": 0.9493745565414429, "learning_rate": 1.377777777777778e-05, "loss": 0.0822, "step": 1900 }, { "epoch": 3.44, "grad_norm": 1.437554955482483, "learning_rate": 1.3333333333333333e-05, "loss": 0.0838, "step": 2000 }, { "epoch": 3.44, "eval_loss": 0.258994460105896, "eval_runtime": 65.0096, "eval_samples_per_second": 15.382, "eval_steps_per_second": 3.846, "step": 2000 }, { "epoch": 3.61, "grad_norm": 1.1978514194488525, "learning_rate": 1.288888888888889e-05, "loss": 0.0849, "step": 2100 }, { "epoch": 3.79, "grad_norm": 1.1100072860717773, "learning_rate": 1.2444444444444446e-05, "loss": 0.0822, "step": 2200 }, { "epoch": 3.96, "grad_norm": 1.0157575607299805, "learning_rate": 1.2e-05, "loss": 0.0799, "step": 2300 }, { "epoch": 4.13, "grad_norm": 1.1923385858535767, "learning_rate": 1.1555555555555556e-05, "loss": 0.05, "step": 2400 }, { "epoch": 4.3, "grad_norm": 1.1894739866256714, "learning_rate": 1.1111111111111113e-05, "loss": 0.0401, "step": 2500 }, { "epoch": 4.3, "eval_loss": 0.3230312466621399, "eval_runtime": 65.0356, "eval_samples_per_second": 15.376, "eval_steps_per_second": 3.844, "step": 2500 }, { "epoch": 4.48, "grad_norm": 1.5902525186538696, "learning_rate": 1.0666666666666667e-05, "loss": 0.0404, "step": 2600 }, { "epoch": 4.65, "grad_norm": 1.3404467105865479, "learning_rate": 1.0222222222222223e-05, "loss": 0.0418, "step": 2700 }, { "epoch": 4.82, "grad_norm": 1.4325445890426636, "learning_rate": 9.777777777777779e-06, "loss": 0.0441, "step": 2800 }, { "epoch": 4.99, "grad_norm": 1.3849271535873413, "learning_rate": 9.333333333333334e-06, "loss": 0.0429, "step": 2900 }, { "epoch": 5.16, "grad_norm": 1.0262218713760376, "learning_rate": 8.888888888888888e-06, "loss": 0.0196, "step": 3000 }, { "epoch": 5.16, "eval_loss": 0.38104376196861267, "eval_runtime": 64.9932, "eval_samples_per_second": 15.386, "eval_steps_per_second": 3.847, "step": 3000 }, { "epoch": 5.34, "grad_norm": 1.4700788259506226, "learning_rate": 8.444444444444446e-06, "loss": 0.0205, "step": 3100 }, { "epoch": 5.51, "grad_norm": 0.9892378449440002, "learning_rate": 8.000000000000001e-06, "loss": 0.0207, "step": 3200 }, { "epoch": 5.68, "grad_norm": 0.4820214509963989, "learning_rate": 7.555555555555556e-06, "loss": 0.0204, "step": 3300 }, { "epoch": 5.85, "grad_norm": 1.2511119842529297, "learning_rate": 7.111111111111112e-06, "loss": 0.0228, "step": 3400 }, { "epoch": 6.02, "grad_norm": 0.32089540362358093, "learning_rate": 6.666666666666667e-06, "loss": 0.0197, "step": 3500 }, { "epoch": 6.02, "eval_loss": 0.3612504303455353, "eval_runtime": 65.1059, "eval_samples_per_second": 15.36, "eval_steps_per_second": 3.84, "step": 3500 }, { "epoch": 6.2, "grad_norm": 0.4687187671661377, "learning_rate": 6.222222222222223e-06, "loss": 0.0085, "step": 3600 }, { "epoch": 6.37, "grad_norm": 0.696907103061676, "learning_rate": 5.777777777777778e-06, "loss": 0.0096, "step": 3700 }, { "epoch": 6.54, "grad_norm": 0.5151516795158386, "learning_rate": 5.333333333333334e-06, "loss": 0.009, "step": 3800 }, { "epoch": 6.71, "grad_norm": 0.8852140307426453, "learning_rate": 4.888888888888889e-06, "loss": 0.012, "step": 3900 }, { "epoch": 6.88, "grad_norm": 0.8389159440994263, "learning_rate": 4.444444444444444e-06, "loss": 0.0093, "step": 4000 }, { "epoch": 6.88, "eval_loss": 0.3735957741737366, "eval_runtime": 65.1742, "eval_samples_per_second": 15.343, "eval_steps_per_second": 3.836, "step": 4000 }, { "epoch": 7.06, "grad_norm": 0.158839613199234, "learning_rate": 4.000000000000001e-06, "loss": 0.0071, "step": 4100 }, { "epoch": 7.23, "grad_norm": 0.4010579586029053, "learning_rate": 3.555555555555556e-06, "loss": 0.0047, "step": 4200 }, { "epoch": 7.4, "grad_norm": 0.16058029234409332, "learning_rate": 3.1111111111111116e-06, "loss": 0.0037, "step": 4300 }, { "epoch": 7.57, "grad_norm": 0.1508949100971222, "learning_rate": 2.666666666666667e-06, "loss": 0.0039, "step": 4400 }, { "epoch": 7.75, "grad_norm": 0.3169790804386139, "learning_rate": 2.222222222222222e-06, "loss": 0.0037, "step": 4500 }, { "epoch": 7.75, "eval_loss": 0.38906845450401306, "eval_runtime": 65.0854, "eval_samples_per_second": 15.364, "eval_steps_per_second": 3.841, "step": 4500 }, { "epoch": 7.92, "grad_norm": 0.46604883670806885, "learning_rate": 1.777777777777778e-06, "loss": 0.0043, "step": 4600 }, { "epoch": 8.09, "grad_norm": 0.08523246645927429, "learning_rate": 1.3333333333333334e-06, "loss": 0.0042, "step": 4700 }, { "epoch": 8.26, "grad_norm": 0.2368686944246292, "learning_rate": 8.88888888888889e-07, "loss": 0.0032, "step": 4800 }, { "epoch": 8.43, "grad_norm": 0.11481987684965134, "learning_rate": 4.444444444444445e-07, "loss": 0.0019, "step": 4900 }, { "epoch": 8.61, "grad_norm": 0.13208821415901184, "learning_rate": 0.0, "loss": 0.002, "step": 5000 }, { "epoch": 8.61, "eval_loss": 0.4305289387702942, "eval_runtime": 65.0773, "eval_samples_per_second": 15.366, "eval_steps_per_second": 3.842, "step": 5000 } ], "logging_steps": 100, "max_steps": 5000, "num_input_tokens_seen": 0, "num_train_epochs": 9, "save_steps": 500, "total_flos": 1.258569996863275e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }