{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.929233772571987, "eval_steps": 50, "global_step": 1280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3904343582235237, "grad_norm": 0.5675944685935974, "learning_rate": 9.609375e-05, "loss": 1.5678, "step": 50 }, { "epoch": 0.3904343582235237, "eval_loss": 1.53541898727417, "eval_runtime": 11.6265, "eval_samples_per_second": 37.156, "eval_steps_per_second": 18.578, "step": 50 }, { "epoch": 0.7808687164470474, "grad_norm": 0.5330150127410889, "learning_rate": 9.21875e-05, "loss": 1.5019, "step": 100 }, { "epoch": 0.7808687164470474, "eval_loss": 1.4973269701004028, "eval_runtime": 11.5507, "eval_samples_per_second": 37.4, "eval_steps_per_second": 18.7, "step": 100 }, { "epoch": 1.16398243045388, "grad_norm": 0.5410779714584351, "learning_rate": 8.828125000000001e-05, "loss": 1.4244, "step": 150 }, { "epoch": 1.16398243045388, "eval_loss": 1.43968665599823, "eval_runtime": 11.5205, "eval_samples_per_second": 37.498, "eval_steps_per_second": 18.749, "step": 150 }, { "epoch": 1.5544167886774036, "grad_norm": 0.5698382258415222, "learning_rate": 8.4375e-05, "loss": 1.4023, "step": 200 }, { "epoch": 1.5544167886774036, "eval_loss": 1.4115360975265503, "eval_runtime": 11.5294, "eval_samples_per_second": 37.469, "eval_steps_per_second": 18.735, "step": 200 }, { "epoch": 1.9448511469009273, "grad_norm": 0.5761227607727051, "learning_rate": 8.046875e-05, "loss": 1.3693, "step": 250 }, { "epoch": 1.9448511469009273, "eval_loss": 1.3649152517318726, "eval_runtime": 11.5198, "eval_samples_per_second": 37.501, "eval_steps_per_second": 18.75, "step": 250 }, { "epoch": 2.32796486090776, "grad_norm": 0.5856680274009705, "learning_rate": 7.65625e-05, "loss": 1.302, "step": 300 }, { "epoch": 2.32796486090776, "eval_loss": 1.3940138816833496, "eval_runtime": 11.5409, "eval_samples_per_second": 37.432, "eval_steps_per_second": 18.716, "step": 300 }, { "epoch": 2.7183992191312836, "grad_norm": 0.5735114216804504, "learning_rate": 7.265625000000001e-05, "loss": 1.3063, "step": 350 }, { "epoch": 2.7183992191312836, "eval_loss": 1.3489629030227661, "eval_runtime": 11.5502, "eval_samples_per_second": 37.402, "eval_steps_per_second": 18.701, "step": 350 }, { "epoch": 3.101512933138116, "grad_norm": 0.6013683676719666, "learning_rate": 6.875e-05, "loss": 1.2507, "step": 400 }, { "epoch": 3.101512933138116, "eval_loss": 1.3298077583312988, "eval_runtime": 11.5601, "eval_samples_per_second": 37.37, "eval_steps_per_second": 18.685, "step": 400 }, { "epoch": 3.49194729136164, "grad_norm": 0.6187678575515747, "learning_rate": 6.484375e-05, "loss": 1.2463, "step": 450 }, { "epoch": 3.49194729136164, "eval_loss": 1.2986701726913452, "eval_runtime": 11.5395, "eval_samples_per_second": 37.437, "eval_steps_per_second": 18.718, "step": 450 }, { "epoch": 3.8823816495851635, "grad_norm": 0.5973629951477051, "learning_rate": 6.0937500000000004e-05, "loss": 1.2315, "step": 500 }, { "epoch": 3.8823816495851635, "eval_loss": 1.2973381280899048, "eval_runtime": 11.5491, "eval_samples_per_second": 37.405, "eval_steps_per_second": 18.703, "step": 500 }, { "epoch": 4.265495363591996, "grad_norm": 0.6226805448532104, "learning_rate": 5.703125e-05, "loss": 1.1778, "step": 550 }, { "epoch": 4.265495363591996, "eval_loss": 1.2510361671447754, "eval_runtime": 11.5281, "eval_samples_per_second": 37.474, "eval_steps_per_second": 18.737, "step": 550 }, { "epoch": 4.65592972181552, "grad_norm": 0.64255690574646, "learning_rate": 5.3125000000000004e-05, "loss": 1.1819, "step": 600 }, { "epoch": 4.65592972181552, "eval_loss": 1.2529098987579346, "eval_runtime": 11.5409, "eval_samples_per_second": 37.432, "eval_steps_per_second": 18.716, "step": 600 }, { "epoch": 5.039043435822352, "grad_norm": 0.6386131048202515, "learning_rate": 4.921875e-05, "loss": 1.1507, "step": 650 }, { "epoch": 5.039043435822352, "eval_loss": 1.2271380424499512, "eval_runtime": 11.5699, "eval_samples_per_second": 37.338, "eval_steps_per_second": 18.669, "step": 650 }, { "epoch": 5.4294777940458765, "grad_norm": 0.6771230101585388, "learning_rate": 4.5312500000000004e-05, "loss": 1.134, "step": 700 }, { "epoch": 5.4294777940458765, "eval_loss": 1.2191808223724365, "eval_runtime": 11.5238, "eval_samples_per_second": 37.488, "eval_steps_per_second": 18.744, "step": 700 }, { "epoch": 5.819912152269399, "grad_norm": 0.6427966952323914, "learning_rate": 4.140625e-05, "loss": 1.1258, "step": 750 }, { "epoch": 5.819912152269399, "eval_loss": 1.2103700637817383, "eval_runtime": 11.5062, "eval_samples_per_second": 37.545, "eval_steps_per_second": 18.772, "step": 750 }, { "epoch": 6.203025866276232, "grad_norm": 0.6937867403030396, "learning_rate": 3.7500000000000003e-05, "loss": 1.0941, "step": 800 }, { "epoch": 6.203025866276232, "eval_loss": 1.2105975151062012, "eval_runtime": 11.55, "eval_samples_per_second": 37.403, "eval_steps_per_second": 18.701, "step": 800 }, { "epoch": 6.593460224499756, "grad_norm": 0.719428300857544, "learning_rate": 3.359375e-05, "loss": 1.0923, "step": 850 }, { "epoch": 6.593460224499756, "eval_loss": 1.1799763441085815, "eval_runtime": 11.536, "eval_samples_per_second": 37.448, "eval_steps_per_second": 18.724, "step": 850 }, { "epoch": 6.98389458272328, "grad_norm": 0.65595942735672, "learning_rate": 2.96875e-05, "loss": 1.0796, "step": 900 }, { "epoch": 6.98389458272328, "eval_loss": 1.1729077100753784, "eval_runtime": 11.5363, "eval_samples_per_second": 37.447, "eval_steps_per_second": 18.724, "step": 900 }, { "epoch": 7.367008296730113, "grad_norm": 0.7260088920593262, "learning_rate": 2.578125e-05, "loss": 1.0371, "step": 950 }, { "epoch": 7.367008296730113, "eval_loss": 1.1634416580200195, "eval_runtime": 11.5405, "eval_samples_per_second": 37.433, "eval_steps_per_second": 18.717, "step": 950 }, { "epoch": 7.7574426549536355, "grad_norm": 0.6944181323051453, "learning_rate": 2.1875e-05, "loss": 1.0466, "step": 1000 }, { "epoch": 7.7574426549536355, "eval_loss": 1.154969573020935, "eval_runtime": 11.532, "eval_samples_per_second": 37.461, "eval_steps_per_second": 18.731, "step": 1000 }, { "epoch": 8.140556368960468, "grad_norm": 0.7572025060653687, "learning_rate": 1.796875e-05, "loss": 1.0228, "step": 1050 }, { "epoch": 8.140556368960468, "eval_loss": 1.1517966985702515, "eval_runtime": 11.5429, "eval_samples_per_second": 37.426, "eval_steps_per_second": 18.713, "step": 1050 }, { "epoch": 8.530990727183992, "grad_norm": 0.6960224509239197, "learning_rate": 1.4062500000000001e-05, "loss": 1.0231, "step": 1100 }, { "epoch": 8.530990727183992, "eval_loss": 1.108694314956665, "eval_runtime": 11.5162, "eval_samples_per_second": 37.512, "eval_steps_per_second": 18.756, "step": 1100 }, { "epoch": 8.921425085407517, "grad_norm": 0.6743898391723633, "learning_rate": 1.0156250000000001e-05, "loss": 1.0164, "step": 1150 }, { "epoch": 8.921425085407517, "eval_loss": 1.1212413311004639, "eval_runtime": 11.5308, "eval_samples_per_second": 37.465, "eval_steps_per_second": 18.732, "step": 1150 }, { "epoch": 9.304538799414349, "grad_norm": 0.7794139385223389, "learning_rate": 6.25e-06, "loss": 0.9863, "step": 1200 }, { "epoch": 9.304538799414349, "eval_loss": 1.1227957010269165, "eval_runtime": 11.5133, "eval_samples_per_second": 37.522, "eval_steps_per_second": 18.761, "step": 1200 }, { "epoch": 9.694973157637872, "grad_norm": 0.7152210474014282, "learning_rate": 2.3437500000000002e-06, "loss": 1.005, "step": 1250 }, { "epoch": 9.694973157637872, "eval_loss": 1.1129647493362427, "eval_runtime": 11.5181, "eval_samples_per_second": 37.506, "eval_steps_per_second": 18.753, "step": 1250 } ], "logging_steps": 50, "max_steps": 1280, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.541080433502454e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }