{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9975369458128078, "eval_steps": 500, "global_step": 270, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003694581280788177, "grad_norm": 1.7654963384889737, "learning_rate": 3.7037037037037036e-07, "loss": 0.7143, "step": 1 }, { "epoch": 0.01847290640394089, "grad_norm": 1.5199857516271311, "learning_rate": 1.8518518518518519e-06, "loss": 0.6647, "step": 5 }, { "epoch": 0.03694581280788178, "grad_norm": 0.49041355078257287, "learning_rate": 3.7037037037037037e-06, "loss": 0.5847, "step": 10 }, { "epoch": 0.05541871921182266, "grad_norm": 0.27640312040532583, "learning_rate": 5.555555555555557e-06, "loss": 0.3425, "step": 15 }, { "epoch": 0.07389162561576355, "grad_norm": 0.18417484630910924, "learning_rate": 7.4074074074074075e-06, "loss": 0.1877, "step": 20 }, { "epoch": 0.09236453201970443, "grad_norm": 0.10832669450973324, "learning_rate": 9.25925925925926e-06, "loss": 0.1471, "step": 25 }, { "epoch": 0.11083743842364532, "grad_norm": 0.08199058472718684, "learning_rate": 9.996239762521152e-06, "loss": 0.1209, "step": 30 }, { "epoch": 0.12931034482758622, "grad_norm": 0.06662942983937706, "learning_rate": 9.973281012033009e-06, "loss": 0.1046, "step": 35 }, { "epoch": 0.1477832512315271, "grad_norm": 0.06811085294724732, "learning_rate": 9.929548316723983e-06, "loss": 0.0958, "step": 40 }, { "epoch": 0.16625615763546797, "grad_norm": 0.055261792291083314, "learning_rate": 9.86522435289912e-06, "loss": 0.0804, "step": 45 }, { "epoch": 0.18472906403940886, "grad_norm": 0.05912952493839196, "learning_rate": 9.7805778088694e-06, "loss": 0.072, "step": 50 }, { "epoch": 0.20320197044334976, "grad_norm": 0.05664454056150425, "learning_rate": 9.67596226261095e-06, "loss": 0.0657, "step": 55 }, { "epoch": 0.22167487684729065, "grad_norm": 0.05397727582384683, "learning_rate": 9.551814704830734e-06, "loss": 0.0626, "step": 60 }, { "epoch": 0.24014778325123154, "grad_norm": 0.054354437756367974, "learning_rate": 9.40865371360804e-06, "loss": 0.0567, "step": 65 }, { "epoch": 0.25862068965517243, "grad_norm": 0.052656359957651395, "learning_rate": 9.247077288236488e-06, "loss": 0.0506, "step": 70 }, { "epoch": 0.2770935960591133, "grad_norm": 0.06099527619651085, "learning_rate": 9.067760351314838e-06, "loss": 0.0447, "step": 75 }, { "epoch": 0.2955665024630542, "grad_norm": 0.05601267986585614, "learning_rate": 8.871451929520662e-06, "loss": 0.046, "step": 80 }, { "epoch": 0.31403940886699505, "grad_norm": 0.05031077501489638, "learning_rate": 8.658972024843063e-06, "loss": 0.0434, "step": 85 }, { "epoch": 0.33251231527093594, "grad_norm": 0.05165558400160997, "learning_rate": 8.43120818934367e-06, "loss": 0.0388, "step": 90 }, { "epoch": 0.35098522167487683, "grad_norm": 0.05542969825706018, "learning_rate": 8.18911181775353e-06, "loss": 0.0474, "step": 95 }, { "epoch": 0.3694581280788177, "grad_norm": 0.054028908120351174, "learning_rate": 7.93369417339209e-06, "loss": 0.0426, "step": 100 }, { "epoch": 0.3879310344827586, "grad_norm": 0.043910381575552423, "learning_rate": 7.666022164008458e-06, "loss": 0.0402, "step": 105 }, { "epoch": 0.4064039408866995, "grad_norm": 0.04420065737962207, "learning_rate": 7.387213885189746e-06, "loss": 0.0441, "step": 110 }, { "epoch": 0.4248768472906404, "grad_norm": 0.042881775226418783, "learning_rate": 7.098433949952146e-06, "loss": 0.0422, "step": 115 }, { "epoch": 0.4433497536945813, "grad_norm": 0.04261745777882021, "learning_rate": 6.800888624023552e-06, "loss": 0.0414, "step": 120 }, { "epoch": 0.4618226600985222, "grad_norm": 0.03486953918711132, "learning_rate": 6.495820787138209e-06, "loss": 0.0345, "step": 125 }, { "epoch": 0.4802955665024631, "grad_norm": 0.04758579863549666, "learning_rate": 6.184504741390596e-06, "loss": 0.0393, "step": 130 }, { "epoch": 0.4987684729064039, "grad_norm": 0.04528381812617609, "learning_rate": 5.8682408883346535e-06, "loss": 0.0355, "step": 135 }, { "epoch": 0.5172413793103449, "grad_norm": 0.05087543134069284, "learning_rate": 5.548350297062659e-06, "loss": 0.0376, "step": 140 }, { "epoch": 0.5357142857142857, "grad_norm": 0.03521666590779419, "learning_rate": 5.2261691859535325e-06, "loss": 0.0316, "step": 145 }, { "epoch": 0.5541871921182266, "grad_norm": 0.0517150609216818, "learning_rate": 4.903043341140879e-06, "loss": 0.032, "step": 150 }, { "epoch": 0.5726600985221675, "grad_norm": 0.03346412011949477, "learning_rate": 4.580322495015466e-06, "loss": 0.0303, "step": 155 }, { "epoch": 0.5911330049261084, "grad_norm": 0.040115617332954906, "learning_rate": 4.259354688243758e-06, "loss": 0.0382, "step": 160 }, { "epoch": 0.6096059113300493, "grad_norm": 0.04277501887386235, "learning_rate": 3.941480638852948e-06, "loss": 0.0291, "step": 165 }, { "epoch": 0.6280788177339901, "grad_norm": 0.03898041246450539, "learning_rate": 3.6280281419034934e-06, "loss": 0.0317, "step": 170 }, { "epoch": 0.646551724137931, "grad_norm": 0.04371627160570444, "learning_rate": 3.3203065231422904e-06, "loss": 0.0301, "step": 175 }, { "epoch": 0.6650246305418719, "grad_norm": 0.03238864584138372, "learning_rate": 3.019601169804216e-06, "loss": 0.0354, "step": 180 }, { "epoch": 0.6834975369458128, "grad_norm": 0.041127251144739585, "learning_rate": 2.7271681614074973e-06, "loss": 0.0294, "step": 185 }, { "epoch": 0.7019704433497537, "grad_norm": 0.045180481360547094, "learning_rate": 2.4442290229706344e-06, "loss": 0.0358, "step": 190 }, { "epoch": 0.7204433497536946, "grad_norm": 0.045021953447442344, "learning_rate": 2.171965622567308e-06, "loss": 0.0306, "step": 195 }, { "epoch": 0.7389162561576355, "grad_norm": 0.050026098917487306, "learning_rate": 1.9115152345327154e-06, "loss": 0.0418, "step": 200 }, { "epoch": 0.7573891625615764, "grad_norm": 0.03656415909500236, "learning_rate": 1.6639657889429017e-06, "loss": 0.0286, "step": 205 }, { "epoch": 0.7758620689655172, "grad_norm": 0.045197818476724314, "learning_rate": 1.4303513272105057e-06, "loss": 0.0317, "step": 210 }, { "epoch": 0.7943349753694581, "grad_norm": 0.041762867738677684, "learning_rate": 1.2116476827794104e-06, "loss": 0.0355, "step": 215 }, { "epoch": 0.812807881773399, "grad_norm": 0.03975638695681742, "learning_rate": 1.008768404960535e-06, "loss": 0.034, "step": 220 }, { "epoch": 0.8312807881773399, "grad_norm": 0.03688322160939588, "learning_rate": 8.225609429353187e-07, "loss": 0.0306, "step": 225 }, { "epoch": 0.8497536945812808, "grad_norm": 0.045910201896259065, "learning_rate": 6.53803105866761e-07, "loss": 0.032, "step": 230 }, { "epoch": 0.8682266009852216, "grad_norm": 0.03466115718136847, "learning_rate": 5.031998139045352e-07, "loss": 0.03, "step": 235 }, { "epoch": 0.8866995073891626, "grad_norm": 0.04041208746429919, "learning_rate": 3.7138015365554834e-07, "loss": 0.033, "step": 240 }, { "epoch": 0.9051724137931034, "grad_norm": 0.030110072192059505, "learning_rate": 2.5889475041961767e-07, "loss": 0.0316, "step": 245 }, { "epoch": 0.9236453201970444, "grad_norm": 0.04208800780561471, "learning_rate": 1.6621346816668993e-07, "loss": 0.0317, "step": 250 }, { "epoch": 0.9421182266009852, "grad_norm": 0.040891559299692404, "learning_rate": 9.372344686307655e-08, "loss": 0.0365, "step": 255 }, { "epoch": 0.9605911330049262, "grad_norm": 0.03589408474423174, "learning_rate": 4.172748534499449e-08, "loss": 0.0288, "step": 260 }, { "epoch": 0.979064039408867, "grad_norm": 0.03745965637860769, "learning_rate": 1.044277649433989e-08, "loss": 0.0307, "step": 265 }, { "epoch": 0.9975369458128078, "grad_norm": 0.031721423195282906, "learning_rate": 0.0, "loss": 0.033, "step": 270 }, { "epoch": 0.9975369458128078, "step": 270, "total_flos": 6.535464838821315e+17, "train_loss": 0.07531778989014802, "train_runtime": 2373.3356, "train_samples_per_second": 2.736, "train_steps_per_second": 0.114 } ], "logging_steps": 5, "max_steps": 270, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.535464838821315e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }