{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0583941605839415, "eval_steps": 500, "global_step": 55, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 2.1336327642202377, "epoch": 0.058394160583941604, "grad_norm": 16.25, "learning_rate": 0.0, "loss": 2.4507, "mean_token_accuracy": 0.4276521671563387, "num_tokens": 3890.0, "step": 1 }, { "entropy": 2.222498059272766, "epoch": 0.11678832116788321, "grad_norm": 17.875, "learning_rate": 2e-06, "loss": 2.6879, "mean_token_accuracy": 0.4140724800527096, "num_tokens": 6825.0, "step": 2 }, { "entropy": 2.262238770723343, "epoch": 0.17518248175182483, "grad_norm": 14.25, "learning_rate": 4e-06, "loss": 2.4973, "mean_token_accuracy": 0.42698577605187893, "num_tokens": 10473.0, "step": 3 }, { "entropy": 2.1316296458244324, "epoch": 0.23357664233576642, "grad_norm": 10.0, "learning_rate": 6e-06, "loss": 2.0863, "mean_token_accuracy": 0.4788516294211149, "num_tokens": 15657.0, "step": 4 }, { "entropy": 2.3611446991562843, "epoch": 0.291970802919708, "grad_norm": 10.1875, "learning_rate": 8e-06, "loss": 2.1751, "mean_token_accuracy": 0.4456866979598999, "num_tokens": 20159.0, "step": 5 }, { "entropy": 2.460206426680088, "epoch": 0.35036496350364965, "grad_norm": 8.875, "learning_rate": 9.999999999999999e-06, "loss": 2.2655, "mean_token_accuracy": 0.4509607646614313, "num_tokens": 23949.0, "step": 6 }, { "entropy": 2.321817234158516, "epoch": 0.40875912408759124, "grad_norm": 7.125, "learning_rate": 1.2e-05, "loss": 2.0123, "mean_token_accuracy": 0.5055391453206539, "num_tokens": 27703.0, "step": 7 }, { "entropy": 2.2407592684030533, "epoch": 0.46715328467153283, "grad_norm": 5.4375, "learning_rate": 1.4e-05, "loss": 1.8516, "mean_token_accuracy": 0.5130146574229002, "num_tokens": 32243.0, "step": 8 }, { "entropy": 2.46332585811615, "epoch": 0.5255474452554745, "grad_norm": 7.09375, "learning_rate": 1.6e-05, "loss": 2.0974, "mean_token_accuracy": 0.5035313870757818, "num_tokens": 35222.0, "step": 9 }, { "entropy": 2.237804166972637, "epoch": 0.583941605839416, "grad_norm": 5.65625, "learning_rate": 1.8e-05, "loss": 1.7838, "mean_token_accuracy": 0.5259560514241457, "num_tokens": 39208.0, "step": 10 }, { "entropy": 2.352365091443062, "epoch": 0.6423357664233577, "grad_norm": 5.84375, "learning_rate": 1.9999999999999998e-05, "loss": 2.0078, "mean_token_accuracy": 0.5078456345945597, "num_tokens": 42447.0, "step": 11 }, { "entropy": 2.1229992732405663, "epoch": 0.7007299270072993, "grad_norm": 4.5625, "learning_rate": 2.2e-05, "loss": 1.7155, "mean_token_accuracy": 0.5374241229146719, "num_tokens": 47138.0, "step": 12 }, { "entropy": 2.121931955218315, "epoch": 0.7591240875912408, "grad_norm": 4.625, "learning_rate": 2.4e-05, "loss": 1.7379, "mean_token_accuracy": 0.5694513749331236, "num_tokens": 51009.0, "step": 13 }, { "entropy": 2.085137240588665, "epoch": 0.8175182481751825, "grad_norm": 4.25, "learning_rate": 2.6000000000000002e-05, "loss": 1.6524, "mean_token_accuracy": 0.5468352809548378, "num_tokens": 55235.0, "step": 14 }, { "entropy": 2.1976606771349907, "epoch": 0.8759124087591241, "grad_norm": 5.625, "learning_rate": 2.8e-05, "loss": 1.8096, "mean_token_accuracy": 0.5231170020997524, "num_tokens": 58219.0, "step": 15 }, { "entropy": 1.9179195016622543, "epoch": 0.9343065693430657, "grad_norm": 3.84375, "learning_rate": 3e-05, "loss": 1.5974, "mean_token_accuracy": 0.5759452320635319, "num_tokens": 63057.0, "step": 16 }, { "entropy": 2.0428223088383675, "epoch": 0.9927007299270073, "grad_norm": 4.53125, "learning_rate": 2.9986842451482876e-05, "loss": 1.7372, "mean_token_accuracy": 0.5385774970054626, "num_tokens": 66564.0, "step": 17 }, { "entropy": 1.9300671219825745, "epoch": 1.0, "grad_norm": 12.9375, "learning_rate": 2.9947392888742566e-05, "loss": 1.7476, "mean_token_accuracy": 0.5453733801841736, "num_tokens": 66897.0, "step": 18 }, { "entropy": 1.935000792145729, "epoch": 1.0583941605839415, "grad_norm": 3.484375, "learning_rate": 2.988172051971717e-05, "loss": 1.4249, "mean_token_accuracy": 0.6101336404681206, "num_tokens": 71403.0, "step": 19 }, { "entropy": 2.0335680916905403, "epoch": 1.1167883211678833, "grad_norm": 3.84375, "learning_rate": 2.9789940556057574e-05, "loss": 1.5345, "mean_token_accuracy": 0.5629026051610708, "num_tokens": 75484.0, "step": 20 }, { "entropy": 2.10165449231863, "epoch": 1.1751824817518248, "grad_norm": 4.0625, "learning_rate": 2.9672214011007087e-05, "loss": 1.4949, "mean_token_accuracy": 0.5799959097057581, "num_tokens": 79113.0, "step": 21 }, { "entropy": 1.992341309785843, "epoch": 1.2335766423357664, "grad_norm": 3.59375, "learning_rate": 2.9528747416929467e-05, "loss": 1.4678, "mean_token_accuracy": 0.5918100215494633, "num_tokens": 83095.0, "step": 22 }, { "entropy": 1.9140778183937073, "epoch": 1.2919708029197081, "grad_norm": 3.375, "learning_rate": 2.9359792462981007e-05, "loss": 1.4038, "mean_token_accuracy": 0.6022733096033335, "num_tokens": 87754.0, "step": 23 }, { "entropy": 1.8838416188955307, "epoch": 1.3503649635036497, "grad_norm": 3.8125, "learning_rate": 2.9165645553562215e-05, "loss": 1.4554, "mean_token_accuracy": 0.6133127138018608, "num_tokens": 91666.0, "step": 24 }, { "entropy": 1.816191054880619, "epoch": 1.4087591240875912, "grad_norm": 3.859375, "learning_rate": 2.894664728832377e-05, "loss": 1.3643, "mean_token_accuracy": 0.6147295907139778, "num_tokens": 95819.0, "step": 25 }, { "entropy": 1.7681904509663582, "epoch": 1.4671532846715327, "grad_norm": 3.609375, "learning_rate": 2.8703181864639013e-05, "loss": 1.3711, "mean_token_accuracy": 0.6297403201460838, "num_tokens": 99865.0, "step": 26 }, { "entropy": 1.7096636295318604, "epoch": 1.5255474452554745, "grad_norm": 3.390625, "learning_rate": 2.8435676403591193e-05, "loss": 1.3362, "mean_token_accuracy": 0.6145001202821732, "num_tokens": 104135.0, "step": 27 }, { "entropy": 1.828701414167881, "epoch": 1.583941605839416, "grad_norm": 4.3125, "learning_rate": 2.8144600200657953e-05, "loss": 1.4266, "mean_token_accuracy": 0.5893764644861221, "num_tokens": 107324.0, "step": 28 }, { "entropy": 1.8768919259309769, "epoch": 1.6423357664233578, "grad_norm": 4.65625, "learning_rate": 2.78304639024076e-05, "loss": 1.5031, "mean_token_accuracy": 0.5983850117772818, "num_tokens": 110263.0, "step": 29 }, { "entropy": 1.7338064908981323, "epoch": 1.7007299270072993, "grad_norm": 4.34375, "learning_rate": 2.7493818610651493e-05, "loss": 1.4431, "mean_token_accuracy": 0.5914898477494717, "num_tokens": 113911.0, "step": 30 }, { "entropy": 1.7540361359715462, "epoch": 1.7591240875912408, "grad_norm": 3.734375, "learning_rate": 2.7135254915624213e-05, "loss": 1.3489, "mean_token_accuracy": 0.6010549142956734, "num_tokens": 118007.0, "step": 31 }, { "entropy": 1.8890240713953972, "epoch": 1.8175182481751824, "grad_norm": 4.65625, "learning_rate": 2.6755401859887598e-05, "loss": 1.4448, "mean_token_accuracy": 0.6083299573510885, "num_tokens": 120725.0, "step": 32 }, { "entropy": 1.850830078125, "epoch": 1.8759124087591241, "grad_norm": 4.28125, "learning_rate": 2.6354925834776346e-05, "loss": 1.502, "mean_token_accuracy": 0.6061263754963875, "num_tokens": 124333.0, "step": 33 }, { "entropy": 1.7397000417113304, "epoch": 1.9343065693430657, "grad_norm": 3.671875, "learning_rate": 2.5934529411321174e-05, "loss": 1.2539, "mean_token_accuracy": 0.6317082159221172, "num_tokens": 128615.0, "step": 34 }, { "entropy": 1.813131682574749, "epoch": 1.9927007299270074, "grad_norm": 3.5625, "learning_rate": 2.5494950107700482e-05, "loss": 1.3284, "mean_token_accuracy": 0.6140319798141718, "num_tokens": 132847.0, "step": 35 }, { "entropy": 1.5973615646362305, "epoch": 2.0, "grad_norm": 7.46875, "learning_rate": 2.5036959095382875e-05, "loss": 1.2697, "mean_token_accuracy": 0.6285321712493896, "num_tokens": 133794.0, "step": 36 }, { "entropy": 1.7645720839500427, "epoch": 2.0583941605839415, "grad_norm": 3.859375, "learning_rate": 2.4561359846230346e-05, "loss": 1.0785, "mean_token_accuracy": 0.6664150357246399, "num_tokens": 137554.0, "step": 37 }, { "entropy": 1.753688521683216, "epoch": 2.116788321167883, "grad_norm": 3.3125, "learning_rate": 2.4068986722935625e-05, "loss": 1.0716, "mean_token_accuracy": 0.6744864694774151, "num_tokens": 141721.0, "step": 38 }, { "entropy": 1.6263050064444542, "epoch": 2.1751824817518246, "grad_norm": 4.3125, "learning_rate": 2.356070351526648e-05, "loss": 1.0687, "mean_token_accuracy": 0.6837072521448135, "num_tokens": 146069.0, "step": 39 }, { "entropy": 1.8026663437485695, "epoch": 2.2335766423357666, "grad_norm": 3.84375, "learning_rate": 2.303740192468495e-05, "loss": 1.1566, "mean_token_accuracy": 0.6734990328550339, "num_tokens": 149664.0, "step": 40 }, { "entropy": 1.5874073877930641, "epoch": 2.291970802919708, "grad_norm": 3.53125, "learning_rate": 2.25e-05, "loss": 1.0591, "mean_token_accuracy": 0.6754884608089924, "num_tokens": 153330.0, "step": 41 }, { "entropy": 1.4746350944042206, "epoch": 2.3503649635036497, "grad_norm": 3.453125, "learning_rate": 2.1949440526797928e-05, "loss": 0.9312, "mean_token_accuracy": 0.7215368486940861, "num_tokens": 157396.0, "step": 42 }, { "entropy": 1.4334233030676842, "epoch": 2.408759124087591, "grad_norm": 6.96875, "learning_rate": 2.138668937347609e-05, "loss": 0.9952, "mean_token_accuracy": 0.7047883793711662, "num_tokens": 162386.0, "step": 43 }, { "entropy": 1.4815244674682617, "epoch": 2.4671532846715327, "grad_norm": 3.9375, "learning_rate": 2.0812733796781544e-05, "loss": 1.0847, "mean_token_accuracy": 0.680337205529213, "num_tokens": 166308.0, "step": 44 }, { "entropy": 1.4337811917066574, "epoch": 2.5255474452554747, "grad_norm": 4.21875, "learning_rate": 2.022858070982723e-05, "loss": 1.0594, "mean_token_accuracy": 0.686751551926136, "num_tokens": 169979.0, "step": 45 }, { "entropy": 1.380111612379551, "epoch": 2.5839416058394162, "grad_norm": 3.984375, "learning_rate": 1.963525491562421e-05, "loss": 0.9718, "mean_token_accuracy": 0.7241853773593903, "num_tokens": 174586.0, "step": 46 }, { "entropy": 1.339597962796688, "epoch": 2.6423357664233578, "grad_norm": 4.0625, "learning_rate": 1.9033797309228984e-05, "loss": 0.9445, "mean_token_accuracy": 0.7082682773470879, "num_tokens": 178535.0, "step": 47 }, { "entropy": 1.293665699660778, "epoch": 2.7007299270072993, "grad_norm": 3.765625, "learning_rate": 1.8425263051659838e-05, "loss": 0.9213, "mean_token_accuracy": 0.7238599583506584, "num_tokens": 183350.0, "step": 48 }, { "entropy": 1.3446906879544258, "epoch": 2.759124087591241, "grad_norm": 4.46875, "learning_rate": 1.781071971878587e-05, "loss": 0.9652, "mean_token_accuracy": 0.6951282061636448, "num_tokens": 187493.0, "step": 49 }, { "entropy": 1.3415213227272034, "epoch": 2.8175182481751824, "grad_norm": 4.8125, "learning_rate": 1.7191245428436175e-05, "loss": 1.0102, "mean_token_accuracy": 0.7021605856716633, "num_tokens": 190843.0, "step": 50 }, { "entropy": 1.4499380737543106, "epoch": 2.875912408759124, "grad_norm": 5.71875, "learning_rate": 1.6567926949014805e-05, "loss": 1.0649, "mean_token_accuracy": 0.7037234976887703, "num_tokens": 193518.0, "step": 51 }, { "entropy": 1.3929353207349777, "epoch": 2.9343065693430654, "grad_norm": 4.75, "learning_rate": 1.5941857792939702e-05, "loss": 1.0284, "mean_token_accuracy": 0.6902767680585384, "num_tokens": 196895.0, "step": 52 }, { "entropy": 1.4459699764847755, "epoch": 2.9927007299270074, "grad_norm": 4.75, "learning_rate": 1.5314136298250355e-05, "loss": 1.013, "mean_token_accuracy": 0.6965249925851822, "num_tokens": 200296.0, "step": 53 }, { "entropy": 1.399910032749176, "epoch": 3.0, "grad_norm": 13.0625, "learning_rate": 1.4685863701749648e-05, "loss": 1.0552, "mean_token_accuracy": 0.6890038251876831, "num_tokens": 200691.0, "step": 54 }, { "entropy": 1.3579635098576546, "epoch": 3.0583941605839415, "grad_norm": 4.28125, "learning_rate": 1.40581422070603e-05, "loss": 0.7865, "mean_token_accuracy": 0.765391580760479, "num_tokens": 204197.0, "step": 55 } ], "logging_steps": 1, "max_steps": 90, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 5, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5469020090400768.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }