| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.0583941605839415, | |
| "eval_steps": 500, | |
| "global_step": 55, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 2.1336327642202377, | |
| "epoch": 0.058394160583941604, | |
| "grad_norm": 16.25, | |
| "learning_rate": 0.0, | |
| "loss": 2.4507, | |
| "mean_token_accuracy": 0.4276521671563387, | |
| "num_tokens": 3890.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 2.222498059272766, | |
| "epoch": 0.11678832116788321, | |
| "grad_norm": 17.875, | |
| "learning_rate": 2e-06, | |
| "loss": 2.6879, | |
| "mean_token_accuracy": 0.4140724800527096, | |
| "num_tokens": 6825.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 2.262238770723343, | |
| "epoch": 0.17518248175182483, | |
| "grad_norm": 14.25, | |
| "learning_rate": 4e-06, | |
| "loss": 2.4973, | |
| "mean_token_accuracy": 0.42698577605187893, | |
| "num_tokens": 10473.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 2.1316296458244324, | |
| "epoch": 0.23357664233576642, | |
| "grad_norm": 10.0, | |
| "learning_rate": 6e-06, | |
| "loss": 2.0863, | |
| "mean_token_accuracy": 0.4788516294211149, | |
| "num_tokens": 15657.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 2.3611446991562843, | |
| "epoch": 0.291970802919708, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 8e-06, | |
| "loss": 2.1751, | |
| "mean_token_accuracy": 0.4456866979598999, | |
| "num_tokens": 20159.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 2.460206426680088, | |
| "epoch": 0.35036496350364965, | |
| "grad_norm": 8.875, | |
| "learning_rate": 9.999999999999999e-06, | |
| "loss": 2.2655, | |
| "mean_token_accuracy": 0.4509607646614313, | |
| "num_tokens": 23949.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 2.321817234158516, | |
| "epoch": 0.40875912408759124, | |
| "grad_norm": 7.125, | |
| "learning_rate": 1.2e-05, | |
| "loss": 2.0123, | |
| "mean_token_accuracy": 0.5055391453206539, | |
| "num_tokens": 27703.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 2.2407592684030533, | |
| "epoch": 0.46715328467153283, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 1.4e-05, | |
| "loss": 1.8516, | |
| "mean_token_accuracy": 0.5130146574229002, | |
| "num_tokens": 32243.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 2.46332585811615, | |
| "epoch": 0.5255474452554745, | |
| "grad_norm": 7.09375, | |
| "learning_rate": 1.6e-05, | |
| "loss": 2.0974, | |
| "mean_token_accuracy": 0.5035313870757818, | |
| "num_tokens": 35222.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 2.237804166972637, | |
| "epoch": 0.583941605839416, | |
| "grad_norm": 5.65625, | |
| "learning_rate": 1.8e-05, | |
| "loss": 1.7838, | |
| "mean_token_accuracy": 0.5259560514241457, | |
| "num_tokens": 39208.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 2.352365091443062, | |
| "epoch": 0.6423357664233577, | |
| "grad_norm": 5.84375, | |
| "learning_rate": 1.9999999999999998e-05, | |
| "loss": 2.0078, | |
| "mean_token_accuracy": 0.5078456345945597, | |
| "num_tokens": 42447.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 2.1229992732405663, | |
| "epoch": 0.7007299270072993, | |
| "grad_norm": 4.5625, | |
| "learning_rate": 2.2e-05, | |
| "loss": 1.7155, | |
| "mean_token_accuracy": 0.5374241229146719, | |
| "num_tokens": 47138.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 2.121931955218315, | |
| "epoch": 0.7591240875912408, | |
| "grad_norm": 4.625, | |
| "learning_rate": 2.4e-05, | |
| "loss": 1.7379, | |
| "mean_token_accuracy": 0.5694513749331236, | |
| "num_tokens": 51009.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 2.085137240588665, | |
| "epoch": 0.8175182481751825, | |
| "grad_norm": 4.25, | |
| "learning_rate": 2.6000000000000002e-05, | |
| "loss": 1.6524, | |
| "mean_token_accuracy": 0.5468352809548378, | |
| "num_tokens": 55235.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 2.1976606771349907, | |
| "epoch": 0.8759124087591241, | |
| "grad_norm": 5.625, | |
| "learning_rate": 2.8e-05, | |
| "loss": 1.8096, | |
| "mean_token_accuracy": 0.5231170020997524, | |
| "num_tokens": 58219.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 1.9179195016622543, | |
| "epoch": 0.9343065693430657, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 3e-05, | |
| "loss": 1.5974, | |
| "mean_token_accuracy": 0.5759452320635319, | |
| "num_tokens": 63057.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 2.0428223088383675, | |
| "epoch": 0.9927007299270073, | |
| "grad_norm": 4.53125, | |
| "learning_rate": 2.9986842451482876e-05, | |
| "loss": 1.7372, | |
| "mean_token_accuracy": 0.5385774970054626, | |
| "num_tokens": 66564.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 1.9300671219825745, | |
| "epoch": 1.0, | |
| "grad_norm": 12.9375, | |
| "learning_rate": 2.9947392888742566e-05, | |
| "loss": 1.7476, | |
| "mean_token_accuracy": 0.5453733801841736, | |
| "num_tokens": 66897.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 1.935000792145729, | |
| "epoch": 1.0583941605839415, | |
| "grad_norm": 3.484375, | |
| "learning_rate": 2.988172051971717e-05, | |
| "loss": 1.4249, | |
| "mean_token_accuracy": 0.6101336404681206, | |
| "num_tokens": 71403.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 2.0335680916905403, | |
| "epoch": 1.1167883211678833, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 2.9789940556057574e-05, | |
| "loss": 1.5345, | |
| "mean_token_accuracy": 0.5629026051610708, | |
| "num_tokens": 75484.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 2.10165449231863, | |
| "epoch": 1.1751824817518248, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 2.9672214011007087e-05, | |
| "loss": 1.4949, | |
| "mean_token_accuracy": 0.5799959097057581, | |
| "num_tokens": 79113.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 1.992341309785843, | |
| "epoch": 1.2335766423357664, | |
| "grad_norm": 3.59375, | |
| "learning_rate": 2.9528747416929467e-05, | |
| "loss": 1.4678, | |
| "mean_token_accuracy": 0.5918100215494633, | |
| "num_tokens": 83095.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 1.9140778183937073, | |
| "epoch": 1.2919708029197081, | |
| "grad_norm": 3.375, | |
| "learning_rate": 2.9359792462981007e-05, | |
| "loss": 1.4038, | |
| "mean_token_accuracy": 0.6022733096033335, | |
| "num_tokens": 87754.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 1.8838416188955307, | |
| "epoch": 1.3503649635036497, | |
| "grad_norm": 3.8125, | |
| "learning_rate": 2.9165645553562215e-05, | |
| "loss": 1.4554, | |
| "mean_token_accuracy": 0.6133127138018608, | |
| "num_tokens": 91666.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 1.816191054880619, | |
| "epoch": 1.4087591240875912, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 2.894664728832377e-05, | |
| "loss": 1.3643, | |
| "mean_token_accuracy": 0.6147295907139778, | |
| "num_tokens": 95819.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 1.7681904509663582, | |
| "epoch": 1.4671532846715327, | |
| "grad_norm": 3.609375, | |
| "learning_rate": 2.8703181864639013e-05, | |
| "loss": 1.3711, | |
| "mean_token_accuracy": 0.6297403201460838, | |
| "num_tokens": 99865.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 1.7096636295318604, | |
| "epoch": 1.5255474452554745, | |
| "grad_norm": 3.390625, | |
| "learning_rate": 2.8435676403591193e-05, | |
| "loss": 1.3362, | |
| "mean_token_accuracy": 0.6145001202821732, | |
| "num_tokens": 104135.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 1.828701414167881, | |
| "epoch": 1.583941605839416, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 2.8144600200657953e-05, | |
| "loss": 1.4266, | |
| "mean_token_accuracy": 0.5893764644861221, | |
| "num_tokens": 107324.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 1.8768919259309769, | |
| "epoch": 1.6423357664233578, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 2.78304639024076e-05, | |
| "loss": 1.5031, | |
| "mean_token_accuracy": 0.5983850117772818, | |
| "num_tokens": 110263.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 1.7338064908981323, | |
| "epoch": 1.7007299270072993, | |
| "grad_norm": 4.34375, | |
| "learning_rate": 2.7493818610651493e-05, | |
| "loss": 1.4431, | |
| "mean_token_accuracy": 0.5914898477494717, | |
| "num_tokens": 113911.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 1.7540361359715462, | |
| "epoch": 1.7591240875912408, | |
| "grad_norm": 3.734375, | |
| "learning_rate": 2.7135254915624213e-05, | |
| "loss": 1.3489, | |
| "mean_token_accuracy": 0.6010549142956734, | |
| "num_tokens": 118007.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 1.8890240713953972, | |
| "epoch": 1.8175182481751824, | |
| "grad_norm": 4.65625, | |
| "learning_rate": 2.6755401859887598e-05, | |
| "loss": 1.4448, | |
| "mean_token_accuracy": 0.6083299573510885, | |
| "num_tokens": 120725.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 1.850830078125, | |
| "epoch": 1.8759124087591241, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 2.6354925834776346e-05, | |
| "loss": 1.502, | |
| "mean_token_accuracy": 0.6061263754963875, | |
| "num_tokens": 124333.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 1.7397000417113304, | |
| "epoch": 1.9343065693430657, | |
| "grad_norm": 3.671875, | |
| "learning_rate": 2.5934529411321174e-05, | |
| "loss": 1.2539, | |
| "mean_token_accuracy": 0.6317082159221172, | |
| "num_tokens": 128615.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 1.813131682574749, | |
| "epoch": 1.9927007299270074, | |
| "grad_norm": 3.5625, | |
| "learning_rate": 2.5494950107700482e-05, | |
| "loss": 1.3284, | |
| "mean_token_accuracy": 0.6140319798141718, | |
| "num_tokens": 132847.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 1.5973615646362305, | |
| "epoch": 2.0, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 2.5036959095382875e-05, | |
| "loss": 1.2697, | |
| "mean_token_accuracy": 0.6285321712493896, | |
| "num_tokens": 133794.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 1.7645720839500427, | |
| "epoch": 2.0583941605839415, | |
| "grad_norm": 3.859375, | |
| "learning_rate": 2.4561359846230346e-05, | |
| "loss": 1.0785, | |
| "mean_token_accuracy": 0.6664150357246399, | |
| "num_tokens": 137554.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 1.753688521683216, | |
| "epoch": 2.116788321167883, | |
| "grad_norm": 3.3125, | |
| "learning_rate": 2.4068986722935625e-05, | |
| "loss": 1.0716, | |
| "mean_token_accuracy": 0.6744864694774151, | |
| "num_tokens": 141721.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 1.6263050064444542, | |
| "epoch": 2.1751824817518246, | |
| "grad_norm": 4.3125, | |
| "learning_rate": 2.356070351526648e-05, | |
| "loss": 1.0687, | |
| "mean_token_accuracy": 0.6837072521448135, | |
| "num_tokens": 146069.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 1.8026663437485695, | |
| "epoch": 2.2335766423357666, | |
| "grad_norm": 3.84375, | |
| "learning_rate": 2.303740192468495e-05, | |
| "loss": 1.1566, | |
| "mean_token_accuracy": 0.6734990328550339, | |
| "num_tokens": 149664.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 1.5874073877930641, | |
| "epoch": 2.291970802919708, | |
| "grad_norm": 3.53125, | |
| "learning_rate": 2.25e-05, | |
| "loss": 1.0591, | |
| "mean_token_accuracy": 0.6754884608089924, | |
| "num_tokens": 153330.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 1.4746350944042206, | |
| "epoch": 2.3503649635036497, | |
| "grad_norm": 3.453125, | |
| "learning_rate": 2.1949440526797928e-05, | |
| "loss": 0.9312, | |
| "mean_token_accuracy": 0.7215368486940861, | |
| "num_tokens": 157396.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 1.4334233030676842, | |
| "epoch": 2.408759124087591, | |
| "grad_norm": 6.96875, | |
| "learning_rate": 2.138668937347609e-05, | |
| "loss": 0.9952, | |
| "mean_token_accuracy": 0.7047883793711662, | |
| "num_tokens": 162386.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 1.4815244674682617, | |
| "epoch": 2.4671532846715327, | |
| "grad_norm": 3.9375, | |
| "learning_rate": 2.0812733796781544e-05, | |
| "loss": 1.0847, | |
| "mean_token_accuracy": 0.680337205529213, | |
| "num_tokens": 166308.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 1.4337811917066574, | |
| "epoch": 2.5255474452554747, | |
| "grad_norm": 4.21875, | |
| "learning_rate": 2.022858070982723e-05, | |
| "loss": 1.0594, | |
| "mean_token_accuracy": 0.686751551926136, | |
| "num_tokens": 169979.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 1.380111612379551, | |
| "epoch": 2.5839416058394162, | |
| "grad_norm": 3.984375, | |
| "learning_rate": 1.963525491562421e-05, | |
| "loss": 0.9718, | |
| "mean_token_accuracy": 0.7241853773593903, | |
| "num_tokens": 174586.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 1.339597962796688, | |
| "epoch": 2.6423357664233578, | |
| "grad_norm": 4.0625, | |
| "learning_rate": 1.9033797309228984e-05, | |
| "loss": 0.9445, | |
| "mean_token_accuracy": 0.7082682773470879, | |
| "num_tokens": 178535.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 1.293665699660778, | |
| "epoch": 2.7007299270072993, | |
| "grad_norm": 3.765625, | |
| "learning_rate": 1.8425263051659838e-05, | |
| "loss": 0.9213, | |
| "mean_token_accuracy": 0.7238599583506584, | |
| "num_tokens": 183350.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 1.3446906879544258, | |
| "epoch": 2.759124087591241, | |
| "grad_norm": 4.46875, | |
| "learning_rate": 1.781071971878587e-05, | |
| "loss": 0.9652, | |
| "mean_token_accuracy": 0.6951282061636448, | |
| "num_tokens": 187493.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 1.3415213227272034, | |
| "epoch": 2.8175182481751824, | |
| "grad_norm": 4.8125, | |
| "learning_rate": 1.7191245428436175e-05, | |
| "loss": 1.0102, | |
| "mean_token_accuracy": 0.7021605856716633, | |
| "num_tokens": 190843.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 1.4499380737543106, | |
| "epoch": 2.875912408759124, | |
| "grad_norm": 5.71875, | |
| "learning_rate": 1.6567926949014805e-05, | |
| "loss": 1.0649, | |
| "mean_token_accuracy": 0.7037234976887703, | |
| "num_tokens": 193518.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 1.3929353207349777, | |
| "epoch": 2.9343065693430654, | |
| "grad_norm": 4.75, | |
| "learning_rate": 1.5941857792939702e-05, | |
| "loss": 1.0284, | |
| "mean_token_accuracy": 0.6902767680585384, | |
| "num_tokens": 196895.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 1.4459699764847755, | |
| "epoch": 2.9927007299270074, | |
| "grad_norm": 4.75, | |
| "learning_rate": 1.5314136298250355e-05, | |
| "loss": 1.013, | |
| "mean_token_accuracy": 0.6965249925851822, | |
| "num_tokens": 200296.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 1.399910032749176, | |
| "epoch": 3.0, | |
| "grad_norm": 13.0625, | |
| "learning_rate": 1.4685863701749648e-05, | |
| "loss": 1.0552, | |
| "mean_token_accuracy": 0.6890038251876831, | |
| "num_tokens": 200691.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 1.3579635098576546, | |
| "epoch": 3.0583941605839415, | |
| "grad_norm": 4.28125, | |
| "learning_rate": 1.40581422070603e-05, | |
| "loss": 0.7865, | |
| "mean_token_accuracy": 0.765391580760479, | |
| "num_tokens": 204197.0, | |
| "step": 55 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 90, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 5, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 5469020090400768.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |