| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9997268505872713, | |
| "eval_steps": 500, | |
| "global_step": 915, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0010925976509150504, | |
| "grad_norm": 22.730928776382456, | |
| "learning_rate": 1.0869565217391305e-07, | |
| "loss": 1.3223, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.005462988254575253, | |
| "grad_norm": 20.891256691927374, | |
| "learning_rate": 5.434782608695653e-07, | |
| "loss": 1.3066, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.010925976509150505, | |
| "grad_norm": 8.917157931216027, | |
| "learning_rate": 1.0869565217391306e-06, | |
| "loss": 1.1943, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.01638896476372576, | |
| "grad_norm": 7.290952602788098, | |
| "learning_rate": 1.6304347826086957e-06, | |
| "loss": 1.0279, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02185195301830101, | |
| "grad_norm": 3.006256144496407, | |
| "learning_rate": 2.173913043478261e-06, | |
| "loss": 0.9049, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.027314941272876262, | |
| "grad_norm": 2.371134339375521, | |
| "learning_rate": 2.7173913043478263e-06, | |
| "loss": 0.8647, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03277792952745152, | |
| "grad_norm": 2.1504800702490865, | |
| "learning_rate": 3.2608695652173914e-06, | |
| "loss": 0.833, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03824091778202677, | |
| "grad_norm": 2.2317280108941, | |
| "learning_rate": 3.804347826086957e-06, | |
| "loss": 0.807, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04370390603660202, | |
| "grad_norm": 2.2488850758737255, | |
| "learning_rate": 4.347826086956522e-06, | |
| "loss": 0.7919, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04916689429117727, | |
| "grad_norm": 2.3580705798318484, | |
| "learning_rate": 4.891304347826087e-06, | |
| "loss": 0.7761, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.054629882545752524, | |
| "grad_norm": 2.4373495899440414, | |
| "learning_rate": 5.4347826086956525e-06, | |
| "loss": 0.7614, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.060092870800327776, | |
| "grad_norm": 2.4135433324014763, | |
| "learning_rate": 5.978260869565218e-06, | |
| "loss": 0.7458, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06555585905490303, | |
| "grad_norm": 2.81358704818424, | |
| "learning_rate": 6.521739130434783e-06, | |
| "loss": 0.7291, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07101884730947829, | |
| "grad_norm": 2.438627231166693, | |
| "learning_rate": 7.065217391304349e-06, | |
| "loss": 0.72, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07648183556405354, | |
| "grad_norm": 2.1869040708927177, | |
| "learning_rate": 7.608695652173914e-06, | |
| "loss": 0.7141, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08194482381862879, | |
| "grad_norm": 2.184483160699592, | |
| "learning_rate": 8.15217391304348e-06, | |
| "loss": 0.6994, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08740781207320404, | |
| "grad_norm": 2.3203787824757507, | |
| "learning_rate": 8.695652173913044e-06, | |
| "loss": 0.7019, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0928708003277793, | |
| "grad_norm": 2.3068506646214915, | |
| "learning_rate": 9.23913043478261e-06, | |
| "loss": 0.6958, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.09833378858235454, | |
| "grad_norm": 2.257062695334498, | |
| "learning_rate": 9.782608695652175e-06, | |
| "loss": 0.6842, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.1037967768369298, | |
| "grad_norm": 2.400983485932064, | |
| "learning_rate": 9.999672147992806e-06, | |
| "loss": 0.6914, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.10925976509150505, | |
| "grad_norm": 2.324571446077097, | |
| "learning_rate": 9.997668763647963e-06, | |
| "loss": 0.6867, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.1147227533460803, | |
| "grad_norm": 2.137443230070172, | |
| "learning_rate": 9.993844863856281e-06, | |
| "loss": 0.689, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.12018574160065555, | |
| "grad_norm": 2.0564600115869585, | |
| "learning_rate": 9.988201841560945e-06, | |
| "loss": 0.6688, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12564872985523082, | |
| "grad_norm": 2.139005852684594, | |
| "learning_rate": 9.980741752362222e-06, | |
| "loss": 0.6711, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.13111171810980607, | |
| "grad_norm": 2.530899194975221, | |
| "learning_rate": 9.971467313768668e-06, | |
| "loss": 0.6681, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13657470636438132, | |
| "grad_norm": 2.1541912419217466, | |
| "learning_rate": 9.96038190420721e-06, | |
| "loss": 0.6624, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.14203769461895657, | |
| "grad_norm": 2.2266155964762646, | |
| "learning_rate": 9.947489561792475e-06, | |
| "loss": 0.6678, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.14750068287353182, | |
| "grad_norm": 2.0151113143541215, | |
| "learning_rate": 9.932794982855818e-06, | |
| "loss": 0.6576, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15296367112810708, | |
| "grad_norm": 2.122951721690571, | |
| "learning_rate": 9.916303520234573e-06, | |
| "loss": 0.6518, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.15842665938268233, | |
| "grad_norm": 2.073005839186561, | |
| "learning_rate": 9.898021181322158e-06, | |
| "loss": 0.6467, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.16388964763725758, | |
| "grad_norm": 2.122513038916697, | |
| "learning_rate": 9.877954625879746e-06, | |
| "loss": 0.6422, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.16935263589183283, | |
| "grad_norm": 1.938211750086781, | |
| "learning_rate": 9.8561111636103e-06, | |
| "loss": 0.6313, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17481562414640808, | |
| "grad_norm": 2.1510416920205997, | |
| "learning_rate": 9.832498751495832e-06, | |
| "loss": 0.637, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18027861240098333, | |
| "grad_norm": 2.368980517738678, | |
| "learning_rate": 9.807125990898905e-06, | |
| "loss": 0.636, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1857416006555586, | |
| "grad_norm": 2.2294247264281553, | |
| "learning_rate": 9.780002124429377e-06, | |
| "loss": 0.6378, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.19120458891013384, | |
| "grad_norm": 2.044434482857163, | |
| "learning_rate": 9.75113703257758e-06, | |
| "loss": 0.6299, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.1966675771647091, | |
| "grad_norm": 2.273754929604351, | |
| "learning_rate": 9.720541230115113e-06, | |
| "loss": 0.6278, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.20213056541928434, | |
| "grad_norm": 1.9049721088945737, | |
| "learning_rate": 9.688225862264604e-06, | |
| "loss": 0.6215, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.2075935536738596, | |
| "grad_norm": 2.2956911916513936, | |
| "learning_rate": 9.654202700639805e-06, | |
| "loss": 0.6114, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.21305654192843484, | |
| "grad_norm": 2.047625547879147, | |
| "learning_rate": 9.61848413895751e-06, | |
| "loss": 0.6272, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2185195301830101, | |
| "grad_norm": 2.144882455415688, | |
| "learning_rate": 9.581083188522862e-06, | |
| "loss": 0.6146, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22398251843758535, | |
| "grad_norm": 1.882314204150727, | |
| "learning_rate": 9.542013473489683e-06, | |
| "loss": 0.6115, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.2294455066921606, | |
| "grad_norm": 2.0782768044314746, | |
| "learning_rate": 9.501289225897566e-06, | |
| "loss": 0.5965, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23490849494673585, | |
| "grad_norm": 1.9514474259034185, | |
| "learning_rate": 9.458925280487532e-06, | |
| "loss": 0.6176, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.2403714832013111, | |
| "grad_norm": 1.8865949054260884, | |
| "learning_rate": 9.414937069298125e-06, | |
| "loss": 0.6018, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.24583447145588638, | |
| "grad_norm": 2.0683179915172962, | |
| "learning_rate": 9.369340616043948e-06, | |
| "loss": 0.6105, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.25129745971046163, | |
| "grad_norm": 2.004631261000376, | |
| "learning_rate": 9.322152530278658e-06, | |
| "loss": 0.5931, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2567604479650369, | |
| "grad_norm": 1.9653795566834225, | |
| "learning_rate": 9.273390001344544e-06, | |
| "loss": 0.5878, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.26222343621961214, | |
| "grad_norm": 2.0291055623314715, | |
| "learning_rate": 9.223070792110927e-06, | |
| "loss": 0.5827, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2676864244741874, | |
| "grad_norm": 1.9570477475533896, | |
| "learning_rate": 9.17121323250362e-06, | |
| "loss": 0.5943, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.27314941272876264, | |
| "grad_norm": 2.1357120267635463, | |
| "learning_rate": 9.117836212827839e-06, | |
| "loss": 0.571, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2786124009833379, | |
| "grad_norm": 1.9807416332273953, | |
| "learning_rate": 9.062959176886967e-06, | |
| "loss": 0.5895, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.28407538923791315, | |
| "grad_norm": 1.8679007980966873, | |
| "learning_rate": 9.006602114899711e-06, | |
| "loss": 0.5837, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.2895383774924884, | |
| "grad_norm": 2.159508129093504, | |
| "learning_rate": 8.948785556218202e-06, | |
| "loss": 0.5872, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.29500136574706365, | |
| "grad_norm": 2.011039338549636, | |
| "learning_rate": 8.88953056184971e-06, | |
| "loss": 0.5656, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.3004643540016389, | |
| "grad_norm": 1.8497602272423903, | |
| "learning_rate": 8.828858716784692e-06, | |
| "loss": 0.5607, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.30592734225621415, | |
| "grad_norm": 1.8697548121419354, | |
| "learning_rate": 8.766792122133949e-06, | |
| "loss": 0.569, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3113903305107894, | |
| "grad_norm": 2.023364352460863, | |
| "learning_rate": 8.703353387077813e-06, | |
| "loss": 0.5694, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.31685331876536466, | |
| "grad_norm": 1.9966954532417194, | |
| "learning_rate": 8.638565620630219e-06, | |
| "loss": 0.5485, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.3223163070199399, | |
| "grad_norm": 2.118048147240604, | |
| "learning_rate": 8.572452423220717e-06, | |
| "loss": 0.5544, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.32777929527451516, | |
| "grad_norm": 1.9646992858208574, | |
| "learning_rate": 8.505037878097481e-06, | |
| "loss": 0.5585, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3332422835290904, | |
| "grad_norm": 1.842699727967433, | |
| "learning_rate": 8.436346542554432e-06, | |
| "loss": 0.5525, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.33870527178366566, | |
| "grad_norm": 2.1516397434897, | |
| "learning_rate": 8.366403438985675e-06, | |
| "loss": 0.5598, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3441682600382409, | |
| "grad_norm": 1.935829862534685, | |
| "learning_rate": 8.295234045770524e-06, | |
| "loss": 0.5474, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.34963124829281617, | |
| "grad_norm": 2.0947580703115, | |
| "learning_rate": 8.222864287992419e-06, | |
| "loss": 0.5444, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.3550942365473914, | |
| "grad_norm": 2.0230909691945462, | |
| "learning_rate": 8.149320527995111e-06, | |
| "loss": 0.5367, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.36055722480196667, | |
| "grad_norm": 1.9422498738110598, | |
| "learning_rate": 8.074629555779586e-06, | |
| "loss": 0.5374, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3660202130565419, | |
| "grad_norm": 2.017157014853384, | |
| "learning_rate": 7.998818579245183e-06, | |
| "loss": 0.538, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3714832013111172, | |
| "grad_norm": 2.0200129213489952, | |
| "learning_rate": 7.9219152142785e-06, | |
| "loss": 0.5261, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3769461895656924, | |
| "grad_norm": 1.9025863914724077, | |
| "learning_rate": 7.843947474693665e-06, | |
| "loss": 0.5272, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3824091778202677, | |
| "grad_norm": 1.98859186872293, | |
| "learning_rate": 7.764943762027675e-06, | |
| "loss": 0.5271, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.38787216607484293, | |
| "grad_norm": 1.9641160396193444, | |
| "learning_rate": 7.68493285519447e-06, | |
| "loss": 0.5156, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.3933351543294182, | |
| "grad_norm": 1.9337347711635438, | |
| "learning_rate": 7.603943900001567e-06, | |
| "loss": 0.5152, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.39879814258399343, | |
| "grad_norm": 1.9734924840604742, | |
| "learning_rate": 7.522006398533022e-06, | |
| "loss": 0.5157, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.4042611308385687, | |
| "grad_norm": 2.3339176407449904, | |
| "learning_rate": 7.439150198402619e-06, | |
| "loss": 0.5162, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.40972411909314393, | |
| "grad_norm": 2.2975439038971732, | |
| "learning_rate": 7.355405481881207e-06, | |
| "loss": 0.5157, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.4151871073477192, | |
| "grad_norm": 1.971313224728527, | |
| "learning_rate": 7.270802754902092e-06, | |
| "loss": 0.5014, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.42065009560229444, | |
| "grad_norm": 1.8926014206142687, | |
| "learning_rate": 7.185372835948574e-06, | |
| "loss": 0.5082, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.4261130838568697, | |
| "grad_norm": 2.0424297512212677, | |
| "learning_rate": 7.0991468448275914e-06, | |
| "loss": 0.4969, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.43157607211144494, | |
| "grad_norm": 1.9075652902136018, | |
| "learning_rate": 7.0121561913336255e-06, | |
| "loss": 0.5025, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4370390603660202, | |
| "grad_norm": 1.972187929750843, | |
| "learning_rate": 6.924432563806962e-06, | |
| "loss": 0.4805, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.44250204862059545, | |
| "grad_norm": 2.242916059633561, | |
| "learning_rate": 6.836007917590487e-06, | |
| "loss": 0.503, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.4479650368751707, | |
| "grad_norm": 2.0649875803912274, | |
| "learning_rate": 6.746914463389216e-06, | |
| "loss": 0.4901, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.45342802512974595, | |
| "grad_norm": 1.8063638082708948, | |
| "learning_rate": 6.65718465553681e-06, | |
| "loss": 0.4755, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.4588910133843212, | |
| "grad_norm": 1.969494410176558, | |
| "learning_rate": 6.566851180173344e-06, | |
| "loss": 0.4874, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.46435400163889645, | |
| "grad_norm": 1.8970828426082733, | |
| "learning_rate": 6.475946943338616e-06, | |
| "loss": 0.4766, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.4698169898934717, | |
| "grad_norm": 1.9623505460334127, | |
| "learning_rate": 6.384505058985388e-06, | |
| "loss": 0.487, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.47527997814804696, | |
| "grad_norm": 2.016298870850614, | |
| "learning_rate": 6.292558836916856e-06, | |
| "loss": 0.4684, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.4807429664026222, | |
| "grad_norm": 1.9782532484829365, | |
| "learning_rate": 6.200141770652791e-06, | |
| "loss": 0.4655, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4862059546571975, | |
| "grad_norm": 1.9875880346692325, | |
| "learning_rate": 6.107287525228764e-06, | |
| "loss": 0.4829, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.49166894291177277, | |
| "grad_norm": 2.0320364233161836, | |
| "learning_rate": 6.014029924932874e-06, | |
| "loss": 0.4753, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.497131931166348, | |
| "grad_norm": 2.025675106544592, | |
| "learning_rate": 5.920402940984483e-06, | |
| "loss": 0.4673, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5025949194209233, | |
| "grad_norm": 1.921954531735398, | |
| "learning_rate": 5.826440679159424e-06, | |
| "loss": 0.4532, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5080579076754985, | |
| "grad_norm": 2.01066093050374, | |
| "learning_rate": 5.732177367366176e-06, | |
| "loss": 0.4658, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5135208959300738, | |
| "grad_norm": 1.8422653149296644, | |
| "learning_rate": 5.63764734317758e-06, | |
| "loss": 0.4529, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.518983884184649, | |
| "grad_norm": 1.9255817468742156, | |
| "learning_rate": 5.542885041322577e-06, | |
| "loss": 0.4561, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5244468724392243, | |
| "grad_norm": 1.951977869616497, | |
| "learning_rate": 5.447924981142578e-06, | |
| "loss": 0.4565, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5299098606937995, | |
| "grad_norm": 1.9554258723769773, | |
| "learning_rate": 5.3528017540169974e-06, | |
| "loss": 0.4475, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5353728489483748, | |
| "grad_norm": 1.9102850022081725, | |
| "learning_rate": 5.25755001076255e-06, | |
| "loss": 0.4497, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.54083583720295, | |
| "grad_norm": 2.239195466962009, | |
| "learning_rate": 5.1622044490108994e-06, | |
| "loss": 0.4454, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5462988254575253, | |
| "grad_norm": 1.8827718246138567, | |
| "learning_rate": 5.066799800569247e-06, | |
| "loss": 0.4404, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5517618137121005, | |
| "grad_norm": 1.9069058786965245, | |
| "learning_rate": 4.971370818768475e-06, | |
| "loss": 0.4236, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5572248019666758, | |
| "grad_norm": 1.855279389571921, | |
| "learning_rate": 4.875952265803452e-06, | |
| "loss": 0.4403, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.562687790221251, | |
| "grad_norm": 1.8972542181259846, | |
| "learning_rate": 4.780578900070104e-06, | |
| "loss": 0.4372, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5681507784758263, | |
| "grad_norm": 1.844147614606819, | |
| "learning_rate": 4.685285463503867e-06, | |
| "loss": 0.4331, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5736137667304015, | |
| "grad_norm": 1.9625504078338127, | |
| "learning_rate": 4.5901066689241514e-06, | |
| "loss": 0.4292, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5790767549849768, | |
| "grad_norm": 2.2025005649073433, | |
| "learning_rate": 4.49507718738939e-06, | |
| "loss": 0.4293, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.584539743239552, | |
| "grad_norm": 1.9450789672739341, | |
| "learning_rate": 4.400231635567319e-06, | |
| "loss": 0.4192, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.5900027314941273, | |
| "grad_norm": 1.9764711012724976, | |
| "learning_rate": 4.305604563125061e-06, | |
| "loss": 0.4182, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.5954657197487025, | |
| "grad_norm": 2.1198793543903194, | |
| "learning_rate": 4.211230440143604e-06, | |
| "loss": 0.4114, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.6009287080032778, | |
| "grad_norm": 3.1276187198521463, | |
| "learning_rate": 4.117143644561306e-06, | |
| "loss": 0.4166, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.606391696257853, | |
| "grad_norm": 1.8794840574959752, | |
| "learning_rate": 4.023378449650928e-06, | |
| "loss": 0.4076, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6118546845124283, | |
| "grad_norm": 1.9139438651443652, | |
| "learning_rate": 3.929969011534811e-06, | |
| "loss": 0.4218, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6173176727670036, | |
| "grad_norm": 1.861741261098825, | |
| "learning_rate": 3.83694935674272e-06, | |
| "loss": 0.4169, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.6227806610215788, | |
| "grad_norm": 1.8247668127301933, | |
| "learning_rate": 3.7443533698169186e-06, | |
| "loss": 0.4132, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6282436492761541, | |
| "grad_norm": 1.9796553518989775, | |
| "learning_rate": 3.652214780968926e-06, | |
| "loss": 0.3996, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.6337066375307293, | |
| "grad_norm": 1.823241119094308, | |
| "learning_rate": 3.5605671537925264e-06, | |
| "loss": 0.4088, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6391696257853046, | |
| "grad_norm": 1.881819370184891, | |
| "learning_rate": 3.469443873037457e-06, | |
| "loss": 0.4073, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6446326140398798, | |
| "grad_norm": 1.947962800229108, | |
| "learning_rate": 3.3788781324482446e-06, | |
| "loss": 0.4067, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6500956022944551, | |
| "grad_norm": 1.9214089256240032, | |
| "learning_rate": 3.2889029226726286e-06, | |
| "loss": 0.3996, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6555585905490303, | |
| "grad_norm": 1.9543022070758702, | |
| "learning_rate": 3.1995510192439587e-06, | |
| "loss": 0.3982, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6610215788036056, | |
| "grad_norm": 1.9173316158851936, | |
| "learning_rate": 3.1108549706419553e-06, | |
| "loss": 0.4, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6664845670581808, | |
| "grad_norm": 1.8159793606682484, | |
| "learning_rate": 3.022847086436176e-06, | |
| "loss": 0.3988, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6719475553127561, | |
| "grad_norm": 1.9089073637989584, | |
| "learning_rate": 2.9355594255165186e-06, | |
| "loss": 0.3871, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6774105435673313, | |
| "grad_norm": 1.8914866485656172, | |
| "learning_rate": 2.8490237844150335e-06, | |
| "loss": 0.3885, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6828735318219066, | |
| "grad_norm": 1.9221955744591699, | |
| "learning_rate": 2.763271685723311e-06, | |
| "loss": 0.3818, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.6883365200764818, | |
| "grad_norm": 1.8733166396673193, | |
| "learning_rate": 2.6783343666096445e-06, | |
| "loss": 0.3874, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.6937995083310571, | |
| "grad_norm": 2.134783287581667, | |
| "learning_rate": 2.5942427674401936e-06, | |
| "loss": 0.3899, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.6992624965856323, | |
| "grad_norm": 1.8202198608461309, | |
| "learning_rate": 2.5110275205082224e-06, | |
| "loss": 0.3839, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.7047254848402076, | |
| "grad_norm": 1.9005906531792638, | |
| "learning_rate": 2.428718938875607e-06, | |
| "loss": 0.3812, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.7101884730947828, | |
| "grad_norm": 1.8656177124830933, | |
| "learning_rate": 2.3473470053305954e-06, | |
| "loss": 0.3805, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.7156514613493581, | |
| "grad_norm": 1.9350473980336609, | |
| "learning_rate": 2.266941361465886e-06, | |
| "loss": 0.381, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7211144496039333, | |
| "grad_norm": 1.8146852411399312, | |
| "learning_rate": 2.187531296881017e-06, | |
| "loss": 0.3751, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7265774378585086, | |
| "grad_norm": 1.8246829733561232, | |
| "learning_rate": 2.1091457385129267e-06, | |
| "loss": 0.3752, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7320404261130838, | |
| "grad_norm": 1.8550222595976351, | |
| "learning_rate": 2.031813240098686e-06, | |
| "loss": 0.3721, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7375034143676591, | |
| "grad_norm": 1.8354415599143141, | |
| "learning_rate": 1.9555619717741248e-06, | |
| "loss": 0.3733, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7429664026222343, | |
| "grad_norm": 1.9427997534753285, | |
| "learning_rate": 1.8804197098122168e-06, | |
| "loss": 0.3745, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7484293908768096, | |
| "grad_norm": 1.9154466908271706, | |
| "learning_rate": 1.8064138265049457e-06, | |
| "loss": 0.3743, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7538923791313848, | |
| "grad_norm": 1.8117953280714802, | |
| "learning_rate": 1.7335712801923015e-06, | |
| "loss": 0.3707, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7593553673859601, | |
| "grad_norm": 1.894844068057027, | |
| "learning_rate": 1.6619186054421088e-06, | |
| "loss": 0.3677, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7648183556405354, | |
| "grad_norm": 1.8949515530814003, | |
| "learning_rate": 1.5914819033841843e-06, | |
| "loss": 0.3639, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7702813438951106, | |
| "grad_norm": 1.7568307263853036, | |
| "learning_rate": 1.5222868322024092e-06, | |
| "loss": 0.3586, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.7757443321496859, | |
| "grad_norm": 1.759178792643852, | |
| "learning_rate": 1.4543585977881513e-06, | |
| "loss": 0.35, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7812073204042611, | |
| "grad_norm": 1.8973466785062572, | |
| "learning_rate": 1.3877219445584283e-06, | |
| "loss": 0.3593, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.7866703086588364, | |
| "grad_norm": 1.9605293056514521, | |
| "learning_rate": 1.3224011464422032e-06, | |
| "loss": 0.3621, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.7921332969134116, | |
| "grad_norm": 1.812660380797101, | |
| "learning_rate": 1.2584199980380358e-06, | |
| "loss": 0.3599, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.7975962851679869, | |
| "grad_norm": 1.8356228496720945, | |
| "learning_rate": 1.1958018059463577e-06, | |
| "loss": 0.3605, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8030592734225621, | |
| "grad_norm": 1.691309969533953, | |
| "learning_rate": 1.1345693802795177e-06, | |
| "loss": 0.3583, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.8085222616771374, | |
| "grad_norm": 1.8242600919765277, | |
| "learning_rate": 1.0747450263526576e-06, | |
| "loss": 0.3598, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8139852499317126, | |
| "grad_norm": 1.7765876082507979, | |
| "learning_rate": 1.0163505365585086e-06, | |
| "loss": 0.3493, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8194482381862879, | |
| "grad_norm": 1.8095739075310635, | |
| "learning_rate": 9.594071824289986e-07, | |
| "loss": 0.3571, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8249112264408631, | |
| "grad_norm": 1.7811537133839481, | |
| "learning_rate": 9.039357068866178e-07, | |
| "loss": 0.3518, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8303742146954384, | |
| "grad_norm": 1.7625847558197685, | |
| "learning_rate": 8.49956316688329e-07, | |
| "loss": 0.3518, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8358372029500136, | |
| "grad_norm": 1.8189872840713601, | |
| "learning_rate": 7.974886750647887e-07, | |
| "loss": 0.3381, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8413001912045889, | |
| "grad_norm": 1.8024026803759359, | |
| "learning_rate": 7.465518945575789e-07, | |
| "loss": 0.345, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.8467631794591641, | |
| "grad_norm": 1.895342687896103, | |
| "learning_rate": 6.971645300570084e-07, | |
| "loss": 0.3464, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8522261677137394, | |
| "grad_norm": 1.796319045444646, | |
| "learning_rate": 6.49344572043083e-07, | |
| "loss": 0.3477, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.8576891559683146, | |
| "grad_norm": 1.8845046027721593, | |
| "learning_rate": 6.031094400320498e-07, | |
| "loss": 0.3443, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8631521442228899, | |
| "grad_norm": 1.724637234062916, | |
| "learning_rate": 5.584759762309422e-07, | |
| "loss": 0.3367, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8686151324774651, | |
| "grad_norm": 1.8964510379949302, | |
| "learning_rate": 5.154604394024254e-07, | |
| "loss": 0.3396, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8740781207320404, | |
| "grad_norm": 1.6763296815328177, | |
| "learning_rate": 4.7407849894216097e-07, | |
| "loss": 0.3348, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.8795411089866156, | |
| "grad_norm": 1.7128909727748394, | |
| "learning_rate": 4.343452291708783e-07, | |
| "loss": 0.343, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 0.8850040972411909, | |
| "grad_norm": 1.8153223119786095, | |
| "learning_rate": 3.962751038431961e-07, | |
| "loss": 0.3405, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 0.8904670854957661, | |
| "grad_norm": 1.7625733448640701, | |
| "learning_rate": 3.5988199087523987e-07, | |
| "loss": 0.3396, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 0.8959300737503414, | |
| "grad_norm": 1.8570474171994868, | |
| "learning_rate": 3.2517914729292446e-07, | |
| "loss": 0.3391, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 0.9013930620049166, | |
| "grad_norm": 1.8466215461741358, | |
| "learning_rate": 2.921792144027813e-07, | |
| "loss": 0.3407, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.9068560502594919, | |
| "grad_norm": 1.7424828950437026, | |
| "learning_rate": 2.6089421318707785e-07, | |
| "loss": 0.3365, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 0.9123190385140671, | |
| "grad_norm": 1.8348358804511151, | |
| "learning_rate": 2.3133553992488845e-07, | |
| "loss": 0.3453, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 0.9177820267686424, | |
| "grad_norm": 1.8267323618432934, | |
| "learning_rate": 2.0351396204074946e-07, | |
| "loss": 0.337, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 0.9232450150232177, | |
| "grad_norm": 1.6632279811187767, | |
| "learning_rate": 1.7743961418237144e-07, | |
| "loss": 0.3341, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 0.9287080032777929, | |
| "grad_norm": 1.8291284979726596, | |
| "learning_rate": 1.5312199452886145e-07, | |
| "loss": 0.332, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.9341709915323682, | |
| "grad_norm": 1.7047344118755032, | |
| "learning_rate": 1.3056996133079924e-07, | |
| "loss": 0.3375, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 0.9396339797869434, | |
| "grad_norm": 1.745785425026002, | |
| "learning_rate": 1.0979172968340667e-07, | |
| "loss": 0.3321, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 0.9450969680415187, | |
| "grad_norm": 1.8776860033466447, | |
| "learning_rate": 9.079486853402097e-08, | |
| "loss": 0.34, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 0.9505599562960939, | |
| "grad_norm": 1.7269678812008018, | |
| "learning_rate": 7.358629792492522e-08, | |
| "loss": 0.342, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 0.9560229445506692, | |
| "grad_norm": 1.7727161663929172, | |
| "learning_rate": 5.8172286472566456e-08, | |
| "loss": 0.3391, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.9614859328052444, | |
| "grad_norm": 1.7673439074550763, | |
| "learning_rate": 4.455844908407059e-08, | |
| "loss": 0.3416, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 0.9669489210598197, | |
| "grad_norm": 1.7746641186916818, | |
| "learning_rate": 3.27497449118791e-08, | |
| "loss": 0.3268, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 0.972411909314395, | |
| "grad_norm": 1.858186055296109, | |
| "learning_rate": 2.275047554726795e-08, | |
| "loss": 0.3371, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 0.9778748975689703, | |
| "grad_norm": 1.6930712261364957, | |
| "learning_rate": 1.4564283453392669e-08, | |
| "loss": 0.3425, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 0.9833378858235455, | |
| "grad_norm": 1.778838215839912, | |
| "learning_rate": 8.194150638438093e-09, | |
| "loss": 0.3315, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.9888008740781208, | |
| "grad_norm": 1.8180837953030582, | |
| "learning_rate": 3.6423975693533618e-09, | |
| "loss": 0.3404, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 0.994263862332696, | |
| "grad_norm": 1.7628508134053706, | |
| "learning_rate": 9.106823265681019e-10, | |
| "loss": 0.335, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 0.9997268505872713, | |
| "grad_norm": 1.7810702627412893, | |
| "learning_rate": 0.0, | |
| "loss": 0.3383, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9997268505872713, | |
| "eval_loss": 0.2970966100692749, | |
| "eval_runtime": 1.1854, | |
| "eval_samples_per_second": 1.687, | |
| "eval_steps_per_second": 0.844, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 0.9997268505872713, | |
| "step": 915, | |
| "total_flos": 191530040033280.0, | |
| "train_loss": 0.4996221627042593, | |
| "train_runtime": 20787.7504, | |
| "train_samples_per_second": 1.409, | |
| "train_steps_per_second": 0.044 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 915, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 191530040033280.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |