| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.8921103986618344, | |
| "eval_steps": 500, | |
| "global_step": 800, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.005575689991636465, | |
| "grad_norm": 0.16284966468811035, | |
| "learning_rate": 7.142857142857143e-06, | |
| "loss": 0.0626, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.01115137998327293, | |
| "grad_norm": 0.10163773596286774, | |
| "learning_rate": 1.6071428571428572e-05, | |
| "loss": 0.051, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.016727069974909393, | |
| "grad_norm": 0.12601175904273987, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.041, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.02230275996654586, | |
| "grad_norm": 0.04396357387304306, | |
| "learning_rate": 3.392857142857143e-05, | |
| "loss": 0.0336, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.027878449958182325, | |
| "grad_norm": 0.04541896656155586, | |
| "learning_rate": 4.2857142857142856e-05, | |
| "loss": 0.03, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.03345413994981879, | |
| "grad_norm": 0.035561174154281616, | |
| "learning_rate": 5.1785714285714296e-05, | |
| "loss": 0.026, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.039029829941455256, | |
| "grad_norm": 0.04301896691322327, | |
| "learning_rate": 6.0714285714285715e-05, | |
| "loss": 0.0232, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.04460551993309172, | |
| "grad_norm": 0.04101714491844177, | |
| "learning_rate": 6.964285714285715e-05, | |
| "loss": 0.0234, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.05018120992472819, | |
| "grad_norm": 0.03225620836019516, | |
| "learning_rate": 7.857142857142858e-05, | |
| "loss": 0.0199, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.05575689991636465, | |
| "grad_norm": 0.03156769648194313, | |
| "learning_rate": 8.75e-05, | |
| "loss": 0.02, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.06133258990800112, | |
| "grad_norm": 0.03408223018050194, | |
| "learning_rate": 9.642857142857143e-05, | |
| "loss": 0.0227, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.06690827989963757, | |
| "grad_norm": 0.031089385971426964, | |
| "learning_rate": 9.999803846452024e-05, | |
| "loss": 0.02, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.07248396989127405, | |
| "grad_norm": 0.033274080604314804, | |
| "learning_rate": 9.998605186060137e-05, | |
| "loss": 0.014, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.07805965988291051, | |
| "grad_norm": 0.02827683836221695, | |
| "learning_rate": 9.996317100396068e-05, | |
| "loss": 0.0202, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.08363534987454697, | |
| "grad_norm": 0.037661969661712646, | |
| "learning_rate": 9.992940088138597e-05, | |
| "loss": 0.0222, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.08921103986618344, | |
| "grad_norm": 0.04279659315943718, | |
| "learning_rate": 9.988474885293544e-05, | |
| "loss": 0.0186, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.0947867298578199, | |
| "grad_norm": 0.026112260296940804, | |
| "learning_rate": 9.98292246503335e-05, | |
| "loss": 0.02, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.10036241984945637, | |
| "grad_norm": 0.029179614037275314, | |
| "learning_rate": 9.976284037484988e-05, | |
| "loss": 0.0175, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.10593810984109284, | |
| "grad_norm": 0.035265687853097916, | |
| "learning_rate": 9.968561049466214e-05, | |
| "loss": 0.0163, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.1115137998327293, | |
| "grad_norm": 0.02659149281680584, | |
| "learning_rate": 9.95975518417024e-05, | |
| "loss": 0.0134, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.11708948982436576, | |
| "grad_norm": 0.03535303473472595, | |
| "learning_rate": 9.949868360798893e-05, | |
| "loss": 0.0174, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.12266517981600224, | |
| "grad_norm": 0.028047222644090652, | |
| "learning_rate": 9.938902734144326e-05, | |
| "loss": 0.014, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.12824086980763869, | |
| "grad_norm": 0.025049524381756783, | |
| "learning_rate": 9.926860694119398e-05, | |
| "loss": 0.014, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.13381655979927515, | |
| "grad_norm": 0.03536759689450264, | |
| "learning_rate": 9.913744865236798e-05, | |
| "loss": 0.0161, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.13939224979091164, | |
| "grad_norm": 0.03142830356955528, | |
| "learning_rate": 9.899558106037039e-05, | |
| "loss": 0.016, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.1449679397825481, | |
| "grad_norm": 0.029837962239980698, | |
| "learning_rate": 9.884303508465463e-05, | |
| "loss": 0.017, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.15054362977418456, | |
| "grad_norm": 0.050843242555856705, | |
| "learning_rate": 9.867984397198348e-05, | |
| "loss": 0.0209, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.15611931976582102, | |
| "grad_norm": 0.022020680829882622, | |
| "learning_rate": 9.85060432891833e-05, | |
| "loss": 0.013, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.1616950097574575, | |
| "grad_norm": 0.03178994357585907, | |
| "learning_rate": 9.832167091539214e-05, | |
| "loss": 0.0144, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.16727069974909395, | |
| "grad_norm": 0.030585451051592827, | |
| "learning_rate": 9.812676703380433e-05, | |
| "loss": 0.0144, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.1728463897407304, | |
| "grad_norm": 0.02348562888801098, | |
| "learning_rate": 9.792137412291265e-05, | |
| "loss": 0.0137, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.17842207973236687, | |
| "grad_norm": 0.028541121631860733, | |
| "learning_rate": 9.770553694725028e-05, | |
| "loss": 0.0169, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.18399776972400333, | |
| "grad_norm": 0.02496708557009697, | |
| "learning_rate": 9.747930254763467e-05, | |
| "loss": 0.013, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1895734597156398, | |
| "grad_norm": 0.03572074696421623, | |
| "learning_rate": 9.724272023091503e-05, | |
| "loss": 0.0164, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1951491497072763, | |
| "grad_norm": 0.02730608731508255, | |
| "learning_rate": 9.699584155922625e-05, | |
| "loss": 0.0135, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.20072483969891275, | |
| "grad_norm": 0.03099130466580391, | |
| "learning_rate": 9.673872033875109e-05, | |
| "loss": 0.0157, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.2063005296905492, | |
| "grad_norm": 0.031458914279937744, | |
| "learning_rate": 9.64714126079933e-05, | |
| "loss": 0.0138, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.21187621968218567, | |
| "grad_norm": 0.03125375509262085, | |
| "learning_rate": 9.619397662556435e-05, | |
| "loss": 0.0114, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.21745190967382214, | |
| "grad_norm": 0.031778380274772644, | |
| "learning_rate": 9.590647285748613e-05, | |
| "loss": 0.0117, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.2230275996654586, | |
| "grad_norm": 0.019305897876620293, | |
| "learning_rate": 9.56089639640127e-05, | |
| "loss": 0.0143, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.22860328965709506, | |
| "grad_norm": 0.02124331146478653, | |
| "learning_rate": 9.530151478597366e-05, | |
| "loss": 0.0135, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.23417897964873152, | |
| "grad_norm": 0.033200375735759735, | |
| "learning_rate": 9.498419233064246e-05, | |
| "loss": 0.0143, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.23975466964036798, | |
| "grad_norm": 0.03514528647065163, | |
| "learning_rate": 9.465706575713236e-05, | |
| "loss": 0.0204, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.24533035963200447, | |
| "grad_norm": 0.029311848804354668, | |
| "learning_rate": 9.432020636132354e-05, | |
| "loss": 0.0176, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.25090604962364094, | |
| "grad_norm": 0.030977483838796616, | |
| "learning_rate": 9.397368756032445e-05, | |
| "loss": 0.0146, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.25648173961527737, | |
| "grad_norm": 0.025566186755895615, | |
| "learning_rate": 9.361758487647082e-05, | |
| "loss": 0.0136, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.26205742960691386, | |
| "grad_norm": 0.02457290142774582, | |
| "learning_rate": 9.32519759208659e-05, | |
| "loss": 0.0152, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2676331195985503, | |
| "grad_norm": 0.023812102153897285, | |
| "learning_rate": 9.287694037646548e-05, | |
| "loss": 0.0148, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.2732088095901868, | |
| "grad_norm": 0.023294365033507347, | |
| "learning_rate": 9.249255998071126e-05, | |
| "loss": 0.0123, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.2787844995818233, | |
| "grad_norm": 0.018993759527802467, | |
| "learning_rate": 9.209891850771657e-05, | |
| "loss": 0.0099, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.2843601895734597, | |
| "grad_norm": 0.034646522253751755, | |
| "learning_rate": 9.169610175000812e-05, | |
| "loss": 0.0139, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.2899358795650962, | |
| "grad_norm": 0.029509609565138817, | |
| "learning_rate": 9.12841974998278e-05, | |
| "loss": 0.0117, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.29551156955673263, | |
| "grad_norm": 0.024864595383405685, | |
| "learning_rate": 9.086329552999891e-05, | |
| "loss": 0.0146, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.3010872595483691, | |
| "grad_norm": 0.023953670635819435, | |
| "learning_rate": 9.043348757436037e-05, | |
| "loss": 0.0131, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.30666294954000556, | |
| "grad_norm": 0.019653376191854477, | |
| "learning_rate": 8.99948673077738e-05, | |
| "loss": 0.0107, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.31223863953164205, | |
| "grad_norm": 0.03767814487218857, | |
| "learning_rate": 8.954753032570742e-05, | |
| "loss": 0.0143, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.3178143295232785, | |
| "grad_norm": 0.021987926214933395, | |
| "learning_rate": 8.90915741234015e-05, | |
| "loss": 0.0098, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.323390019514915, | |
| "grad_norm": 0.021244384348392487, | |
| "learning_rate": 8.862709807461956e-05, | |
| "loss": 0.0106, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.32896570950655146, | |
| "grad_norm": 0.02844163216650486, | |
| "learning_rate": 8.815420340999033e-05, | |
| "loss": 0.0162, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.3345413994981879, | |
| "grad_norm": 0.02485722117125988, | |
| "learning_rate": 8.767299319494503e-05, | |
| "loss": 0.0164, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.3401170894898244, | |
| "grad_norm": 0.019627615809440613, | |
| "learning_rate": 8.718357230725449e-05, | |
| "loss": 0.0139, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.3456927794814608, | |
| "grad_norm": 0.02517726831138134, | |
| "learning_rate": 8.668604741417171e-05, | |
| "loss": 0.0128, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.3512684694730973, | |
| "grad_norm": 0.02175074815750122, | |
| "learning_rate": 8.618052694918399e-05, | |
| "loss": 0.0111, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.35684415946473375, | |
| "grad_norm": 0.021222930401563644, | |
| "learning_rate": 8.566712108838042e-05, | |
| "loss": 0.0111, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.36241984945637024, | |
| "grad_norm": 0.024494808167219162, | |
| "learning_rate": 8.514594172643934e-05, | |
| "loss": 0.0138, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.36799553944800667, | |
| "grad_norm": 0.022174010053277016, | |
| "learning_rate": 8.461710245224148e-05, | |
| "loss": 0.0134, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.37357122943964316, | |
| "grad_norm": 0.01959528774023056, | |
| "learning_rate": 8.40807185241137e-05, | |
| "loss": 0.0102, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.3791469194312796, | |
| "grad_norm": 0.017945902422070503, | |
| "learning_rate": 8.353690684470884e-05, | |
| "loss": 0.0143, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.3847226094229161, | |
| "grad_norm": 0.020864926278591156, | |
| "learning_rate": 8.298578593552737e-05, | |
| "loss": 0.0179, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3902982994145526, | |
| "grad_norm": 0.027325566858053207, | |
| "learning_rate": 8.242747591108605e-05, | |
| "loss": 0.0133, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.395873989406189, | |
| "grad_norm": 0.019658569246530533, | |
| "learning_rate": 8.186209845273954e-05, | |
| "loss": 0.0139, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.4014496793978255, | |
| "grad_norm": 0.02014886401593685, | |
| "learning_rate": 8.128977678216039e-05, | |
| "loss": 0.009, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.40702536938946193, | |
| "grad_norm": 0.02425803802907467, | |
| "learning_rate": 8.07106356344834e-05, | |
| "loss": 0.0125, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.4126010593810984, | |
| "grad_norm": 0.030235106125473976, | |
| "learning_rate": 8.012480123112014e-05, | |
| "loss": 0.0171, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.41817674937273486, | |
| "grad_norm": 0.022229960188269615, | |
| "learning_rate": 7.953240125224948e-05, | |
| "loss": 0.0116, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.42375243936437135, | |
| "grad_norm": 0.025448938831686974, | |
| "learning_rate": 7.89335648089903e-05, | |
| "loss": 0.0142, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.4293281293560078, | |
| "grad_norm": 0.023552658036351204, | |
| "learning_rate": 7.832842241526212e-05, | |
| "loss": 0.0147, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.43490381934764427, | |
| "grad_norm": 0.019487692043185234, | |
| "learning_rate": 7.77171059593403e-05, | |
| "loss": 0.0115, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.44047950933928076, | |
| "grad_norm": 0.021791953593492508, | |
| "learning_rate": 7.709974867511138e-05, | |
| "loss": 0.012, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.4460551993309172, | |
| "grad_norm": 0.02281327173113823, | |
| "learning_rate": 7.647648511303544e-05, | |
| "loss": 0.0126, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.4516308893225537, | |
| "grad_norm": 0.02623576670885086, | |
| "learning_rate": 7.584745111082127e-05, | |
| "loss": 0.0128, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.4572065793141901, | |
| "grad_norm": 0.019894316792488098, | |
| "learning_rate": 7.521278376382123e-05, | |
| "loss": 0.0092, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.4627822693058266, | |
| "grad_norm": 0.022427916526794434, | |
| "learning_rate": 7.457262139515171e-05, | |
| "loss": 0.0111, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.46835795929746304, | |
| "grad_norm": 0.021067511290311813, | |
| "learning_rate": 7.392710352554641e-05, | |
| "loss": 0.0099, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.47393364928909953, | |
| "grad_norm": 0.019623806700110435, | |
| "learning_rate": 7.327637084294817e-05, | |
| "loss": 0.012, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.47950933928073597, | |
| "grad_norm": 0.02039971947669983, | |
| "learning_rate": 7.262056517184669e-05, | |
| "loss": 0.0138, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.48508502927237246, | |
| "grad_norm": 0.021388281136751175, | |
| "learning_rate": 7.195982944236851e-05, | |
| "loss": 0.0123, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.49066071926400895, | |
| "grad_norm": 0.022272834554314613, | |
| "learning_rate": 7.1294307659126e-05, | |
| "loss": 0.0126, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.4962364092556454, | |
| "grad_norm": 0.02803129144012928, | |
| "learning_rate": 7.062414486983197e-05, | |
| "loss": 0.0118, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.5018120992472819, | |
| "grad_norm": 0.025339094921946526, | |
| "learning_rate": 6.994948713368737e-05, | |
| "loss": 0.0147, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.5073877892389184, | |
| "grad_norm": 0.024465398862957954, | |
| "learning_rate": 6.927048148954812e-05, | |
| "loss": 0.0118, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.5129634792305547, | |
| "grad_norm": 0.025315098464488983, | |
| "learning_rate": 6.858727592387867e-05, | |
| "loss": 0.0165, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.5185391692221912, | |
| "grad_norm": 0.020109234377741814, | |
| "learning_rate": 6.790001933849899e-05, | |
| "loss": 0.0108, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.5241148592138277, | |
| "grad_norm": 0.01888495869934559, | |
| "learning_rate": 6.720886151813194e-05, | |
| "loss": 0.0097, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.5296905492054642, | |
| "grad_norm": 0.023433005437254906, | |
| "learning_rate": 6.651395309775837e-05, | |
| "loss": 0.0122, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.5352662391971006, | |
| "grad_norm": 0.019298607483506203, | |
| "learning_rate": 6.581544552978687e-05, | |
| "loss": 0.0134, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.5408419291887371, | |
| "grad_norm": 0.025588713586330414, | |
| "learning_rate": 6.511349105104534e-05, | |
| "loss": 0.0108, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.5464176191803736, | |
| "grad_norm": 0.03148540109395981, | |
| "learning_rate": 6.440824264960157e-05, | |
| "loss": 0.0115, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.5519933091720101, | |
| "grad_norm": 0.01748904027044773, | |
| "learning_rate": 6.369985403142014e-05, | |
| "loss": 0.0112, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.5575689991636466, | |
| "grad_norm": 0.027883267030119896, | |
| "learning_rate": 6.298847958686283e-05, | |
| "loss": 0.0125, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.5631446891552829, | |
| "grad_norm": 0.02129966951906681, | |
| "learning_rate": 6.227427435703997e-05, | |
| "loss": 0.0149, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.5687203791469194, | |
| "grad_norm": 0.02562125027179718, | |
| "learning_rate": 6.15573940000197e-05, | |
| "loss": 0.0136, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.5742960691385559, | |
| "grad_norm": 0.02446940541267395, | |
| "learning_rate": 6.083799475690309e-05, | |
| "loss": 0.0112, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.5798717591301924, | |
| "grad_norm": 0.024061646312475204, | |
| "learning_rate": 6.0116233417771994e-05, | |
| "loss": 0.0115, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.5854474491218288, | |
| "grad_norm": 0.01640748232603073, | |
| "learning_rate": 5.9392267287517325e-05, | |
| "loss": 0.0103, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.5910231391134653, | |
| "grad_norm": 0.023191062733530998, | |
| "learning_rate": 5.8666254151554976e-05, | |
| "loss": 0.0113, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.5965988291051018, | |
| "grad_norm": 0.017516452819108963, | |
| "learning_rate": 5.7938352241437366e-05, | |
| "loss": 0.0093, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.6021745190967382, | |
| "grad_norm": 0.019351812079548836, | |
| "learning_rate": 5.720872020036734e-05, | |
| "loss": 0.0125, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.6077502090883747, | |
| "grad_norm": 0.029706666246056557, | |
| "learning_rate": 5.647751704862263e-05, | |
| "loss": 0.008, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 0.6133258990800111, | |
| "grad_norm": 0.016750017181038857, | |
| "learning_rate": 5.5744902148898005e-05, | |
| "loss": 0.0118, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.6189015890716476, | |
| "grad_norm": 0.022833596915006638, | |
| "learning_rate": 5.501103517157288e-05, | |
| "loss": 0.0088, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 0.6244772790632841, | |
| "grad_norm": 0.03475171700119972, | |
| "learning_rate": 5.427607605991176e-05, | |
| "loss": 0.0136, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 0.6300529690549206, | |
| "grad_norm": 0.021340183913707733, | |
| "learning_rate": 5.354018499520536e-05, | |
| "loss": 0.0103, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 0.635628659046557, | |
| "grad_norm": 0.024497641250491142, | |
| "learning_rate": 5.2803522361859594e-05, | |
| "loss": 0.011, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 0.6412043490381935, | |
| "grad_norm": 0.01924068294465542, | |
| "learning_rate": 5.2066248712440656e-05, | |
| "loss": 0.0125, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.64678003902983, | |
| "grad_norm": 0.017638731747865677, | |
| "learning_rate": 5.1328524732683134e-05, | |
| "loss": 0.0104, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 0.6523557290214664, | |
| "grad_norm": 0.022175751626491547, | |
| "learning_rate": 5.059051120646924e-05, | |
| "loss": 0.0128, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 0.6579314190131029, | |
| "grad_norm": 0.022414250299334526, | |
| "learning_rate": 4.985236898078658e-05, | |
| "loss": 0.0128, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 0.6635071090047393, | |
| "grad_norm": 0.020940134301781654, | |
| "learning_rate": 4.911425893067239e-05, | |
| "loss": 0.0124, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 0.6690827989963758, | |
| "grad_norm": 0.021777737885713577, | |
| "learning_rate": 4.837634192415128e-05, | |
| "loss": 0.0126, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.6746584889880123, | |
| "grad_norm": 0.01768389716744423, | |
| "learning_rate": 4.763877878717484e-05, | |
| "loss": 0.0095, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 0.6802341789796488, | |
| "grad_norm": 0.02011968567967415, | |
| "learning_rate": 4.6901730268570275e-05, | |
| "loss": 0.0093, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 0.6858098689712852, | |
| "grad_norm": 0.02239886298775673, | |
| "learning_rate": 4.616535700500583e-05, | |
| "loss": 0.0126, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 0.6913855589629216, | |
| "grad_norm": 0.022233402356505394, | |
| "learning_rate": 4.542981948598071e-05, | |
| "loss": 0.0107, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 0.6969612489545581, | |
| "grad_norm": 0.027380308136343956, | |
| "learning_rate": 4.4695278018847105e-05, | |
| "loss": 0.0142, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.7025369389461946, | |
| "grad_norm": 0.025442643091082573, | |
| "learning_rate": 4.396189269387176e-05, | |
| "loss": 0.0153, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 0.708112628937831, | |
| "grad_norm": 0.01852184161543846, | |
| "learning_rate": 4.322982334934509e-05, | |
| "loss": 0.0102, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 0.7136883189294675, | |
| "grad_norm": 0.01528929267078638, | |
| "learning_rate": 4.2499229536744986e-05, | |
| "loss": 0.0097, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 0.719264008921104, | |
| "grad_norm": 0.026008352637290955, | |
| "learning_rate": 4.17702704859633e-05, | |
| "loss": 0.0159, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 0.7248396989127405, | |
| "grad_norm": 0.018146734684705734, | |
| "learning_rate": 4.104310507060234e-05, | |
| "loss": 0.0095, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.730415388904377, | |
| "grad_norm": 0.022718293592333794, | |
| "learning_rate": 4.0317891773348946e-05, | |
| "loss": 0.0095, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 0.7359910788960133, | |
| "grad_norm": 0.02410387434065342, | |
| "learning_rate": 3.959478865143397e-05, | |
| "loss": 0.0109, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 0.7415667688876498, | |
| "grad_norm": 0.017437651753425598, | |
| "learning_rate": 3.887395330218429e-05, | |
| "loss": 0.0107, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 0.7471424588792863, | |
| "grad_norm": 0.020500419661402702, | |
| "learning_rate": 3.815554282867513e-05, | |
| "loss": 0.0107, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 0.7527181488709228, | |
| "grad_norm": 0.01553898025304079, | |
| "learning_rate": 3.743971380549008e-05, | |
| "loss": 0.0083, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.7582938388625592, | |
| "grad_norm": 0.020166153088212013, | |
| "learning_rate": 3.67266222445964e-05, | |
| "loss": 0.0111, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 0.7638695288541957, | |
| "grad_norm": 0.023076798766851425, | |
| "learning_rate": 3.6016423561342706e-05, | |
| "loss": 0.0128, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 0.7694452188458322, | |
| "grad_norm": 0.02140919119119644, | |
| "learning_rate": 3.5309272540587e-05, | |
| "loss": 0.0104, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 0.7750209088374687, | |
| "grad_norm": 0.016010567545890808, | |
| "learning_rate": 3.4605323302961854e-05, | |
| "loss": 0.0135, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 0.7805965988291051, | |
| "grad_norm": 0.017510782927274704, | |
| "learning_rate": 3.3904729271284473e-05, | |
| "loss": 0.0115, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.7861722888207415, | |
| "grad_norm": 0.024468230083584785, | |
| "learning_rate": 3.3207643137118874e-05, | |
| "loss": 0.01, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 0.791747978812378, | |
| "grad_norm": 0.020976202562451363, | |
| "learning_rate": 3.251421682749732e-05, | |
| "loss": 0.0114, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 0.7973236688040145, | |
| "grad_norm": 0.02256660722196102, | |
| "learning_rate": 3.18246014718085e-05, | |
| "loss": 0.0108, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 0.802899358795651, | |
| "grad_norm": 0.030024701729416847, | |
| "learning_rate": 3.113894736885953e-05, | |
| "loss": 0.0104, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 0.8084750487872874, | |
| "grad_norm": 0.019312310963869095, | |
| "learning_rate": 3.0457403954118856e-05, | |
| "loss": 0.0082, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.8140507387789239, | |
| "grad_norm": 0.02008041925728321, | |
| "learning_rate": 2.978011976714753e-05, | |
| "loss": 0.0099, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 0.8196264287705604, | |
| "grad_norm": 0.021896323189139366, | |
| "learning_rate": 2.9107242419225577e-05, | |
| "loss": 0.0143, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 0.8252021187621968, | |
| "grad_norm": 0.019579166546463966, | |
| "learning_rate": 2.8438918561180634e-05, | |
| "loss": 0.0106, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 0.8307778087538333, | |
| "grad_norm": 0.02049921080470085, | |
| "learning_rate": 2.7775293851426232e-05, | |
| "loss": 0.0115, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 0.8363534987454697, | |
| "grad_norm": 0.014969157055020332, | |
| "learning_rate": 2.711651292421593e-05, | |
| "loss": 0.0101, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.8419291887371062, | |
| "grad_norm": 0.020416075363755226, | |
| "learning_rate": 2.646271935812098e-05, | |
| "loss": 0.0098, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 0.8475048787287427, | |
| "grad_norm": 0.018367785960435867, | |
| "learning_rate": 2.581405564473801e-05, | |
| "loss": 0.0165, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 0.8530805687203792, | |
| "grad_norm": 0.0190111193805933, | |
| "learning_rate": 2.5170663157633477e-05, | |
| "loss": 0.0135, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 0.8586562587120156, | |
| "grad_norm": 0.024806899949908257, | |
| "learning_rate": 2.45326821215319e-05, | |
| "loss": 0.0116, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 0.864231948703652, | |
| "grad_norm": 0.02073819749057293, | |
| "learning_rate": 2.390025158175458e-05, | |
| "loss": 0.0129, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.8698076386952885, | |
| "grad_norm": 0.02042596973478794, | |
| "learning_rate": 2.3273509373915093e-05, | |
| "loss": 0.0088, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 0.875383328686925, | |
| "grad_norm": 0.015911240130662918, | |
| "learning_rate": 2.2652592093878666e-05, | |
| "loss": 0.01, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 0.8809590186785615, | |
| "grad_norm": 0.023589760065078735, | |
| "learning_rate": 2.2037635067991663e-05, | |
| "loss": 0.0107, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 0.8865347086701979, | |
| "grad_norm": 0.017796384170651436, | |
| "learning_rate": 2.1428772323587827e-05, | |
| "loss": 0.0103, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 0.8921103986618344, | |
| "grad_norm": 0.018958481028676033, | |
| "learning_rate": 2.082613655977745e-05, | |
| "loss": 0.0079, | |
| "step": 800 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 1120, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 2, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.728813755050754e+18, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |