| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9990766389658357, | |
| "eval_steps": 500, | |
| "global_step": 541, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0018467220683287165, | |
| "grad_norm": 0.13621516525745392, | |
| "learning_rate": 3.636363636363636e-06, | |
| "loss": 0.7363, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.009233610341643583, | |
| "grad_norm": 0.13796468079090118, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.7817, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.018467220683287166, | |
| "grad_norm": 0.12183558940887451, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.7779, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.027700831024930747, | |
| "grad_norm": 0.16904406249523163, | |
| "learning_rate": 5.4545454545454546e-05, | |
| "loss": 0.7869, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.03693444136657433, | |
| "grad_norm": 0.1359681636095047, | |
| "learning_rate": 7.272727272727273e-05, | |
| "loss": 0.777, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.046168051708217916, | |
| "grad_norm": 0.11777453124523163, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 0.7807, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.055401662049861494, | |
| "grad_norm": 0.12384524196386337, | |
| "learning_rate": 0.00010909090909090909, | |
| "loss": 0.7494, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06463527239150507, | |
| "grad_norm": 0.11038655042648315, | |
| "learning_rate": 0.00012727272727272728, | |
| "loss": 0.7475, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07386888273314866, | |
| "grad_norm": 0.12827804684638977, | |
| "learning_rate": 0.00014545454545454546, | |
| "loss": 0.7571, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08310249307479224, | |
| "grad_norm": 0.1219051405787468, | |
| "learning_rate": 0.00016363636363636366, | |
| "loss": 0.7498, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09233610341643583, | |
| "grad_norm": 0.114303357899189, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 0.7459, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10156971375807941, | |
| "grad_norm": 0.12260652333498001, | |
| "learning_rate": 0.0002, | |
| "loss": 0.7285, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.11080332409972299, | |
| "grad_norm": 0.1140127182006836, | |
| "learning_rate": 0.00019994777247895855, | |
| "loss": 0.7382, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12003693444136658, | |
| "grad_norm": 0.11311746388673782, | |
| "learning_rate": 0.00019979114447011323, | |
| "loss": 0.7759, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.12927054478301014, | |
| "grad_norm": 0.1240229681134224, | |
| "learning_rate": 0.00019953027957931658, | |
| "loss": 0.7489, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.13850415512465375, | |
| "grad_norm": 0.10431641340255737, | |
| "learning_rate": 0.00019916545029310012, | |
| "loss": 0.7401, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.14773776546629733, | |
| "grad_norm": 0.10815250873565674, | |
| "learning_rate": 0.00019869703769404828, | |
| "loss": 0.7429, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.1569713758079409, | |
| "grad_norm": 0.10247638076543808, | |
| "learning_rate": 0.00019812553106273847, | |
| "loss": 0.7496, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.16620498614958448, | |
| "grad_norm": 0.1005801111459732, | |
| "learning_rate": 0.00019745152736666302, | |
| "loss": 0.7354, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.17543859649122806, | |
| "grad_norm": 0.11037640273571014, | |
| "learning_rate": 0.0001966757306366662, | |
| "loss": 0.765, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.18467220683287167, | |
| "grad_norm": 0.1104218140244484, | |
| "learning_rate": 0.0001957989512315489, | |
| "loss": 0.7395, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.19390581717451524, | |
| "grad_norm": 0.10845394432544708, | |
| "learning_rate": 0.00019482210499160765, | |
| "loss": 0.7378, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.20313942751615882, | |
| "grad_norm": 0.103615403175354, | |
| "learning_rate": 0.0001937462122819935, | |
| "loss": 0.7344, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.2123730378578024, | |
| "grad_norm": 0.10229409486055374, | |
| "learning_rate": 0.00019257239692688907, | |
| "loss": 0.7396, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.22160664819944598, | |
| "grad_norm": 0.11397738009691238, | |
| "learning_rate": 0.00019130188503561741, | |
| "loss": 0.7587, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.23084025854108955, | |
| "grad_norm": 0.1019892767071724, | |
| "learning_rate": 0.00018993600372190932, | |
| "loss": 0.7524, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.24007386888273316, | |
| "grad_norm": 0.11047018319368362, | |
| "learning_rate": 0.00018847617971766577, | |
| "loss": 0.7447, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.24930747922437674, | |
| "grad_norm": 0.10999737679958344, | |
| "learning_rate": 0.00018692393788266479, | |
| "loss": 0.7598, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2585410895660203, | |
| "grad_norm": 0.10835966467857361, | |
| "learning_rate": 0.0001852808996117683, | |
| "loss": 0.7502, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2677746999076639, | |
| "grad_norm": 0.1028084084391594, | |
| "learning_rate": 0.00018354878114129367, | |
| "loss": 0.7594, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.2770083102493075, | |
| "grad_norm": 0.10460478812456131, | |
| "learning_rate": 0.00018172939175631808, | |
| "loss": 0.7532, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.28624192059095105, | |
| "grad_norm": 0.10527301579713821, | |
| "learning_rate": 0.0001798246319007893, | |
| "loss": 0.7564, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.29547553093259465, | |
| "grad_norm": 0.11887970566749573, | |
| "learning_rate": 0.00017783649119241602, | |
| "loss": 0.7409, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.3047091412742382, | |
| "grad_norm": 0.10338354855775833, | |
| "learning_rate": 0.0001757670463444118, | |
| "loss": 0.7439, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3139427516158818, | |
| "grad_norm": 0.1198032796382904, | |
| "learning_rate": 0.00017361845899626355, | |
| "loss": 0.727, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.3231763619575254, | |
| "grad_norm": 0.10385128110647202, | |
| "learning_rate": 0.00017139297345578994, | |
| "loss": 0.7352, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.33240997229916897, | |
| "grad_norm": 0.11350072920322418, | |
| "learning_rate": 0.0001690929143548488, | |
| "loss": 0.7421, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.34164358264081257, | |
| "grad_norm": 0.10703606903553009, | |
| "learning_rate": 0.00016672068422114196, | |
| "loss": 0.7344, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.3508771929824561, | |
| "grad_norm": 0.10089576244354248, | |
| "learning_rate": 0.00016427876096865394, | |
| "loss": 0.743, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3601108033240997, | |
| "grad_norm": 0.10272319614887238, | |
| "learning_rate": 0.00016176969530934572, | |
| "loss": 0.7419, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.36934441366574333, | |
| "grad_norm": 0.10321817547082901, | |
| "learning_rate": 0.0001591961080888076, | |
| "loss": 0.7502, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.3785780240073869, | |
| "grad_norm": 0.10779888927936554, | |
| "learning_rate": 0.00015656068754865387, | |
| "loss": 0.7417, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.3878116343490305, | |
| "grad_norm": 0.10841017961502075, | |
| "learning_rate": 0.0001538661865185188, | |
| "loss": 0.7356, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.39704524469067404, | |
| "grad_norm": 0.1070384755730629, | |
| "learning_rate": 0.00015111541954058734, | |
| "loss": 0.7553, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.40627885503231764, | |
| "grad_norm": 0.09864313900470734, | |
| "learning_rate": 0.00014831125992966385, | |
| "loss": 0.75, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.4155124653739612, | |
| "grad_norm": 0.10148273408412933, | |
| "learning_rate": 0.00014545663677185006, | |
| "loss": 0.738, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4247460757156048, | |
| "grad_norm": 0.10425528883934021, | |
| "learning_rate": 0.00014255453186496673, | |
| "loss": 0.7385, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4339796860572484, | |
| "grad_norm": 0.09917706996202469, | |
| "learning_rate": 0.0001396079766039157, | |
| "loss": 0.7261, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.44321329639889195, | |
| "grad_norm": 0.11001438647508621, | |
| "learning_rate": 0.0001366200488142348, | |
| "loss": 0.7349, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.45244690674053556, | |
| "grad_norm": 0.10055994987487793, | |
| "learning_rate": 0.00013359386953715421, | |
| "loss": 0.7318, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4616805170821791, | |
| "grad_norm": 0.10037145018577576, | |
| "learning_rate": 0.00013053259976951133, | |
| "loss": 0.7312, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.4709141274238227, | |
| "grad_norm": 0.11422228813171387, | |
| "learning_rate": 0.00012743943716193016, | |
| "loss": 0.7426, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.4801477377654663, | |
| "grad_norm": 0.10170543938875198, | |
| "learning_rate": 0.00012431761267871417, | |
| "loss": 0.7212, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.48938134810710987, | |
| "grad_norm": 0.1071636751294136, | |
| "learning_rate": 0.0001211703872229411, | |
| "loss": 0.7518, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.4986149584487535, | |
| "grad_norm": 0.11060360074043274, | |
| "learning_rate": 0.00011800104823028515, | |
| "loss": 0.7526, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5078485687903971, | |
| "grad_norm": 0.10358710587024689, | |
| "learning_rate": 0.0001148129062351249, | |
| "loss": 0.7198, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5170821791320406, | |
| "grad_norm": 0.10385197401046753, | |
| "learning_rate": 0.00011160929141252303, | |
| "loss": 0.7221, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.10536545515060425, | |
| "learning_rate": 0.00010839355009969068, | |
| "loss": 0.7248, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5355493998153278, | |
| "grad_norm": 0.10879741609096527, | |
| "learning_rate": 0.00010516904130056946, | |
| "loss": 0.7398, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5447830101569714, | |
| "grad_norm": 0.10238393396139145, | |
| "learning_rate": 0.00010193913317718244, | |
| "loss": 0.7545, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.554016620498615, | |
| "grad_norm": 0.11232389509677887, | |
| "learning_rate": 9.870719953141917e-05, | |
| "loss": 0.7324, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.5632502308402585, | |
| "grad_norm": 0.09757301956415176, | |
| "learning_rate": 9.547661628092937e-05, | |
| "loss": 0.7303, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.5724838411819021, | |
| "grad_norm": 0.10265171527862549, | |
| "learning_rate": 9.225075793280692e-05, | |
| "loss": 0.7366, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.5817174515235457, | |
| "grad_norm": 0.10101115703582764, | |
| "learning_rate": 8.903299405874684e-05, | |
| "loss": 0.7156, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.5909510618651893, | |
| "grad_norm": 0.10101660341024399, | |
| "learning_rate": 8.582668577535797e-05, | |
| "loss": 0.7329, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6001846722068329, | |
| "grad_norm": 0.10654629021883011, | |
| "learning_rate": 8.263518223330697e-05, | |
| "loss": 0.76, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6094182825484764, | |
| "grad_norm": 0.10585548728704453, | |
| "learning_rate": 7.94618171189618e-05, | |
| "loss": 0.706, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.61865189289012, | |
| "grad_norm": 0.10175392776727676, | |
| "learning_rate": 7.630990517218808e-05, | |
| "loss": 0.7423, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.6278855032317636, | |
| "grad_norm": 0.10401345789432526, | |
| "learning_rate": 7.318273872393625e-05, | |
| "loss": 0.7446, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6371191135734072, | |
| "grad_norm": 0.10385286062955856, | |
| "learning_rate": 7.008358425723585e-05, | |
| "loss": 0.757, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6463527239150508, | |
| "grad_norm": 0.10788547992706299, | |
| "learning_rate": 6.701567899518924e-05, | |
| "loss": 0.7305, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.6555863342566943, | |
| "grad_norm": 0.10447971522808075, | |
| "learning_rate": 6.398222751952899e-05, | |
| "loss": 0.7178, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.6648199445983379, | |
| "grad_norm": 0.10098852962255478, | |
| "learning_rate": 6.098639842327052e-05, | |
| "loss": 0.742, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.6740535549399815, | |
| "grad_norm": 0.10096322745084763, | |
| "learning_rate": 5.80313210009571e-05, | |
| "loss": 0.7331, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.6832871652816251, | |
| "grad_norm": 0.10325752198696136, | |
| "learning_rate": 5.5120081979953785e-05, | |
| "loss": 0.738, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.6925207756232687, | |
| "grad_norm": 0.10698148608207703, | |
| "learning_rate": 5.22557222962051e-05, | |
| "loss": 0.7532, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.7017543859649122, | |
| "grad_norm": 0.10292143374681473, | |
| "learning_rate": 4.9441233917824106e-05, | |
| "loss": 0.7339, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7109879963065558, | |
| "grad_norm": 0.10220400989055634, | |
| "learning_rate": 4.66795567198309e-05, | |
| "loss": 0.7346, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7202216066481995, | |
| "grad_norm": 0.10404420644044876, | |
| "learning_rate": 4.397357541330476e-05, | |
| "loss": 0.7575, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7294552169898431, | |
| "grad_norm": 0.10498429834842682, | |
| "learning_rate": 4.132611653215822e-05, | |
| "loss": 0.737, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.7386888273314867, | |
| "grad_norm": 0.10577254742383957, | |
| "learning_rate": 3.873994548067972e-05, | |
| "loss": 0.7277, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.7479224376731302, | |
| "grad_norm": 0.11322159320116043, | |
| "learning_rate": 3.621776364492939e-05, | |
| "loss": 0.7447, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.7571560480147738, | |
| "grad_norm": 0.1056286096572876, | |
| "learning_rate": 3.376220557100523e-05, | |
| "loss": 0.7347, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.7663896583564174, | |
| "grad_norm": 0.10042563825845718, | |
| "learning_rate": 3.137583621312665e-05, | |
| "loss": 0.7374, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.775623268698061, | |
| "grad_norm": 0.10892172902822495, | |
| "learning_rate": 2.906114825441072e-05, | |
| "loss": 0.7288, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.7848568790397045, | |
| "grad_norm": 0.1068541631102562, | |
| "learning_rate": 2.6820559503138797e-05, | |
| "loss": 0.7474, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.7940904893813481, | |
| "grad_norm": 0.10999409854412079, | |
| "learning_rate": 2.465641036723393e-05, | |
| "loss": 0.7451, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.8033240997229917, | |
| "grad_norm": 0.10453728586435318, | |
| "learning_rate": 2.2570961409586754e-05, | |
| "loss": 0.7427, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8125577100646353, | |
| "grad_norm": 0.10310923308134079, | |
| "learning_rate": 2.0566390986783646e-05, | |
| "loss": 0.7474, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8217913204062789, | |
| "grad_norm": 0.10431590676307678, | |
| "learning_rate": 1.864479297370325e-05, | |
| "loss": 0.7319, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.8310249307479224, | |
| "grad_norm": 0.10921698808670044, | |
| "learning_rate": 1.6808174576358848e-05, | |
| "loss": 0.7478, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.840258541089566, | |
| "grad_norm": 0.11102822422981262, | |
| "learning_rate": 1.505845423527027e-05, | |
| "loss": 0.7585, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.8494921514312096, | |
| "grad_norm": 0.10043223947286606, | |
| "learning_rate": 1.339745962155613e-05, | |
| "loss": 0.7337, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.8587257617728532, | |
| "grad_norm": 0.10496434569358826, | |
| "learning_rate": 1.18269257278392e-05, | |
| "loss": 0.7407, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.8679593721144968, | |
| "grad_norm": 0.10377249866724014, | |
| "learning_rate": 1.0348493055959062e-05, | |
| "loss": 0.7198, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.8771929824561403, | |
| "grad_norm": 0.10728956758975983, | |
| "learning_rate": 8.963705903385345e-06, | |
| "loss": 0.7176, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.8864265927977839, | |
| "grad_norm": 0.10525339096784592, | |
| "learning_rate": 7.674010750120964e-06, | |
| "loss": 0.7326, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.8956602031394275, | |
| "grad_norm": 0.10717398673295975, | |
| "learning_rate": 6.480754747781037e-06, | |
| "loss": 0.7298, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.9048938134810711, | |
| "grad_norm": 0.10121981054544449, | |
| "learning_rate": 5.385184312424974e-06, | |
| "loss": 0.7187, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9141274238227147, | |
| "grad_norm": 0.10384524613618851, | |
| "learning_rate": 4.3884438226120424e-06, | |
| "loss": 0.7348, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.9233610341643582, | |
| "grad_norm": 0.09785287827253342, | |
| "learning_rate": 3.4915744240403558e-06, | |
| "loss": 0.7339, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.9325946445060018, | |
| "grad_norm": 0.1057659238576889, | |
| "learning_rate": 2.6955129420176196e-06, | |
| "loss": 0.7295, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 0.9418282548476454, | |
| "grad_norm": 0.1022268533706665, | |
| "learning_rate": 2.0010909028998827e-06, | |
| "loss": 0.7254, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 0.951061865189289, | |
| "grad_norm": 0.10650717467069626, | |
| "learning_rate": 1.409033665520354e-06, | |
| "loss": 0.7362, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 0.9602954755309326, | |
| "grad_norm": 0.10881408303976059, | |
| "learning_rate": 9.199596635154683e-07, | |
| "loss": 0.733, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 0.9695290858725761, | |
| "grad_norm": 0.10430116206407547, | |
| "learning_rate": 5.343797593398536e-07, | |
| "loss": 0.7487, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.9787626962142197, | |
| "grad_norm": 0.1005534678697586, | |
| "learning_rate": 2.5269671064467313e-07, | |
| "loss": 0.7265, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 0.9879963065558633, | |
| "grad_norm": 0.10172731429338455, | |
| "learning_rate": 7.520474957699586e-08, | |
| "loss": 0.7208, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 0.997229916897507, | |
| "grad_norm": 0.10914121568202972, | |
| "learning_rate": 2.0892754394208346e-09, | |
| "loss": 0.7491, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 0.9990766389658357, | |
| "eval_loss": 0.9509617686271667, | |
| "eval_runtime": 131.3926, | |
| "eval_samples_per_second": 8.79, | |
| "eval_steps_per_second": 0.556, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 0.9990766389658357, | |
| "step": 541, | |
| "total_flos": 2.228078290818564e+18, | |
| "train_loss": 0.7416185885392363, | |
| "train_runtime": 26509.2456, | |
| "train_samples_per_second": 3.921, | |
| "train_steps_per_second": 0.02 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 541, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 25, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.228078290818564e+18, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |