{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1142, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017513134851138354, "grad_norm": 1.6189321527508724, "learning_rate": 9.99998108060379e-06, "loss": 0.2516, "step": 1 }, { "epoch": 0.0035026269702276708, "grad_norm": 3.2205596273078165, "learning_rate": 9.999924322558328e-06, "loss": 0.4272, "step": 2 }, { "epoch": 0.005253940455341506, "grad_norm": 1.9512359773972658, "learning_rate": 9.99982972629315e-06, "loss": 0.3537, "step": 3 }, { "epoch": 0.0070052539404553416, "grad_norm": 2.303082988314436, "learning_rate": 9.99969729252414e-06, "loss": 0.4041, "step": 4 }, { "epoch": 0.008756567425569177, "grad_norm": 1.7602753076895619, "learning_rate": 9.999527022253521e-06, "loss": 0.282, "step": 5 }, { "epoch": 0.010507880910683012, "grad_norm": 1.472660482986906, "learning_rate": 9.999318916769858e-06, "loss": 0.3178, "step": 6 }, { "epoch": 0.012259194395796848, "grad_norm": 1.5120175005465801, "learning_rate": 9.999072977648042e-06, "loss": 0.2982, "step": 7 }, { "epoch": 0.014010507880910683, "grad_norm": 1.552294241363048, "learning_rate": 9.998789206749284e-06, "loss": 0.2406, "step": 8 }, { "epoch": 0.01576182136602452, "grad_norm": 1.4355383392258896, "learning_rate": 9.998467606221091e-06, "loss": 0.3063, "step": 9 }, { "epoch": 0.017513134851138354, "grad_norm": 1.1505639079486283, "learning_rate": 9.998108178497259e-06, "loss": 0.2002, "step": 10 }, { "epoch": 0.01926444833625219, "grad_norm": 1.478144193504059, "learning_rate": 9.99771092629785e-06, "loss": 0.249, "step": 11 }, { "epoch": 0.021015761821366025, "grad_norm": 1.2422830596341097, "learning_rate": 9.997275852629172e-06, "loss": 0.2275, "step": 12 }, { "epoch": 0.02276707530647986, "grad_norm": 1.176007388166311, "learning_rate": 9.99680296078376e-06, "loss": 0.1875, "step": 13 }, { "epoch": 0.024518388791593695, "grad_norm": 1.5860889058713947, "learning_rate": 9.996292254340342e-06, "loss": 0.2827, "step": 14 }, { "epoch": 0.02626970227670753, "grad_norm": 1.708745706115137, "learning_rate": 9.995743737163823e-06, "loss": 0.2909, "step": 15 }, { "epoch": 0.028021015761821366, "grad_norm": 1.8765102069789172, "learning_rate": 9.99515741340525e-06, "loss": 0.1961, "step": 16 }, { "epoch": 0.0297723292469352, "grad_norm": 1.4334195277716413, "learning_rate": 9.994533287501775e-06, "loss": 0.2652, "step": 17 }, { "epoch": 0.03152364273204904, "grad_norm": 1.140465695934814, "learning_rate": 9.993871364176637e-06, "loss": 0.2149, "step": 18 }, { "epoch": 0.03327495621716287, "grad_norm": 1.1401794918473909, "learning_rate": 9.993171648439109e-06, "loss": 0.1913, "step": 19 }, { "epoch": 0.03502626970227671, "grad_norm": 1.3063665613307132, "learning_rate": 9.992434145584471e-06, "loss": 0.2375, "step": 20 }, { "epoch": 0.03677758318739054, "grad_norm": 1.1007379852192687, "learning_rate": 9.991658861193966e-06, "loss": 0.2276, "step": 21 }, { "epoch": 0.03852889667250438, "grad_norm": 1.3635848002372042, "learning_rate": 9.99084580113476e-06, "loss": 0.2432, "step": 22 }, { "epoch": 0.040280210157618214, "grad_norm": 1.437328015329854, "learning_rate": 9.989994971559897e-06, "loss": 0.2815, "step": 23 }, { "epoch": 0.04203152364273205, "grad_norm": 1.2255764233144246, "learning_rate": 9.989106378908246e-06, "loss": 0.2028, "step": 24 }, { "epoch": 0.043782837127845885, "grad_norm": 0.934261492585796, "learning_rate": 9.988180029904463e-06, "loss": 0.1948, "step": 25 }, { "epoch": 0.04553415061295972, "grad_norm": 1.3236962758603368, "learning_rate": 9.987215931558935e-06, "loss": 0.2235, "step": 26 }, { "epoch": 0.047285464098073555, "grad_norm": 1.389095598869151, "learning_rate": 9.986214091167726e-06, "loss": 0.2542, "step": 27 }, { "epoch": 0.04903677758318739, "grad_norm": 1.2087489144763743, "learning_rate": 9.985174516312519e-06, "loss": 0.2045, "step": 28 }, { "epoch": 0.050788091068301226, "grad_norm": 1.0708229969119525, "learning_rate": 9.984097214860566e-06, "loss": 0.2343, "step": 29 }, { "epoch": 0.05253940455341506, "grad_norm": 1.0781105753548819, "learning_rate": 9.982982194964626e-06, "loss": 0.1994, "step": 30 }, { "epoch": 0.0542907180385289, "grad_norm": 1.2715702567474565, "learning_rate": 9.981829465062898e-06, "loss": 0.2553, "step": 31 }, { "epoch": 0.05604203152364273, "grad_norm": 0.9520552503997382, "learning_rate": 9.980639033878965e-06, "loss": 0.1844, "step": 32 }, { "epoch": 0.05779334500875657, "grad_norm": 1.2877099584096807, "learning_rate": 9.979410910421724e-06, "loss": 0.232, "step": 33 }, { "epoch": 0.0595446584938704, "grad_norm": 1.364716904790534, "learning_rate": 9.978145103985315e-06, "loss": 0.1983, "step": 34 }, { "epoch": 0.06129597197898424, "grad_norm": 1.0906698260991894, "learning_rate": 9.976841624149054e-06, "loss": 0.1903, "step": 35 }, { "epoch": 0.06304728546409807, "grad_norm": 1.09412669488779, "learning_rate": 9.975500480777364e-06, "loss": 0.1919, "step": 36 }, { "epoch": 0.0647985989492119, "grad_norm": 1.2010020335474978, "learning_rate": 9.974121684019695e-06, "loss": 0.1608, "step": 37 }, { "epoch": 0.06654991243432574, "grad_norm": 0.8480134607436435, "learning_rate": 9.972705244310445e-06, "loss": 0.1524, "step": 38 }, { "epoch": 0.06830122591943957, "grad_norm": 1.2269951958743044, "learning_rate": 9.97125117236889e-06, "loss": 0.2164, "step": 39 }, { "epoch": 0.07005253940455342, "grad_norm": 1.5214916016344373, "learning_rate": 9.969759479199093e-06, "loss": 0.2573, "step": 40 }, { "epoch": 0.07180385288966724, "grad_norm": 1.3616317930694422, "learning_rate": 9.968230176089828e-06, "loss": 0.2272, "step": 41 }, { "epoch": 0.07355516637478109, "grad_norm": 1.1932422020281335, "learning_rate": 9.966663274614495e-06, "loss": 0.1567, "step": 42 }, { "epoch": 0.07530647985989491, "grad_norm": 0.8716927387571308, "learning_rate": 9.965058786631022e-06, "loss": 0.2293, "step": 43 }, { "epoch": 0.07705779334500876, "grad_norm": 1.3932388141794017, "learning_rate": 9.963416724281787e-06, "loss": 0.2286, "step": 44 }, { "epoch": 0.07880910683012259, "grad_norm": 1.2309788186288007, "learning_rate": 9.961737099993523e-06, "loss": 0.2633, "step": 45 }, { "epoch": 0.08056042031523643, "grad_norm": 1.0195652741277668, "learning_rate": 9.960019926477218e-06, "loss": 0.1664, "step": 46 }, { "epoch": 0.08231173380035026, "grad_norm": 0.805855610816426, "learning_rate": 9.958265216728032e-06, "loss": 0.1538, "step": 47 }, { "epoch": 0.0840630472854641, "grad_norm": 0.9295478292913552, "learning_rate": 9.956472984025179e-06, "loss": 0.154, "step": 48 }, { "epoch": 0.08581436077057793, "grad_norm": 1.1182679627713559, "learning_rate": 9.954643241931845e-06, "loss": 0.1946, "step": 49 }, { "epoch": 0.08756567425569177, "grad_norm": 0.9976680326317856, "learning_rate": 9.952776004295077e-06, "loss": 0.1705, "step": 50 }, { "epoch": 0.0893169877408056, "grad_norm": 0.9178402665093803, "learning_rate": 9.95087128524568e-06, "loss": 0.1583, "step": 51 }, { "epoch": 0.09106830122591944, "grad_norm": 0.9494960096058251, "learning_rate": 9.948929099198104e-06, "loss": 0.1779, "step": 52 }, { "epoch": 0.09281961471103327, "grad_norm": 1.546996916824689, "learning_rate": 9.946949460850346e-06, "loss": 0.2514, "step": 53 }, { "epoch": 0.09457092819614711, "grad_norm": 0.9923825116181026, "learning_rate": 9.944932385183831e-06, "loss": 0.1989, "step": 54 }, { "epoch": 0.09632224168126094, "grad_norm": 1.0402809673797797, "learning_rate": 9.9428778874633e-06, "loss": 0.2107, "step": 55 }, { "epoch": 0.09807355516637478, "grad_norm": 1.0825018590876776, "learning_rate": 9.940785983236696e-06, "loss": 0.1676, "step": 56 }, { "epoch": 0.09982486865148861, "grad_norm": 1.379406625027056, "learning_rate": 9.938656688335045e-06, "loss": 0.1991, "step": 57 }, { "epoch": 0.10157618213660245, "grad_norm": 1.0500448305369041, "learning_rate": 9.936490018872336e-06, "loss": 0.1695, "step": 58 }, { "epoch": 0.10332749562171628, "grad_norm": 1.1035051286979576, "learning_rate": 9.934285991245399e-06, "loss": 0.1939, "step": 59 }, { "epoch": 0.10507880910683012, "grad_norm": 0.9812842055285581, "learning_rate": 9.932044622133785e-06, "loss": 0.1937, "step": 60 }, { "epoch": 0.10683012259194395, "grad_norm": 0.9527544704706223, "learning_rate": 9.929765928499635e-06, "loss": 0.171, "step": 61 }, { "epoch": 0.1085814360770578, "grad_norm": 1.302925637235369, "learning_rate": 9.927449927587549e-06, "loss": 0.278, "step": 62 }, { "epoch": 0.11033274956217162, "grad_norm": 0.917614726426997, "learning_rate": 9.925096636924465e-06, "loss": 0.2583, "step": 63 }, { "epoch": 0.11208406304728546, "grad_norm": 1.1260675596481124, "learning_rate": 9.922706074319517e-06, "loss": 0.1954, "step": 64 }, { "epoch": 0.1138353765323993, "grad_norm": 0.9735158179996255, "learning_rate": 9.920278257863904e-06, "loss": 0.1688, "step": 65 }, { "epoch": 0.11558669001751314, "grad_norm": 1.2032784643650782, "learning_rate": 9.917813205930758e-06, "loss": 0.2189, "step": 66 }, { "epoch": 0.11733800350262696, "grad_norm": 1.1710648620955906, "learning_rate": 9.915310937174995e-06, "loss": 0.2043, "step": 67 }, { "epoch": 0.1190893169877408, "grad_norm": 1.0468308734796317, "learning_rate": 9.91277147053318e-06, "loss": 0.1912, "step": 68 }, { "epoch": 0.12084063047285463, "grad_norm": 1.0767180014359876, "learning_rate": 9.910194825223384e-06, "loss": 0.1916, "step": 69 }, { "epoch": 0.12259194395796848, "grad_norm": 1.157850250531908, "learning_rate": 9.90758102074504e-06, "loss": 0.1692, "step": 70 }, { "epoch": 0.1243432574430823, "grad_norm": 1.1978899308041546, "learning_rate": 9.90493007687878e-06, "loss": 0.1904, "step": 71 }, { "epoch": 0.12609457092819615, "grad_norm": 1.3308506120726233, "learning_rate": 9.902242013686316e-06, "loss": 0.2215, "step": 72 }, { "epoch": 0.12784588441331, "grad_norm": 1.186862457320643, "learning_rate": 9.899516851510256e-06, "loss": 0.2086, "step": 73 }, { "epoch": 0.1295971978984238, "grad_norm": 0.9921699627050491, "learning_rate": 9.89675461097397e-06, "loss": 0.1942, "step": 74 }, { "epoch": 0.13134851138353765, "grad_norm": 1.1944882918690307, "learning_rate": 9.893955312981428e-06, "loss": 0.1996, "step": 75 }, { "epoch": 0.1330998248686515, "grad_norm": 1.3595222631043964, "learning_rate": 9.89111897871704e-06, "loss": 0.2493, "step": 76 }, { "epoch": 0.13485113835376533, "grad_norm": 1.311168457583758, "learning_rate": 9.888245629645502e-06, "loss": 0.3023, "step": 77 }, { "epoch": 0.13660245183887915, "grad_norm": 1.2264268547734622, "learning_rate": 9.885335287511621e-06, "loss": 0.2375, "step": 78 }, { "epoch": 0.138353765323993, "grad_norm": 1.0615586907581638, "learning_rate": 9.882387974340166e-06, "loss": 0.2196, "step": 79 }, { "epoch": 0.14010507880910683, "grad_norm": 1.2113698183185242, "learning_rate": 9.879403712435692e-06, "loss": 0.2094, "step": 80 }, { "epoch": 0.14185639229422067, "grad_norm": 1.6599951150659369, "learning_rate": 9.876382524382372e-06, "loss": 0.2155, "step": 81 }, { "epoch": 0.1436077057793345, "grad_norm": 1.1421227311156412, "learning_rate": 9.873324433043825e-06, "loss": 0.2082, "step": 82 }, { "epoch": 0.14535901926444833, "grad_norm": 1.3566945133013182, "learning_rate": 9.87022946156295e-06, "loss": 0.221, "step": 83 }, { "epoch": 0.14711033274956217, "grad_norm": 1.3051526600631753, "learning_rate": 9.867097633361745e-06, "loss": 0.1775, "step": 84 }, { "epoch": 0.14886164623467601, "grad_norm": 0.8957821215184415, "learning_rate": 9.863928972141127e-06, "loss": 0.1868, "step": 85 }, { "epoch": 0.15061295971978983, "grad_norm": 0.8391246575100022, "learning_rate": 9.860723501880758e-06, "loss": 0.1299, "step": 86 }, { "epoch": 0.15236427320490367, "grad_norm": 1.0429245203527981, "learning_rate": 9.857481246838867e-06, "loss": 0.1631, "step": 87 }, { "epoch": 0.15411558669001751, "grad_norm": 0.9670342133099205, "learning_rate": 9.854202231552052e-06, "loss": 0.186, "step": 88 }, { "epoch": 0.15586690017513136, "grad_norm": 1.0108020188058704, "learning_rate": 9.850886480835113e-06, "loss": 0.2, "step": 89 }, { "epoch": 0.15761821366024517, "grad_norm": 1.1527622631935206, "learning_rate": 9.847534019780848e-06, "loss": 0.2098, "step": 90 }, { "epoch": 0.159369527145359, "grad_norm": 1.044844838210246, "learning_rate": 9.844144873759874e-06, "loss": 0.1915, "step": 91 }, { "epoch": 0.16112084063047286, "grad_norm": 1.4573741072487383, "learning_rate": 9.840719068420427e-06, "loss": 0.2642, "step": 92 }, { "epoch": 0.1628721541155867, "grad_norm": 1.091447061849621, "learning_rate": 9.837256629688177e-06, "loss": 0.1863, "step": 93 }, { "epoch": 0.1646234676007005, "grad_norm": 1.252517589764872, "learning_rate": 9.833757583766025e-06, "loss": 0.2034, "step": 94 }, { "epoch": 0.16637478108581435, "grad_norm": 1.1107281185356483, "learning_rate": 9.830221957133903e-06, "loss": 0.1832, "step": 95 }, { "epoch": 0.1681260945709282, "grad_norm": 0.976784716781447, "learning_rate": 9.82664977654858e-06, "loss": 0.1326, "step": 96 }, { "epoch": 0.16987740805604204, "grad_norm": 1.19891512446026, "learning_rate": 9.823041069043457e-06, "loss": 0.191, "step": 97 }, { "epoch": 0.17162872154115585, "grad_norm": 1.1984485781115666, "learning_rate": 9.81939586192836e-06, "loss": 0.2456, "step": 98 }, { "epoch": 0.1733800350262697, "grad_norm": 1.978454735318979, "learning_rate": 9.815714182789335e-06, "loss": 0.3018, "step": 99 }, { "epoch": 0.17513134851138354, "grad_norm": 1.016573147746523, "learning_rate": 9.811996059488445e-06, "loss": 0.2071, "step": 100 }, { "epoch": 0.17688266199649738, "grad_norm": 1.1362427606530137, "learning_rate": 9.808241520163542e-06, "loss": 0.1999, "step": 101 }, { "epoch": 0.1786339754816112, "grad_norm": 1.230279571459468, "learning_rate": 9.804450593228079e-06, "loss": 0.1463, "step": 102 }, { "epoch": 0.18038528896672504, "grad_norm": 0.868837318204682, "learning_rate": 9.800623307370874e-06, "loss": 0.1376, "step": 103 }, { "epoch": 0.18213660245183888, "grad_norm": 0.972725973754719, "learning_rate": 9.7967596915559e-06, "loss": 0.1922, "step": 104 }, { "epoch": 0.18388791593695272, "grad_norm": 0.8173644404176668, "learning_rate": 9.792859775022069e-06, "loss": 0.1299, "step": 105 }, { "epoch": 0.18563922942206654, "grad_norm": 1.0879870179146585, "learning_rate": 9.788923587283008e-06, "loss": 0.2063, "step": 106 }, { "epoch": 0.18739054290718038, "grad_norm": 0.952468881220675, "learning_rate": 9.784951158126836e-06, "loss": 0.1661, "step": 107 }, { "epoch": 0.18914185639229422, "grad_norm": 1.0914420674457646, "learning_rate": 9.780942517615937e-06, "loss": 0.1622, "step": 108 }, { "epoch": 0.19089316987740806, "grad_norm": 1.4948764206782315, "learning_rate": 9.776897696086734e-06, "loss": 0.2464, "step": 109 }, { "epoch": 0.19264448336252188, "grad_norm": 1.1347837540296297, "learning_rate": 9.772816724149459e-06, "loss": 0.159, "step": 110 }, { "epoch": 0.19439579684763572, "grad_norm": 1.3238181457613138, "learning_rate": 9.768699632687922e-06, "loss": 0.2241, "step": 111 }, { "epoch": 0.19614711033274956, "grad_norm": 1.042910339183979, "learning_rate": 9.764546452859277e-06, "loss": 0.1967, "step": 112 }, { "epoch": 0.1978984238178634, "grad_norm": 1.1785097761069256, "learning_rate": 9.760357216093788e-06, "loss": 0.1961, "step": 113 }, { "epoch": 0.19964973730297722, "grad_norm": 1.1403104414823952, "learning_rate": 9.756131954094582e-06, "loss": 0.2042, "step": 114 }, { "epoch": 0.20140105078809106, "grad_norm": 1.4022044273328718, "learning_rate": 9.751870698837428e-06, "loss": 0.258, "step": 115 }, { "epoch": 0.2031523642732049, "grad_norm": 1.1023466504712938, "learning_rate": 9.747573482570471e-06, "loss": 0.1867, "step": 116 }, { "epoch": 0.20490367775831875, "grad_norm": 1.013716455534731, "learning_rate": 9.74324033781401e-06, "loss": 0.235, "step": 117 }, { "epoch": 0.20665499124343256, "grad_norm": 1.1766050779187667, "learning_rate": 9.738871297360233e-06, "loss": 0.2042, "step": 118 }, { "epoch": 0.2084063047285464, "grad_norm": 1.1003072535572958, "learning_rate": 9.734466394272988e-06, "loss": 0.1945, "step": 119 }, { "epoch": 0.21015761821366025, "grad_norm": 1.4838997871202069, "learning_rate": 9.730025661887517e-06, "loss": 0.1961, "step": 120 }, { "epoch": 0.2119089316987741, "grad_norm": 1.2740923322292086, "learning_rate": 9.725549133810205e-06, "loss": 0.1872, "step": 121 }, { "epoch": 0.2136602451838879, "grad_norm": 1.0849396699876135, "learning_rate": 9.721036843918343e-06, "loss": 0.1887, "step": 122 }, { "epoch": 0.21541155866900175, "grad_norm": 1.171434657200328, "learning_rate": 9.716488826359848e-06, "loss": 0.1719, "step": 123 }, { "epoch": 0.2171628721541156, "grad_norm": 0.8144293180316944, "learning_rate": 9.711905115553014e-06, "loss": 0.1217, "step": 124 }, { "epoch": 0.21891418563922943, "grad_norm": 0.8287059128714208, "learning_rate": 9.707285746186262e-06, "loss": 0.126, "step": 125 }, { "epoch": 0.22066549912434325, "grad_norm": 1.0397181324925568, "learning_rate": 9.702630753217865e-06, "loss": 0.2018, "step": 126 }, { "epoch": 0.2224168126094571, "grad_norm": 1.2093454647956197, "learning_rate": 9.697940171875682e-06, "loss": 0.1819, "step": 127 }, { "epoch": 0.22416812609457093, "grad_norm": 0.971954663646255, "learning_rate": 9.6932140376569e-06, "loss": 0.1459, "step": 128 }, { "epoch": 0.22591943957968477, "grad_norm": 0.9871770951957114, "learning_rate": 9.688452386327764e-06, "loss": 0.1597, "step": 129 }, { "epoch": 0.2276707530647986, "grad_norm": 1.1914487524596076, "learning_rate": 9.6836552539233e-06, "loss": 0.1894, "step": 130 }, { "epoch": 0.22942206654991243, "grad_norm": 1.3299445049184178, "learning_rate": 9.678822676747048e-06, "loss": 0.21, "step": 131 }, { "epoch": 0.23117338003502627, "grad_norm": 0.9944443296898016, "learning_rate": 9.673954691370782e-06, "loss": 0.1933, "step": 132 }, { "epoch": 0.2329246935201401, "grad_norm": 1.2879765089518327, "learning_rate": 9.669051334634243e-06, "loss": 0.2187, "step": 133 }, { "epoch": 0.23467600700525393, "grad_norm": 1.1579504079908498, "learning_rate": 9.66411264364485e-06, "loss": 0.2593, "step": 134 }, { "epoch": 0.23642732049036777, "grad_norm": 0.889979983641812, "learning_rate": 9.659138655777422e-06, "loss": 0.1599, "step": 135 }, { "epoch": 0.2381786339754816, "grad_norm": 1.1392333425773118, "learning_rate": 9.654129408673897e-06, "loss": 0.2232, "step": 136 }, { "epoch": 0.23992994746059546, "grad_norm": 0.927182815890979, "learning_rate": 9.649084940243052e-06, "loss": 0.1547, "step": 137 }, { "epoch": 0.24168126094570927, "grad_norm": 1.0230754932044808, "learning_rate": 9.644005288660204e-06, "loss": 0.1552, "step": 138 }, { "epoch": 0.2434325744308231, "grad_norm": 1.0117195711045635, "learning_rate": 9.638890492366924e-06, "loss": 0.1563, "step": 139 }, { "epoch": 0.24518388791593695, "grad_norm": 1.4066019466468673, "learning_rate": 9.633740590070763e-06, "loss": 0.3144, "step": 140 }, { "epoch": 0.2469352014010508, "grad_norm": 1.1513768439474406, "learning_rate": 9.628555620744932e-06, "loss": 0.1434, "step": 141 }, { "epoch": 0.2486865148861646, "grad_norm": 1.0700849134487496, "learning_rate": 9.62333562362803e-06, "loss": 0.2161, "step": 142 }, { "epoch": 0.2504378283712785, "grad_norm": 1.5059188434701325, "learning_rate": 9.618080638223732e-06, "loss": 0.2005, "step": 143 }, { "epoch": 0.2521891418563923, "grad_norm": 0.9665925005797491, "learning_rate": 9.612790704300501e-06, "loss": 0.1828, "step": 144 }, { "epoch": 0.2539404553415061, "grad_norm": 0.8229122412140533, "learning_rate": 9.607465861891276e-06, "loss": 0.1208, "step": 145 }, { "epoch": 0.25569176882662, "grad_norm": 1.0488909683370946, "learning_rate": 9.602106151293182e-06, "loss": 0.2138, "step": 146 }, { "epoch": 0.2574430823117338, "grad_norm": 1.2990902146116425, "learning_rate": 9.596711613067212e-06, "loss": 0.2018, "step": 147 }, { "epoch": 0.2591943957968476, "grad_norm": 0.8880761437071694, "learning_rate": 9.59128228803793e-06, "loss": 0.1679, "step": 148 }, { "epoch": 0.2609457092819615, "grad_norm": 0.8863876920056281, "learning_rate": 9.585818217293155e-06, "loss": 0.1488, "step": 149 }, { "epoch": 0.2626970227670753, "grad_norm": 1.252288840375211, "learning_rate": 9.580319442183654e-06, "loss": 0.2294, "step": 150 }, { "epoch": 0.26444833625218916, "grad_norm": 1.1271115490692163, "learning_rate": 9.574786004322831e-06, "loss": 0.2005, "step": 151 }, { "epoch": 0.266199649737303, "grad_norm": 0.8958292187859824, "learning_rate": 9.569217945586406e-06, "loss": 0.1515, "step": 152 }, { "epoch": 0.2679509632224168, "grad_norm": 1.1432682883821648, "learning_rate": 9.563615308112106e-06, "loss": 0.1727, "step": 153 }, { "epoch": 0.26970227670753066, "grad_norm": 0.8860825973578063, "learning_rate": 9.557978134299332e-06, "loss": 0.1561, "step": 154 }, { "epoch": 0.2714535901926445, "grad_norm": 0.9458112069640355, "learning_rate": 9.552306466808861e-06, "loss": 0.173, "step": 155 }, { "epoch": 0.2732049036777583, "grad_norm": 1.1059388701307742, "learning_rate": 9.546600348562499e-06, "loss": 0.1939, "step": 156 }, { "epoch": 0.27495621716287216, "grad_norm": 1.3621793677790732, "learning_rate": 9.54085982274277e-06, "loss": 0.2033, "step": 157 }, { "epoch": 0.276707530647986, "grad_norm": 1.049168730239092, "learning_rate": 9.535084932792588e-06, "loss": 0.2193, "step": 158 }, { "epoch": 0.27845884413309985, "grad_norm": 0.8987325354112385, "learning_rate": 9.529275722414926e-06, "loss": 0.149, "step": 159 }, { "epoch": 0.28021015761821366, "grad_norm": 1.1205614005555482, "learning_rate": 9.523432235572485e-06, "loss": 0.1715, "step": 160 }, { "epoch": 0.2819614711033275, "grad_norm": 1.1119065744262058, "learning_rate": 9.517554516487361e-06, "loss": 0.2139, "step": 161 }, { "epoch": 0.28371278458844135, "grad_norm": 1.2120128153983245, "learning_rate": 9.511642609640714e-06, "loss": 0.2055, "step": 162 }, { "epoch": 0.28546409807355516, "grad_norm": 1.0381552882652774, "learning_rate": 9.505696559772427e-06, "loss": 0.1521, "step": 163 }, { "epoch": 0.287215411558669, "grad_norm": 1.0616048587072129, "learning_rate": 9.499716411880767e-06, "loss": 0.1438, "step": 164 }, { "epoch": 0.28896672504378285, "grad_norm": 1.465227437163341, "learning_rate": 9.493702211222052e-06, "loss": 0.1939, "step": 165 }, { "epoch": 0.29071803852889666, "grad_norm": 1.2171201787031805, "learning_rate": 9.4876540033103e-06, "loss": 0.1542, "step": 166 }, { "epoch": 0.29246935201401053, "grad_norm": 0.9937562205583209, "learning_rate": 9.481571833916884e-06, "loss": 0.1822, "step": 167 }, { "epoch": 0.29422066549912435, "grad_norm": 1.139810203249971, "learning_rate": 9.475455749070198e-06, "loss": 0.2018, "step": 168 }, { "epoch": 0.29597197898423816, "grad_norm": 1.2507741492130755, "learning_rate": 9.469305795055292e-06, "loss": 0.2314, "step": 169 }, { "epoch": 0.29772329246935203, "grad_norm": 1.6644506023322219, "learning_rate": 9.463122018413533e-06, "loss": 0.1912, "step": 170 }, { "epoch": 0.29947460595446584, "grad_norm": 0.8574109893402403, "learning_rate": 9.45690446594225e-06, "loss": 0.1236, "step": 171 }, { "epoch": 0.30122591943957966, "grad_norm": 0.9262386331879862, "learning_rate": 9.450653184694378e-06, "loss": 0.2005, "step": 172 }, { "epoch": 0.30297723292469353, "grad_norm": 0.7994796498712383, "learning_rate": 9.444368221978102e-06, "loss": 0.1488, "step": 173 }, { "epoch": 0.30472854640980734, "grad_norm": 0.9739129710543231, "learning_rate": 9.438049625356506e-06, "loss": 0.2011, "step": 174 }, { "epoch": 0.3064798598949212, "grad_norm": 1.7022881013682905, "learning_rate": 9.431697442647199e-06, "loss": 0.286, "step": 175 }, { "epoch": 0.30823117338003503, "grad_norm": 0.9765370631504982, "learning_rate": 9.425311721921967e-06, "loss": 0.1455, "step": 176 }, { "epoch": 0.30998248686514884, "grad_norm": 1.1358714034120214, "learning_rate": 9.418892511506404e-06, "loss": 0.1664, "step": 177 }, { "epoch": 0.3117338003502627, "grad_norm": 1.0393192528807746, "learning_rate": 9.412439859979543e-06, "loss": 0.162, "step": 178 }, { "epoch": 0.3134851138353765, "grad_norm": 1.096045737373684, "learning_rate": 9.405953816173491e-06, "loss": 0.1431, "step": 179 }, { "epoch": 0.31523642732049034, "grad_norm": 0.7637588357172228, "learning_rate": 9.399434429173063e-06, "loss": 0.1522, "step": 180 }, { "epoch": 0.3169877408056042, "grad_norm": 1.1511627286236419, "learning_rate": 9.392881748315403e-06, "loss": 0.23, "step": 181 }, { "epoch": 0.318739054290718, "grad_norm": 1.296531852821544, "learning_rate": 9.38629582318962e-06, "loss": 0.1559, "step": 182 }, { "epoch": 0.3204903677758319, "grad_norm": 1.0245849374018412, "learning_rate": 9.379676703636402e-06, "loss": 0.2058, "step": 183 }, { "epoch": 0.3222416812609457, "grad_norm": 1.1325995765106882, "learning_rate": 9.373024439747648e-06, "loss": 0.1798, "step": 184 }, { "epoch": 0.3239929947460595, "grad_norm": 0.955274718027506, "learning_rate": 9.366339081866085e-06, "loss": 0.1318, "step": 185 }, { "epoch": 0.3257443082311734, "grad_norm": 1.0960146910727295, "learning_rate": 9.359620680584889e-06, "loss": 0.2125, "step": 186 }, { "epoch": 0.3274956217162872, "grad_norm": 0.9976010826462164, "learning_rate": 9.352869286747295e-06, "loss": 0.1744, "step": 187 }, { "epoch": 0.329246935201401, "grad_norm": 1.0033232475788938, "learning_rate": 9.34608495144622e-06, "loss": 0.1712, "step": 188 }, { "epoch": 0.3309982486865149, "grad_norm": 1.2252146471565943, "learning_rate": 9.33926772602388e-06, "loss": 0.1983, "step": 189 }, { "epoch": 0.3327495621716287, "grad_norm": 1.1760605780656463, "learning_rate": 9.332417662071386e-06, "loss": 0.1666, "step": 190 }, { "epoch": 0.3345008756567426, "grad_norm": 1.3456892597616057, "learning_rate": 9.32553481142837e-06, "loss": 0.1829, "step": 191 }, { "epoch": 0.3362521891418564, "grad_norm": 1.2660460244764533, "learning_rate": 9.31861922618258e-06, "loss": 0.2458, "step": 192 }, { "epoch": 0.3380035026269702, "grad_norm": 1.0534767359485842, "learning_rate": 9.311670958669502e-06, "loss": 0.1874, "step": 193 }, { "epoch": 0.3397548161120841, "grad_norm": 0.8458278928313304, "learning_rate": 9.304690061471937e-06, "loss": 0.1667, "step": 194 }, { "epoch": 0.3415061295971979, "grad_norm": 0.9903682726455234, "learning_rate": 9.297676587419638e-06, "loss": 0.2062, "step": 195 }, { "epoch": 0.3432574430823117, "grad_norm": 1.1314930714580482, "learning_rate": 9.290630589588876e-06, "loss": 0.1794, "step": 196 }, { "epoch": 0.3450087565674256, "grad_norm": 1.463207590968482, "learning_rate": 9.283552121302064e-06, "loss": 0.2053, "step": 197 }, { "epoch": 0.3467600700525394, "grad_norm": 1.0264128857179728, "learning_rate": 9.276441236127343e-06, "loss": 0.1463, "step": 198 }, { "epoch": 0.34851138353765326, "grad_norm": 1.1530074129076198, "learning_rate": 9.269297987878168e-06, "loss": 0.1918, "step": 199 }, { "epoch": 0.3502626970227671, "grad_norm": 0.9958162716820419, "learning_rate": 9.262122430612922e-06, "loss": 0.1474, "step": 200 }, { "epoch": 0.3520140105078809, "grad_norm": 1.0886838150262181, "learning_rate": 9.254914618634487e-06, "loss": 0.2175, "step": 201 }, { "epoch": 0.35376532399299476, "grad_norm": 1.1036143709701502, "learning_rate": 9.247674606489843e-06, "loss": 0.141, "step": 202 }, { "epoch": 0.3555166374781086, "grad_norm": 1.0910090805563288, "learning_rate": 9.240402448969655e-06, "loss": 0.1638, "step": 203 }, { "epoch": 0.3572679509632224, "grad_norm": 1.0496526329879359, "learning_rate": 9.233098201107854e-06, "loss": 0.1745, "step": 204 }, { "epoch": 0.35901926444833626, "grad_norm": 1.0259634242913862, "learning_rate": 9.225761918181224e-06, "loss": 0.1554, "step": 205 }, { "epoch": 0.3607705779334501, "grad_norm": 0.9318692908968823, "learning_rate": 9.218393655708981e-06, "loss": 0.1598, "step": 206 }, { "epoch": 0.36252189141856395, "grad_norm": 1.0675242528447997, "learning_rate": 9.210993469452357e-06, "loss": 0.2542, "step": 207 }, { "epoch": 0.36427320490367776, "grad_norm": 1.0231337468264674, "learning_rate": 9.203561415414174e-06, "loss": 0.1377, "step": 208 }, { "epoch": 0.3660245183887916, "grad_norm": 0.8172962309380866, "learning_rate": 9.196097549838422e-06, "loss": 0.1337, "step": 209 }, { "epoch": 0.36777583187390545, "grad_norm": 1.3437004411676805, "learning_rate": 9.188601929209836e-06, "loss": 0.2003, "step": 210 }, { "epoch": 0.36952714535901926, "grad_norm": 0.9859223201620534, "learning_rate": 9.181074610253457e-06, "loss": 0.1246, "step": 211 }, { "epoch": 0.3712784588441331, "grad_norm": 1.1821125005665047, "learning_rate": 9.173515649934222e-06, "loss": 0.1918, "step": 212 }, { "epoch": 0.37302977232924694, "grad_norm": 1.3158062961704573, "learning_rate": 9.165925105456513e-06, "loss": 0.2639, "step": 213 }, { "epoch": 0.37478108581436076, "grad_norm": 1.3388147444695622, "learning_rate": 9.15830303426374e-06, "loss": 0.2246, "step": 214 }, { "epoch": 0.37653239929947463, "grad_norm": 0.8947375382446747, "learning_rate": 9.150649494037895e-06, "loss": 0.1225, "step": 215 }, { "epoch": 0.37828371278458844, "grad_norm": 1.103951490161312, "learning_rate": 9.142964542699124e-06, "loss": 0.2026, "step": 216 }, { "epoch": 0.38003502626970226, "grad_norm": 1.233924530688903, "learning_rate": 9.135248238405282e-06, "loss": 0.2086, "step": 217 }, { "epoch": 0.38178633975481613, "grad_norm": 1.1719877250784767, "learning_rate": 9.127500639551497e-06, "loss": 0.1795, "step": 218 }, { "epoch": 0.38353765323992994, "grad_norm": 1.255963695109249, "learning_rate": 9.119721804769723e-06, "loss": 0.1862, "step": 219 }, { "epoch": 0.38528896672504376, "grad_norm": 1.1459951961009553, "learning_rate": 9.111911792928308e-06, "loss": 0.1966, "step": 220 }, { "epoch": 0.38704028021015763, "grad_norm": 1.1761484428825753, "learning_rate": 9.10407066313153e-06, "loss": 0.1868, "step": 221 }, { "epoch": 0.38879159369527144, "grad_norm": 1.118245234998823, "learning_rate": 9.096198474719169e-06, "loss": 0.189, "step": 222 }, { "epoch": 0.3905429071803853, "grad_norm": 1.1403077273341966, "learning_rate": 9.088295287266042e-06, "loss": 0.1752, "step": 223 }, { "epoch": 0.3922942206654991, "grad_norm": 1.1241899523938508, "learning_rate": 9.080361160581569e-06, "loss": 0.163, "step": 224 }, { "epoch": 0.39404553415061294, "grad_norm": 1.1244116274618798, "learning_rate": 9.0723961547093e-06, "loss": 0.1644, "step": 225 }, { "epoch": 0.3957968476357268, "grad_norm": 1.1452039316203921, "learning_rate": 9.064400329926476e-06, "loss": 0.1677, "step": 226 }, { "epoch": 0.3975481611208406, "grad_norm": 0.9169411289803756, "learning_rate": 9.05637374674357e-06, "loss": 0.1499, "step": 227 }, { "epoch": 0.39929947460595444, "grad_norm": 1.1320705178616042, "learning_rate": 9.048316465903823e-06, "loss": 0.1532, "step": 228 }, { "epoch": 0.4010507880910683, "grad_norm": 1.1148728898419542, "learning_rate": 9.04022854838279e-06, "loss": 0.1745, "step": 229 }, { "epoch": 0.4028021015761821, "grad_norm": 0.9892668796106193, "learning_rate": 9.032110055387881e-06, "loss": 0.192, "step": 230 }, { "epoch": 0.404553415061296, "grad_norm": 1.2041757680734857, "learning_rate": 9.023961048357885e-06, "loss": 0.2152, "step": 231 }, { "epoch": 0.4063047285464098, "grad_norm": 1.1128606421374905, "learning_rate": 9.015781588962524e-06, "loss": 0.273, "step": 232 }, { "epoch": 0.4080560420315236, "grad_norm": 1.0110467377360022, "learning_rate": 9.007571739101968e-06, "loss": 0.1625, "step": 233 }, { "epoch": 0.4098073555166375, "grad_norm": 2.1812012536277385, "learning_rate": 8.999331560906382e-06, "loss": 0.437, "step": 234 }, { "epoch": 0.4115586690017513, "grad_norm": 1.2026156956732903, "learning_rate": 8.991061116735437e-06, "loss": 0.1843, "step": 235 }, { "epoch": 0.4133099824868651, "grad_norm": 0.9335774970781566, "learning_rate": 8.982760469177865e-06, "loss": 0.1648, "step": 236 }, { "epoch": 0.415061295971979, "grad_norm": 0.9632646131071537, "learning_rate": 8.974429681050957e-06, "loss": 0.234, "step": 237 }, { "epoch": 0.4168126094570928, "grad_norm": 1.1628290027919184, "learning_rate": 8.966068815400108e-06, "loss": 0.243, "step": 238 }, { "epoch": 0.4185639229422067, "grad_norm": 0.9926771048841662, "learning_rate": 8.957677935498328e-06, "loss": 0.2181, "step": 239 }, { "epoch": 0.4203152364273205, "grad_norm": 1.2100489148340519, "learning_rate": 8.949257104845772e-06, "loss": 0.1799, "step": 240 }, { "epoch": 0.4220665499124343, "grad_norm": 0.9686376813257808, "learning_rate": 8.94080638716925e-06, "loss": 0.1412, "step": 241 }, { "epoch": 0.4238178633975482, "grad_norm": 1.0304072209793849, "learning_rate": 8.932325846421755e-06, "loss": 0.1608, "step": 242 }, { "epoch": 0.425569176882662, "grad_norm": 1.1068555551888453, "learning_rate": 8.923815546781968e-06, "loss": 0.1736, "step": 243 }, { "epoch": 0.4273204903677758, "grad_norm": 1.1406345353775849, "learning_rate": 8.915275552653786e-06, "loss": 0.1856, "step": 244 }, { "epoch": 0.4290718038528897, "grad_norm": 0.824906662216677, "learning_rate": 8.906705928665818e-06, "loss": 0.1241, "step": 245 }, { "epoch": 0.4308231173380035, "grad_norm": 0.9817782743515178, "learning_rate": 8.898106739670908e-06, "loss": 0.1391, "step": 246 }, { "epoch": 0.43257443082311736, "grad_norm": 1.0061540033779128, "learning_rate": 8.889478050745646e-06, "loss": 0.1487, "step": 247 }, { "epoch": 0.4343257443082312, "grad_norm": 1.319181249331738, "learning_rate": 8.88081992718986e-06, "loss": 0.2516, "step": 248 }, { "epoch": 0.436077057793345, "grad_norm": 1.0168251468902272, "learning_rate": 8.872132434526144e-06, "loss": 0.1601, "step": 249 }, { "epoch": 0.43782837127845886, "grad_norm": 0.903893586373318, "learning_rate": 8.863415638499341e-06, "loss": 0.1675, "step": 250 }, { "epoch": 0.4395796847635727, "grad_norm": 1.5466100134300305, "learning_rate": 8.854669605076058e-06, "loss": 0.2004, "step": 251 }, { "epoch": 0.4413309982486865, "grad_norm": 0.8990563567570562, "learning_rate": 8.845894400444163e-06, "loss": 0.1505, "step": 252 }, { "epoch": 0.44308231173380036, "grad_norm": 1.2358463965854751, "learning_rate": 8.837090091012289e-06, "loss": 0.188, "step": 253 }, { "epoch": 0.4448336252189142, "grad_norm": 0.8631731423566544, "learning_rate": 8.82825674340932e-06, "loss": 0.1085, "step": 254 }, { "epoch": 0.44658493870402804, "grad_norm": 0.851054134724772, "learning_rate": 8.819394424483898e-06, "loss": 0.1553, "step": 255 }, { "epoch": 0.44833625218914186, "grad_norm": 0.9747188919128223, "learning_rate": 8.810503201303914e-06, "loss": 0.1429, "step": 256 }, { "epoch": 0.4500875656742557, "grad_norm": 1.1817384021065651, "learning_rate": 8.801583141155993e-06, "loss": 0.1714, "step": 257 }, { "epoch": 0.45183887915936954, "grad_norm": 1.1497461858404032, "learning_rate": 8.792634311545002e-06, "loss": 0.1654, "step": 258 }, { "epoch": 0.45359019264448336, "grad_norm": 1.0653546734905857, "learning_rate": 8.78365678019352e-06, "loss": 0.1561, "step": 259 }, { "epoch": 0.4553415061295972, "grad_norm": 0.9098419050905309, "learning_rate": 8.774650615041332e-06, "loss": 0.1446, "step": 260 }, { "epoch": 0.45709281961471104, "grad_norm": 1.3562833349770014, "learning_rate": 8.765615884244925e-06, "loss": 0.1887, "step": 261 }, { "epoch": 0.45884413309982486, "grad_norm": 1.584928142177982, "learning_rate": 8.75655265617696e-06, "loss": 0.2126, "step": 262 }, { "epoch": 0.46059544658493873, "grad_norm": 1.25237114313114, "learning_rate": 8.747460999425755e-06, "loss": 0.1999, "step": 263 }, { "epoch": 0.46234676007005254, "grad_norm": 1.0137290535234078, "learning_rate": 8.738340982794775e-06, "loss": 0.1567, "step": 264 }, { "epoch": 0.46409807355516636, "grad_norm": 0.9990455785944722, "learning_rate": 8.729192675302104e-06, "loss": 0.1817, "step": 265 }, { "epoch": 0.4658493870402802, "grad_norm": 0.9098201735226615, "learning_rate": 8.720016146179921e-06, "loss": 0.181, "step": 266 }, { "epoch": 0.46760070052539404, "grad_norm": 0.8339374481166864, "learning_rate": 8.710811464873984e-06, "loss": 0.13, "step": 267 }, { "epoch": 0.46935201401050786, "grad_norm": 0.8247199593689756, "learning_rate": 8.701578701043097e-06, "loss": 0.116, "step": 268 }, { "epoch": 0.4711033274956217, "grad_norm": 1.2251926045855088, "learning_rate": 8.692317924558586e-06, "loss": 0.2267, "step": 269 }, { "epoch": 0.47285464098073554, "grad_norm": 1.323949068367743, "learning_rate": 8.683029205503772e-06, "loss": 0.2413, "step": 270 }, { "epoch": 0.4746059544658494, "grad_norm": 0.8694038746666335, "learning_rate": 8.67371261417344e-06, "loss": 0.1314, "step": 271 }, { "epoch": 0.4763572679509632, "grad_norm": 1.4214200720496517, "learning_rate": 8.664368221073297e-06, "loss": 0.2074, "step": 272 }, { "epoch": 0.47810858143607704, "grad_norm": 0.8164989303563484, "learning_rate": 8.65499609691946e-06, "loss": 0.1142, "step": 273 }, { "epoch": 0.4798598949211909, "grad_norm": 0.9950353997188668, "learning_rate": 8.645596312637895e-06, "loss": 0.2059, "step": 274 }, { "epoch": 0.4816112084063047, "grad_norm": 1.1741807018487027, "learning_rate": 8.636168939363905e-06, "loss": 0.1762, "step": 275 }, { "epoch": 0.48336252189141854, "grad_norm": 1.1639496884105414, "learning_rate": 8.62671404844157e-06, "loss": 0.1946, "step": 276 }, { "epoch": 0.4851138353765324, "grad_norm": 1.24592549170423, "learning_rate": 8.617231711423222e-06, "loss": 0.2481, "step": 277 }, { "epoch": 0.4868651488616462, "grad_norm": 1.6788228325648948, "learning_rate": 8.607722000068898e-06, "loss": 0.2368, "step": 278 }, { "epoch": 0.4886164623467601, "grad_norm": 0.9120565830493937, "learning_rate": 8.598184986345797e-06, "loss": 0.1528, "step": 279 }, { "epoch": 0.4903677758318739, "grad_norm": 1.0407774323531322, "learning_rate": 8.588620742427733e-06, "loss": 0.1481, "step": 280 }, { "epoch": 0.4921190893169877, "grad_norm": 0.8774818082676015, "learning_rate": 8.579029340694596e-06, "loss": 0.167, "step": 281 }, { "epoch": 0.4938704028021016, "grad_norm": 0.7302486166777703, "learning_rate": 8.569410853731799e-06, "loss": 0.1339, "step": 282 }, { "epoch": 0.4956217162872154, "grad_norm": 1.4977744846377492, "learning_rate": 8.559765354329728e-06, "loss": 0.2384, "step": 283 }, { "epoch": 0.4973730297723292, "grad_norm": 1.3333579162401141, "learning_rate": 8.55009291548319e-06, "loss": 0.2047, "step": 284 }, { "epoch": 0.4991243432574431, "grad_norm": 1.0371512553867137, "learning_rate": 8.540393610390871e-06, "loss": 0.2014, "step": 285 }, { "epoch": 0.500875656742557, "grad_norm": 5.507551141823795, "learning_rate": 8.530667512454765e-06, "loss": 0.2963, "step": 286 }, { "epoch": 0.5026269702276708, "grad_norm": 1.0814488316677124, "learning_rate": 8.520914695279632e-06, "loss": 0.1799, "step": 287 }, { "epoch": 0.5043782837127846, "grad_norm": 1.0877989498247866, "learning_rate": 8.511135232672442e-06, "loss": 0.2273, "step": 288 }, { "epoch": 0.5061295971978984, "grad_norm": 1.1546522986634278, "learning_rate": 8.501329198641802e-06, "loss": 0.1699, "step": 289 }, { "epoch": 0.5078809106830122, "grad_norm": 0.9955478078874541, "learning_rate": 8.491496667397409e-06, "loss": 0.1616, "step": 290 }, { "epoch": 0.5096322241681261, "grad_norm": 1.0749336259458449, "learning_rate": 8.481637713349486e-06, "loss": 0.2121, "step": 291 }, { "epoch": 0.51138353765324, "grad_norm": 1.2722939684715049, "learning_rate": 8.471752411108216e-06, "loss": 0.1619, "step": 292 }, { "epoch": 0.5131348511383538, "grad_norm": 1.31720477548223, "learning_rate": 8.461840835483179e-06, "loss": 0.2357, "step": 293 }, { "epoch": 0.5148861646234676, "grad_norm": 1.0512404750610869, "learning_rate": 8.451903061482787e-06, "loss": 0.2039, "step": 294 }, { "epoch": 0.5166374781085814, "grad_norm": 1.095035695042105, "learning_rate": 8.44193916431371e-06, "loss": 0.1386, "step": 295 }, { "epoch": 0.5183887915936952, "grad_norm": 1.2777241490218374, "learning_rate": 8.431949219380319e-06, "loss": 0.2109, "step": 296 }, { "epoch": 0.5201401050788091, "grad_norm": 1.0873909511926902, "learning_rate": 8.421933302284102e-06, "loss": 0.1584, "step": 297 }, { "epoch": 0.521891418563923, "grad_norm": 0.9773041895311751, "learning_rate": 8.411891488823102e-06, "loss": 0.1512, "step": 298 }, { "epoch": 0.5236427320490368, "grad_norm": 1.1584559931137173, "learning_rate": 8.401823854991338e-06, "loss": 0.1974, "step": 299 }, { "epoch": 0.5253940455341506, "grad_norm": 0.9617777428924835, "learning_rate": 8.391730476978229e-06, "loss": 0.1536, "step": 300 }, { "epoch": 0.5271453590192644, "grad_norm": 1.1847455857829146, "learning_rate": 8.381611431168027e-06, "loss": 0.1949, "step": 301 }, { "epoch": 0.5288966725043783, "grad_norm": 1.0558725080734255, "learning_rate": 8.37146679413922e-06, "loss": 0.1873, "step": 302 }, { "epoch": 0.5306479859894921, "grad_norm": 0.9708895644033165, "learning_rate": 8.361296642663977e-06, "loss": 0.1851, "step": 303 }, { "epoch": 0.532399299474606, "grad_norm": 1.0628725551453226, "learning_rate": 8.351101053707545e-06, "loss": 0.169, "step": 304 }, { "epoch": 0.5341506129597198, "grad_norm": 0.9196106999391176, "learning_rate": 8.34088010442768e-06, "loss": 0.159, "step": 305 }, { "epoch": 0.5359019264448336, "grad_norm": 1.0255932075584107, "learning_rate": 8.330633872174057e-06, "loss": 0.1701, "step": 306 }, { "epoch": 0.5376532399299475, "grad_norm": 1.0762624625214643, "learning_rate": 8.320362434487688e-06, "loss": 0.1644, "step": 307 }, { "epoch": 0.5394045534150613, "grad_norm": 0.7633404300233255, "learning_rate": 8.310065869100332e-06, "loss": 0.123, "step": 308 }, { "epoch": 0.5411558669001751, "grad_norm": 0.8699176281503174, "learning_rate": 8.299744253933908e-06, "loss": 0.1351, "step": 309 }, { "epoch": 0.542907180385289, "grad_norm": 1.2275049904001125, "learning_rate": 8.289397667099909e-06, "loss": 0.1675, "step": 310 }, { "epoch": 0.5446584938704028, "grad_norm": 1.2101635020956292, "learning_rate": 8.279026186898805e-06, "loss": 0.1738, "step": 311 }, { "epoch": 0.5464098073555166, "grad_norm": 0.9106260951604767, "learning_rate": 8.26862989181945e-06, "loss": 0.1338, "step": 312 }, { "epoch": 0.5481611208406305, "grad_norm": 1.1480139615147442, "learning_rate": 8.258208860538498e-06, "loss": 0.1958, "step": 313 }, { "epoch": 0.5499124343257443, "grad_norm": 1.543005528446332, "learning_rate": 8.247763171919795e-06, "loss": 0.1611, "step": 314 }, { "epoch": 0.5516637478108581, "grad_norm": 1.201047458056005, "learning_rate": 8.237292905013792e-06, "loss": 0.2219, "step": 315 }, { "epoch": 0.553415061295972, "grad_norm": 0.9962169779650556, "learning_rate": 8.226798139056938e-06, "loss": 0.1751, "step": 316 }, { "epoch": 0.5551663747810858, "grad_norm": 1.0391141659299086, "learning_rate": 8.216278953471088e-06, "loss": 0.1907, "step": 317 }, { "epoch": 0.5569176882661997, "grad_norm": 0.8878841272733307, "learning_rate": 8.205735427862897e-06, "loss": 0.1392, "step": 318 }, { "epoch": 0.5586690017513135, "grad_norm": 0.8815771076681298, "learning_rate": 8.19516764202322e-06, "loss": 0.2054, "step": 319 }, { "epoch": 0.5604203152364273, "grad_norm": 1.3561357204987174, "learning_rate": 8.184575675926511e-06, "loss": 0.162, "step": 320 }, { "epoch": 0.5621716287215411, "grad_norm": 1.1436127474045497, "learning_rate": 8.173959609730209e-06, "loss": 0.1553, "step": 321 }, { "epoch": 0.563922942206655, "grad_norm": 0.8696831532462574, "learning_rate": 8.16331952377414e-06, "loss": 0.1614, "step": 322 }, { "epoch": 0.5656742556917689, "grad_norm": 1.243551733126445, "learning_rate": 8.152655498579903e-06, "loss": 0.241, "step": 323 }, { "epoch": 0.5674255691768827, "grad_norm": 1.0649177969550163, "learning_rate": 8.141967614850265e-06, "loss": 0.1368, "step": 324 }, { "epoch": 0.5691768826619965, "grad_norm": 1.1762554635110793, "learning_rate": 8.131255953468553e-06, "loss": 0.2207, "step": 325 }, { "epoch": 0.5709281961471103, "grad_norm": 1.2132153902490277, "learning_rate": 8.120520595498029e-06, "loss": 0.1887, "step": 326 }, { "epoch": 0.5726795096322241, "grad_norm": 0.7399066544679602, "learning_rate": 8.10976162218129e-06, "loss": 0.1267, "step": 327 }, { "epoch": 0.574430823117338, "grad_norm": 0.9899573210939664, "learning_rate": 8.09897911493965e-06, "loss": 0.1685, "step": 328 }, { "epoch": 0.5761821366024519, "grad_norm": 1.3337051985288606, "learning_rate": 8.088173155372517e-06, "loss": 0.3282, "step": 329 }, { "epoch": 0.5779334500875657, "grad_norm": 1.005083830754978, "learning_rate": 8.077343825256783e-06, "loss": 0.2075, "step": 330 }, { "epoch": 0.5796847635726795, "grad_norm": 0.8802289529518278, "learning_rate": 8.066491206546206e-06, "loss": 0.1254, "step": 331 }, { "epoch": 0.5814360770577933, "grad_norm": 1.2400889336140388, "learning_rate": 8.055615381370781e-06, "loss": 0.185, "step": 332 }, { "epoch": 0.5831873905429071, "grad_norm": 0.8545769679622639, "learning_rate": 8.044716432036126e-06, "loss": 0.1352, "step": 333 }, { "epoch": 0.5849387040280211, "grad_norm": 1.2907830562873122, "learning_rate": 8.033794441022857e-06, "loss": 0.2143, "step": 334 }, { "epoch": 0.5866900175131349, "grad_norm": 1.2934031358525784, "learning_rate": 8.022849490985966e-06, "loss": 0.2373, "step": 335 }, { "epoch": 0.5884413309982487, "grad_norm": 1.0494212000075114, "learning_rate": 8.011881664754193e-06, "loss": 0.21, "step": 336 }, { "epoch": 0.5901926444833625, "grad_norm": 1.3713545247159777, "learning_rate": 8.000891045329394e-06, "loss": 0.1956, "step": 337 }, { "epoch": 0.5919439579684763, "grad_norm": 1.0044217822469004, "learning_rate": 7.989877715885925e-06, "loss": 0.1455, "step": 338 }, { "epoch": 0.5936952714535902, "grad_norm": 0.969157755217139, "learning_rate": 7.97884175977e-06, "loss": 0.1858, "step": 339 }, { "epoch": 0.5954465849387041, "grad_norm": 0.9043027733845134, "learning_rate": 7.967783260499073e-06, "loss": 0.1307, "step": 340 }, { "epoch": 0.5971978984238179, "grad_norm": 0.7943124415991508, "learning_rate": 7.956702301761195e-06, "loss": 0.1142, "step": 341 }, { "epoch": 0.5989492119089317, "grad_norm": 0.9004231757301827, "learning_rate": 7.945598967414386e-06, "loss": 0.1908, "step": 342 }, { "epoch": 0.6007005253940455, "grad_norm": 1.1105437759236843, "learning_rate": 7.934473341485998e-06, "loss": 0.2115, "step": 343 }, { "epoch": 0.6024518388791593, "grad_norm": 0.9301936250430058, "learning_rate": 7.92332550817208e-06, "loss": 0.1647, "step": 344 }, { "epoch": 0.6042031523642732, "grad_norm": 1.047767148226068, "learning_rate": 7.912155551836743e-06, "loss": 0.2355, "step": 345 }, { "epoch": 0.6059544658493871, "grad_norm": 0.9613875662244317, "learning_rate": 7.900963557011519e-06, "loss": 0.171, "step": 346 }, { "epoch": 0.6077057793345009, "grad_norm": 1.2164758034474923, "learning_rate": 7.88974960839472e-06, "loss": 0.155, "step": 347 }, { "epoch": 0.6094570928196147, "grad_norm": 1.056374791415413, "learning_rate": 7.878513790850805e-06, "loss": 0.1732, "step": 348 }, { "epoch": 0.6112084063047285, "grad_norm": 1.2496494090012524, "learning_rate": 7.867256189409724e-06, "loss": 0.1835, "step": 349 }, { "epoch": 0.6129597197898424, "grad_norm": 0.6631622478113742, "learning_rate": 7.855976889266288e-06, "loss": 0.118, "step": 350 }, { "epoch": 0.6147110332749562, "grad_norm": 0.834904245415055, "learning_rate": 7.844675975779514e-06, "loss": 0.1363, "step": 351 }, { "epoch": 0.6164623467600701, "grad_norm": 1.0623504091270612, "learning_rate": 7.833353534471988e-06, "loss": 0.1341, "step": 352 }, { "epoch": 0.6182136602451839, "grad_norm": 1.002067391498751, "learning_rate": 7.82200965102921e-06, "loss": 0.1388, "step": 353 }, { "epoch": 0.6199649737302977, "grad_norm": 1.2517283119177405, "learning_rate": 7.810644411298951e-06, "loss": 0.2027, "step": 354 }, { "epoch": 0.6217162872154116, "grad_norm": 0.9691375625821814, "learning_rate": 7.799257901290597e-06, "loss": 0.1918, "step": 355 }, { "epoch": 0.6234676007005254, "grad_norm": 1.257025304991851, "learning_rate": 7.787850207174512e-06, "loss": 0.1984, "step": 356 }, { "epoch": 0.6252189141856392, "grad_norm": 1.2989884834762626, "learning_rate": 7.776421415281368e-06, "loss": 0.2251, "step": 357 }, { "epoch": 0.626970227670753, "grad_norm": 0.9608457991453937, "learning_rate": 7.764971612101497e-06, "loss": 0.1598, "step": 358 }, { "epoch": 0.6287215411558669, "grad_norm": 1.055564436388202, "learning_rate": 7.753500884284251e-06, "loss": 0.1506, "step": 359 }, { "epoch": 0.6304728546409807, "grad_norm": 0.8536479488591544, "learning_rate": 7.742009318637323e-06, "loss": 0.1023, "step": 360 }, { "epoch": 0.6322241681260946, "grad_norm": 1.1845728329626128, "learning_rate": 7.730497002126105e-06, "loss": 0.1584, "step": 361 }, { "epoch": 0.6339754816112084, "grad_norm": 0.8928523838563781, "learning_rate": 7.718964021873035e-06, "loss": 0.1084, "step": 362 }, { "epoch": 0.6357267950963222, "grad_norm": 0.9835590981476029, "learning_rate": 7.707410465156916e-06, "loss": 0.1638, "step": 363 }, { "epoch": 0.637478108581436, "grad_norm": 0.9201578437929335, "learning_rate": 7.695836419412277e-06, "loss": 0.1239, "step": 364 }, { "epoch": 0.6392294220665499, "grad_norm": 0.728467005507231, "learning_rate": 7.684241972228702e-06, "loss": 0.1244, "step": 365 }, { "epoch": 0.6409807355516638, "grad_norm": 0.918030927901545, "learning_rate": 7.672627211350164e-06, "loss": 0.1466, "step": 366 }, { "epoch": 0.6427320490367776, "grad_norm": 1.2979769917204351, "learning_rate": 7.660992224674371e-06, "loss": 0.2255, "step": 367 }, { "epoch": 0.6444833625218914, "grad_norm": 0.771343645637159, "learning_rate": 7.649337100252091e-06, "loss": 0.1293, "step": 368 }, { "epoch": 0.6462346760070052, "grad_norm": 1.1242763102043734, "learning_rate": 7.637661926286493e-06, "loss": 0.2268, "step": 369 }, { "epoch": 0.647985989492119, "grad_norm": 1.024519873593415, "learning_rate": 7.625966791132469e-06, "loss": 0.1664, "step": 370 }, { "epoch": 0.649737302977233, "grad_norm": 0.9346453233331181, "learning_rate": 7.614251783295981e-06, "loss": 0.1493, "step": 371 }, { "epoch": 0.6514886164623468, "grad_norm": 1.032430577115044, "learning_rate": 7.602516991433376e-06, "loss": 0.168, "step": 372 }, { "epoch": 0.6532399299474606, "grad_norm": 1.2756452983998474, "learning_rate": 7.590762504350729e-06, "loss": 0.2004, "step": 373 }, { "epoch": 0.6549912434325744, "grad_norm": 1.0952734861105042, "learning_rate": 7.578988411003156e-06, "loss": 0.2038, "step": 374 }, { "epoch": 0.6567425569176882, "grad_norm": 0.9982958673516226, "learning_rate": 7.567194800494154e-06, "loss": 0.1722, "step": 375 }, { "epoch": 0.658493870402802, "grad_norm": 1.2747360607076716, "learning_rate": 7.555381762074918e-06, "loss": 0.1977, "step": 376 }, { "epoch": 0.660245183887916, "grad_norm": 1.48637993886011, "learning_rate": 7.543549385143667e-06, "loss": 0.2751, "step": 377 }, { "epoch": 0.6619964973730298, "grad_norm": 1.0127675566111833, "learning_rate": 7.531697759244978e-06, "loss": 0.1556, "step": 378 }, { "epoch": 0.6637478108581436, "grad_norm": 1.0835339789681326, "learning_rate": 7.519826974069088e-06, "loss": 0.1884, "step": 379 }, { "epoch": 0.6654991243432574, "grad_norm": 0.9837633602612155, "learning_rate": 7.507937119451234e-06, "loss": 0.1823, "step": 380 }, { "epoch": 0.6672504378283712, "grad_norm": 1.392684179495506, "learning_rate": 7.496028285370966e-06, "loss": 0.1912, "step": 381 }, { "epoch": 0.6690017513134852, "grad_norm": 1.0916850999558239, "learning_rate": 7.484100561951459e-06, "loss": 0.2066, "step": 382 }, { "epoch": 0.670753064798599, "grad_norm": 0.8680971975778679, "learning_rate": 7.472154039458851e-06, "loss": 0.1754, "step": 383 }, { "epoch": 0.6725043782837128, "grad_norm": 1.0435891343464805, "learning_rate": 7.460188808301532e-06, "loss": 0.1318, "step": 384 }, { "epoch": 0.6742556917688266, "grad_norm": 1.042398115064006, "learning_rate": 7.448204959029484e-06, "loss": 0.2022, "step": 385 }, { "epoch": 0.6760070052539404, "grad_norm": 0.9731481469841672, "learning_rate": 7.436202582333587e-06, "loss": 0.13, "step": 386 }, { "epoch": 0.6777583187390543, "grad_norm": 1.386034337965727, "learning_rate": 7.4241817690449235e-06, "loss": 0.2216, "step": 387 }, { "epoch": 0.6795096322241682, "grad_norm": 1.2608682646343083, "learning_rate": 7.41214261013411e-06, "loss": 0.1966, "step": 388 }, { "epoch": 0.681260945709282, "grad_norm": 1.1020654673893056, "learning_rate": 7.40008519671059e-06, "loss": 0.1762, "step": 389 }, { "epoch": 0.6830122591943958, "grad_norm": 1.0019379870434075, "learning_rate": 7.3880096200219585e-06, "loss": 0.1436, "step": 390 }, { "epoch": 0.6847635726795096, "grad_norm": 0.9922002827434665, "learning_rate": 7.375915971453264e-06, "loss": 0.159, "step": 391 }, { "epoch": 0.6865148861646234, "grad_norm": 1.2392846396033714, "learning_rate": 7.363804342526315e-06, "loss": 0.1972, "step": 392 }, { "epoch": 0.6882661996497373, "grad_norm": 0.8963807218487707, "learning_rate": 7.3516748248989955e-06, "loss": 0.1921, "step": 393 }, { "epoch": 0.6900175131348512, "grad_norm": 1.1130741139227058, "learning_rate": 7.339527510364567e-06, "loss": 0.1459, "step": 394 }, { "epoch": 0.691768826619965, "grad_norm": 0.8075594361508924, "learning_rate": 7.327362490850971e-06, "loss": 0.1379, "step": 395 }, { "epoch": 0.6935201401050788, "grad_norm": 1.0062548542126621, "learning_rate": 7.315179858420138e-06, "loss": 0.1758, "step": 396 }, { "epoch": 0.6952714535901926, "grad_norm": 0.9717163669556567, "learning_rate": 7.302979705267286e-06, "loss": 0.1538, "step": 397 }, { "epoch": 0.6970227670753065, "grad_norm": 1.0413579747819772, "learning_rate": 7.2907621237202275e-06, "loss": 0.1535, "step": 398 }, { "epoch": 0.6987740805604203, "grad_norm": 0.9776202668633781, "learning_rate": 7.2785272062386715e-06, "loss": 0.2237, "step": 399 }, { "epoch": 0.7005253940455342, "grad_norm": 0.9919980255734399, "learning_rate": 7.266275045413517e-06, "loss": 0.1759, "step": 400 }, { "epoch": 0.702276707530648, "grad_norm": 1.1333937112664494, "learning_rate": 7.254005733966159e-06, "loss": 0.26, "step": 401 }, { "epoch": 0.7040280210157618, "grad_norm": 0.9978679010728084, "learning_rate": 7.241719364747781e-06, "loss": 0.146, "step": 402 }, { "epoch": 0.7057793345008757, "grad_norm": 1.0310148677742963, "learning_rate": 7.229416030738661e-06, "loss": 0.1358, "step": 403 }, { "epoch": 0.7075306479859895, "grad_norm": 0.9210280630631851, "learning_rate": 7.217095825047455e-06, "loss": 0.1368, "step": 404 }, { "epoch": 0.7092819614711033, "grad_norm": 0.8458172902391173, "learning_rate": 7.204758840910509e-06, "loss": 0.1548, "step": 405 }, { "epoch": 0.7110332749562172, "grad_norm": 1.0007666191428417, "learning_rate": 7.192405171691138e-06, "loss": 0.1358, "step": 406 }, { "epoch": 0.712784588441331, "grad_norm": 1.1791422782221483, "learning_rate": 7.180034910878926e-06, "loss": 0.2027, "step": 407 }, { "epoch": 0.7145359019264448, "grad_norm": 1.3622226936019624, "learning_rate": 7.167648152089017e-06, "loss": 0.1446, "step": 408 }, { "epoch": 0.7162872154115587, "grad_norm": 1.260377975347423, "learning_rate": 7.155244989061415e-06, "loss": 0.1394, "step": 409 }, { "epoch": 0.7180385288966725, "grad_norm": 0.7647098429722478, "learning_rate": 7.142825515660259e-06, "loss": 0.1436, "step": 410 }, { "epoch": 0.7197898423817863, "grad_norm": 1.0806203976791042, "learning_rate": 7.130389825873125e-06, "loss": 0.1472, "step": 411 }, { "epoch": 0.7215411558669002, "grad_norm": 1.160580416868669, "learning_rate": 7.1179380138103105e-06, "loss": 0.1709, "step": 412 }, { "epoch": 0.723292469352014, "grad_norm": 1.2669865482460725, "learning_rate": 7.105470173704121e-06, "loss": 0.1692, "step": 413 }, { "epoch": 0.7250437828371279, "grad_norm": 0.8979827897826866, "learning_rate": 7.092986399908158e-06, "loss": 0.1538, "step": 414 }, { "epoch": 0.7267950963222417, "grad_norm": 1.1613876219039976, "learning_rate": 7.08048678689661e-06, "loss": 0.1627, "step": 415 }, { "epoch": 0.7285464098073555, "grad_norm": 0.7088309545286363, "learning_rate": 7.067971429263527e-06, "loss": 0.0933, "step": 416 }, { "epoch": 0.7302977232924693, "grad_norm": 0.9953579341793583, "learning_rate": 7.055440421722113e-06, "loss": 0.1546, "step": 417 }, { "epoch": 0.7320490367775832, "grad_norm": 0.893498402926208, "learning_rate": 7.042893859104008e-06, "loss": 0.1647, "step": 418 }, { "epoch": 0.7338003502626971, "grad_norm": 0.9606538728371828, "learning_rate": 7.030331836358565e-06, "loss": 0.1584, "step": 419 }, { "epoch": 0.7355516637478109, "grad_norm": 0.9087697947323283, "learning_rate": 7.017754448552141e-06, "loss": 0.1489, "step": 420 }, { "epoch": 0.7373029772329247, "grad_norm": 1.0980936242324373, "learning_rate": 7.0051617908673685e-06, "loss": 0.1909, "step": 421 }, { "epoch": 0.7390542907180385, "grad_norm": 1.0679212943502743, "learning_rate": 6.992553958602439e-06, "loss": 0.1933, "step": 422 }, { "epoch": 0.7408056042031523, "grad_norm": 1.0201469343275764, "learning_rate": 6.979931047170382e-06, "loss": 0.19, "step": 423 }, { "epoch": 0.7425569176882661, "grad_norm": 1.153726254550419, "learning_rate": 6.967293152098345e-06, "loss": 0.2164, "step": 424 }, { "epoch": 0.7443082311733801, "grad_norm": 0.9986655789682201, "learning_rate": 6.954640369026861e-06, "loss": 0.1681, "step": 425 }, { "epoch": 0.7460595446584939, "grad_norm": 0.7808715269507883, "learning_rate": 6.941972793709141e-06, "loss": 0.1427, "step": 426 }, { "epoch": 0.7478108581436077, "grad_norm": 0.9806851736858181, "learning_rate": 6.929290522010332e-06, "loss": 0.1558, "step": 427 }, { "epoch": 0.7495621716287215, "grad_norm": 1.1151483469529613, "learning_rate": 6.9165936499068065e-06, "loss": 0.1851, "step": 428 }, { "epoch": 0.7513134851138353, "grad_norm": 1.0226642337566068, "learning_rate": 6.903882273485425e-06, "loss": 0.1406, "step": 429 }, { "epoch": 0.7530647985989493, "grad_norm": 1.2154853234808298, "learning_rate": 6.891156488942812e-06, "loss": 0.2281, "step": 430 }, { "epoch": 0.7548161120840631, "grad_norm": 0.9124967173578454, "learning_rate": 6.878416392584635e-06, "loss": 0.1502, "step": 431 }, { "epoch": 0.7565674255691769, "grad_norm": 1.4028442036532214, "learning_rate": 6.865662080824864e-06, "loss": 0.2161, "step": 432 }, { "epoch": 0.7583187390542907, "grad_norm": 0.975649535916596, "learning_rate": 6.852893650185051e-06, "loss": 0.1655, "step": 433 }, { "epoch": 0.7600700525394045, "grad_norm": 1.1154571470663182, "learning_rate": 6.840111197293594e-06, "loss": 0.2768, "step": 434 }, { "epoch": 0.7618213660245184, "grad_norm": 1.2620416059011632, "learning_rate": 6.8273148188850105e-06, "loss": 0.2549, "step": 435 }, { "epoch": 0.7635726795096323, "grad_norm": 0.9710561696847746, "learning_rate": 6.814504611799202e-06, "loss": 0.1068, "step": 436 }, { "epoch": 0.7653239929947461, "grad_norm": 0.8701266642614385, "learning_rate": 6.801680672980722e-06, "loss": 0.1272, "step": 437 }, { "epoch": 0.7670753064798599, "grad_norm": 1.1987353253288306, "learning_rate": 6.788843099478041e-06, "loss": 0.2027, "step": 438 }, { "epoch": 0.7688266199649737, "grad_norm": 1.1590697629080275, "learning_rate": 6.775991988442816e-06, "loss": 0.2143, "step": 439 }, { "epoch": 0.7705779334500875, "grad_norm": 1.0846847919101628, "learning_rate": 6.763127437129151e-06, "loss": 0.1705, "step": 440 }, { "epoch": 0.7723292469352014, "grad_norm": 0.8580193306506241, "learning_rate": 6.750249542892863e-06, "loss": 0.175, "step": 441 }, { "epoch": 0.7740805604203153, "grad_norm": 1.5775617568317908, "learning_rate": 6.737358403190746e-06, "loss": 0.2081, "step": 442 }, { "epoch": 0.7758318739054291, "grad_norm": 0.9624673340593134, "learning_rate": 6.724454115579832e-06, "loss": 0.1098, "step": 443 }, { "epoch": 0.7775831873905429, "grad_norm": 1.0607534980440978, "learning_rate": 6.711536777716654e-06, "loss": 0.1803, "step": 444 }, { "epoch": 0.7793345008756567, "grad_norm": 1.1255975448505335, "learning_rate": 6.698606487356503e-06, "loss": 0.1872, "step": 445 }, { "epoch": 0.7810858143607706, "grad_norm": 0.8928366518808328, "learning_rate": 6.685663342352693e-06, "loss": 0.1466, "step": 446 }, { "epoch": 0.7828371278458844, "grad_norm": 1.0817838600370688, "learning_rate": 6.6727074406558224e-06, "loss": 0.1663, "step": 447 }, { "epoch": 0.7845884413309983, "grad_norm": 1.1115633011949881, "learning_rate": 6.659738880313025e-06, "loss": 0.1598, "step": 448 }, { "epoch": 0.7863397548161121, "grad_norm": 1.0231529929725236, "learning_rate": 6.646757759467234e-06, "loss": 0.155, "step": 449 }, { "epoch": 0.7880910683012259, "grad_norm": 1.2411539450968938, "learning_rate": 6.633764176356434e-06, "loss": 0.1915, "step": 450 }, { "epoch": 0.7898423817863398, "grad_norm": 1.0708800647966268, "learning_rate": 6.620758229312927e-06, "loss": 0.1385, "step": 451 }, { "epoch": 0.7915936952714536, "grad_norm": 1.0683250200759222, "learning_rate": 6.6077400167625784e-06, "loss": 0.1663, "step": 452 }, { "epoch": 0.7933450087565674, "grad_norm": 1.250391733683125, "learning_rate": 6.594709637224075e-06, "loss": 0.1996, "step": 453 }, { "epoch": 0.7950963222416813, "grad_norm": 1.1798794300569202, "learning_rate": 6.581667189308185e-06, "loss": 0.146, "step": 454 }, { "epoch": 0.7968476357267951, "grad_norm": 1.4398604943836388, "learning_rate": 6.5686127717170015e-06, "loss": 0.3225, "step": 455 }, { "epoch": 0.7985989492119089, "grad_norm": 0.9313043884553712, "learning_rate": 6.555546483243205e-06, "loss": 0.1389, "step": 456 }, { "epoch": 0.8003502626970228, "grad_norm": 1.1833167749805182, "learning_rate": 6.542468422769311e-06, "loss": 0.2136, "step": 457 }, { "epoch": 0.8021015761821366, "grad_norm": 0.8648334244503824, "learning_rate": 6.529378689266923e-06, "loss": 0.1878, "step": 458 }, { "epoch": 0.8038528896672504, "grad_norm": 0.9949458854079241, "learning_rate": 6.516277381795984e-06, "loss": 0.1497, "step": 459 }, { "epoch": 0.8056042031523643, "grad_norm": 1.1191303302941829, "learning_rate": 6.503164599504022e-06, "loss": 0.1566, "step": 460 }, { "epoch": 0.8073555166374781, "grad_norm": 1.1298654961289165, "learning_rate": 6.490040441625407e-06, "loss": 0.2017, "step": 461 }, { "epoch": 0.809106830122592, "grad_norm": 1.0947278566055827, "learning_rate": 6.476905007480597e-06, "loss": 0.1525, "step": 462 }, { "epoch": 0.8108581436077058, "grad_norm": 1.1323194203448552, "learning_rate": 6.4637583964753855e-06, "loss": 0.2241, "step": 463 }, { "epoch": 0.8126094570928196, "grad_norm": 1.0459409005734945, "learning_rate": 6.45060070810015e-06, "loss": 0.1296, "step": 464 }, { "epoch": 0.8143607705779334, "grad_norm": 1.1316198669484385, "learning_rate": 6.437432041929097e-06, "loss": 0.1621, "step": 465 }, { "epoch": 0.8161120840630472, "grad_norm": 0.9491729705590622, "learning_rate": 6.424252497619511e-06, "loss": 0.1547, "step": 466 }, { "epoch": 0.8178633975481612, "grad_norm": 1.0195018137674068, "learning_rate": 6.4110621749110014e-06, "loss": 0.1424, "step": 467 }, { "epoch": 0.819614711033275, "grad_norm": 1.2170884375327042, "learning_rate": 6.397861173624745e-06, "loss": 0.2018, "step": 468 }, { "epoch": 0.8213660245183888, "grad_norm": 1.5918684168233677, "learning_rate": 6.384649593662733e-06, "loss": 0.1759, "step": 469 }, { "epoch": 0.8231173380035026, "grad_norm": 1.1612346799052706, "learning_rate": 6.371427535007008e-06, "loss": 0.1944, "step": 470 }, { "epoch": 0.8248686514886164, "grad_norm": 1.01990361540596, "learning_rate": 6.358195097718917e-06, "loss": 0.2028, "step": 471 }, { "epoch": 0.8266199649737302, "grad_norm": 0.9233804242151922, "learning_rate": 6.344952381938354e-06, "loss": 0.1768, "step": 472 }, { "epoch": 0.8283712784588442, "grad_norm": 1.1968348714557342, "learning_rate": 6.331699487882987e-06, "loss": 0.1657, "step": 473 }, { "epoch": 0.830122591943958, "grad_norm": 1.4115850068938127, "learning_rate": 6.318436515847525e-06, "loss": 0.2006, "step": 474 }, { "epoch": 0.8318739054290718, "grad_norm": 0.9507911762810964, "learning_rate": 6.30516356620293e-06, "loss": 0.1495, "step": 475 }, { "epoch": 0.8336252189141856, "grad_norm": 1.1430728607474907, "learning_rate": 6.291880739395683e-06, "loss": 0.1722, "step": 476 }, { "epoch": 0.8353765323992994, "grad_norm": 1.343500691590876, "learning_rate": 6.278588135947011e-06, "loss": 0.2047, "step": 477 }, { "epoch": 0.8371278458844134, "grad_norm": 1.2595243490259276, "learning_rate": 6.265285856452123e-06, "loss": 0.2214, "step": 478 }, { "epoch": 0.8388791593695272, "grad_norm": 1.1669616297664058, "learning_rate": 6.251974001579459e-06, "loss": 0.1724, "step": 479 }, { "epoch": 0.840630472854641, "grad_norm": 1.0844521569107943, "learning_rate": 6.238652672069921e-06, "loss": 0.1961, "step": 480 }, { "epoch": 0.8423817863397548, "grad_norm": 0.8709327722146551, "learning_rate": 6.225321968736114e-06, "loss": 0.1118, "step": 481 }, { "epoch": 0.8441330998248686, "grad_norm": 1.0680058091791873, "learning_rate": 6.211981992461583e-06, "loss": 0.1579, "step": 482 }, { "epoch": 0.8458844133099825, "grad_norm": 1.0598391865687347, "learning_rate": 6.1986328442000425e-06, "loss": 0.2064, "step": 483 }, { "epoch": 0.8476357267950964, "grad_norm": 0.8766034007901955, "learning_rate": 6.185274624974627e-06, "loss": 0.1729, "step": 484 }, { "epoch": 0.8493870402802102, "grad_norm": 1.0572455452106584, "learning_rate": 6.1719074358771095e-06, "loss": 0.1506, "step": 485 }, { "epoch": 0.851138353765324, "grad_norm": 1.191222790209331, "learning_rate": 6.158531378067151e-06, "loss": 0.2654, "step": 486 }, { "epoch": 0.8528896672504378, "grad_norm": 1.4428690121550316, "learning_rate": 6.145146552771526e-06, "loss": 0.1961, "step": 487 }, { "epoch": 0.8546409807355516, "grad_norm": 1.4737339805651568, "learning_rate": 6.13175306128336e-06, "loss": 0.2061, "step": 488 }, { "epoch": 0.8563922942206655, "grad_norm": 0.8182297459502176, "learning_rate": 6.118351004961361e-06, "loss": 0.1507, "step": 489 }, { "epoch": 0.8581436077057794, "grad_norm": 0.8288780085158164, "learning_rate": 6.104940485229055e-06, "loss": 0.13, "step": 490 }, { "epoch": 0.8598949211908932, "grad_norm": 1.1292187916757546, "learning_rate": 6.091521603574016e-06, "loss": 0.1324, "step": 491 }, { "epoch": 0.861646234676007, "grad_norm": 1.0078907033539166, "learning_rate": 6.0780944615471016e-06, "loss": 0.1468, "step": 492 }, { "epoch": 0.8633975481611208, "grad_norm": 1.182790806253496, "learning_rate": 6.064659160761676e-06, "loss": 0.1444, "step": 493 }, { "epoch": 0.8651488616462347, "grad_norm": 1.2426011389183171, "learning_rate": 6.051215802892855e-06, "loss": 0.1864, "step": 494 }, { "epoch": 0.8669001751313485, "grad_norm": 0.9102336236721412, "learning_rate": 6.03776448967672e-06, "loss": 0.1639, "step": 495 }, { "epoch": 0.8686514886164624, "grad_norm": 0.8523410941646513, "learning_rate": 6.024305322909565e-06, "loss": 0.1501, "step": 496 }, { "epoch": 0.8704028021015762, "grad_norm": 0.8389091979384604, "learning_rate": 6.0108384044471115e-06, "loss": 0.1543, "step": 497 }, { "epoch": 0.87215411558669, "grad_norm": 0.9283364061815252, "learning_rate": 5.997363836203744e-06, "loss": 0.179, "step": 498 }, { "epoch": 0.8739054290718039, "grad_norm": 1.610431915265918, "learning_rate": 5.983881720151743e-06, "loss": 0.211, "step": 499 }, { "epoch": 0.8756567425569177, "grad_norm": 0.9193134177440736, "learning_rate": 5.970392158320505e-06, "loss": 0.171, "step": 500 }, { "epoch": 0.8756567425569177, "eval_loss": 0.18447460234165192, "eval_runtime": 1.9261, "eval_samples_per_second": 24.401, "eval_steps_per_second": 6.23, "step": 500 }, { "epoch": 0.8774080560420315, "grad_norm": 1.5249299930362135, "learning_rate": 5.956895252795778e-06, "loss": 0.2216, "step": 501 }, { "epoch": 0.8791593695271454, "grad_norm": 0.9421038979843945, "learning_rate": 5.943391105718883e-06, "loss": 0.172, "step": 502 }, { "epoch": 0.8809106830122592, "grad_norm": 1.0939585065581912, "learning_rate": 5.9298798192859434e-06, "loss": 0.1562, "step": 503 }, { "epoch": 0.882661996497373, "grad_norm": 1.3902162886272154, "learning_rate": 5.91636149574711e-06, "loss": 0.2069, "step": 504 }, { "epoch": 0.8844133099824869, "grad_norm": 0.9650701747967442, "learning_rate": 5.902836237405791e-06, "loss": 0.1716, "step": 505 }, { "epoch": 0.8861646234676007, "grad_norm": 1.083475666382535, "learning_rate": 5.889304146617878e-06, "loss": 0.1473, "step": 506 }, { "epoch": 0.8879159369527145, "grad_norm": 0.832482252729812, "learning_rate": 5.875765325790963e-06, "loss": 0.1003, "step": 507 }, { "epoch": 0.8896672504378283, "grad_norm": 1.096304873507722, "learning_rate": 5.8622198773835725e-06, "loss": 0.1779, "step": 508 }, { "epoch": 0.8914185639229422, "grad_norm": 0.6978941340044295, "learning_rate": 5.8486679039043895e-06, "loss": 0.0972, "step": 509 }, { "epoch": 0.8931698774080561, "grad_norm": 0.9622846696182903, "learning_rate": 5.835109507911475e-06, "loss": 0.1651, "step": 510 }, { "epoch": 0.8949211908931699, "grad_norm": 1.2738814460850292, "learning_rate": 5.821544792011495e-06, "loss": 0.1897, "step": 511 }, { "epoch": 0.8966725043782837, "grad_norm": 1.1576090771526317, "learning_rate": 5.807973858858947e-06, "loss": 0.1617, "step": 512 }, { "epoch": 0.8984238178633975, "grad_norm": 1.0945933042426927, "learning_rate": 5.794396811155372e-06, "loss": 0.2411, "step": 513 }, { "epoch": 0.9001751313485113, "grad_norm": 1.0071156666849748, "learning_rate": 5.780813751648589e-06, "loss": 0.145, "step": 514 }, { "epoch": 0.9019264448336253, "grad_norm": 1.0608443685972735, "learning_rate": 5.76722478313191e-06, "loss": 0.1602, "step": 515 }, { "epoch": 0.9036777583187391, "grad_norm": 0.8351677418619291, "learning_rate": 5.753630008443371e-06, "loss": 0.1649, "step": 516 }, { "epoch": 0.9054290718038529, "grad_norm": 1.0252931582190568, "learning_rate": 5.740029530464941e-06, "loss": 0.1208, "step": 517 }, { "epoch": 0.9071803852889667, "grad_norm": 1.0954354367499803, "learning_rate": 5.726423452121751e-06, "loss": 0.1731, "step": 518 }, { "epoch": 0.9089316987740805, "grad_norm": 0.9910985812759849, "learning_rate": 5.712811876381318e-06, "loss": 0.185, "step": 519 }, { "epoch": 0.9106830122591943, "grad_norm": 0.9431196895717147, "learning_rate": 5.699194906252761e-06, "loss": 0.203, "step": 520 }, { "epoch": 0.9124343257443083, "grad_norm": 0.7613500880928905, "learning_rate": 5.685572644786016e-06, "loss": 0.1142, "step": 521 }, { "epoch": 0.9141856392294221, "grad_norm": 0.9561738575523392, "learning_rate": 5.671945195071075e-06, "loss": 0.1474, "step": 522 }, { "epoch": 0.9159369527145359, "grad_norm": 1.0351076685823428, "learning_rate": 5.65831266023718e-06, "loss": 0.1973, "step": 523 }, { "epoch": 0.9176882661996497, "grad_norm": 1.2139682431989942, "learning_rate": 5.644675143452065e-06, "loss": 0.2251, "step": 524 }, { "epoch": 0.9194395796847635, "grad_norm": 1.1432766151339988, "learning_rate": 5.631032747921165e-06, "loss": 0.2148, "step": 525 }, { "epoch": 0.9211908931698775, "grad_norm": 0.8384192320224637, "learning_rate": 5.617385576886829e-06, "loss": 0.124, "step": 526 }, { "epoch": 0.9229422066549913, "grad_norm": 0.7477784220288675, "learning_rate": 5.603733733627559e-06, "loss": 0.1244, "step": 527 }, { "epoch": 0.9246935201401051, "grad_norm": 0.8136618533131901, "learning_rate": 5.5900773214572016e-06, "loss": 0.1652, "step": 528 }, { "epoch": 0.9264448336252189, "grad_norm": 0.9923459165132333, "learning_rate": 5.576416443724187e-06, "loss": 0.1719, "step": 529 }, { "epoch": 0.9281961471103327, "grad_norm": 1.2728780007916458, "learning_rate": 5.562751203810742e-06, "loss": 0.1844, "step": 530 }, { "epoch": 0.9299474605954466, "grad_norm": 0.9234166515823709, "learning_rate": 5.5490817051320964e-06, "loss": 0.1612, "step": 531 }, { "epoch": 0.9316987740805605, "grad_norm": 1.0361866511885336, "learning_rate": 5.535408051135721e-06, "loss": 0.1428, "step": 532 }, { "epoch": 0.9334500875656743, "grad_norm": 0.9374383845356417, "learning_rate": 5.5217303453005225e-06, "loss": 0.1787, "step": 533 }, { "epoch": 0.9352014010507881, "grad_norm": 1.0824506841717698, "learning_rate": 5.508048691136075e-06, "loss": 0.1846, "step": 534 }, { "epoch": 0.9369527145359019, "grad_norm": 1.2681101493420375, "learning_rate": 5.4943631921818365e-06, "loss": 0.1857, "step": 535 }, { "epoch": 0.9387040280210157, "grad_norm": 0.8870751536018538, "learning_rate": 5.480673952006355e-06, "loss": 0.1893, "step": 536 }, { "epoch": 0.9404553415061296, "grad_norm": 0.8698308010341032, "learning_rate": 5.466981074206493e-06, "loss": 0.1576, "step": 537 }, { "epoch": 0.9422066549912435, "grad_norm": 1.1493313359852635, "learning_rate": 5.453284662406646e-06, "loss": 0.1915, "step": 538 }, { "epoch": 0.9439579684763573, "grad_norm": 0.9597895796258253, "learning_rate": 5.439584820257949e-06, "loss": 0.1799, "step": 539 }, { "epoch": 0.9457092819614711, "grad_norm": 0.9439514805423415, "learning_rate": 5.425881651437499e-06, "loss": 0.1466, "step": 540 }, { "epoch": 0.9474605954465849, "grad_norm": 0.9974103027552716, "learning_rate": 5.412175259647567e-06, "loss": 0.1623, "step": 541 }, { "epoch": 0.9492119089316988, "grad_norm": 0.8671577413147288, "learning_rate": 5.398465748614815e-06, "loss": 0.1989, "step": 542 }, { "epoch": 0.9509632224168126, "grad_norm": 0.8733255413441741, "learning_rate": 5.384753222089515e-06, "loss": 0.1228, "step": 543 }, { "epoch": 0.9527145359019265, "grad_norm": 1.3146328909933664, "learning_rate": 5.371037783844752e-06, "loss": 0.2122, "step": 544 }, { "epoch": 0.9544658493870403, "grad_norm": 1.1770390858844189, "learning_rate": 5.357319537675655e-06, "loss": 0.2062, "step": 545 }, { "epoch": 0.9562171628721541, "grad_norm": 1.0962088909147447, "learning_rate": 5.3435985873985926e-06, "loss": 0.188, "step": 546 }, { "epoch": 0.957968476357268, "grad_norm": 1.2179282929078772, "learning_rate": 5.329875036850406e-06, "loss": 0.1765, "step": 547 }, { "epoch": 0.9597197898423818, "grad_norm": 1.069605096067716, "learning_rate": 5.31614898988761e-06, "loss": 0.1565, "step": 548 }, { "epoch": 0.9614711033274956, "grad_norm": 1.3962825793212799, "learning_rate": 5.302420550385612e-06, "loss": 0.2066, "step": 549 }, { "epoch": 0.9632224168126094, "grad_norm": 1.008584413162853, "learning_rate": 5.28868982223793e-06, "loss": 0.1698, "step": 550 }, { "epoch": 0.9649737302977233, "grad_norm": 0.998912078130381, "learning_rate": 5.274956909355395e-06, "loss": 0.179, "step": 551 }, { "epoch": 0.9667250437828371, "grad_norm": 0.9297277664472026, "learning_rate": 5.261221915665375e-06, "loss": 0.1184, "step": 552 }, { "epoch": 0.968476357267951, "grad_norm": 1.185642958138451, "learning_rate": 5.247484945110988e-06, "loss": 0.1932, "step": 553 }, { "epoch": 0.9702276707530648, "grad_norm": 1.088829726983837, "learning_rate": 5.233746101650308e-06, "loss": 0.2206, "step": 554 }, { "epoch": 0.9719789842381786, "grad_norm": 1.0987062412828756, "learning_rate": 5.220005489255583e-06, "loss": 0.1554, "step": 555 }, { "epoch": 0.9737302977232924, "grad_norm": 1.0323763199957168, "learning_rate": 5.20626321191245e-06, "loss": 0.1546, "step": 556 }, { "epoch": 0.9754816112084063, "grad_norm": 0.9972604317206961, "learning_rate": 5.192519373619145e-06, "loss": 0.1742, "step": 557 }, { "epoch": 0.9772329246935202, "grad_norm": 1.1779226698001648, "learning_rate": 5.1787740783857164e-06, "loss": 0.1969, "step": 558 }, { "epoch": 0.978984238178634, "grad_norm": 1.1733643102354534, "learning_rate": 5.165027430233239e-06, "loss": 0.138, "step": 559 }, { "epoch": 0.9807355516637478, "grad_norm": 1.0272420360834542, "learning_rate": 5.151279533193027e-06, "loss": 0.1705, "step": 560 }, { "epoch": 0.9824868651488616, "grad_norm": 0.904519502078042, "learning_rate": 5.137530491305844e-06, "loss": 0.1255, "step": 561 }, { "epoch": 0.9842381786339754, "grad_norm": 0.9629383999654443, "learning_rate": 5.123780408621118e-06, "loss": 0.1659, "step": 562 }, { "epoch": 0.9859894921190894, "grad_norm": 1.3124846848756935, "learning_rate": 5.110029389196155e-06, "loss": 0.1844, "step": 563 }, { "epoch": 0.9877408056042032, "grad_norm": 1.0676108653291219, "learning_rate": 5.096277537095348e-06, "loss": 0.2078, "step": 564 }, { "epoch": 0.989492119089317, "grad_norm": 0.9382113481780886, "learning_rate": 5.082524956389394e-06, "loss": 0.1409, "step": 565 }, { "epoch": 0.9912434325744308, "grad_norm": 0.9936394728663424, "learning_rate": 5.0687717511545e-06, "loss": 0.2063, "step": 566 }, { "epoch": 0.9929947460595446, "grad_norm": 1.031862421429508, "learning_rate": 5.055018025471602e-06, "loss": 0.1595, "step": 567 }, { "epoch": 0.9947460595446584, "grad_norm": 1.3217031605637353, "learning_rate": 5.0412638834255755e-06, "loss": 0.1276, "step": 568 }, { "epoch": 0.9964973730297724, "grad_norm": 1.1933242590091766, "learning_rate": 5.027509429104443e-06, "loss": 0.1923, "step": 569 }, { "epoch": 0.9982486865148862, "grad_norm": 0.8661254447213783, "learning_rate": 5.013754766598599e-06, "loss": 0.1724, "step": 570 }, { "epoch": 1.0, "grad_norm": 1.1986284951562434, "learning_rate": 5e-06, "loss": 0.1998, "step": 571 }, { "epoch": 1.001751313485114, "grad_norm": 0.7771391696807222, "learning_rate": 4.986245233401403e-06, "loss": 0.1012, "step": 572 }, { "epoch": 1.0035026269702276, "grad_norm": 0.6737848363676279, "learning_rate": 4.9724905708955575e-06, "loss": 0.0784, "step": 573 }, { "epoch": 1.0052539404553416, "grad_norm": 0.6454263064830977, "learning_rate": 4.958736116574426e-06, "loss": 0.0818, "step": 574 }, { "epoch": 1.0070052539404553, "grad_norm": 0.6725211290025607, "learning_rate": 4.9449819745284e-06, "loss": 0.0843, "step": 575 }, { "epoch": 1.0087565674255692, "grad_norm": 0.8469678132359898, "learning_rate": 4.931228248845502e-06, "loss": 0.1477, "step": 576 }, { "epoch": 1.010507880910683, "grad_norm": 0.7442182106279001, "learning_rate": 4.9174750436106076e-06, "loss": 0.0892, "step": 577 }, { "epoch": 1.0122591943957968, "grad_norm": 0.7468024313770749, "learning_rate": 4.903722462904653e-06, "loss": 0.0948, "step": 578 }, { "epoch": 1.0140105078809107, "grad_norm": 0.7166080541638878, "learning_rate": 4.889970610803845e-06, "loss": 0.0991, "step": 579 }, { "epoch": 1.0157618213660244, "grad_norm": 0.7762519026289294, "learning_rate": 4.8762195913788825e-06, "loss": 0.0774, "step": 580 }, { "epoch": 1.0175131348511384, "grad_norm": 0.808868055025971, "learning_rate": 4.862469508694157e-06, "loss": 0.1099, "step": 581 }, { "epoch": 1.0192644483362523, "grad_norm": 0.5819265746424057, "learning_rate": 4.8487204668069735e-06, "loss": 0.0695, "step": 582 }, { "epoch": 1.021015761821366, "grad_norm": 0.6156072907006408, "learning_rate": 4.834972569766762e-06, "loss": 0.0838, "step": 583 }, { "epoch": 1.02276707530648, "grad_norm": 0.7635944643048334, "learning_rate": 4.8212259216142835e-06, "loss": 0.104, "step": 584 }, { "epoch": 1.0245183887915936, "grad_norm": 0.7637571188691638, "learning_rate": 4.8074806263808565e-06, "loss": 0.0708, "step": 585 }, { "epoch": 1.0262697022767076, "grad_norm": 0.9551589547852135, "learning_rate": 4.7937367880875514e-06, "loss": 0.1281, "step": 586 }, { "epoch": 1.0280210157618215, "grad_norm": 0.7293559825967365, "learning_rate": 4.779994510744419e-06, "loss": 0.0912, "step": 587 }, { "epoch": 1.0297723292469352, "grad_norm": 0.8566789761394177, "learning_rate": 4.766253898349694e-06, "loss": 0.1606, "step": 588 }, { "epoch": 1.031523642732049, "grad_norm": 0.6627533346656157, "learning_rate": 4.752515054889012e-06, "loss": 0.0718, "step": 589 }, { "epoch": 1.0332749562171628, "grad_norm": 0.5815945433400204, "learning_rate": 4.738778084334625e-06, "loss": 0.0695, "step": 590 }, { "epoch": 1.0350262697022767, "grad_norm": 0.760838011156959, "learning_rate": 4.725043090644606e-06, "loss": 0.0884, "step": 591 }, { "epoch": 1.0367775831873904, "grad_norm": 0.7483668041512324, "learning_rate": 4.711310177762072e-06, "loss": 0.0669, "step": 592 }, { "epoch": 1.0385288966725044, "grad_norm": 0.8799384240976829, "learning_rate": 4.697579449614389e-06, "loss": 0.0998, "step": 593 }, { "epoch": 1.0402802101576183, "grad_norm": 0.8082415948151013, "learning_rate": 4.683851010112391e-06, "loss": 0.0985, "step": 594 }, { "epoch": 1.042031523642732, "grad_norm": 0.6580267314761, "learning_rate": 4.670124963149596e-06, "loss": 0.1115, "step": 595 }, { "epoch": 1.043782837127846, "grad_norm": 0.7118138502690213, "learning_rate": 4.656401412601408e-06, "loss": 0.0662, "step": 596 }, { "epoch": 1.0455341506129596, "grad_norm": 0.7285566494410792, "learning_rate": 4.642680462324348e-06, "loss": 0.1036, "step": 597 }, { "epoch": 1.0472854640980735, "grad_norm": 0.6952238511426204, "learning_rate": 4.628962216155249e-06, "loss": 0.0882, "step": 598 }, { "epoch": 1.0490367775831875, "grad_norm": 0.7543264293172796, "learning_rate": 4.615246777910485e-06, "loss": 0.087, "step": 599 }, { "epoch": 1.0507880910683012, "grad_norm": 0.6191909832818346, "learning_rate": 4.6015342513851854e-06, "loss": 0.0745, "step": 600 }, { "epoch": 1.052539404553415, "grad_norm": 0.922433858210772, "learning_rate": 4.587824740352435e-06, "loss": 0.1058, "step": 601 }, { "epoch": 1.0542907180385288, "grad_norm": 0.6821453050325335, "learning_rate": 4.5741183485625044e-06, "loss": 0.0771, "step": 602 }, { "epoch": 1.0560420315236427, "grad_norm": 0.9489221869695271, "learning_rate": 4.560415179742052e-06, "loss": 0.0955, "step": 603 }, { "epoch": 1.0577933450087567, "grad_norm": 0.7026344715382692, "learning_rate": 4.546715337593354e-06, "loss": 0.0819, "step": 604 }, { "epoch": 1.0595446584938704, "grad_norm": 0.6671067987720858, "learning_rate": 4.533018925793508e-06, "loss": 0.0727, "step": 605 }, { "epoch": 1.0612959719789843, "grad_norm": 0.8642821874175421, "learning_rate": 4.519326047993647e-06, "loss": 0.0937, "step": 606 }, { "epoch": 1.063047285464098, "grad_norm": 0.8793543522695341, "learning_rate": 4.505636807818166e-06, "loss": 0.1301, "step": 607 }, { "epoch": 1.064798598949212, "grad_norm": 0.8237519754604328, "learning_rate": 4.491951308863926e-06, "loss": 0.0825, "step": 608 }, { "epoch": 1.0665499124343258, "grad_norm": 0.8459623510016205, "learning_rate": 4.478269654699478e-06, "loss": 0.0821, "step": 609 }, { "epoch": 1.0683012259194395, "grad_norm": 0.8255877830028168, "learning_rate": 4.464591948864281e-06, "loss": 0.0842, "step": 610 }, { "epoch": 1.0700525394045535, "grad_norm": 0.8718483520086847, "learning_rate": 4.4509182948679035e-06, "loss": 0.0821, "step": 611 }, { "epoch": 1.0718038528896672, "grad_norm": 0.9897123154664441, "learning_rate": 4.43724879618926e-06, "loss": 0.1109, "step": 612 }, { "epoch": 1.073555166374781, "grad_norm": 0.8636401435184293, "learning_rate": 4.423583556275814e-06, "loss": 0.0904, "step": 613 }, { "epoch": 1.0753064798598948, "grad_norm": 0.8916326658281433, "learning_rate": 4.409922678542799e-06, "loss": 0.0695, "step": 614 }, { "epoch": 1.0770577933450087, "grad_norm": 0.6527711082304235, "learning_rate": 4.396266266372443e-06, "loss": 0.0512, "step": 615 }, { "epoch": 1.0788091068301227, "grad_norm": 0.7314392682365592, "learning_rate": 4.382614423113171e-06, "loss": 0.0772, "step": 616 }, { "epoch": 1.0805604203152364, "grad_norm": 0.7896716564759495, "learning_rate": 4.368967252078838e-06, "loss": 0.0837, "step": 617 }, { "epoch": 1.0823117338003503, "grad_norm": 0.8904478092266286, "learning_rate": 4.355324856547936e-06, "loss": 0.0984, "step": 618 }, { "epoch": 1.084063047285464, "grad_norm": 0.7997582578314417, "learning_rate": 4.341687339762822e-06, "loss": 0.0719, "step": 619 }, { "epoch": 1.085814360770578, "grad_norm": 0.9328053345880932, "learning_rate": 4.3280548049289275e-06, "loss": 0.1102, "step": 620 }, { "epoch": 1.0875656742556918, "grad_norm": 0.7402218059098891, "learning_rate": 4.314427355213984e-06, "loss": 0.0882, "step": 621 }, { "epoch": 1.0893169877408055, "grad_norm": 1.0242119598597839, "learning_rate": 4.3008050937472424e-06, "loss": 0.0971, "step": 622 }, { "epoch": 1.0910683012259195, "grad_norm": 1.078920725370126, "learning_rate": 4.2871881236186835e-06, "loss": 0.1252, "step": 623 }, { "epoch": 1.0928196147110332, "grad_norm": 0.8096017533492185, "learning_rate": 4.273576547878252e-06, "loss": 0.0717, "step": 624 }, { "epoch": 1.094570928196147, "grad_norm": 0.7940757501651525, "learning_rate": 4.259970469535061e-06, "loss": 0.1207, "step": 625 }, { "epoch": 1.096322241681261, "grad_norm": 0.6730235221485272, "learning_rate": 4.24636999155663e-06, "loss": 0.0642, "step": 626 }, { "epoch": 1.0980735551663747, "grad_norm": 0.8410423617949386, "learning_rate": 4.2327752168680904e-06, "loss": 0.1123, "step": 627 }, { "epoch": 1.0998248686514887, "grad_norm": 0.7165164918449669, "learning_rate": 4.219186248351413e-06, "loss": 0.1079, "step": 628 }, { "epoch": 1.1015761821366024, "grad_norm": 0.9046144772499766, "learning_rate": 4.20560318884463e-06, "loss": 0.0889, "step": 629 }, { "epoch": 1.1033274956217163, "grad_norm": 0.6944164438470994, "learning_rate": 4.192026141141054e-06, "loss": 0.0726, "step": 630 }, { "epoch": 1.1050788091068302, "grad_norm": 0.6933343999073917, "learning_rate": 4.178455207988504e-06, "loss": 0.103, "step": 631 }, { "epoch": 1.106830122591944, "grad_norm": 0.9114549106270846, "learning_rate": 4.164890492088527e-06, "loss": 0.0816, "step": 632 }, { "epoch": 1.1085814360770578, "grad_norm": 0.8832236692461997, "learning_rate": 4.151332096095613e-06, "loss": 0.0716, "step": 633 }, { "epoch": 1.1103327495621715, "grad_norm": 0.6560477009246709, "learning_rate": 4.13778012261643e-06, "loss": 0.0495, "step": 634 }, { "epoch": 1.1120840630472855, "grad_norm": 0.7697309148882717, "learning_rate": 4.124234674209038e-06, "loss": 0.0784, "step": 635 }, { "epoch": 1.1138353765323994, "grad_norm": 0.8584207785015194, "learning_rate": 4.110695853382123e-06, "loss": 0.0838, "step": 636 }, { "epoch": 1.115586690017513, "grad_norm": 0.8178593101603066, "learning_rate": 4.09716376259421e-06, "loss": 0.0885, "step": 637 }, { "epoch": 1.117338003502627, "grad_norm": 0.7493368271272891, "learning_rate": 4.083638504252891e-06, "loss": 0.0755, "step": 638 }, { "epoch": 1.1190893169877407, "grad_norm": 0.8921855827679555, "learning_rate": 4.070120180714059e-06, "loss": 0.1016, "step": 639 }, { "epoch": 1.1208406304728546, "grad_norm": 0.6566522468773398, "learning_rate": 4.056608894281118e-06, "loss": 0.0621, "step": 640 }, { "epoch": 1.1225919439579686, "grad_norm": 1.0368825738027343, "learning_rate": 4.043104747204222e-06, "loss": 0.0964, "step": 641 }, { "epoch": 1.1243432574430823, "grad_norm": 0.7545101954718236, "learning_rate": 4.029607841679496e-06, "loss": 0.0743, "step": 642 }, { "epoch": 1.1260945709281962, "grad_norm": 0.7795780796724158, "learning_rate": 4.016118279848259e-06, "loss": 0.0818, "step": 643 }, { "epoch": 1.12784588441331, "grad_norm": 0.8102419707778211, "learning_rate": 4.002636163796259e-06, "loss": 0.0601, "step": 644 }, { "epoch": 1.1295971978984238, "grad_norm": 0.8912319933395433, "learning_rate": 3.989161595552891e-06, "loss": 0.1056, "step": 645 }, { "epoch": 1.1313485113835378, "grad_norm": 0.9227762274712196, "learning_rate": 3.975694677090436e-06, "loss": 0.0946, "step": 646 }, { "epoch": 1.1330998248686515, "grad_norm": 1.0277331150492526, "learning_rate": 3.9622355103232805e-06, "loss": 0.0943, "step": 647 }, { "epoch": 1.1348511383537654, "grad_norm": 0.7898037783030375, "learning_rate": 3.948784197107146e-06, "loss": 0.0724, "step": 648 }, { "epoch": 1.136602451838879, "grad_norm": 0.8906513390558273, "learning_rate": 3.935340839238325e-06, "loss": 0.0978, "step": 649 }, { "epoch": 1.138353765323993, "grad_norm": 0.8134210787173696, "learning_rate": 3.9219055384529e-06, "loss": 0.0743, "step": 650 }, { "epoch": 1.140105078809107, "grad_norm": 0.7115092884012872, "learning_rate": 3.9084783964259855e-06, "loss": 0.0492, "step": 651 }, { "epoch": 1.1418563922942206, "grad_norm": 0.7227199245003441, "learning_rate": 3.895059514770947e-06, "loss": 0.089, "step": 652 }, { "epoch": 1.1436077057793346, "grad_norm": 0.8212458686585175, "learning_rate": 3.88164899503864e-06, "loss": 0.0873, "step": 653 }, { "epoch": 1.1453590192644483, "grad_norm": 0.6189120343911374, "learning_rate": 3.868246938716643e-06, "loss": 0.0519, "step": 654 }, { "epoch": 1.1471103327495622, "grad_norm": 0.9135905892477679, "learning_rate": 3.854853447228475e-06, "loss": 0.0815, "step": 655 }, { "epoch": 1.1488616462346761, "grad_norm": 0.9650411991425408, "learning_rate": 3.841468621932851e-06, "loss": 0.0864, "step": 656 }, { "epoch": 1.1506129597197898, "grad_norm": 0.8715517114024203, "learning_rate": 3.828092564122893e-06, "loss": 0.0808, "step": 657 }, { "epoch": 1.1523642732049038, "grad_norm": 0.7812734597619906, "learning_rate": 3.814725375025376e-06, "loss": 0.0681, "step": 658 }, { "epoch": 1.1541155866900175, "grad_norm": 0.9464561590385424, "learning_rate": 3.801367155799959e-06, "loss": 0.0967, "step": 659 }, { "epoch": 1.1558669001751314, "grad_norm": 0.8197419560752517, "learning_rate": 3.788018007538419e-06, "loss": 0.0876, "step": 660 }, { "epoch": 1.157618213660245, "grad_norm": 0.7844620890716654, "learning_rate": 3.774678031263887e-06, "loss": 0.0949, "step": 661 }, { "epoch": 1.159369527145359, "grad_norm": 0.6684997290021009, "learning_rate": 3.7613473279300804e-06, "loss": 0.0659, "step": 662 }, { "epoch": 1.161120840630473, "grad_norm": 0.6607818899113638, "learning_rate": 3.7480259984205426e-06, "loss": 0.0797, "step": 663 }, { "epoch": 1.1628721541155866, "grad_norm": 0.8000383670556351, "learning_rate": 3.734714143547879e-06, "loss": 0.0982, "step": 664 }, { "epoch": 1.1646234676007006, "grad_norm": 0.7750030818236461, "learning_rate": 3.7214118640529894e-06, "loss": 0.0755, "step": 665 }, { "epoch": 1.1663747810858143, "grad_norm": 0.7718230303441634, "learning_rate": 3.708119260604317e-06, "loss": 0.0775, "step": 666 }, { "epoch": 1.1681260945709282, "grad_norm": 0.7606884085579109, "learning_rate": 3.694836433797071e-06, "loss": 0.0652, "step": 667 }, { "epoch": 1.1698774080560421, "grad_norm": 0.9648178738887017, "learning_rate": 3.681563484152477e-06, "loss": 0.0892, "step": 668 }, { "epoch": 1.1716287215411558, "grad_norm": 1.0030218510210618, "learning_rate": 3.668300512117014e-06, "loss": 0.0996, "step": 669 }, { "epoch": 1.1733800350262698, "grad_norm": 0.602908672449069, "learning_rate": 3.655047618061648e-06, "loss": 0.0583, "step": 670 }, { "epoch": 1.1751313485113835, "grad_norm": 0.9365604274710315, "learning_rate": 3.6418049022810843e-06, "loss": 0.0884, "step": 671 }, { "epoch": 1.1768826619964974, "grad_norm": 0.9882368559103631, "learning_rate": 3.6285724649929944e-06, "loss": 0.1015, "step": 672 }, { "epoch": 1.178633975481611, "grad_norm": 0.965569447935582, "learning_rate": 3.615350406337269e-06, "loss": 0.097, "step": 673 }, { "epoch": 1.180385288966725, "grad_norm": 1.0268385342532949, "learning_rate": 3.6021388263752566e-06, "loss": 0.1107, "step": 674 }, { "epoch": 1.182136602451839, "grad_norm": 0.9192005858295202, "learning_rate": 3.588937825088999e-06, "loss": 0.1, "step": 675 }, { "epoch": 1.1838879159369526, "grad_norm": 0.9617400889226272, "learning_rate": 3.5757475023804907e-06, "loss": 0.0692, "step": 676 }, { "epoch": 1.1856392294220666, "grad_norm": 0.8649057821650563, "learning_rate": 3.562567958070905e-06, "loss": 0.1033, "step": 677 }, { "epoch": 1.1873905429071803, "grad_norm": 1.0420162692258215, "learning_rate": 3.549399291899851e-06, "loss": 0.099, "step": 678 }, { "epoch": 1.1891418563922942, "grad_norm": 0.5439954275486397, "learning_rate": 3.536241603524616e-06, "loss": 0.0514, "step": 679 }, { "epoch": 1.1908931698774081, "grad_norm": 0.8221945556326056, "learning_rate": 3.5230949925194034e-06, "loss": 0.0841, "step": 680 }, { "epoch": 1.1926444833625218, "grad_norm": 1.0298523052786546, "learning_rate": 3.5099595583745947e-06, "loss": 0.102, "step": 681 }, { "epoch": 1.1943957968476357, "grad_norm": 0.9729770963676866, "learning_rate": 3.4968354004959804e-06, "loss": 0.0959, "step": 682 }, { "epoch": 1.1961471103327495, "grad_norm": 0.8687339216607315, "learning_rate": 3.4837226182040184e-06, "loss": 0.0723, "step": 683 }, { "epoch": 1.1978984238178634, "grad_norm": 1.1187226087659867, "learning_rate": 3.470621310733078e-06, "loss": 0.1072, "step": 684 }, { "epoch": 1.1996497373029773, "grad_norm": 0.8518811353036616, "learning_rate": 3.4575315772306894e-06, "loss": 0.1147, "step": 685 }, { "epoch": 1.201401050788091, "grad_norm": 1.0529027501197052, "learning_rate": 3.444453516756796e-06, "loss": 0.1036, "step": 686 }, { "epoch": 1.203152364273205, "grad_norm": 1.0032377659240923, "learning_rate": 3.4313872282829998e-06, "loss": 0.1128, "step": 687 }, { "epoch": 1.2049036777583186, "grad_norm": 1.0018061327702659, "learning_rate": 3.4183328106918177e-06, "loss": 0.092, "step": 688 }, { "epoch": 1.2066549912434326, "grad_norm": 0.8566035601149397, "learning_rate": 3.4052903627759264e-06, "loss": 0.0936, "step": 689 }, { "epoch": 1.2084063047285465, "grad_norm": 0.9125746690092138, "learning_rate": 3.3922599832374224e-06, "loss": 0.0788, "step": 690 }, { "epoch": 1.2101576182136602, "grad_norm": 1.0426073296986158, "learning_rate": 3.379241770687074e-06, "loss": 0.0799, "step": 691 }, { "epoch": 1.2119089316987741, "grad_norm": 1.06100141802292, "learning_rate": 3.3662358236435664e-06, "loss": 0.1105, "step": 692 }, { "epoch": 1.2136602451838878, "grad_norm": 0.9390043654644253, "learning_rate": 3.353242240532769e-06, "loss": 0.1165, "step": 693 }, { "epoch": 1.2154115586690017, "grad_norm": 0.8503852226528796, "learning_rate": 3.3402611196869764e-06, "loss": 0.1161, "step": 694 }, { "epoch": 1.2171628721541157, "grad_norm": 0.9078220049525988, "learning_rate": 3.327292559344178e-06, "loss": 0.0681, "step": 695 }, { "epoch": 1.2189141856392294, "grad_norm": 0.7127386687186886, "learning_rate": 3.314336657647308e-06, "loss": 0.0752, "step": 696 }, { "epoch": 1.2206654991243433, "grad_norm": 0.9358408817071951, "learning_rate": 3.3013935126434994e-06, "loss": 0.1019, "step": 697 }, { "epoch": 1.222416812609457, "grad_norm": 0.7742420094838459, "learning_rate": 3.288463222283349e-06, "loss": 0.0931, "step": 698 }, { "epoch": 1.224168126094571, "grad_norm": 0.941039147307977, "learning_rate": 3.2755458844201692e-06, "loss": 0.094, "step": 699 }, { "epoch": 1.2259194395796849, "grad_norm": 0.8228135353803155, "learning_rate": 3.262641596809254e-06, "loss": 0.0752, "step": 700 }, { "epoch": 1.2276707530647986, "grad_norm": 1.0260217425221851, "learning_rate": 3.249750457107138e-06, "loss": 0.1434, "step": 701 }, { "epoch": 1.2294220665499125, "grad_norm": 0.8764995569070175, "learning_rate": 3.2368725628708507e-06, "loss": 0.0942, "step": 702 }, { "epoch": 1.2311733800350262, "grad_norm": 0.8810808130600745, "learning_rate": 3.224008011557186e-06, "loss": 0.0773, "step": 703 }, { "epoch": 1.2329246935201401, "grad_norm": 0.8339945067432337, "learning_rate": 3.211156900521961e-06, "loss": 0.0577, "step": 704 }, { "epoch": 1.234676007005254, "grad_norm": 0.9539133650050169, "learning_rate": 3.1983193270192787e-06, "loss": 0.0854, "step": 705 }, { "epoch": 1.2364273204903677, "grad_norm": 0.7988973560578649, "learning_rate": 3.185495388200799e-06, "loss": 0.0718, "step": 706 }, { "epoch": 1.2381786339754817, "grad_norm": 1.074214838443568, "learning_rate": 3.1726851811149907e-06, "loss": 0.0927, "step": 707 }, { "epoch": 1.2399299474605954, "grad_norm": 0.8023394631186075, "learning_rate": 3.159888802706408e-06, "loss": 0.076, "step": 708 }, { "epoch": 1.2416812609457093, "grad_norm": 0.6592103260258445, "learning_rate": 3.147106349814951e-06, "loss": 0.0602, "step": 709 }, { "epoch": 1.2434325744308232, "grad_norm": 1.1114735422528423, "learning_rate": 3.1343379191751366e-06, "loss": 0.0904, "step": 710 }, { "epoch": 1.245183887915937, "grad_norm": 0.8722207507134669, "learning_rate": 3.1215836074153666e-06, "loss": 0.0553, "step": 711 }, { "epoch": 1.2469352014010509, "grad_norm": 0.9701640956470772, "learning_rate": 3.1088435110571884e-06, "loss": 0.0951, "step": 712 }, { "epoch": 1.2486865148861646, "grad_norm": 0.7785105550035741, "learning_rate": 3.0961177265145776e-06, "loss": 0.0744, "step": 713 }, { "epoch": 1.2504378283712785, "grad_norm": 1.1865316263907835, "learning_rate": 3.0834063500931947e-06, "loss": 0.1155, "step": 714 }, { "epoch": 1.2521891418563924, "grad_norm": 0.670146861520249, "learning_rate": 3.0707094779896695e-06, "loss": 0.0737, "step": 715 }, { "epoch": 1.253940455341506, "grad_norm": 1.0183130513166565, "learning_rate": 3.0580272062908605e-06, "loss": 0.1009, "step": 716 }, { "epoch": 1.25569176882662, "grad_norm": 1.3895554101906575, "learning_rate": 3.0453596309731396e-06, "loss": 0.1295, "step": 717 }, { "epoch": 1.2574430823117337, "grad_norm": 0.8401700996571929, "learning_rate": 3.032706847901658e-06, "loss": 0.1052, "step": 718 }, { "epoch": 1.2591943957968477, "grad_norm": 0.9557568744741772, "learning_rate": 3.020068952829619e-06, "loss": 0.1099, "step": 719 }, { "epoch": 1.2609457092819616, "grad_norm": 0.7781143616664629, "learning_rate": 3.0074460413975636e-06, "loss": 0.0603, "step": 720 }, { "epoch": 1.2626970227670753, "grad_norm": 0.8080335461170193, "learning_rate": 2.9948382091326328e-06, "loss": 0.0971, "step": 721 }, { "epoch": 1.2644483362521892, "grad_norm": 0.8725117432766293, "learning_rate": 2.98224555144786e-06, "loss": 0.0565, "step": 722 }, { "epoch": 1.266199649737303, "grad_norm": 1.0281518070875828, "learning_rate": 2.9696681636414372e-06, "loss": 0.1, "step": 723 }, { "epoch": 1.2679509632224168, "grad_norm": 0.9107780574366331, "learning_rate": 2.9571061408959943e-06, "loss": 0.0812, "step": 724 }, { "epoch": 1.2697022767075308, "grad_norm": 0.8277139085800029, "learning_rate": 2.944559578277889e-06, "loss": 0.0669, "step": 725 }, { "epoch": 1.2714535901926445, "grad_norm": 0.8815637667439311, "learning_rate": 2.932028570736474e-06, "loss": 0.097, "step": 726 }, { "epoch": 1.2732049036777582, "grad_norm": 1.1364269200108077, "learning_rate": 2.919513213103391e-06, "loss": 0.0706, "step": 727 }, { "epoch": 1.274956217162872, "grad_norm": 0.7498765483852221, "learning_rate": 2.9070136000918426e-06, "loss": 0.0687, "step": 728 }, { "epoch": 1.276707530647986, "grad_norm": 0.7962376754602092, "learning_rate": 2.89452982629588e-06, "loss": 0.0609, "step": 729 }, { "epoch": 1.2784588441331, "grad_norm": 1.0565112515662811, "learning_rate": 2.8820619861896908e-06, "loss": 0.089, "step": 730 }, { "epoch": 1.2802101576182137, "grad_norm": 1.0984844227065096, "learning_rate": 2.8696101741268765e-06, "loss": 0.0609, "step": 731 }, { "epoch": 1.2819614711033274, "grad_norm": 1.3486910096875888, "learning_rate": 2.8571744843397412e-06, "loss": 0.1295, "step": 732 }, { "epoch": 1.2837127845884413, "grad_norm": 0.7596179516412704, "learning_rate": 2.844755010938586e-06, "loss": 0.0552, "step": 733 }, { "epoch": 1.2854640980735552, "grad_norm": 0.8293822539041598, "learning_rate": 2.8323518479109824e-06, "loss": 0.0673, "step": 734 }, { "epoch": 1.287215411558669, "grad_norm": 1.0021410590514732, "learning_rate": 2.819965089121076e-06, "loss": 0.079, "step": 735 }, { "epoch": 1.2889667250437828, "grad_norm": 0.98205909089748, "learning_rate": 2.8075948283088637e-06, "loss": 0.0956, "step": 736 }, { "epoch": 1.2907180385288965, "grad_norm": 0.9484379433282005, "learning_rate": 2.7952411590894914e-06, "loss": 0.0836, "step": 737 }, { "epoch": 1.2924693520140105, "grad_norm": 0.8548057872579898, "learning_rate": 2.7829041749525455e-06, "loss": 0.0698, "step": 738 }, { "epoch": 1.2942206654991244, "grad_norm": 0.6958930931575743, "learning_rate": 2.77058396926134e-06, "loss": 0.0472, "step": 739 }, { "epoch": 1.295971978984238, "grad_norm": 0.8590285326110315, "learning_rate": 2.7582806352522194e-06, "loss": 0.1035, "step": 740 }, { "epoch": 1.297723292469352, "grad_norm": 0.7506949834567328, "learning_rate": 2.7459942660338434e-06, "loss": 0.0844, "step": 741 }, { "epoch": 1.2994746059544657, "grad_norm": 0.718144378394281, "learning_rate": 2.733724954586483e-06, "loss": 0.072, "step": 742 }, { "epoch": 1.3012259194395797, "grad_norm": 0.9554449618063786, "learning_rate": 2.7214727937613293e-06, "loss": 0.0738, "step": 743 }, { "epoch": 1.3029772329246936, "grad_norm": 1.281934191339505, "learning_rate": 2.709237876279772e-06, "loss": 0.0861, "step": 744 }, { "epoch": 1.3047285464098073, "grad_norm": 0.8591603075335503, "learning_rate": 2.6970202947327156e-06, "loss": 0.0738, "step": 745 }, { "epoch": 1.3064798598949212, "grad_norm": 1.077686551423765, "learning_rate": 2.6848201415798646e-06, "loss": 0.1006, "step": 746 }, { "epoch": 1.308231173380035, "grad_norm": 1.0140620923015204, "learning_rate": 2.6726375091490313e-06, "loss": 0.1179, "step": 747 }, { "epoch": 1.3099824868651488, "grad_norm": 0.8465851580721724, "learning_rate": 2.6604724896354338e-06, "loss": 0.095, "step": 748 }, { "epoch": 1.3117338003502628, "grad_norm": 0.8292233955682955, "learning_rate": 2.648325175101004e-06, "loss": 0.078, "step": 749 }, { "epoch": 1.3134851138353765, "grad_norm": 0.9961150572832315, "learning_rate": 2.6361956574736867e-06, "loss": 0.1037, "step": 750 }, { "epoch": 1.3152364273204904, "grad_norm": 1.0125252057907417, "learning_rate": 2.624084028546739e-06, "loss": 0.1302, "step": 751 }, { "epoch": 1.316987740805604, "grad_norm": 1.4492932570805932, "learning_rate": 2.6119903799780445e-06, "loss": 0.1018, "step": 752 }, { "epoch": 1.318739054290718, "grad_norm": 1.0313545636415973, "learning_rate": 2.5999148032894116e-06, "loss": 0.1301, "step": 753 }, { "epoch": 1.320490367775832, "grad_norm": 0.7640005278295612, "learning_rate": 2.587857389865891e-06, "loss": 0.0705, "step": 754 }, { "epoch": 1.3222416812609457, "grad_norm": 0.9241800146530195, "learning_rate": 2.5758182309550773e-06, "loss": 0.1024, "step": 755 }, { "epoch": 1.3239929947460596, "grad_norm": 0.8476677958931723, "learning_rate": 2.5637974176664156e-06, "loss": 0.075, "step": 756 }, { "epoch": 1.3257443082311733, "grad_norm": 0.9539049936089635, "learning_rate": 2.5517950409705173e-06, "loss": 0.0732, "step": 757 }, { "epoch": 1.3274956217162872, "grad_norm": 0.7787415845648707, "learning_rate": 2.539811191698469e-06, "loss": 0.068, "step": 758 }, { "epoch": 1.3292469352014011, "grad_norm": 0.6740960966163063, "learning_rate": 2.52784596054115e-06, "loss": 0.0807, "step": 759 }, { "epoch": 1.3309982486865148, "grad_norm": 0.9272432887557794, "learning_rate": 2.5158994380485403e-06, "loss": 0.1073, "step": 760 }, { "epoch": 1.3327495621716288, "grad_norm": 0.805162044866896, "learning_rate": 2.5039717146290365e-06, "loss": 0.1363, "step": 761 }, { "epoch": 1.3345008756567425, "grad_norm": 0.9017129027314272, "learning_rate": 2.4920628805487684e-06, "loss": 0.093, "step": 762 }, { "epoch": 1.3362521891418564, "grad_norm": 0.9079872727729905, "learning_rate": 2.4801730259309136e-06, "loss": 0.0808, "step": 763 }, { "epoch": 1.3380035026269703, "grad_norm": 0.7596071384023176, "learning_rate": 2.468302240755023e-06, "loss": 0.0811, "step": 764 }, { "epoch": 1.339754816112084, "grad_norm": 0.8763250650248721, "learning_rate": 2.456450614856333e-06, "loss": 0.0887, "step": 765 }, { "epoch": 1.341506129597198, "grad_norm": 0.9923301577394401, "learning_rate": 2.4446182379250843e-06, "loss": 0.0893, "step": 766 }, { "epoch": 1.3432574430823117, "grad_norm": 0.8633434639363498, "learning_rate": 2.4328051995058482e-06, "loss": 0.088, "step": 767 }, { "epoch": 1.3450087565674256, "grad_norm": 1.0475821217828167, "learning_rate": 2.4210115889968446e-06, "loss": 0.0924, "step": 768 }, { "epoch": 1.3467600700525395, "grad_norm": 0.9107712943754082, "learning_rate": 2.409237495649271e-06, "loss": 0.0728, "step": 769 }, { "epoch": 1.3485113835376532, "grad_norm": 1.5257662309673086, "learning_rate": 2.397483008566624e-06, "loss": 0.1125, "step": 770 }, { "epoch": 1.3502626970227671, "grad_norm": 0.876528919503985, "learning_rate": 2.3857482167040215e-06, "loss": 0.0974, "step": 771 }, { "epoch": 1.3520140105078808, "grad_norm": 0.924196021469823, "learning_rate": 2.374033208867534e-06, "loss": 0.0915, "step": 772 }, { "epoch": 1.3537653239929948, "grad_norm": 0.792737877765985, "learning_rate": 2.3623380737135094e-06, "loss": 0.0678, "step": 773 }, { "epoch": 1.3555166374781087, "grad_norm": 0.7778836639012523, "learning_rate": 2.3506628997479085e-06, "loss": 0.0653, "step": 774 }, { "epoch": 1.3572679509632224, "grad_norm": 0.8527989490917307, "learning_rate": 2.339007775325629e-06, "loss": 0.0833, "step": 775 }, { "epoch": 1.3590192644483363, "grad_norm": 0.7022012295601906, "learning_rate": 2.3273727886498372e-06, "loss": 0.0593, "step": 776 }, { "epoch": 1.36077057793345, "grad_norm": 0.957527967610434, "learning_rate": 2.3157580277713004e-06, "loss": 0.0669, "step": 777 }, { "epoch": 1.362521891418564, "grad_norm": 1.5092316384911828, "learning_rate": 2.304163580587724e-06, "loss": 0.1074, "step": 778 }, { "epoch": 1.3642732049036779, "grad_norm": 0.899000777013253, "learning_rate": 2.2925895348430856e-06, "loss": 0.0835, "step": 779 }, { "epoch": 1.3660245183887916, "grad_norm": 0.790034586719747, "learning_rate": 2.2810359781269657e-06, "loss": 0.0719, "step": 780 }, { "epoch": 1.3677758318739055, "grad_norm": 0.7907785753390943, "learning_rate": 2.269502997873895e-06, "loss": 0.0781, "step": 781 }, { "epoch": 1.3695271453590192, "grad_norm": 0.6926955937004445, "learning_rate": 2.2579906813626807e-06, "loss": 0.0728, "step": 782 }, { "epoch": 1.3712784588441331, "grad_norm": 0.6166696427484202, "learning_rate": 2.246499115715751e-06, "loss": 0.0644, "step": 783 }, { "epoch": 1.373029772329247, "grad_norm": 0.7255433228039453, "learning_rate": 2.235028387898504e-06, "loss": 0.0979, "step": 784 }, { "epoch": 1.3747810858143608, "grad_norm": 0.7617740643442561, "learning_rate": 2.2235785847186338e-06, "loss": 0.0924, "step": 785 }, { "epoch": 1.3765323992994747, "grad_norm": 1.0217536171541641, "learning_rate": 2.212149792825489e-06, "loss": 0.0979, "step": 786 }, { "epoch": 1.3782837127845884, "grad_norm": 0.8732538774678509, "learning_rate": 2.2007420987094036e-06, "loss": 0.0734, "step": 787 }, { "epoch": 1.3800350262697023, "grad_norm": 1.0019292167266876, "learning_rate": 2.189355588701051e-06, "loss": 0.1069, "step": 788 }, { "epoch": 1.3817863397548162, "grad_norm": 0.8315824634330226, "learning_rate": 2.177990348970792e-06, "loss": 0.0909, "step": 789 }, { "epoch": 1.38353765323993, "grad_norm": 0.647602160309111, "learning_rate": 2.1666464655280133e-06, "loss": 0.0812, "step": 790 }, { "epoch": 1.3852889667250436, "grad_norm": 0.9640769615135378, "learning_rate": 2.1553240242204876e-06, "loss": 0.0873, "step": 791 }, { "epoch": 1.3870402802101576, "grad_norm": 0.9202817145346947, "learning_rate": 2.1440231107337147e-06, "loss": 0.0792, "step": 792 }, { "epoch": 1.3887915936952715, "grad_norm": 0.912675153291401, "learning_rate": 2.1327438105902763e-06, "loss": 0.0773, "step": 793 }, { "epoch": 1.3905429071803854, "grad_norm": 0.9415049365667928, "learning_rate": 2.1214862091491966e-06, "loss": 0.1135, "step": 794 }, { "epoch": 1.3922942206654991, "grad_norm": 0.9970178640835594, "learning_rate": 2.1102503916052797e-06, "loss": 0.0847, "step": 795 }, { "epoch": 1.3940455341506128, "grad_norm": 0.9832327330756709, "learning_rate": 2.0990364429884828e-06, "loss": 0.1235, "step": 796 }, { "epoch": 1.3957968476357268, "grad_norm": 0.8721793805349013, "learning_rate": 2.0878444481632597e-06, "loss": 0.1004, "step": 797 }, { "epoch": 1.3975481611208407, "grad_norm": 0.7747485377067997, "learning_rate": 2.076674491827922e-06, "loss": 0.069, "step": 798 }, { "epoch": 1.3992994746059544, "grad_norm": 0.8456173036014264, "learning_rate": 2.0655266585140045e-06, "loss": 0.0754, "step": 799 }, { "epoch": 1.4010507880910683, "grad_norm": 0.9391798871992534, "learning_rate": 2.0544010325856146e-06, "loss": 0.0969, "step": 800 }, { "epoch": 1.402802101576182, "grad_norm": 0.8875537072650175, "learning_rate": 2.043297698238805e-06, "loss": 0.0678, "step": 801 }, { "epoch": 1.404553415061296, "grad_norm": 0.7833633854106007, "learning_rate": 2.0322167395009286e-06, "loss": 0.0877, "step": 802 }, { "epoch": 1.4063047285464099, "grad_norm": 0.9708805413159364, "learning_rate": 2.0211582402300007e-06, "loss": 0.0937, "step": 803 }, { "epoch": 1.4080560420315236, "grad_norm": 0.8290033085639599, "learning_rate": 2.0101222841140775e-06, "loss": 0.0722, "step": 804 }, { "epoch": 1.4098073555166375, "grad_norm": 0.7734317269111782, "learning_rate": 1.9991089546706067e-06, "loss": 0.0788, "step": 805 }, { "epoch": 1.4115586690017512, "grad_norm": 0.8912988277705161, "learning_rate": 1.9881183352458083e-06, "loss": 0.0923, "step": 806 }, { "epoch": 1.4133099824868651, "grad_norm": 0.8930959140982987, "learning_rate": 1.9771505090140343e-06, "loss": 0.0858, "step": 807 }, { "epoch": 1.415061295971979, "grad_norm": 1.0486079636516412, "learning_rate": 1.9662055589771427e-06, "loss": 0.0848, "step": 808 }, { "epoch": 1.4168126094570928, "grad_norm": 0.869268490663677, "learning_rate": 1.955283567963876e-06, "loss": 0.1326, "step": 809 }, { "epoch": 1.4185639229422067, "grad_norm": 0.6923293806693147, "learning_rate": 1.9443846186292204e-06, "loss": 0.0594, "step": 810 }, { "epoch": 1.4203152364273204, "grad_norm": 0.7354927224586634, "learning_rate": 1.9335087934537956e-06, "loss": 0.0806, "step": 811 }, { "epoch": 1.4220665499124343, "grad_norm": 0.8259624368436999, "learning_rate": 1.9226561747432188e-06, "loss": 0.0857, "step": 812 }, { "epoch": 1.4238178633975482, "grad_norm": 0.8711192744207589, "learning_rate": 1.911826844627485e-06, "loss": 0.0752, "step": 813 }, { "epoch": 1.425569176882662, "grad_norm": 2.6069679207164667, "learning_rate": 1.901020885060353e-06, "loss": 0.0866, "step": 814 }, { "epoch": 1.4273204903677759, "grad_norm": 0.8510608016527956, "learning_rate": 1.8902383778187106e-06, "loss": 0.1021, "step": 815 }, { "epoch": 1.4290718038528896, "grad_norm": 0.9912516765235937, "learning_rate": 1.8794794045019727e-06, "loss": 0.1037, "step": 816 }, { "epoch": 1.4308231173380035, "grad_norm": 1.4280563242119932, "learning_rate": 1.8687440465314493e-06, "loss": 0.125, "step": 817 }, { "epoch": 1.4325744308231174, "grad_norm": 0.8887782023449038, "learning_rate": 1.858032385149735e-06, "loss": 0.0961, "step": 818 }, { "epoch": 1.4343257443082311, "grad_norm": 0.8092352027580832, "learning_rate": 1.8473445014200992e-06, "loss": 0.0784, "step": 819 }, { "epoch": 1.436077057793345, "grad_norm": 0.8508624184842186, "learning_rate": 1.8366804762258612e-06, "loss": 0.0993, "step": 820 }, { "epoch": 1.4378283712784588, "grad_norm": 0.8476088885858866, "learning_rate": 1.826040390269792e-06, "loss": 0.097, "step": 821 }, { "epoch": 1.4395796847635727, "grad_norm": 0.7858581848448734, "learning_rate": 1.8154243240734904e-06, "loss": 0.0545, "step": 822 }, { "epoch": 1.4413309982486866, "grad_norm": 0.7511491075823152, "learning_rate": 1.8048323579767796e-06, "loss": 0.0614, "step": 823 }, { "epoch": 1.4430823117338003, "grad_norm": 0.8916221736098047, "learning_rate": 1.7942645721371043e-06, "loss": 0.0688, "step": 824 }, { "epoch": 1.4448336252189142, "grad_norm": 1.0386802641836865, "learning_rate": 1.7837210465289129e-06, "loss": 0.1243, "step": 825 }, { "epoch": 1.446584938704028, "grad_norm": 0.8600832619931907, "learning_rate": 1.773201860943063e-06, "loss": 0.0591, "step": 826 }, { "epoch": 1.4483362521891419, "grad_norm": 0.85335509068009, "learning_rate": 1.7627070949862095e-06, "loss": 0.0897, "step": 827 }, { "epoch": 1.4500875656742558, "grad_norm": 1.0290314674102257, "learning_rate": 1.7522368280802048e-06, "loss": 0.1101, "step": 828 }, { "epoch": 1.4518388791593695, "grad_norm": 0.7360509013463974, "learning_rate": 1.7417911394615033e-06, "loss": 0.073, "step": 829 }, { "epoch": 1.4535901926444834, "grad_norm": 1.0355631885341359, "learning_rate": 1.7313701081805506e-06, "loss": 0.1144, "step": 830 }, { "epoch": 1.4553415061295971, "grad_norm": 0.6596592368497637, "learning_rate": 1.7209738131011977e-06, "loss": 0.0815, "step": 831 }, { "epoch": 1.457092819614711, "grad_norm": 0.8157916293097928, "learning_rate": 1.7106023329000932e-06, "loss": 0.0825, "step": 832 }, { "epoch": 1.458844133099825, "grad_norm": 0.8170690188009203, "learning_rate": 1.700255746066093e-06, "loss": 0.0768, "step": 833 }, { "epoch": 1.4605954465849387, "grad_norm": 0.7806828063640731, "learning_rate": 1.6899341308996704e-06, "loss": 0.0828, "step": 834 }, { "epoch": 1.4623467600700526, "grad_norm": 0.7615625289107787, "learning_rate": 1.6796375655123126e-06, "loss": 0.0983, "step": 835 }, { "epoch": 1.4640980735551663, "grad_norm": 1.1145859850353201, "learning_rate": 1.6693661278259438e-06, "loss": 0.1593, "step": 836 }, { "epoch": 1.4658493870402802, "grad_norm": 0.8460031372441591, "learning_rate": 1.659119895572322e-06, "loss": 0.0713, "step": 837 }, { "epoch": 1.4676007005253942, "grad_norm": 0.8666355376799286, "learning_rate": 1.648898946292456e-06, "loss": 0.0795, "step": 838 }, { "epoch": 1.4693520140105079, "grad_norm": 1.098767971036189, "learning_rate": 1.6387033573360244e-06, "loss": 0.1291, "step": 839 }, { "epoch": 1.4711033274956218, "grad_norm": 1.0895405573642913, "learning_rate": 1.62853320586078e-06, "loss": 0.1035, "step": 840 }, { "epoch": 1.4728546409807355, "grad_norm": 1.0610577874468181, "learning_rate": 1.6183885688319755e-06, "loss": 0.1761, "step": 841 }, { "epoch": 1.4746059544658494, "grad_norm": 0.9087423490771874, "learning_rate": 1.6082695230217721e-06, "loss": 0.0903, "step": 842 }, { "epoch": 1.4763572679509633, "grad_norm": 1.4022223726835983, "learning_rate": 1.5981761450086647e-06, "loss": 0.1407, "step": 843 }, { "epoch": 1.478108581436077, "grad_norm": 0.8453730182740572, "learning_rate": 1.588108511176899e-06, "loss": 0.0801, "step": 844 }, { "epoch": 1.479859894921191, "grad_norm": 0.7650759951583646, "learning_rate": 1.5780666977158976e-06, "loss": 0.0898, "step": 845 }, { "epoch": 1.4816112084063047, "grad_norm": 1.0534592243091558, "learning_rate": 1.5680507806196815e-06, "loss": 0.1065, "step": 846 }, { "epoch": 1.4833625218914186, "grad_norm": 0.7581070203659848, "learning_rate": 1.558060835686291e-06, "loss": 0.0768, "step": 847 }, { "epoch": 1.4851138353765325, "grad_norm": 0.8228217306630333, "learning_rate": 1.548096938517215e-06, "loss": 0.0864, "step": 848 }, { "epoch": 1.4868651488616462, "grad_norm": 0.6948138142476058, "learning_rate": 1.5381591645168214e-06, "loss": 0.0727, "step": 849 }, { "epoch": 1.4886164623467601, "grad_norm": 1.1215312769377082, "learning_rate": 1.5282475888917837e-06, "loss": 0.1084, "step": 850 }, { "epoch": 1.4903677758318739, "grad_norm": 0.556612824562337, "learning_rate": 1.5183622866505149e-06, "loss": 0.0378, "step": 851 }, { "epoch": 1.4921190893169878, "grad_norm": 0.8334893621906851, "learning_rate": 1.5085033326025933e-06, "loss": 0.1058, "step": 852 }, { "epoch": 1.4938704028021017, "grad_norm": 0.8483067154096786, "learning_rate": 1.4986708013582013e-06, "loss": 0.0593, "step": 853 }, { "epoch": 1.4956217162872154, "grad_norm": 0.7563089529959395, "learning_rate": 1.4888647673275598e-06, "loss": 0.0881, "step": 854 }, { "epoch": 1.4973730297723291, "grad_norm": 0.9581064139110383, "learning_rate": 1.4790853047203674e-06, "loss": 0.1231, "step": 855 }, { "epoch": 1.499124343257443, "grad_norm": 0.8646013332942565, "learning_rate": 1.4693324875452369e-06, "loss": 0.0962, "step": 856 }, { "epoch": 1.500875656742557, "grad_norm": 0.8161715950753782, "learning_rate": 1.4596063896091316e-06, "loss": 0.0984, "step": 857 }, { "epoch": 1.5026269702276709, "grad_norm": 0.85028135397844, "learning_rate": 1.4499070845168112e-06, "loss": 0.0998, "step": 858 }, { "epoch": 1.5043782837127846, "grad_norm": 0.8492942866077009, "learning_rate": 1.4402346456702737e-06, "loss": 0.0802, "step": 859 }, { "epoch": 1.5061295971978983, "grad_norm": 1.0442880468905495, "learning_rate": 1.4305891462682004e-06, "loss": 0.1154, "step": 860 }, { "epoch": 1.5078809106830122, "grad_norm": 0.8309778737447728, "learning_rate": 1.420970659305404e-06, "loss": 0.1184, "step": 861 }, { "epoch": 1.5096322241681261, "grad_norm": 1.0968471389762762, "learning_rate": 1.4113792575722684e-06, "loss": 0.0877, "step": 862 }, { "epoch": 1.51138353765324, "grad_norm": 0.7388443383068837, "learning_rate": 1.4018150136542063e-06, "loss": 0.0431, "step": 863 }, { "epoch": 1.5131348511383538, "grad_norm": 0.978572938428472, "learning_rate": 1.3922779999311032e-06, "loss": 0.0662, "step": 864 }, { "epoch": 1.5148861646234675, "grad_norm": 0.825879975581395, "learning_rate": 1.3827682885767778e-06, "loss": 0.0741, "step": 865 }, { "epoch": 1.5166374781085814, "grad_norm": 0.8275889522903741, "learning_rate": 1.3732859515584306e-06, "loss": 0.0719, "step": 866 }, { "epoch": 1.5183887915936953, "grad_norm": 0.8930447963765076, "learning_rate": 1.363831060636096e-06, "loss": 0.101, "step": 867 }, { "epoch": 1.5201401050788093, "grad_norm": 1.0626174646711952, "learning_rate": 1.3544036873621054e-06, "loss": 0.1285, "step": 868 }, { "epoch": 1.521891418563923, "grad_norm": 0.9900839577095674, "learning_rate": 1.345003903080541e-06, "loss": 0.072, "step": 869 }, { "epoch": 1.5236427320490367, "grad_norm": 0.9536264805723799, "learning_rate": 1.335631778926702e-06, "loss": 0.1401, "step": 870 }, { "epoch": 1.5253940455341506, "grad_norm": 0.816870903519518, "learning_rate": 1.3262873858265618e-06, "loss": 0.0764, "step": 871 }, { "epoch": 1.5271453590192645, "grad_norm": 1.1150610380119643, "learning_rate": 1.316970794496229e-06, "loss": 0.0694, "step": 872 }, { "epoch": 1.5288966725043784, "grad_norm": 0.955595059805072, "learning_rate": 1.3076820754414165e-06, "loss": 0.0844, "step": 873 }, { "epoch": 1.5306479859894921, "grad_norm": 0.8892461247283927, "learning_rate": 1.2984212989569055e-06, "loss": 0.0709, "step": 874 }, { "epoch": 1.5323992994746058, "grad_norm": 1.0183197020080885, "learning_rate": 1.2891885351260191e-06, "loss": 0.0835, "step": 875 }, { "epoch": 1.5341506129597198, "grad_norm": 0.7689697067022396, "learning_rate": 1.2799838538200804e-06, "loss": 0.0865, "step": 876 }, { "epoch": 1.5359019264448337, "grad_norm": 0.9433827530845643, "learning_rate": 1.270807324697898e-06, "loss": 0.0831, "step": 877 }, { "epoch": 1.5376532399299476, "grad_norm": 0.7106251655802119, "learning_rate": 1.2616590172052268e-06, "loss": 0.0772, "step": 878 }, { "epoch": 1.5394045534150613, "grad_norm": 0.787230689946638, "learning_rate": 1.252539000574246e-06, "loss": 0.0839, "step": 879 }, { "epoch": 1.541155866900175, "grad_norm": 0.7954726817869575, "learning_rate": 1.2434473438230426e-06, "loss": 0.0655, "step": 880 }, { "epoch": 1.542907180385289, "grad_norm": 0.8902609833747344, "learning_rate": 1.2343841157550757e-06, "loss": 0.0812, "step": 881 }, { "epoch": 1.5446584938704029, "grad_norm": 0.8517459203382189, "learning_rate": 1.2253493849586695e-06, "loss": 0.091, "step": 882 }, { "epoch": 1.5464098073555166, "grad_norm": 0.9150985106553249, "learning_rate": 1.2163432198064834e-06, "loss": 0.0957, "step": 883 }, { "epoch": 1.5481611208406305, "grad_norm": 1.0872606970611478, "learning_rate": 1.207365688454999e-06, "loss": 0.0643, "step": 884 }, { "epoch": 1.5499124343257442, "grad_norm": 1.0678678238356631, "learning_rate": 1.1984168588440075e-06, "loss": 0.089, "step": 885 }, { "epoch": 1.5516637478108581, "grad_norm": 0.8484700364176362, "learning_rate": 1.1894967986960877e-06, "loss": 0.089, "step": 886 }, { "epoch": 1.553415061295972, "grad_norm": 0.8940833430222472, "learning_rate": 1.1806055755161029e-06, "loss": 0.107, "step": 887 }, { "epoch": 1.5551663747810858, "grad_norm": 0.8190347787792227, "learning_rate": 1.1717432565906817e-06, "loss": 0.0787, "step": 888 }, { "epoch": 1.5569176882661997, "grad_norm": 0.71210611816795, "learning_rate": 1.1629099089877116e-06, "loss": 0.0665, "step": 889 }, { "epoch": 1.5586690017513134, "grad_norm": 0.9042648925525666, "learning_rate": 1.154105599555837e-06, "loss": 0.0886, "step": 890 }, { "epoch": 1.5604203152364273, "grad_norm": 0.9505703326726503, "learning_rate": 1.1453303949239431e-06, "loss": 0.097, "step": 891 }, { "epoch": 1.5621716287215412, "grad_norm": 1.3798512911645553, "learning_rate": 1.1365843615006606e-06, "loss": 0.0818, "step": 892 }, { "epoch": 1.563922942206655, "grad_norm": 0.877599922223935, "learning_rate": 1.127867565473858e-06, "loss": 0.0697, "step": 893 }, { "epoch": 1.5656742556917689, "grad_norm": 0.8465883171108056, "learning_rate": 1.11918007281014e-06, "loss": 0.0831, "step": 894 }, { "epoch": 1.5674255691768826, "grad_norm": 0.9027728204073343, "learning_rate": 1.1105219492543567e-06, "loss": 0.1178, "step": 895 }, { "epoch": 1.5691768826619965, "grad_norm": 0.976849713853511, "learning_rate": 1.1018932603290927e-06, "loss": 0.1209, "step": 896 }, { "epoch": 1.5709281961471104, "grad_norm": 0.9584305854695117, "learning_rate": 1.0932940713341843e-06, "loss": 0.1158, "step": 897 }, { "epoch": 1.5726795096322241, "grad_norm": 0.8686276598286167, "learning_rate": 1.0847244473462165e-06, "loss": 0.0715, "step": 898 }, { "epoch": 1.5744308231173378, "grad_norm": 1.0091500961047988, "learning_rate": 1.0761844532180322e-06, "loss": 0.0961, "step": 899 }, { "epoch": 1.5761821366024518, "grad_norm": 0.7807508240613471, "learning_rate": 1.067674153578247e-06, "loss": 0.0664, "step": 900 }, { "epoch": 1.5779334500875657, "grad_norm": 0.8067749796026943, "learning_rate": 1.05919361283075e-06, "loss": 0.0636, "step": 901 }, { "epoch": 1.5796847635726796, "grad_norm": 0.8759620340871858, "learning_rate": 1.0507428951542293e-06, "loss": 0.0753, "step": 902 }, { "epoch": 1.5814360770577933, "grad_norm": 0.975397189576844, "learning_rate": 1.042322064501673e-06, "loss": 0.0825, "step": 903 }, { "epoch": 1.583187390542907, "grad_norm": 0.8598966333634731, "learning_rate": 1.0339311845998929e-06, "loss": 0.0713, "step": 904 }, { "epoch": 1.584938704028021, "grad_norm": 0.7086427629579961, "learning_rate": 1.025570318949044e-06, "loss": 0.0588, "step": 905 }, { "epoch": 1.5866900175131349, "grad_norm": 0.9371811065539839, "learning_rate": 1.0172395308221355e-06, "loss": 0.1025, "step": 906 }, { "epoch": 1.5884413309982488, "grad_norm": 0.8523909056255194, "learning_rate": 1.008938883264563e-06, "loss": 0.0785, "step": 907 }, { "epoch": 1.5901926444833625, "grad_norm": 0.6932575906112352, "learning_rate": 1.0006684390936206e-06, "loss": 0.0527, "step": 908 }, { "epoch": 1.5919439579684762, "grad_norm": 0.7895131639078904, "learning_rate": 9.924282608980318e-07, "loss": 0.0672, "step": 909 }, { "epoch": 1.5936952714535901, "grad_norm": 0.9109376997806669, "learning_rate": 9.84218411037477e-07, "loss": 0.0695, "step": 910 }, { "epoch": 1.595446584938704, "grad_norm": 1.375391449767245, "learning_rate": 9.760389516421143e-07, "loss": 0.1032, "step": 911 }, { "epoch": 1.597197898423818, "grad_norm": 0.7645671144180791, "learning_rate": 9.678899446121205e-07, "loss": 0.0487, "step": 912 }, { "epoch": 1.5989492119089317, "grad_norm": 0.98070657894628, "learning_rate": 9.597714516172107e-07, "loss": 0.1004, "step": 913 }, { "epoch": 1.6007005253940454, "grad_norm": 0.8547769727295768, "learning_rate": 9.516835340961783e-07, "loss": 0.0743, "step": 914 }, { "epoch": 1.6024518388791593, "grad_norm": 1.0122657120554246, "learning_rate": 9.436262532564316e-07, "loss": 0.1235, "step": 915 }, { "epoch": 1.6042031523642732, "grad_norm": 1.0918909664473564, "learning_rate": 9.355996700735242e-07, "loss": 0.0997, "step": 916 }, { "epoch": 1.6059544658493872, "grad_norm": 0.9405618536030825, "learning_rate": 9.276038452907016e-07, "loss": 0.0692, "step": 917 }, { "epoch": 1.6077057793345009, "grad_norm": 0.9492717016651747, "learning_rate": 9.19638839418433e-07, "loss": 0.0828, "step": 918 }, { "epoch": 1.6094570928196146, "grad_norm": 0.7689394884673381, "learning_rate": 9.117047127339579e-07, "loss": 0.0973, "step": 919 }, { "epoch": 1.6112084063047285, "grad_norm": 0.8844949815830236, "learning_rate": 9.038015252808335e-07, "loss": 0.0863, "step": 920 }, { "epoch": 1.6129597197898424, "grad_norm": 0.7795777404725, "learning_rate": 8.959293368684713e-07, "loss": 0.0707, "step": 921 }, { "epoch": 1.6147110332749564, "grad_norm": 0.9595778145894932, "learning_rate": 8.880882070716945e-07, "loss": 0.0936, "step": 922 }, { "epoch": 1.61646234676007, "grad_norm": 0.9497307647401774, "learning_rate": 8.80278195230278e-07, "loss": 0.0941, "step": 923 }, { "epoch": 1.6182136602451838, "grad_norm": 1.1659675647916, "learning_rate": 8.724993604485044e-07, "loss": 0.1023, "step": 924 }, { "epoch": 1.6199649737302977, "grad_norm": 1.1234743594327323, "learning_rate": 8.647517615947193e-07, "loss": 0.0776, "step": 925 }, { "epoch": 1.6217162872154116, "grad_norm": 0.8528927864364849, "learning_rate": 8.57035457300876e-07, "loss": 0.0807, "step": 926 }, { "epoch": 1.6234676007005255, "grad_norm": 0.83886810565477, "learning_rate": 8.49350505962106e-07, "loss": 0.0594, "step": 927 }, { "epoch": 1.6252189141856392, "grad_norm": 0.855463864117844, "learning_rate": 8.416969657362622e-07, "loss": 0.0819, "step": 928 }, { "epoch": 1.626970227670753, "grad_norm": 0.908767552315424, "learning_rate": 8.340748945434879e-07, "loss": 0.1285, "step": 929 }, { "epoch": 1.6287215411558669, "grad_norm": 1.0012453661601142, "learning_rate": 8.264843500657799e-07, "loss": 0.0861, "step": 930 }, { "epoch": 1.6304728546409808, "grad_norm": 1.0333069552134135, "learning_rate": 8.189253897465433e-07, "loss": 0.0753, "step": 931 }, { "epoch": 1.6322241681260947, "grad_norm": 0.905280465006129, "learning_rate": 8.113980707901653e-07, "loss": 0.0899, "step": 932 }, { "epoch": 1.6339754816112084, "grad_norm": 1.0338044714761787, "learning_rate": 8.039024501615777e-07, "loss": 0.0938, "step": 933 }, { "epoch": 1.6357267950963221, "grad_norm": 2.359076466144793, "learning_rate": 7.964385845858258e-07, "loss": 0.1217, "step": 934 }, { "epoch": 1.637478108581436, "grad_norm": 0.9826730256245049, "learning_rate": 7.890065305476441e-07, "loss": 0.1191, "step": 935 }, { "epoch": 1.63922942206655, "grad_norm": 0.8747332183619412, "learning_rate": 7.816063442910193e-07, "loss": 0.0993, "step": 936 }, { "epoch": 1.640980735551664, "grad_norm": 0.8075496453024368, "learning_rate": 7.742380818187772e-07, "loss": 0.071, "step": 937 }, { "epoch": 1.6427320490367776, "grad_norm": 1.001355554288775, "learning_rate": 7.669017988921474e-07, "loss": 0.1207, "step": 938 }, { "epoch": 1.6444833625218913, "grad_norm": 0.8066142991914813, "learning_rate": 7.595975510303466e-07, "loss": 0.0833, "step": 939 }, { "epoch": 1.6462346760070052, "grad_norm": 0.845058894943196, "learning_rate": 7.523253935101577e-07, "loss": 0.0838, "step": 940 }, { "epoch": 1.6479859894921192, "grad_norm": 0.8892397239300569, "learning_rate": 7.45085381365514e-07, "loss": 0.0842, "step": 941 }, { "epoch": 1.649737302977233, "grad_norm": 0.7913749401745233, "learning_rate": 7.378775693870793e-07, "loss": 0.0656, "step": 942 }, { "epoch": 1.6514886164623468, "grad_norm": 0.9412135475034102, "learning_rate": 7.307020121218333e-07, "loss": 0.0988, "step": 943 }, { "epoch": 1.6532399299474605, "grad_norm": 0.6922791542112726, "learning_rate": 7.235587638726599e-07, "loss": 0.0644, "step": 944 }, { "epoch": 1.6549912434325744, "grad_norm": 0.683548226075825, "learning_rate": 7.164478786979356e-07, "loss": 0.0507, "step": 945 }, { "epoch": 1.6567425569176883, "grad_norm": 0.9731267616312939, "learning_rate": 7.093694104111237e-07, "loss": 0.078, "step": 946 }, { "epoch": 1.658493870402802, "grad_norm": 0.9269892309271431, "learning_rate": 7.023234125803635e-07, "loss": 0.1005, "step": 947 }, { "epoch": 1.660245183887916, "grad_norm": 0.9430923254149106, "learning_rate": 6.953099385280632e-07, "loss": 0.063, "step": 948 }, { "epoch": 1.6619964973730297, "grad_norm": 0.954580758978347, "learning_rate": 6.883290413305011e-07, "loss": 0.1154, "step": 949 }, { "epoch": 1.6637478108581436, "grad_norm": 0.9178349147320257, "learning_rate": 6.813807738174199e-07, "loss": 0.0574, "step": 950 }, { "epoch": 1.6654991243432575, "grad_norm": 0.8477054090991074, "learning_rate": 6.744651885716313e-07, "loss": 0.0713, "step": 951 }, { "epoch": 1.6672504378283712, "grad_norm": 1.033767975754156, "learning_rate": 6.675823379286151e-07, "loss": 0.1363, "step": 952 }, { "epoch": 1.6690017513134852, "grad_norm": 0.8140614446219004, "learning_rate": 6.607322739761219e-07, "loss": 0.0811, "step": 953 }, { "epoch": 1.6707530647985989, "grad_norm": 0.8580159228495404, "learning_rate": 6.53915048553781e-07, "loss": 0.0786, "step": 954 }, { "epoch": 1.6725043782837128, "grad_norm": 1.0237229957514502, "learning_rate": 6.471307132527071e-07, "loss": 0.072, "step": 955 }, { "epoch": 1.6742556917688267, "grad_norm": 1.1139363365853103, "learning_rate": 6.40379319415112e-07, "loss": 0.1586, "step": 956 }, { "epoch": 1.6760070052539404, "grad_norm": 0.8039008743188236, "learning_rate": 6.336609181339148e-07, "loss": 0.074, "step": 957 }, { "epoch": 1.6777583187390543, "grad_norm": 0.955504225000358, "learning_rate": 6.269755602523531e-07, "loss": 0.0941, "step": 958 }, { "epoch": 1.679509632224168, "grad_norm": 1.1369679103050756, "learning_rate": 6.203232963636003e-07, "loss": 0.0953, "step": 959 }, { "epoch": 1.681260945709282, "grad_norm": 0.8138093638660885, "learning_rate": 6.137041768103819e-07, "loss": 0.0682, "step": 960 }, { "epoch": 1.683012259194396, "grad_norm": 0.9438149839513782, "learning_rate": 6.071182516845974e-07, "loss": 0.0759, "step": 961 }, { "epoch": 1.6847635726795096, "grad_norm": 1.1428154671493533, "learning_rate": 6.005655708269386e-07, "loss": 0.0851, "step": 962 }, { "epoch": 1.6865148861646233, "grad_norm": 0.8193942407950339, "learning_rate": 5.9404618382651e-07, "loss": 0.0985, "step": 963 }, { "epoch": 1.6882661996497372, "grad_norm": 1.054106752133417, "learning_rate": 5.87560140020459e-07, "loss": 0.0707, "step": 964 }, { "epoch": 1.6900175131348512, "grad_norm": 0.8940576683459753, "learning_rate": 5.811074884935964e-07, "loss": 0.0683, "step": 965 }, { "epoch": 1.691768826619965, "grad_norm": 0.6287784415256515, "learning_rate": 5.746882780780322e-07, "loss": 0.0589, "step": 966 }, { "epoch": 1.6935201401050788, "grad_norm": 0.9750764230003164, "learning_rate": 5.683025573528017e-07, "loss": 0.1097, "step": 967 }, { "epoch": 1.6952714535901925, "grad_norm": 0.9218101868960946, "learning_rate": 5.619503746434956e-07, "loss": 0.1168, "step": 968 }, { "epoch": 1.6970227670753064, "grad_norm": 1.1755520131559736, "learning_rate": 5.55631778021899e-07, "loss": 0.0959, "step": 969 }, { "epoch": 1.6987740805604203, "grad_norm": 0.8414667003154775, "learning_rate": 5.493468153056236e-07, "loss": 0.0664, "step": 970 }, { "epoch": 1.7005253940455343, "grad_norm": 0.94270536937066, "learning_rate": 5.430955340577515e-07, "loss": 0.0608, "step": 971 }, { "epoch": 1.702276707530648, "grad_norm": 1.1274506094616077, "learning_rate": 5.368779815864678e-07, "loss": 0.1253, "step": 972 }, { "epoch": 1.7040280210157617, "grad_norm": 1.0875056272885322, "learning_rate": 5.306942049447095e-07, "loss": 0.0803, "step": 973 }, { "epoch": 1.7057793345008756, "grad_norm": 0.7586215304561557, "learning_rate": 5.245442509298038e-07, "loss": 0.0707, "step": 974 }, { "epoch": 1.7075306479859895, "grad_norm": 0.9854236513125099, "learning_rate": 5.184281660831158e-07, "loss": 0.0862, "step": 975 }, { "epoch": 1.7092819614711035, "grad_norm": 1.1000068183348335, "learning_rate": 5.123459966897021e-07, "loss": 0.0804, "step": 976 }, { "epoch": 1.7110332749562172, "grad_norm": 0.9054286431304135, "learning_rate": 5.062977887779486e-07, "loss": 0.0605, "step": 977 }, { "epoch": 1.7127845884413309, "grad_norm": 0.7861214593313163, "learning_rate": 5.002835881192336e-07, "loss": 0.0827, "step": 978 }, { "epoch": 1.7145359019264448, "grad_norm": 0.8032715915593343, "learning_rate": 4.943034402275754e-07, "loss": 0.0983, "step": 979 }, { "epoch": 1.7162872154115587, "grad_norm": 1.0469596242294776, "learning_rate": 4.88357390359287e-07, "loss": 0.0669, "step": 980 }, { "epoch": 1.7180385288966726, "grad_norm": 1.2227292003086079, "learning_rate": 4.824454835126402e-07, "loss": 0.1081, "step": 981 }, { "epoch": 1.7197898423817863, "grad_norm": 0.8383307838231276, "learning_rate": 4.765677644275163e-07, "loss": 0.1177, "step": 982 }, { "epoch": 1.7215411558669, "grad_norm": 0.8845101264170805, "learning_rate": 4.707242775850751e-07, "loss": 0.0825, "step": 983 }, { "epoch": 1.723292469352014, "grad_norm": 0.8738347124266663, "learning_rate": 4.6491506720741376e-07, "loss": 0.0767, "step": 984 }, { "epoch": 1.725043782837128, "grad_norm": 0.9235780941896881, "learning_rate": 4.591401772572313e-07, "loss": 0.1073, "step": 985 }, { "epoch": 1.7267950963222418, "grad_norm": 0.808520171929526, "learning_rate": 4.533996514375033e-07, "loss": 0.0888, "step": 986 }, { "epoch": 1.7285464098073555, "grad_norm": 0.8040022329220664, "learning_rate": 4.476935331911397e-07, "loss": 0.0689, "step": 987 }, { "epoch": 1.7302977232924692, "grad_norm": 1.024560760379506, "learning_rate": 4.4202186570066753e-07, "loss": 0.0624, "step": 988 }, { "epoch": 1.7320490367775832, "grad_norm": 0.7655215543483458, "learning_rate": 4.363846918878961e-07, "loss": 0.0641, "step": 989 }, { "epoch": 1.733800350262697, "grad_norm": 0.8347354918909909, "learning_rate": 4.307820544135938e-07, "loss": 0.065, "step": 990 }, { "epoch": 1.735551663747811, "grad_norm": 1.0085283172237212, "learning_rate": 4.2521399567717004e-07, "loss": 0.0696, "step": 991 }, { "epoch": 1.7373029772329247, "grad_norm": 0.9981732466737275, "learning_rate": 4.1968055781634655e-07, "loss": 0.0668, "step": 992 }, { "epoch": 1.7390542907180384, "grad_norm": 0.6502067313094136, "learning_rate": 4.1418178270684727e-07, "loss": 0.067, "step": 993 }, { "epoch": 1.7408056042031523, "grad_norm": 0.9620108166680914, "learning_rate": 4.0871771196207223e-07, "loss": 0.0865, "step": 994 }, { "epoch": 1.7425569176882663, "grad_norm": 0.7915472204717325, "learning_rate": 4.032883869327886e-07, "loss": 0.0725, "step": 995 }, { "epoch": 1.7443082311733802, "grad_norm": 1.11392133448476, "learning_rate": 3.9789384870681904e-07, "loss": 0.0976, "step": 996 }, { "epoch": 1.746059544658494, "grad_norm": 0.8699184608686867, "learning_rate": 3.925341381087239e-07, "loss": 0.0631, "step": 997 }, { "epoch": 1.7478108581436076, "grad_norm": 0.7718373090564703, "learning_rate": 3.872092956995005e-07, "loss": 0.0555, "step": 998 }, { "epoch": 1.7495621716287215, "grad_norm": 1.2956166388511012, "learning_rate": 3.81919361776269e-07, "loss": 0.1143, "step": 999 }, { "epoch": 1.7513134851138354, "grad_norm": 1.1331409176270781, "learning_rate": 3.7666437637197127e-07, "loss": 0.0937, "step": 1000 }, { "epoch": 1.7513134851138354, "eval_loss": 0.20276139676570892, "eval_runtime": 1.9019, "eval_samples_per_second": 24.712, "eval_steps_per_second": 6.31, "step": 1000 }, { "epoch": 1.7530647985989494, "grad_norm": 0.7722864692389659, "learning_rate": 3.714443792550687e-07, "loss": 0.0783, "step": 1001 }, { "epoch": 1.754816112084063, "grad_norm": 0.859006539210779, "learning_rate": 3.6625940992923826e-07, "loss": 0.0823, "step": 1002 }, { "epoch": 1.7565674255691768, "grad_norm": 0.9250172746239839, "learning_rate": 3.611095076330762e-07, "loss": 0.1252, "step": 1003 }, { "epoch": 1.7583187390542907, "grad_norm": 1.125672327130624, "learning_rate": 3.559947113397988e-07, "loss": 0.0956, "step": 1004 }, { "epoch": 1.7600700525394046, "grad_norm": 1.0498908967100793, "learning_rate": 3.509150597569483e-07, "loss": 0.0776, "step": 1005 }, { "epoch": 1.7618213660245186, "grad_norm": 0.9371798046914668, "learning_rate": 3.458705913261029e-07, "loss": 0.0605, "step": 1006 }, { "epoch": 1.7635726795096323, "grad_norm": 0.8138873352682954, "learning_rate": 3.4086134422257945e-07, "loss": 0.0592, "step": 1007 }, { "epoch": 1.765323992994746, "grad_norm": 0.9071537379118976, "learning_rate": 3.3588735635515177e-07, "loss": 0.1014, "step": 1008 }, { "epoch": 1.7670753064798599, "grad_norm": 0.9437847210944791, "learning_rate": 3.309486653657584e-07, "loss": 0.1097, "step": 1009 }, { "epoch": 1.7688266199649738, "grad_norm": 1.0469256700567728, "learning_rate": 3.260453086292187e-07, "loss": 0.0508, "step": 1010 }, { "epoch": 1.7705779334500875, "grad_norm": 0.9028655810648165, "learning_rate": 3.2117732325295416e-07, "loss": 0.0708, "step": 1011 }, { "epoch": 1.7723292469352014, "grad_norm": 1.0851027389495533, "learning_rate": 3.163447460767005e-07, "loss": 0.0761, "step": 1012 }, { "epoch": 1.7740805604203151, "grad_norm": 0.9493103571999805, "learning_rate": 3.115476136722362e-07, "loss": 0.0996, "step": 1013 }, { "epoch": 1.775831873905429, "grad_norm": 0.7586774662370825, "learning_rate": 3.067859623431008e-07, "loss": 0.0727, "step": 1014 }, { "epoch": 1.777583187390543, "grad_norm": 0.9384004769032015, "learning_rate": 3.0205982812431924e-07, "loss": 0.0723, "step": 1015 }, { "epoch": 1.7793345008756567, "grad_norm": 1.4973320222136197, "learning_rate": 2.973692467821371e-07, "loss": 0.125, "step": 1016 }, { "epoch": 1.7810858143607706, "grad_norm": 0.8533671156495338, "learning_rate": 2.927142538137384e-07, "loss": 0.0596, "step": 1017 }, { "epoch": 1.7828371278458843, "grad_norm": 0.7496043304937501, "learning_rate": 2.880948844469872e-07, "loss": 0.0788, "step": 1018 }, { "epoch": 1.7845884413309983, "grad_norm": 1.3612555191011448, "learning_rate": 2.8351117364015526e-07, "loss": 0.0955, "step": 1019 }, { "epoch": 1.7863397548161122, "grad_norm": 0.9630356160518716, "learning_rate": 2.78963156081658e-07, "loss": 0.0988, "step": 1020 }, { "epoch": 1.7880910683012259, "grad_norm": 1.1121115936311063, "learning_rate": 2.744508661897949e-07, "loss": 0.1138, "step": 1021 }, { "epoch": 1.7898423817863398, "grad_norm": 0.7339445298389599, "learning_rate": 2.6997433811248475e-07, "loss": 0.0927, "step": 1022 }, { "epoch": 1.7915936952714535, "grad_norm": 0.8953178801892129, "learning_rate": 2.6553360572701195e-07, "loss": 0.1039, "step": 1023 }, { "epoch": 1.7933450087565674, "grad_norm": 1.0398978963579788, "learning_rate": 2.6112870263976686e-07, "loss": 0.1398, "step": 1024 }, { "epoch": 1.7950963222416814, "grad_norm": 1.182068666933184, "learning_rate": 2.5675966218599136e-07, "loss": 0.1103, "step": 1025 }, { "epoch": 1.796847635726795, "grad_norm": 0.8892213526204024, "learning_rate": 2.524265174295293e-07, "loss": 0.0669, "step": 1026 }, { "epoch": 1.7985989492119088, "grad_norm": 0.8059578982086425, "learning_rate": 2.481293011625724e-07, "loss": 0.0648, "step": 1027 }, { "epoch": 1.8003502626970227, "grad_norm": 0.7102065798669428, "learning_rate": 2.438680459054171e-07, "loss": 0.0718, "step": 1028 }, { "epoch": 1.8021015761821366, "grad_norm": 0.6852586354069934, "learning_rate": 2.3964278390621374e-07, "loss": 0.0826, "step": 1029 }, { "epoch": 1.8038528896672505, "grad_norm": 0.6825735878520617, "learning_rate": 2.3545354714072265e-07, "loss": 0.0569, "step": 1030 }, { "epoch": 1.8056042031523643, "grad_norm": 0.945132269971334, "learning_rate": 2.3130036731207893e-07, "loss": 0.1009, "step": 1031 }, { "epoch": 1.807355516637478, "grad_norm": 0.7485096153539492, "learning_rate": 2.2718327585054156e-07, "loss": 0.0551, "step": 1032 }, { "epoch": 1.8091068301225919, "grad_norm": 1.0031751267167337, "learning_rate": 2.2310230391326682e-07, "loss": 0.1056, "step": 1033 }, { "epoch": 1.8108581436077058, "grad_norm": 0.7465291965071548, "learning_rate": 2.190574823840641e-07, "loss": 0.0799, "step": 1034 }, { "epoch": 1.8126094570928197, "grad_norm": 1.0587014217350015, "learning_rate": 2.15048841873165e-07, "loss": 0.0851, "step": 1035 }, { "epoch": 1.8143607705779334, "grad_norm": 1.1062177226269723, "learning_rate": 2.110764127169923e-07, "loss": 0.1016, "step": 1036 }, { "epoch": 1.8161120840630471, "grad_norm": 0.8981661281109393, "learning_rate": 2.0714022497793197e-07, "loss": 0.0762, "step": 1037 }, { "epoch": 1.817863397548161, "grad_norm": 0.9608421098019754, "learning_rate": 2.0324030844410204e-07, "loss": 0.1099, "step": 1038 }, { "epoch": 1.819614711033275, "grad_norm": 0.9681573187969551, "learning_rate": 1.993766926291285e-07, "loss": 0.0735, "step": 1039 }, { "epoch": 1.821366024518389, "grad_norm": 0.8273362165098569, "learning_rate": 1.9554940677192213e-07, "loss": 0.0981, "step": 1040 }, { "epoch": 1.8231173380035026, "grad_norm": 0.8824152506254714, "learning_rate": 1.9175847983645857e-07, "loss": 0.1064, "step": 1041 }, { "epoch": 1.8248686514886163, "grad_norm": 1.0042241694921472, "learning_rate": 1.880039405115569e-07, "loss": 0.0947, "step": 1042 }, { "epoch": 1.8266199649737302, "grad_norm": 1.0580797717184558, "learning_rate": 1.8428581721066486e-07, "loss": 0.077, "step": 1043 }, { "epoch": 1.8283712784588442, "grad_norm": 0.8676736423566411, "learning_rate": 1.806041380716411e-07, "loss": 0.0854, "step": 1044 }, { "epoch": 1.830122591943958, "grad_norm": 1.0605831500963205, "learning_rate": 1.769589309565445e-07, "loss": 0.1308, "step": 1045 }, { "epoch": 1.8318739054290718, "grad_norm": 1.14451694965405, "learning_rate": 1.733502234514206e-07, "loss": 0.0877, "step": 1046 }, { "epoch": 1.8336252189141855, "grad_norm": 1.0185258661977419, "learning_rate": 1.6977804286609777e-07, "loss": 0.0884, "step": 1047 }, { "epoch": 1.8353765323992994, "grad_norm": 0.9397169836007261, "learning_rate": 1.6624241623397598e-07, "loss": 0.1227, "step": 1048 }, { "epoch": 1.8371278458844134, "grad_norm": 0.988120401495064, "learning_rate": 1.6274337031182362e-07, "loss": 0.0721, "step": 1049 }, { "epoch": 1.8388791593695273, "grad_norm": 0.842379722481448, "learning_rate": 1.5928093157957403e-07, "loss": 0.0883, "step": 1050 }, { "epoch": 1.840630472854641, "grad_norm": 0.7650106307160991, "learning_rate": 1.5585512624012812e-07, "loss": 0.0627, "step": 1051 }, { "epoch": 1.8423817863397547, "grad_norm": 0.7520245228487481, "learning_rate": 1.5246598021915304e-07, "loss": 0.0583, "step": 1052 }, { "epoch": 1.8441330998248686, "grad_norm": 0.7816067951505854, "learning_rate": 1.4911351916488849e-07, "loss": 0.0629, "step": 1053 }, { "epoch": 1.8458844133099825, "grad_norm": 0.8206548278069278, "learning_rate": 1.4579776844794834e-07, "loss": 0.0629, "step": 1054 }, { "epoch": 1.8476357267950965, "grad_norm": 0.8413065831699674, "learning_rate": 1.4251875316113495e-07, "loss": 0.0918, "step": 1055 }, { "epoch": 1.8493870402802102, "grad_norm": 0.8646047693995625, "learning_rate": 1.3927649811924182e-07, "loss": 0.1067, "step": 1056 }, { "epoch": 1.8511383537653239, "grad_norm": 1.1648039728609827, "learning_rate": 1.3607102785887393e-07, "loss": 0.1264, "step": 1057 }, { "epoch": 1.8528896672504378, "grad_norm": 0.7719934600369042, "learning_rate": 1.3290236663825562e-07, "loss": 0.083, "step": 1058 }, { "epoch": 1.8546409807355517, "grad_norm": 0.8950213917800804, "learning_rate": 1.2977053843704957e-07, "loss": 0.0847, "step": 1059 }, { "epoch": 1.8563922942206657, "grad_norm": 1.0026565240342258, "learning_rate": 1.2667556695617534e-07, "loss": 0.1044, "step": 1060 }, { "epoch": 1.8581436077057794, "grad_norm": 0.744107605196941, "learning_rate": 1.236174756176295e-07, "loss": 0.0721, "step": 1061 }, { "epoch": 1.859894921190893, "grad_norm": 0.9509055299042167, "learning_rate": 1.2059628756430797e-07, "loss": 0.0818, "step": 1062 }, { "epoch": 1.861646234676007, "grad_norm": 0.9112160743275268, "learning_rate": 1.1761202565983399e-07, "loss": 0.0645, "step": 1063 }, { "epoch": 1.863397548161121, "grad_norm": 1.0272055127705473, "learning_rate": 1.1466471248837985e-07, "loss": 0.0748, "step": 1064 }, { "epoch": 1.8651488616462348, "grad_norm": 0.8706834973807851, "learning_rate": 1.1175437035450043e-07, "loss": 0.0809, "step": 1065 }, { "epoch": 1.8669001751313485, "grad_norm": 0.9563935615496911, "learning_rate": 1.0888102128296052e-07, "loss": 0.0581, "step": 1066 }, { "epoch": 1.8686514886164622, "grad_norm": 0.5939777496692319, "learning_rate": 1.0604468701857384e-07, "loss": 0.0715, "step": 1067 }, { "epoch": 1.8704028021015762, "grad_norm": 0.8243817906122596, "learning_rate": 1.0324538902603154e-07, "loss": 0.0795, "step": 1068 }, { "epoch": 1.87215411558669, "grad_norm": 0.9622902905039842, "learning_rate": 1.0048314848974616e-07, "loss": 0.1115, "step": 1069 }, { "epoch": 1.873905429071804, "grad_norm": 0.8979703410249232, "learning_rate": 9.775798631368627e-08, "loss": 0.0807, "step": 1070 }, { "epoch": 1.8756567425569177, "grad_norm": 1.0203580216733263, "learning_rate": 9.506992312122044e-08, "loss": 0.1578, "step": 1071 }, { "epoch": 1.8774080560420314, "grad_norm": 0.7923499980875016, "learning_rate": 9.24189792549629e-08, "loss": 0.0717, "step": 1072 }, { "epoch": 1.8791593695271454, "grad_norm": 0.9641425590050212, "learning_rate": 8.980517477661543e-08, "loss": 0.0915, "step": 1073 }, { "epoch": 1.8809106830122593, "grad_norm": 1.3669709921921882, "learning_rate": 8.722852946682014e-08, "loss": 0.1012, "step": 1074 }, { "epoch": 1.882661996497373, "grad_norm": 0.9665788733716942, "learning_rate": 8.468906282500577e-08, "loss": 0.0614, "step": 1075 }, { "epoch": 1.884413309982487, "grad_norm": 1.400962552491105, "learning_rate": 8.218679406924279e-08, "loss": 0.1118, "step": 1076 }, { "epoch": 1.8861646234676006, "grad_norm": 0.9219968558833442, "learning_rate": 7.972174213609684e-08, "loss": 0.0779, "step": 1077 }, { "epoch": 1.8879159369527145, "grad_norm": 0.8045432273483273, "learning_rate": 7.7293925680485e-08, "loss": 0.0767, "step": 1078 }, { "epoch": 1.8896672504378285, "grad_norm": 0.9755957257233182, "learning_rate": 7.490336307553691e-08, "loss": 0.0859, "step": 1079 }, { "epoch": 1.8914185639229422, "grad_norm": 1.0420255755068712, "learning_rate": 7.255007241245227e-08, "loss": 0.0811, "step": 1080 }, { "epoch": 1.893169877408056, "grad_norm": 0.9551488849265928, "learning_rate": 7.023407150036632e-08, "loss": 0.1306, "step": 1081 }, { "epoch": 1.8949211908931698, "grad_norm": 0.77813629526978, "learning_rate": 6.795537786621564e-08, "loss": 0.0741, "step": 1082 }, { "epoch": 1.8966725043782837, "grad_norm": 0.8084153621002628, "learning_rate": 6.571400875460154e-08, "loss": 0.0814, "step": 1083 }, { "epoch": 1.8984238178633976, "grad_norm": 0.8812494181028698, "learning_rate": 6.350998112766626e-08, "loss": 0.0897, "step": 1084 }, { "epoch": 1.9001751313485113, "grad_norm": 0.7735066758206781, "learning_rate": 6.1343311664957e-08, "loss": 0.0895, "step": 1085 }, { "epoch": 1.9019264448336253, "grad_norm": 0.8161894034164808, "learning_rate": 5.92140167633054e-08, "loss": 0.0899, "step": 1086 }, { "epoch": 1.903677758318739, "grad_norm": 0.9086034533332163, "learning_rate": 5.712211253670108e-08, "loss": 0.0896, "step": 1087 }, { "epoch": 1.905429071803853, "grad_norm": 0.75646864097532, "learning_rate": 5.5067614816169955e-08, "loss": 0.0766, "step": 1088 }, { "epoch": 1.9071803852889668, "grad_norm": 0.731979768510242, "learning_rate": 5.3050539149654964e-08, "loss": 0.0686, "step": 1089 }, { "epoch": 1.9089316987740805, "grad_norm": 0.8139155119262608, "learning_rate": 5.107090080189725e-08, "loss": 0.0801, "step": 1090 }, { "epoch": 1.9106830122591942, "grad_norm": 0.9723091179497511, "learning_rate": 4.9128714754321794e-08, "loss": 0.0919, "step": 1091 }, { "epoch": 1.9124343257443082, "grad_norm": 0.8354742100084726, "learning_rate": 4.722399570492309e-08, "loss": 0.0675, "step": 1092 }, { "epoch": 1.914185639229422, "grad_norm": 0.8372591413758104, "learning_rate": 4.535675806815576e-08, "loss": 0.0645, "step": 1093 }, { "epoch": 1.915936952714536, "grad_norm": 1.046373747378417, "learning_rate": 4.352701597482245e-08, "loss": 0.0907, "step": 1094 }, { "epoch": 1.9176882661996497, "grad_norm": 0.8526193691562309, "learning_rate": 4.173478327197e-08, "loss": 0.084, "step": 1095 }, { "epoch": 1.9194395796847634, "grad_norm": 0.8722546887931946, "learning_rate": 3.998007352278233e-08, "loss": 0.1041, "step": 1096 }, { "epoch": 1.9211908931698773, "grad_norm": 0.9484274530627599, "learning_rate": 3.826290000647881e-08, "loss": 0.0926, "step": 1097 }, { "epoch": 1.9229422066549913, "grad_norm": 1.064819089816097, "learning_rate": 3.6583275718214406e-08, "loss": 0.1145, "step": 1098 }, { "epoch": 1.9246935201401052, "grad_norm": 0.6922227153177929, "learning_rate": 3.4941213368980264e-08, "loss": 0.0826, "step": 1099 }, { "epoch": 1.926444833625219, "grad_norm": 0.9547055760475679, "learning_rate": 3.333672538550714e-08, "loss": 0.088, "step": 1100 }, { "epoch": 1.9281961471103326, "grad_norm": 1.1501182412152315, "learning_rate": 3.176982391017214e-08, "loss": 0.1087, "step": 1101 }, { "epoch": 1.9299474605954465, "grad_norm": 0.9179486185891369, "learning_rate": 3.024052080090822e-08, "loss": 0.0724, "step": 1102 }, { "epoch": 1.9316987740805605, "grad_norm": 0.664000841194001, "learning_rate": 2.874882763111153e-08, "loss": 0.0474, "step": 1103 }, { "epoch": 1.9334500875656744, "grad_norm": 0.7849197413612594, "learning_rate": 2.7294755689555307e-08, "loss": 0.0624, "step": 1104 }, { "epoch": 1.935201401050788, "grad_norm": 0.768814756583036, "learning_rate": 2.5878315980305548e-08, "loss": 0.0741, "step": 1105 }, { "epoch": 1.9369527145359018, "grad_norm": 1.003735722548012, "learning_rate": 2.4499519222635493e-08, "loss": 0.064, "step": 1106 }, { "epoch": 1.9387040280210157, "grad_norm": 1.1491273999902563, "learning_rate": 2.3158375850946268e-08, "loss": 0.1346, "step": 1107 }, { "epoch": 1.9404553415061296, "grad_norm": 0.7589932497855878, "learning_rate": 2.1854896014686376e-08, "loss": 0.0762, "step": 1108 }, { "epoch": 1.9422066549912436, "grad_norm": 0.8331017479251108, "learning_rate": 2.0589089578276767e-08, "loss": 0.0484, "step": 1109 }, { "epoch": 1.9439579684763573, "grad_norm": 0.9832005255060994, "learning_rate": 1.936096612103533e-08, "loss": 0.1037, "step": 1110 }, { "epoch": 1.945709281961471, "grad_norm": 0.8225952913071677, "learning_rate": 1.817053493710308e-08, "loss": 0.0494, "step": 1111 }, { "epoch": 1.947460595446585, "grad_norm": 0.7543806570940519, "learning_rate": 1.7017805035375866e-08, "loss": 0.0601, "step": 1112 }, { "epoch": 1.9492119089316988, "grad_norm": 0.8293133630950005, "learning_rate": 1.590278513943555e-08, "loss": 0.0803, "step": 1113 }, { "epoch": 1.9509632224168127, "grad_norm": 0.8768099650131057, "learning_rate": 1.4825483687483377e-08, "loss": 0.0659, "step": 1114 }, { "epoch": 1.9527145359019265, "grad_norm": 0.9469149296349607, "learning_rate": 1.3785908832275596e-08, "loss": 0.0824, "step": 1115 }, { "epoch": 1.9544658493870402, "grad_norm": 0.8351027145369079, "learning_rate": 1.2784068441064611e-08, "loss": 0.0678, "step": 1116 }, { "epoch": 1.956217162872154, "grad_norm": 1.020423409119886, "learning_rate": 1.1819970095536814e-08, "loss": 0.1034, "step": 1117 }, { "epoch": 1.957968476357268, "grad_norm": 0.787927571900196, "learning_rate": 1.0893621091754847e-08, "loss": 0.068, "step": 1118 }, { "epoch": 1.959719789842382, "grad_norm": 0.6892271318399958, "learning_rate": 1.0005028440104313e-08, "loss": 0.0547, "step": 1119 }, { "epoch": 1.9614711033274956, "grad_norm": 0.9205823435107259, "learning_rate": 9.154198865239938e-09, "loss": 0.1156, "step": 1120 }, { "epoch": 1.9632224168126093, "grad_norm": 1.742536478730652, "learning_rate": 8.341138806035043e-09, "loss": 0.1909, "step": 1121 }, { "epoch": 1.9649737302977233, "grad_norm": 1.0405677994948164, "learning_rate": 7.565854415531037e-09, "loss": 0.1204, "step": 1122 }, { "epoch": 1.9667250437828372, "grad_norm": 0.9209713667024632, "learning_rate": 6.8283515608924545e-09, "loss": 0.0842, "step": 1123 }, { "epoch": 1.9684763572679511, "grad_norm": 0.847022736110267, "learning_rate": 6.128635823364204e-09, "loss": 0.0647, "step": 1124 }, { "epoch": 1.9702276707530648, "grad_norm": 0.8569688135314069, "learning_rate": 5.466712498225501e-09, "loss": 0.0984, "step": 1125 }, { "epoch": 1.9719789842381785, "grad_norm": 0.7807927515616613, "learning_rate": 4.8425865947515635e-09, "loss": 0.0919, "step": 1126 }, { "epoch": 1.9737302977232924, "grad_norm": 0.803314368964672, "learning_rate": 4.256262836176972e-09, "loss": 0.0748, "step": 1127 }, { "epoch": 1.9754816112084064, "grad_norm": 0.8192070789421884, "learning_rate": 3.7077456596584793e-09, "loss": 0.0622, "step": 1128 }, { "epoch": 1.9772329246935203, "grad_norm": 0.7402355614601918, "learning_rate": 3.197039216241149e-09, "loss": 0.0611, "step": 1129 }, { "epoch": 1.978984238178634, "grad_norm": 0.8005803168162587, "learning_rate": 2.7241473708283784e-09, "loss": 0.082, "step": 1130 }, { "epoch": 1.9807355516637477, "grad_norm": 0.9029956857496308, "learning_rate": 2.2890737021513675e-09, "loss": 0.099, "step": 1131 }, { "epoch": 1.9824868651488616, "grad_norm": 0.786670113306005, "learning_rate": 1.8918215027424746e-09, "loss": 0.1038, "step": 1132 }, { "epoch": 1.9842381786339756, "grad_norm": 0.8219187516038525, "learning_rate": 1.532393778910235e-09, "loss": 0.0785, "step": 1133 }, { "epoch": 1.9859894921190895, "grad_norm": 0.7919831938955016, "learning_rate": 1.2107932507177123e-09, "loss": 0.0784, "step": 1134 }, { "epoch": 1.9877408056042032, "grad_norm": 0.9217680061030283, "learning_rate": 9.270223519586285e-10, "loss": 0.086, "step": 1135 }, { "epoch": 1.989492119089317, "grad_norm": 0.7642493509648998, "learning_rate": 6.810832301440417e-10, "loss": 0.0862, "step": 1136 }, { "epoch": 1.9912434325744308, "grad_norm": 0.8154622682049354, "learning_rate": 4.729777464806961e-10, "loss": 0.0821, "step": 1137 }, { "epoch": 1.9929947460595447, "grad_norm": 0.8127834975137689, "learning_rate": 3.0270747586103045e-10, "loss": 0.1092, "step": 1138 }, { "epoch": 1.9947460595446584, "grad_norm": 1.0067550313335858, "learning_rate": 1.702737068492999e-10, "loss": 0.0985, "step": 1139 }, { "epoch": 1.9964973730297724, "grad_norm": 0.9383181919963424, "learning_rate": 7.567744167269464e-11, "loss": 0.0958, "step": 1140 }, { "epoch": 1.998248686514886, "grad_norm": 0.8519146037451801, "learning_rate": 1.8919396212457865e-11, "loss": 0.0717, "step": 1141 }, { "epoch": 2.0, "grad_norm": 0.6152399598695977, "learning_rate": 0.0, "loss": 0.0677, "step": 1142 }, { "epoch": 2.0, "step": 1142, "total_flos": 3773592600576.0, "train_loss": 0.13601795624086924, "train_runtime": 863.4724, "train_samples_per_second": 10.581, "train_steps_per_second": 1.323 } ], "logging_steps": 1, "max_steps": 1142, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 50000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3773592600576.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }