{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.2999670003299966, "eval_steps": 50000, "global_step": 150000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021999780002199976, "grad_norm": 1.3319692611694336, "learning_rate": 9.9e-06, "loss": 0.5606, "step": 100 }, { "epoch": 0.004399956000439995, "grad_norm": 1.4906107187271118, "learning_rate": 9.99564212611423e-06, "loss": 0.5478, "step": 200 }, { "epoch": 0.006599934000659994, "grad_norm": 1.542100191116333, "learning_rate": 9.99124023330032e-06, "loss": 0.5508, "step": 300 }, { "epoch": 0.00879991200087999, "grad_norm": 1.59752357006073, "learning_rate": 9.98683834048641e-06, "loss": 0.566, "step": 400 }, { "epoch": 0.010999890001099988, "grad_norm": 1.535962462425232, "learning_rate": 9.9824364476725e-06, "loss": 0.5604, "step": 500 }, { "epoch": 0.013199868001319988, "grad_norm": 1.6737797260284424, "learning_rate": 9.97803455485859e-06, "loss": 0.5651, "step": 600 }, { "epoch": 0.015399846001539985, "grad_norm": 1.5698915719985962, "learning_rate": 9.97363266204468e-06, "loss": 0.5384, "step": 700 }, { "epoch": 0.01759982400175998, "grad_norm": 1.501681923866272, "learning_rate": 9.96923076923077e-06, "loss": 0.5454, "step": 800 }, { "epoch": 0.01979980200197998, "grad_norm": 1.6730457544326782, "learning_rate": 9.96482887641686e-06, "loss": 0.5515, "step": 900 }, { "epoch": 0.021999780002199976, "grad_norm": 1.7415289878845215, "learning_rate": 9.960426983602949e-06, "loss": 0.5641, "step": 1000 }, { "epoch": 0.024199758002419976, "grad_norm": 1.7273190021514893, "learning_rate": 9.95602509078904e-06, "loss": 0.559, "step": 1100 }, { "epoch": 0.026399736002639975, "grad_norm": 1.7402335405349731, "learning_rate": 9.95162319797513e-06, "loss": 0.5391, "step": 1200 }, { "epoch": 0.02859971400285997, "grad_norm": 1.8390350341796875, "learning_rate": 9.94722130516122e-06, "loss": 0.5563, "step": 1300 }, { "epoch": 0.03079969200307997, "grad_norm": 1.3122905492782593, "learning_rate": 9.94281941234731e-06, "loss": 0.5594, "step": 1400 }, { "epoch": 0.032999670003299966, "grad_norm": 1.3811813592910767, "learning_rate": 9.9384175195334e-06, "loss": 0.5592, "step": 1500 }, { "epoch": 0.03519964800351996, "grad_norm": 1.8546792268753052, "learning_rate": 9.934015626719489e-06, "loss": 0.5522, "step": 1600 }, { "epoch": 0.037399626003739965, "grad_norm": 1.6485520601272583, "learning_rate": 9.92961373390558e-06, "loss": 0.5354, "step": 1700 }, { "epoch": 0.03959960400395996, "grad_norm": 1.366682767868042, "learning_rate": 9.92521184109167e-06, "loss": 0.5507, "step": 1800 }, { "epoch": 0.04179958200417996, "grad_norm": 1.7690378427505493, "learning_rate": 9.92080994827776e-06, "loss": 0.5444, "step": 1900 }, { "epoch": 0.04399956000439995, "grad_norm": 1.5437382459640503, "learning_rate": 9.91640805546385e-06, "loss": 0.5651, "step": 2000 }, { "epoch": 0.046199538004619956, "grad_norm": 1.156587839126587, "learning_rate": 9.91200616264994e-06, "loss": 0.562, "step": 2100 }, { "epoch": 0.04839951600483995, "grad_norm": 1.7941553592681885, "learning_rate": 9.90760426983603e-06, "loss": 0.5659, "step": 2200 }, { "epoch": 0.05059949400505995, "grad_norm": 1.4848283529281616, "learning_rate": 9.903202377022121e-06, "loss": 0.5629, "step": 2300 }, { "epoch": 0.05279947200527995, "grad_norm": 1.4486836194992065, "learning_rate": 9.898800484208211e-06, "loss": 0.5459, "step": 2400 }, { "epoch": 0.054999450005499946, "grad_norm": 1.731554388999939, "learning_rate": 9.894398591394302e-06, "loss": 0.5626, "step": 2500 }, { "epoch": 0.05719942800571994, "grad_norm": 1.6251667737960815, "learning_rate": 9.88999669858039e-06, "loss": 0.5516, "step": 2600 }, { "epoch": 0.05939940600593994, "grad_norm": 1.256371021270752, "learning_rate": 9.88559480576648e-06, "loss": 0.5459, "step": 2700 }, { "epoch": 0.06159938400615994, "grad_norm": 1.418700933456421, "learning_rate": 9.88119291295257e-06, "loss": 0.5495, "step": 2800 }, { "epoch": 0.06379936200637994, "grad_norm": 1.6376900672912598, "learning_rate": 9.876791020138661e-06, "loss": 0.5641, "step": 2900 }, { "epoch": 0.06599934000659993, "grad_norm": 1.5085667371749878, "learning_rate": 9.872389127324751e-06, "loss": 0.5625, "step": 3000 }, { "epoch": 0.06819931800681993, "grad_norm": 1.5381278991699219, "learning_rate": 9.86798723451084e-06, "loss": 0.5603, "step": 3100 }, { "epoch": 0.07039929600703992, "grad_norm": 1.5536515712738037, "learning_rate": 9.86358534169693e-06, "loss": 0.5529, "step": 3200 }, { "epoch": 0.07259927400725993, "grad_norm": 1.9047861099243164, "learning_rate": 9.85918344888302e-06, "loss": 0.549, "step": 3300 }, { "epoch": 0.07479925200747993, "grad_norm": 1.517338514328003, "learning_rate": 9.85478155606911e-06, "loss": 0.561, "step": 3400 }, { "epoch": 0.07699923000769993, "grad_norm": 1.5779054164886475, "learning_rate": 9.850379663255201e-06, "loss": 0.5706, "step": 3500 }, { "epoch": 0.07919920800791992, "grad_norm": 1.704124927520752, "learning_rate": 9.845977770441291e-06, "loss": 0.5523, "step": 3600 }, { "epoch": 0.08139918600813992, "grad_norm": 1.5121921300888062, "learning_rate": 9.84157587762738e-06, "loss": 0.5539, "step": 3700 }, { "epoch": 0.08359916400835991, "grad_norm": 1.6511967182159424, "learning_rate": 9.83717398481347e-06, "loss": 0.5443, "step": 3800 }, { "epoch": 0.08579914200857991, "grad_norm": 1.719138503074646, "learning_rate": 9.83277209199956e-06, "loss": 0.55, "step": 3900 }, { "epoch": 0.0879991200087999, "grad_norm": 1.6003084182739258, "learning_rate": 9.82837019918565e-06, "loss": 0.5588, "step": 4000 }, { "epoch": 0.09019909800901992, "grad_norm": 1.787855625152588, "learning_rate": 9.823968306371741e-06, "loss": 0.5636, "step": 4100 }, { "epoch": 0.09239907600923991, "grad_norm": 1.6582859754562378, "learning_rate": 9.819566413557831e-06, "loss": 0.5618, "step": 4200 }, { "epoch": 0.09459905400945991, "grad_norm": 1.696978211402893, "learning_rate": 9.81516452074392e-06, "loss": 0.5546, "step": 4300 }, { "epoch": 0.0967990320096799, "grad_norm": 1.8410296440124512, "learning_rate": 9.81076262793001e-06, "loss": 0.5471, "step": 4400 }, { "epoch": 0.0989990100098999, "grad_norm": 1.736607313156128, "learning_rate": 9.8063607351161e-06, "loss": 0.5461, "step": 4500 }, { "epoch": 0.1011989880101199, "grad_norm": 1.507016897201538, "learning_rate": 9.80195884230219e-06, "loss": 0.5609, "step": 4600 }, { "epoch": 0.10339896601033989, "grad_norm": 1.6941606998443604, "learning_rate": 9.797556949488281e-06, "loss": 0.5656, "step": 4700 }, { "epoch": 0.1055989440105599, "grad_norm": 1.6578975915908813, "learning_rate": 9.793155056674371e-06, "loss": 0.5624, "step": 4800 }, { "epoch": 0.1077989220107799, "grad_norm": 1.6376292705535889, "learning_rate": 9.78875316386046e-06, "loss": 0.5483, "step": 4900 }, { "epoch": 0.10999890001099989, "grad_norm": 1.8150690793991089, "learning_rate": 9.78435127104655e-06, "loss": 0.5739, "step": 5000 }, { "epoch": 0.11219887801121989, "grad_norm": 1.8733948469161987, "learning_rate": 9.77994937823264e-06, "loss": 0.5511, "step": 5100 }, { "epoch": 0.11439885601143988, "grad_norm": 1.3109201192855835, "learning_rate": 9.77554748541873e-06, "loss": 0.5584, "step": 5200 }, { "epoch": 0.11659883401165988, "grad_norm": 2.0025064945220947, "learning_rate": 9.771145592604821e-06, "loss": 0.5638, "step": 5300 }, { "epoch": 0.11879881201187988, "grad_norm": 1.584830641746521, "learning_rate": 9.76674369979091e-06, "loss": 0.575, "step": 5400 }, { "epoch": 0.12099879001209989, "grad_norm": 1.7688754796981812, "learning_rate": 9.762341806977e-06, "loss": 0.5603, "step": 5500 }, { "epoch": 0.12319876801231988, "grad_norm": 1.6688051223754883, "learning_rate": 9.75793991416309e-06, "loss": 0.5746, "step": 5600 }, { "epoch": 0.12539874601253986, "grad_norm": 1.6409167051315308, "learning_rate": 9.753538021349182e-06, "loss": 0.5469, "step": 5700 }, { "epoch": 0.12759872401275987, "grad_norm": 1.5867542028427124, "learning_rate": 9.74913612853527e-06, "loss": 0.5414, "step": 5800 }, { "epoch": 0.12979870201297988, "grad_norm": 1.7665027379989624, "learning_rate": 9.744734235721361e-06, "loss": 0.5574, "step": 5900 }, { "epoch": 0.13199868001319986, "grad_norm": 1.298757553100586, "learning_rate": 9.740332342907451e-06, "loss": 0.5356, "step": 6000 }, { "epoch": 0.13419865801341987, "grad_norm": 1.381654143333435, "learning_rate": 9.735930450093542e-06, "loss": 0.5525, "step": 6100 }, { "epoch": 0.13639863601363986, "grad_norm": 1.398958683013916, "learning_rate": 9.731528557279632e-06, "loss": 0.5427, "step": 6200 }, { "epoch": 0.13859861401385987, "grad_norm": 1.4779409170150757, "learning_rate": 9.727126664465722e-06, "loss": 0.5583, "step": 6300 }, { "epoch": 0.14079859201407985, "grad_norm": 1.5421425104141235, "learning_rate": 9.72272477165181e-06, "loss": 0.5484, "step": 6400 }, { "epoch": 0.14299857001429986, "grad_norm": 1.7208441495895386, "learning_rate": 9.718322878837901e-06, "loss": 0.5478, "step": 6500 }, { "epoch": 0.14519854801451987, "grad_norm": 1.643373727798462, "learning_rate": 9.713920986023991e-06, "loss": 0.5742, "step": 6600 }, { "epoch": 0.14739852601473985, "grad_norm": 1.5801072120666504, "learning_rate": 9.709519093210082e-06, "loss": 0.5516, "step": 6700 }, { "epoch": 0.14959850401495986, "grad_norm": 1.5034841299057007, "learning_rate": 9.705117200396172e-06, "loss": 0.558, "step": 6800 }, { "epoch": 0.15179848201517984, "grad_norm": 1.6282888650894165, "learning_rate": 9.70071530758226e-06, "loss": 0.5575, "step": 6900 }, { "epoch": 0.15399846001539985, "grad_norm": 1.4846858978271484, "learning_rate": 9.69631341476835e-06, "loss": 0.5487, "step": 7000 }, { "epoch": 0.15619843801561983, "grad_norm": 1.6254215240478516, "learning_rate": 9.691911521954441e-06, "loss": 0.5443, "step": 7100 }, { "epoch": 0.15839841601583984, "grad_norm": 1.7018550634384155, "learning_rate": 9.687509629140531e-06, "loss": 0.556, "step": 7200 }, { "epoch": 0.16059839401605983, "grad_norm": 1.6466326713562012, "learning_rate": 9.683107736326622e-06, "loss": 0.5541, "step": 7300 }, { "epoch": 0.16279837201627984, "grad_norm": 1.4446876049041748, "learning_rate": 9.678705843512712e-06, "loss": 0.5464, "step": 7400 }, { "epoch": 0.16499835001649985, "grad_norm": 1.5896605253219604, "learning_rate": 9.6743039506988e-06, "loss": 0.5394, "step": 7500 }, { "epoch": 0.16719832801671983, "grad_norm": 1.837875485420227, "learning_rate": 9.66990205788489e-06, "loss": 0.5351, "step": 7600 }, { "epoch": 0.16939830601693984, "grad_norm": 1.5089105367660522, "learning_rate": 9.665500165070981e-06, "loss": 0.5434, "step": 7700 }, { "epoch": 0.17159828401715982, "grad_norm": 1.5068552494049072, "learning_rate": 9.661098272257071e-06, "loss": 0.5542, "step": 7800 }, { "epoch": 0.17379826201737983, "grad_norm": 1.7671160697937012, "learning_rate": 9.656696379443162e-06, "loss": 0.5434, "step": 7900 }, { "epoch": 0.1759982400175998, "grad_norm": 1.612404227256775, "learning_rate": 9.652294486629252e-06, "loss": 0.5481, "step": 8000 }, { "epoch": 0.17819821801781982, "grad_norm": 1.403520941734314, "learning_rate": 9.64789259381534e-06, "loss": 0.5436, "step": 8100 }, { "epoch": 0.18039819601803983, "grad_norm": 1.786060094833374, "learning_rate": 9.64349070100143e-06, "loss": 0.5571, "step": 8200 }, { "epoch": 0.1825981740182598, "grad_norm": 1.6619782447814941, "learning_rate": 9.639088808187521e-06, "loss": 0.5402, "step": 8300 }, { "epoch": 0.18479815201847982, "grad_norm": 1.805365800857544, "learning_rate": 9.634686915373611e-06, "loss": 0.5705, "step": 8400 }, { "epoch": 0.1869981300186998, "grad_norm": 1.5753322839736938, "learning_rate": 9.630285022559702e-06, "loss": 0.5477, "step": 8500 }, { "epoch": 0.18919810801891981, "grad_norm": 1.688490629196167, "learning_rate": 9.625883129745792e-06, "loss": 0.5497, "step": 8600 }, { "epoch": 0.1913980860191398, "grad_norm": 1.5862349271774292, "learning_rate": 9.62148123693188e-06, "loss": 0.5374, "step": 8700 }, { "epoch": 0.1935980640193598, "grad_norm": 1.8771247863769531, "learning_rate": 9.61707934411797e-06, "loss": 0.5445, "step": 8800 }, { "epoch": 0.19579804201957982, "grad_norm": 1.432055115699768, "learning_rate": 9.612677451304061e-06, "loss": 0.5478, "step": 8900 }, { "epoch": 0.1979980200197998, "grad_norm": 1.7091459035873413, "learning_rate": 9.608275558490151e-06, "loss": 0.5509, "step": 9000 }, { "epoch": 0.2001979980200198, "grad_norm": 1.5979877710342407, "learning_rate": 9.603873665676242e-06, "loss": 0.5439, "step": 9100 }, { "epoch": 0.2023979760202398, "grad_norm": 1.5256608724594116, "learning_rate": 9.599471772862332e-06, "loss": 0.546, "step": 9200 }, { "epoch": 0.2045979540204598, "grad_norm": 1.7038841247558594, "learning_rate": 9.595069880048422e-06, "loss": 0.5455, "step": 9300 }, { "epoch": 0.20679793202067978, "grad_norm": 1.6116039752960205, "learning_rate": 9.590667987234512e-06, "loss": 0.5448, "step": 9400 }, { "epoch": 0.2089979100208998, "grad_norm": 1.6021257638931274, "learning_rate": 9.586266094420603e-06, "loss": 0.5373, "step": 9500 }, { "epoch": 0.2111978880211198, "grad_norm": 1.8599495887756348, "learning_rate": 9.581864201606691e-06, "loss": 0.5445, "step": 9600 }, { "epoch": 0.21339786602133978, "grad_norm": 1.5737359523773193, "learning_rate": 9.577462308792782e-06, "loss": 0.554, "step": 9700 }, { "epoch": 0.2155978440215598, "grad_norm": 1.9932422637939453, "learning_rate": 9.573060415978872e-06, "loss": 0.5466, "step": 9800 }, { "epoch": 0.21779782202177977, "grad_norm": 1.2846128940582275, "learning_rate": 9.568658523164962e-06, "loss": 0.552, "step": 9900 }, { "epoch": 0.21999780002199978, "grad_norm": 1.845566987991333, "learning_rate": 9.564256630351052e-06, "loss": 0.5351, "step": 10000 }, { "epoch": 0.22219777802221977, "grad_norm": 1.7098534107208252, "learning_rate": 9.559854737537143e-06, "loss": 0.5701, "step": 10100 }, { "epoch": 0.22439775602243978, "grad_norm": 1.6359370946884155, "learning_rate": 9.555452844723231e-06, "loss": 0.5399, "step": 10200 }, { "epoch": 0.22659773402265979, "grad_norm": 1.8628222942352295, "learning_rate": 9.551050951909322e-06, "loss": 0.5428, "step": 10300 }, { "epoch": 0.22879771202287977, "grad_norm": 1.7202619314193726, "learning_rate": 9.546649059095412e-06, "loss": 0.5473, "step": 10400 }, { "epoch": 0.23099769002309978, "grad_norm": 1.6408450603485107, "learning_rate": 9.542247166281502e-06, "loss": 0.5566, "step": 10500 }, { "epoch": 0.23319766802331976, "grad_norm": 1.6586904525756836, "learning_rate": 9.537845273467592e-06, "loss": 0.5357, "step": 10600 }, { "epoch": 0.23539764602353977, "grad_norm": 1.8505043983459473, "learning_rate": 9.533443380653683e-06, "loss": 0.5596, "step": 10700 }, { "epoch": 0.23759762402375975, "grad_norm": 1.9244803190231323, "learning_rate": 9.529041487839771e-06, "loss": 0.5428, "step": 10800 }, { "epoch": 0.23979760202397976, "grad_norm": 1.5375540256500244, "learning_rate": 9.524639595025862e-06, "loss": 0.5478, "step": 10900 }, { "epoch": 0.24199758002419977, "grad_norm": 1.7372453212738037, "learning_rate": 9.520237702211952e-06, "loss": 0.5458, "step": 11000 }, { "epoch": 0.24419755802441975, "grad_norm": 1.5542049407958984, "learning_rate": 9.515835809398042e-06, "loss": 0.5412, "step": 11100 }, { "epoch": 0.24639753602463976, "grad_norm": 1.5235602855682373, "learning_rate": 9.511433916584132e-06, "loss": 0.5631, "step": 11200 }, { "epoch": 0.24859751402485974, "grad_norm": 1.7347521781921387, "learning_rate": 9.507032023770221e-06, "loss": 0.5508, "step": 11300 }, { "epoch": 0.2507974920250797, "grad_norm": 1.8189500570297241, "learning_rate": 9.502630130956311e-06, "loss": 0.5346, "step": 11400 }, { "epoch": 0.25299747002529976, "grad_norm": 1.5607105493545532, "learning_rate": 9.498228238142402e-06, "loss": 0.5454, "step": 11500 }, { "epoch": 0.25519744802551975, "grad_norm": 1.5799516439437866, "learning_rate": 9.493826345328492e-06, "loss": 0.5271, "step": 11600 }, { "epoch": 0.25739742602573973, "grad_norm": 1.4460997581481934, "learning_rate": 9.489424452514582e-06, "loss": 0.5437, "step": 11700 }, { "epoch": 0.25959740402595977, "grad_norm": 1.368635892868042, "learning_rate": 9.485022559700672e-06, "loss": 0.5442, "step": 11800 }, { "epoch": 0.26179738202617975, "grad_norm": 1.8246245384216309, "learning_rate": 9.480620666886761e-06, "loss": 0.5321, "step": 11900 }, { "epoch": 0.26399736002639973, "grad_norm": 1.8881937265396118, "learning_rate": 9.476218774072851e-06, "loss": 0.5639, "step": 12000 }, { "epoch": 0.2661973380266197, "grad_norm": 1.39218008518219, "learning_rate": 9.471816881258942e-06, "loss": 0.5634, "step": 12100 }, { "epoch": 0.26839731602683975, "grad_norm": 1.5577659606933594, "learning_rate": 9.467414988445032e-06, "loss": 0.5422, "step": 12200 }, { "epoch": 0.27059729402705973, "grad_norm": 1.9022492170333862, "learning_rate": 9.463013095631122e-06, "loss": 0.5429, "step": 12300 }, { "epoch": 0.2727972720272797, "grad_norm": 1.7101701498031616, "learning_rate": 9.458611202817212e-06, "loss": 0.5473, "step": 12400 }, { "epoch": 0.27499725002749975, "grad_norm": 2.0155210494995117, "learning_rate": 9.454209310003301e-06, "loss": 0.5689, "step": 12500 }, { "epoch": 0.27719722802771973, "grad_norm": 1.994775414466858, "learning_rate": 9.449807417189393e-06, "loss": 0.514, "step": 12600 }, { "epoch": 0.2793972060279397, "grad_norm": 1.5826818943023682, "learning_rate": 9.445405524375483e-06, "loss": 0.5413, "step": 12700 }, { "epoch": 0.2815971840281597, "grad_norm": 1.589729905128479, "learning_rate": 9.441003631561574e-06, "loss": 0.5339, "step": 12800 }, { "epoch": 0.28379716202837973, "grad_norm": 1.8156132698059082, "learning_rate": 9.436601738747662e-06, "loss": 0.5546, "step": 12900 }, { "epoch": 0.2859971400285997, "grad_norm": 1.576416254043579, "learning_rate": 9.432199845933752e-06, "loss": 0.5465, "step": 13000 }, { "epoch": 0.2881971180288197, "grad_norm": 1.9609074592590332, "learning_rate": 9.427797953119843e-06, "loss": 0.553, "step": 13100 }, { "epoch": 0.29039709602903974, "grad_norm": 1.5881434679031372, "learning_rate": 9.423396060305933e-06, "loss": 0.5377, "step": 13200 }, { "epoch": 0.2925970740292597, "grad_norm": 1.569200038909912, "learning_rate": 9.418994167492023e-06, "loss": 0.5467, "step": 13300 }, { "epoch": 0.2947970520294797, "grad_norm": 1.7305947542190552, "learning_rate": 9.414592274678112e-06, "loss": 0.5388, "step": 13400 }, { "epoch": 0.2969970300296997, "grad_norm": 1.9278624057769775, "learning_rate": 9.410190381864202e-06, "loss": 0.5419, "step": 13500 }, { "epoch": 0.2991970080299197, "grad_norm": 1.6430861949920654, "learning_rate": 9.405788489050292e-06, "loss": 0.5579, "step": 13600 }, { "epoch": 0.3013969860301397, "grad_norm": 1.4233689308166504, "learning_rate": 9.401386596236383e-06, "loss": 0.5385, "step": 13700 }, { "epoch": 0.3035969640303597, "grad_norm": 1.705346941947937, "learning_rate": 9.396984703422473e-06, "loss": 0.5491, "step": 13800 }, { "epoch": 0.3057969420305797, "grad_norm": 1.7933902740478516, "learning_rate": 9.392582810608563e-06, "loss": 0.5513, "step": 13900 }, { "epoch": 0.3079969200307997, "grad_norm": 1.901663899421692, "learning_rate": 9.388180917794652e-06, "loss": 0.5614, "step": 14000 }, { "epoch": 0.3101968980310197, "grad_norm": 1.6877708435058594, "learning_rate": 9.383779024980742e-06, "loss": 0.5334, "step": 14100 }, { "epoch": 0.31239687603123967, "grad_norm": 1.7979609966278076, "learning_rate": 9.379377132166832e-06, "loss": 0.5527, "step": 14200 }, { "epoch": 0.3145968540314597, "grad_norm": 1.7708429098129272, "learning_rate": 9.374975239352923e-06, "loss": 0.5386, "step": 14300 }, { "epoch": 0.3167968320316797, "grad_norm": 1.3621147871017456, "learning_rate": 9.370573346539013e-06, "loss": 0.5626, "step": 14400 }, { "epoch": 0.31899681003189967, "grad_norm": 1.5842787027359009, "learning_rate": 9.366171453725103e-06, "loss": 0.529, "step": 14500 }, { "epoch": 0.32119678803211965, "grad_norm": 1.817987084388733, "learning_rate": 9.361769560911192e-06, "loss": 0.538, "step": 14600 }, { "epoch": 0.3233967660323397, "grad_norm": 1.6293082237243652, "learning_rate": 9.357367668097282e-06, "loss": 0.5481, "step": 14700 }, { "epoch": 0.32559674403255967, "grad_norm": 1.5916519165039062, "learning_rate": 9.352965775283372e-06, "loss": 0.5534, "step": 14800 }, { "epoch": 0.32779672203277965, "grad_norm": 1.5773463249206543, "learning_rate": 9.348563882469463e-06, "loss": 0.5501, "step": 14900 }, { "epoch": 0.3299967000329997, "grad_norm": 1.9787790775299072, "learning_rate": 9.344161989655553e-06, "loss": 0.541, "step": 15000 }, { "epoch": 0.3321966780332197, "grad_norm": 1.3281339406967163, "learning_rate": 9.339760096841642e-06, "loss": 0.539, "step": 15100 }, { "epoch": 0.33439665603343965, "grad_norm": 2.091588020324707, "learning_rate": 9.335358204027732e-06, "loss": 0.5393, "step": 15200 }, { "epoch": 0.33659663403365964, "grad_norm": 1.912660837173462, "learning_rate": 9.330956311213822e-06, "loss": 0.5168, "step": 15300 }, { "epoch": 0.3387966120338797, "grad_norm": 1.7248882055282593, "learning_rate": 9.326554418399912e-06, "loss": 0.538, "step": 15400 }, { "epoch": 0.34099659003409966, "grad_norm": 1.8949754238128662, "learning_rate": 9.322152525586003e-06, "loss": 0.5444, "step": 15500 }, { "epoch": 0.34319656803431964, "grad_norm": 1.4323865175247192, "learning_rate": 9.317750632772093e-06, "loss": 0.542, "step": 15600 }, { "epoch": 0.3453965460345397, "grad_norm": 1.7454142570495605, "learning_rate": 9.313348739958182e-06, "loss": 0.5346, "step": 15700 }, { "epoch": 0.34759652403475966, "grad_norm": 2.214750289916992, "learning_rate": 9.308946847144272e-06, "loss": 0.5391, "step": 15800 }, { "epoch": 0.34979650203497964, "grad_norm": 1.7991106510162354, "learning_rate": 9.304544954330362e-06, "loss": 0.551, "step": 15900 }, { "epoch": 0.3519964800351996, "grad_norm": 1.7487062215805054, "learning_rate": 9.300143061516452e-06, "loss": 0.5536, "step": 16000 }, { "epoch": 0.35419645803541966, "grad_norm": 1.7137202024459839, "learning_rate": 9.295741168702543e-06, "loss": 0.5472, "step": 16100 }, { "epoch": 0.35639643603563964, "grad_norm": 1.569287657737732, "learning_rate": 9.291339275888633e-06, "loss": 0.5286, "step": 16200 }, { "epoch": 0.3585964140358596, "grad_norm": 1.805232286453247, "learning_rate": 9.286937383074723e-06, "loss": 0.535, "step": 16300 }, { "epoch": 0.36079639203607966, "grad_norm": 1.8445895910263062, "learning_rate": 9.282535490260814e-06, "loss": 0.5297, "step": 16400 }, { "epoch": 0.36299637003629964, "grad_norm": 1.8282471895217896, "learning_rate": 9.278133597446904e-06, "loss": 0.5341, "step": 16500 }, { "epoch": 0.3651963480365196, "grad_norm": 1.5979552268981934, "learning_rate": 9.273731704632994e-06, "loss": 0.5471, "step": 16600 }, { "epoch": 0.3673963260367396, "grad_norm": 1.6148823499679565, "learning_rate": 9.269329811819083e-06, "loss": 0.534, "step": 16700 }, { "epoch": 0.36959630403695964, "grad_norm": 1.7306467294692993, "learning_rate": 9.264927919005173e-06, "loss": 0.5475, "step": 16800 }, { "epoch": 0.3717962820371796, "grad_norm": 1.5774517059326172, "learning_rate": 9.260526026191263e-06, "loss": 0.5604, "step": 16900 }, { "epoch": 0.3739962600373996, "grad_norm": 1.6581697463989258, "learning_rate": 9.256124133377354e-06, "loss": 0.5474, "step": 17000 }, { "epoch": 0.37619623803761965, "grad_norm": 1.8324202299118042, "learning_rate": 9.251722240563444e-06, "loss": 0.5341, "step": 17100 }, { "epoch": 0.37839621603783963, "grad_norm": 1.7121940851211548, "learning_rate": 9.247320347749532e-06, "loss": 0.5538, "step": 17200 }, { "epoch": 0.3805961940380596, "grad_norm": 1.8483502864837646, "learning_rate": 9.242918454935623e-06, "loss": 0.5231, "step": 17300 }, { "epoch": 0.3827961720382796, "grad_norm": 1.7600507736206055, "learning_rate": 9.238516562121713e-06, "loss": 0.5581, "step": 17400 }, { "epoch": 0.38499615003849963, "grad_norm": 1.779398798942566, "learning_rate": 9.234114669307803e-06, "loss": 0.5468, "step": 17500 }, { "epoch": 0.3871961280387196, "grad_norm": 1.7732363939285278, "learning_rate": 9.229712776493894e-06, "loss": 0.558, "step": 17600 }, { "epoch": 0.3893961060389396, "grad_norm": 1.7597503662109375, "learning_rate": 9.225310883679984e-06, "loss": 0.5231, "step": 17700 }, { "epoch": 0.39159608403915963, "grad_norm": 1.8344216346740723, "learning_rate": 9.220908990866072e-06, "loss": 0.5428, "step": 17800 }, { "epoch": 0.3937960620393796, "grad_norm": 1.662919044494629, "learning_rate": 9.216507098052163e-06, "loss": 0.5314, "step": 17900 }, { "epoch": 0.3959960400395996, "grad_norm": 1.3180632591247559, "learning_rate": 9.212105205238253e-06, "loss": 0.5335, "step": 18000 }, { "epoch": 0.3981960180398196, "grad_norm": 1.8466808795928955, "learning_rate": 9.207703312424343e-06, "loss": 0.5251, "step": 18100 }, { "epoch": 0.4003959960400396, "grad_norm": 1.942530632019043, "learning_rate": 9.203301419610434e-06, "loss": 0.5361, "step": 18200 }, { "epoch": 0.4025959740402596, "grad_norm": 1.6795586347579956, "learning_rate": 9.198899526796524e-06, "loss": 0.5322, "step": 18300 }, { "epoch": 0.4047959520404796, "grad_norm": 1.8028258085250854, "learning_rate": 9.194497633982612e-06, "loss": 0.5332, "step": 18400 }, { "epoch": 0.4069959300406996, "grad_norm": 1.9072916507720947, "learning_rate": 9.190095741168703e-06, "loss": 0.5436, "step": 18500 }, { "epoch": 0.4091959080409196, "grad_norm": 1.849950909614563, "learning_rate": 9.185693848354793e-06, "loss": 0.5464, "step": 18600 }, { "epoch": 0.4113958860411396, "grad_norm": 1.8676297664642334, "learning_rate": 9.181291955540883e-06, "loss": 0.5598, "step": 18700 }, { "epoch": 0.41359586404135956, "grad_norm": 1.8260865211486816, "learning_rate": 9.176890062726974e-06, "loss": 0.5433, "step": 18800 }, { "epoch": 0.4157958420415796, "grad_norm": 1.6370753049850464, "learning_rate": 9.172488169913064e-06, "loss": 0.5473, "step": 18900 }, { "epoch": 0.4179958200417996, "grad_norm": 1.583030104637146, "learning_rate": 9.168086277099152e-06, "loss": 0.5478, "step": 19000 }, { "epoch": 0.42019579804201956, "grad_norm": 1.895065188407898, "learning_rate": 9.163684384285243e-06, "loss": 0.5391, "step": 19100 }, { "epoch": 0.4223957760422396, "grad_norm": 1.6694116592407227, "learning_rate": 9.159282491471333e-06, "loss": 0.5206, "step": 19200 }, { "epoch": 0.4245957540424596, "grad_norm": 1.630575180053711, "learning_rate": 9.154880598657423e-06, "loss": 0.5451, "step": 19300 }, { "epoch": 0.42679573204267957, "grad_norm": 2.0224249362945557, "learning_rate": 9.150478705843514e-06, "loss": 0.5334, "step": 19400 }, { "epoch": 0.42899571004289955, "grad_norm": 1.6329941749572754, "learning_rate": 9.146076813029602e-06, "loss": 0.5279, "step": 19500 }, { "epoch": 0.4311956880431196, "grad_norm": 1.3999661207199097, "learning_rate": 9.141674920215694e-06, "loss": 0.5366, "step": 19600 }, { "epoch": 0.43339566604333957, "grad_norm": 1.5041108131408691, "learning_rate": 9.137273027401784e-06, "loss": 0.5324, "step": 19700 }, { "epoch": 0.43559564404355955, "grad_norm": 1.714513897895813, "learning_rate": 9.132871134587875e-06, "loss": 0.5341, "step": 19800 }, { "epoch": 0.4377956220437796, "grad_norm": 1.7554248571395874, "learning_rate": 9.128469241773963e-06, "loss": 0.5436, "step": 19900 }, { "epoch": 0.43999560004399957, "grad_norm": 1.665436029434204, "learning_rate": 9.124067348960054e-06, "loss": 0.5299, "step": 20000 }, { "epoch": 0.44219557804421955, "grad_norm": 1.668437123298645, "learning_rate": 9.119665456146144e-06, "loss": 0.5188, "step": 20100 }, { "epoch": 0.44439555604443953, "grad_norm": 1.9339295625686646, "learning_rate": 9.115263563332234e-06, "loss": 0.5574, "step": 20200 }, { "epoch": 0.44659553404465957, "grad_norm": 1.7263190746307373, "learning_rate": 9.110861670518324e-06, "loss": 0.5469, "step": 20300 }, { "epoch": 0.44879551204487955, "grad_norm": 1.5733555555343628, "learning_rate": 9.106459777704415e-06, "loss": 0.529, "step": 20400 }, { "epoch": 0.45099549004509953, "grad_norm": 1.6786284446716309, "learning_rate": 9.102057884890503e-06, "loss": 0.539, "step": 20500 }, { "epoch": 0.45319546804531957, "grad_norm": 1.6025316715240479, "learning_rate": 9.097655992076594e-06, "loss": 0.5394, "step": 20600 }, { "epoch": 0.45539544604553955, "grad_norm": 1.7945187091827393, "learning_rate": 9.093254099262684e-06, "loss": 0.5233, "step": 20700 }, { "epoch": 0.45759542404575954, "grad_norm": 1.6407737731933594, "learning_rate": 9.088852206448774e-06, "loss": 0.547, "step": 20800 }, { "epoch": 0.4597954020459795, "grad_norm": 1.623547911643982, "learning_rate": 9.084450313634864e-06, "loss": 0.5609, "step": 20900 }, { "epoch": 0.46199538004619956, "grad_norm": 1.7454668283462524, "learning_rate": 9.080048420820953e-06, "loss": 0.5484, "step": 21000 }, { "epoch": 0.46419535804641954, "grad_norm": 2.0362443923950195, "learning_rate": 9.075646528007043e-06, "loss": 0.5199, "step": 21100 }, { "epoch": 0.4663953360466395, "grad_norm": 1.8968782424926758, "learning_rate": 9.071244635193134e-06, "loss": 0.5471, "step": 21200 }, { "epoch": 0.46859531404685956, "grad_norm": 1.7040385007858276, "learning_rate": 9.066842742379224e-06, "loss": 0.5167, "step": 21300 }, { "epoch": 0.47079529204707954, "grad_norm": 1.8420989513397217, "learning_rate": 9.062440849565314e-06, "loss": 0.5359, "step": 21400 }, { "epoch": 0.4729952700472995, "grad_norm": 1.6311464309692383, "learning_rate": 9.058038956751404e-06, "loss": 0.5375, "step": 21500 }, { "epoch": 0.4751952480475195, "grad_norm": 2.0437209606170654, "learning_rate": 9.053637063937493e-06, "loss": 0.5427, "step": 21600 }, { "epoch": 0.47739522604773954, "grad_norm": 1.6111825704574585, "learning_rate": 9.049235171123583e-06, "loss": 0.526, "step": 21700 }, { "epoch": 0.4795952040479595, "grad_norm": 1.3677709102630615, "learning_rate": 9.044833278309674e-06, "loss": 0.5328, "step": 21800 }, { "epoch": 0.4817951820481795, "grad_norm": 2.1056365966796875, "learning_rate": 9.040431385495764e-06, "loss": 0.5391, "step": 21900 }, { "epoch": 0.48399516004839954, "grad_norm": 1.807760238647461, "learning_rate": 9.036029492681854e-06, "loss": 0.5606, "step": 22000 }, { "epoch": 0.4861951380486195, "grad_norm": 1.8556056022644043, "learning_rate": 9.031627599867944e-06, "loss": 0.5351, "step": 22100 }, { "epoch": 0.4883951160488395, "grad_norm": 2.0106847286224365, "learning_rate": 9.027225707054033e-06, "loss": 0.5542, "step": 22200 }, { "epoch": 0.4905950940490595, "grad_norm": 1.6676563024520874, "learning_rate": 9.022823814240123e-06, "loss": 0.538, "step": 22300 }, { "epoch": 0.4927950720492795, "grad_norm": 1.4103186130523682, "learning_rate": 9.018421921426214e-06, "loss": 0.5241, "step": 22400 }, { "epoch": 0.4949950500494995, "grad_norm": 1.8032267093658447, "learning_rate": 9.014020028612304e-06, "loss": 0.5367, "step": 22500 }, { "epoch": 0.4971950280497195, "grad_norm": 1.6195557117462158, "learning_rate": 9.009618135798394e-06, "loss": 0.5434, "step": 22600 }, { "epoch": 0.4993950060499395, "grad_norm": 1.7808386087417603, "learning_rate": 9.005216242984484e-06, "loss": 0.5421, "step": 22700 }, { "epoch": 0.5015949840501595, "grad_norm": 1.746341586112976, "learning_rate": 9.000814350170573e-06, "loss": 0.5362, "step": 22800 }, { "epoch": 0.5037949620503795, "grad_norm": 2.1744487285614014, "learning_rate": 8.996412457356663e-06, "loss": 0.5243, "step": 22900 }, { "epoch": 0.5059949400505995, "grad_norm": 1.7973219156265259, "learning_rate": 8.992010564542755e-06, "loss": 0.5504, "step": 23000 }, { "epoch": 0.5081949180508195, "grad_norm": 1.6203027963638306, "learning_rate": 8.987608671728844e-06, "loss": 0.5426, "step": 23100 }, { "epoch": 0.5103948960510395, "grad_norm": 1.6453986167907715, "learning_rate": 8.983206778914934e-06, "loss": 0.548, "step": 23200 }, { "epoch": 0.5125948740512595, "grad_norm": 1.8163201808929443, "learning_rate": 8.978804886101024e-06, "loss": 0.5306, "step": 23300 }, { "epoch": 0.5147948520514795, "grad_norm": 1.7606194019317627, "learning_rate": 8.974402993287115e-06, "loss": 0.5318, "step": 23400 }, { "epoch": 0.5169948300516994, "grad_norm": 1.9621275663375854, "learning_rate": 8.970001100473205e-06, "loss": 0.5289, "step": 23500 }, { "epoch": 0.5191948080519195, "grad_norm": 1.707217812538147, "learning_rate": 8.965599207659295e-06, "loss": 0.5374, "step": 23600 }, { "epoch": 0.5213947860521395, "grad_norm": 1.9041409492492676, "learning_rate": 8.961197314845384e-06, "loss": 0.5512, "step": 23700 }, { "epoch": 0.5235947640523595, "grad_norm": 1.7021831274032593, "learning_rate": 8.956795422031474e-06, "loss": 0.5363, "step": 23800 }, { "epoch": 0.5257947420525795, "grad_norm": 1.6546313762664795, "learning_rate": 8.952393529217564e-06, "loss": 0.5355, "step": 23900 }, { "epoch": 0.5279947200527995, "grad_norm": 2.1298437118530273, "learning_rate": 8.947991636403655e-06, "loss": 0.5336, "step": 24000 }, { "epoch": 0.5301946980530194, "grad_norm": 1.6351710557937622, "learning_rate": 8.943589743589745e-06, "loss": 0.5298, "step": 24100 }, { "epoch": 0.5323946760532394, "grad_norm": 1.7850167751312256, "learning_rate": 8.939187850775835e-06, "loss": 0.5295, "step": 24200 }, { "epoch": 0.5345946540534595, "grad_norm": 1.6639127731323242, "learning_rate": 8.934785957961924e-06, "loss": 0.5482, "step": 24300 }, { "epoch": 0.5367946320536795, "grad_norm": 1.6761794090270996, "learning_rate": 8.930384065148014e-06, "loss": 0.5398, "step": 24400 }, { "epoch": 0.5389946100538995, "grad_norm": 2.0362918376922607, "learning_rate": 8.925982172334104e-06, "loss": 0.5387, "step": 24500 }, { "epoch": 0.5411945880541195, "grad_norm": 1.5029228925704956, "learning_rate": 8.921580279520195e-06, "loss": 0.5296, "step": 24600 }, { "epoch": 0.5433945660543394, "grad_norm": 1.7153294086456299, "learning_rate": 8.917178386706285e-06, "loss": 0.5395, "step": 24700 }, { "epoch": 0.5455945440545594, "grad_norm": 1.6009351015090942, "learning_rate": 8.912776493892375e-06, "loss": 0.5301, "step": 24800 }, { "epoch": 0.5477945220547794, "grad_norm": 1.7909400463104248, "learning_rate": 8.908374601078464e-06, "loss": 0.5292, "step": 24900 }, { "epoch": 0.5499945000549995, "grad_norm": 2.1847472190856934, "learning_rate": 8.903972708264554e-06, "loss": 0.5326, "step": 25000 }, { "epoch": 0.5521944780552195, "grad_norm": 2.270923614501953, "learning_rate": 8.899570815450644e-06, "loss": 0.545, "step": 25100 }, { "epoch": 0.5543944560554395, "grad_norm": 2.044668436050415, "learning_rate": 8.895168922636735e-06, "loss": 0.5335, "step": 25200 }, { "epoch": 0.5565944340556594, "grad_norm": 1.9989433288574219, "learning_rate": 8.890767029822825e-06, "loss": 0.5516, "step": 25300 }, { "epoch": 0.5587944120558794, "grad_norm": 1.7529683113098145, "learning_rate": 8.886365137008914e-06, "loss": 0.5379, "step": 25400 }, { "epoch": 0.5609943900560994, "grad_norm": 1.4954921007156372, "learning_rate": 8.881963244195004e-06, "loss": 0.5346, "step": 25500 }, { "epoch": 0.5631943680563194, "grad_norm": 1.7510510683059692, "learning_rate": 8.877561351381094e-06, "loss": 0.5186, "step": 25600 }, { "epoch": 0.5653943460565395, "grad_norm": 1.8264451026916504, "learning_rate": 8.873159458567184e-06, "loss": 0.5419, "step": 25700 }, { "epoch": 0.5675943240567595, "grad_norm": 2.1004931926727295, "learning_rate": 8.868757565753275e-06, "loss": 0.5419, "step": 25800 }, { "epoch": 0.5697943020569795, "grad_norm": 1.9316984415054321, "learning_rate": 8.864355672939365e-06, "loss": 0.5209, "step": 25900 }, { "epoch": 0.5719942800571994, "grad_norm": 2.182731866836548, "learning_rate": 8.859953780125454e-06, "loss": 0.5356, "step": 26000 }, { "epoch": 0.5741942580574194, "grad_norm": 1.6151630878448486, "learning_rate": 8.855551887311544e-06, "loss": 0.5419, "step": 26100 }, { "epoch": 0.5763942360576394, "grad_norm": 1.8083909749984741, "learning_rate": 8.851149994497634e-06, "loss": 0.5218, "step": 26200 }, { "epoch": 0.5785942140578594, "grad_norm": 1.6356123685836792, "learning_rate": 8.846748101683724e-06, "loss": 0.5256, "step": 26300 }, { "epoch": 0.5807941920580795, "grad_norm": 2.2701175212860107, "learning_rate": 8.842346208869815e-06, "loss": 0.534, "step": 26400 }, { "epoch": 0.5829941700582995, "grad_norm": 1.9146398305892944, "learning_rate": 8.837944316055905e-06, "loss": 0.5399, "step": 26500 }, { "epoch": 0.5851941480585194, "grad_norm": 1.9954113960266113, "learning_rate": 8.833542423241995e-06, "loss": 0.537, "step": 26600 }, { "epoch": 0.5873941260587394, "grad_norm": 1.6357481479644775, "learning_rate": 8.829140530428086e-06, "loss": 0.5322, "step": 26700 }, { "epoch": 0.5895941040589594, "grad_norm": 1.7142163515090942, "learning_rate": 8.824738637614176e-06, "loss": 0.5475, "step": 26800 }, { "epoch": 0.5917940820591794, "grad_norm": 1.7539161443710327, "learning_rate": 8.820336744800266e-06, "loss": 0.523, "step": 26900 }, { "epoch": 0.5939940600593994, "grad_norm": 1.6141777038574219, "learning_rate": 8.815934851986355e-06, "loss": 0.5318, "step": 27000 }, { "epoch": 0.5961940380596195, "grad_norm": 2.0629382133483887, "learning_rate": 8.811532959172445e-06, "loss": 0.5334, "step": 27100 }, { "epoch": 0.5983940160598394, "grad_norm": 1.999254584312439, "learning_rate": 8.807131066358535e-06, "loss": 0.5504, "step": 27200 }, { "epoch": 0.6005939940600594, "grad_norm": 1.8531382083892822, "learning_rate": 8.802729173544626e-06, "loss": 0.5376, "step": 27300 }, { "epoch": 0.6027939720602794, "grad_norm": 1.4768983125686646, "learning_rate": 8.798327280730716e-06, "loss": 0.5344, "step": 27400 }, { "epoch": 0.6049939500604994, "grad_norm": 1.7571672201156616, "learning_rate": 8.793925387916804e-06, "loss": 0.5342, "step": 27500 }, { "epoch": 0.6071939280607194, "grad_norm": 1.7986180782318115, "learning_rate": 8.789523495102895e-06, "loss": 0.5474, "step": 27600 }, { "epoch": 0.6093939060609393, "grad_norm": 1.9569381475448608, "learning_rate": 8.785121602288985e-06, "loss": 0.5403, "step": 27700 }, { "epoch": 0.6115938840611594, "grad_norm": 2.1773102283477783, "learning_rate": 8.780719709475075e-06, "loss": 0.5239, "step": 27800 }, { "epoch": 0.6137938620613794, "grad_norm": 2.050550937652588, "learning_rate": 8.776317816661166e-06, "loss": 0.5253, "step": 27900 }, { "epoch": 0.6159938400615994, "grad_norm": 1.7763617038726807, "learning_rate": 8.771915923847256e-06, "loss": 0.5283, "step": 28000 }, { "epoch": 0.6181938180618194, "grad_norm": 1.6701637506484985, "learning_rate": 8.767514031033344e-06, "loss": 0.5316, "step": 28100 }, { "epoch": 0.6203937960620394, "grad_norm": 1.6922410726547241, "learning_rate": 8.763112138219435e-06, "loss": 0.5384, "step": 28200 }, { "epoch": 0.6225937740622594, "grad_norm": 2.3351800441741943, "learning_rate": 8.758710245405525e-06, "loss": 0.5462, "step": 28300 }, { "epoch": 0.6247937520624793, "grad_norm": 1.7946525812149048, "learning_rate": 8.754308352591615e-06, "loss": 0.5341, "step": 28400 }, { "epoch": 0.6269937300626994, "grad_norm": 1.6485981941223145, "learning_rate": 8.749906459777706e-06, "loss": 0.5229, "step": 28500 }, { "epoch": 0.6291937080629194, "grad_norm": 2.138338327407837, "learning_rate": 8.745504566963796e-06, "loss": 0.5489, "step": 28600 }, { "epoch": 0.6313936860631394, "grad_norm": 1.7668613195419312, "learning_rate": 8.741102674149884e-06, "loss": 0.5239, "step": 28700 }, { "epoch": 0.6335936640633594, "grad_norm": 2.0970587730407715, "learning_rate": 8.736700781335975e-06, "loss": 0.5313, "step": 28800 }, { "epoch": 0.6357936420635794, "grad_norm": 1.7800394296646118, "learning_rate": 8.732298888522065e-06, "loss": 0.5322, "step": 28900 }, { "epoch": 0.6379936200637993, "grad_norm": 1.7388654947280884, "learning_rate": 8.727896995708155e-06, "loss": 0.5291, "step": 29000 }, { "epoch": 0.6401935980640193, "grad_norm": 1.6228729486465454, "learning_rate": 8.723495102894246e-06, "loss": 0.5318, "step": 29100 }, { "epoch": 0.6423935760642393, "grad_norm": 2.1541671752929688, "learning_rate": 8.719093210080334e-06, "loss": 0.5376, "step": 29200 }, { "epoch": 0.6445935540644594, "grad_norm": 2.0600032806396484, "learning_rate": 8.714691317266424e-06, "loss": 0.5342, "step": 29300 }, { "epoch": 0.6467935320646794, "grad_norm": 1.673624873161316, "learning_rate": 8.710289424452515e-06, "loss": 0.5533, "step": 29400 }, { "epoch": 0.6489935100648994, "grad_norm": 1.8217624425888062, "learning_rate": 8.705887531638605e-06, "loss": 0.526, "step": 29500 }, { "epoch": 0.6511934880651193, "grad_norm": 2.1350643634796143, "learning_rate": 8.701485638824695e-06, "loss": 0.5254, "step": 29600 }, { "epoch": 0.6533934660653393, "grad_norm": 1.7675269842147827, "learning_rate": 8.697083746010786e-06, "loss": 0.5191, "step": 29700 }, { "epoch": 0.6555934440655593, "grad_norm": 2.134058952331543, "learning_rate": 8.692681853196874e-06, "loss": 0.5329, "step": 29800 }, { "epoch": 0.6577934220657793, "grad_norm": 1.6623740196228027, "learning_rate": 8.688279960382964e-06, "loss": 0.5287, "step": 29900 }, { "epoch": 0.6599934000659994, "grad_norm": 2.05334210395813, "learning_rate": 8.683878067569056e-06, "loss": 0.5393, "step": 30000 }, { "epoch": 0.6621933780662194, "grad_norm": 1.7684849500656128, "learning_rate": 8.679476174755147e-06, "loss": 0.527, "step": 30100 }, { "epoch": 0.6643933560664393, "grad_norm": 1.825725793838501, "learning_rate": 8.675074281941235e-06, "loss": 0.5314, "step": 30200 }, { "epoch": 0.6665933340666593, "grad_norm": 1.9619163274765015, "learning_rate": 8.670672389127326e-06, "loss": 0.5238, "step": 30300 }, { "epoch": 0.6687933120668793, "grad_norm": 1.7254787683486938, "learning_rate": 8.666270496313416e-06, "loss": 0.5253, "step": 30400 }, { "epoch": 0.6709932900670993, "grad_norm": 1.739046335220337, "learning_rate": 8.661868603499506e-06, "loss": 0.5452, "step": 30500 }, { "epoch": 0.6731932680673193, "grad_norm": 1.9458619356155396, "learning_rate": 8.657466710685596e-06, "loss": 0.5253, "step": 30600 }, { "epoch": 0.6753932460675394, "grad_norm": 1.9501069784164429, "learning_rate": 8.653064817871687e-06, "loss": 0.5313, "step": 30700 }, { "epoch": 0.6775932240677593, "grad_norm": 1.4754610061645508, "learning_rate": 8.648662925057775e-06, "loss": 0.5409, "step": 30800 }, { "epoch": 0.6797932020679793, "grad_norm": 1.7951412200927734, "learning_rate": 8.644261032243866e-06, "loss": 0.558, "step": 30900 }, { "epoch": 0.6819931800681993, "grad_norm": 1.5883880853652954, "learning_rate": 8.639859139429956e-06, "loss": 0.5668, "step": 31000 }, { "epoch": 0.6841931580684193, "grad_norm": 1.7715564966201782, "learning_rate": 8.635457246616046e-06, "loss": 0.5567, "step": 31100 }, { "epoch": 0.6863931360686393, "grad_norm": 1.7103959321975708, "learning_rate": 8.631055353802136e-06, "loss": 0.5646, "step": 31200 }, { "epoch": 0.6885931140688593, "grad_norm": 2.053924322128296, "learning_rate": 8.626653460988225e-06, "loss": 0.5554, "step": 31300 }, { "epoch": 0.6907930920690794, "grad_norm": 1.3964165449142456, "learning_rate": 8.622251568174315e-06, "loss": 0.5341, "step": 31400 }, { "epoch": 0.6929930700692993, "grad_norm": 1.623286485671997, "learning_rate": 8.617849675360406e-06, "loss": 0.5475, "step": 31500 }, { "epoch": 0.6951930480695193, "grad_norm": 1.5909929275512695, "learning_rate": 8.613447782546496e-06, "loss": 0.543, "step": 31600 }, { "epoch": 0.6973930260697393, "grad_norm": 1.6793596744537354, "learning_rate": 8.609045889732586e-06, "loss": 0.5642, "step": 31700 }, { "epoch": 0.6995930040699593, "grad_norm": 1.5003210306167603, "learning_rate": 8.604643996918676e-06, "loss": 0.5528, "step": 31800 }, { "epoch": 0.7017929820701793, "grad_norm": 1.6098058223724365, "learning_rate": 8.600242104104765e-06, "loss": 0.5591, "step": 31900 }, { "epoch": 0.7039929600703992, "grad_norm": 1.8180344104766846, "learning_rate": 8.595840211290855e-06, "loss": 0.5575, "step": 32000 }, { "epoch": 0.7061929380706193, "grad_norm": 1.6185832023620605, "learning_rate": 8.591438318476946e-06, "loss": 0.5555, "step": 32100 }, { "epoch": 0.7083929160708393, "grad_norm": 1.7686482667922974, "learning_rate": 8.587036425663036e-06, "loss": 0.5562, "step": 32200 }, { "epoch": 0.7105928940710593, "grad_norm": 1.6809719800949097, "learning_rate": 8.582634532849126e-06, "loss": 0.5519, "step": 32300 }, { "epoch": 0.7127928720712793, "grad_norm": 1.8532384634017944, "learning_rate": 8.578232640035216e-06, "loss": 0.5466, "step": 32400 }, { "epoch": 0.7149928500714993, "grad_norm": 1.6389007568359375, "learning_rate": 8.573830747221305e-06, "loss": 0.5527, "step": 32500 }, { "epoch": 0.7171928280717192, "grad_norm": 1.6388925313949585, "learning_rate": 8.569428854407395e-06, "loss": 0.5439, "step": 32600 }, { "epoch": 0.7193928060719392, "grad_norm": 1.7384296655654907, "learning_rate": 8.565026961593486e-06, "loss": 0.5375, "step": 32700 }, { "epoch": 0.7215927840721593, "grad_norm": 1.7327488660812378, "learning_rate": 8.560625068779576e-06, "loss": 0.5548, "step": 32800 }, { "epoch": 0.7237927620723793, "grad_norm": 1.564349889755249, "learning_rate": 8.556223175965666e-06, "loss": 0.5573, "step": 32900 }, { "epoch": 0.7259927400725993, "grad_norm": 1.8052953481674194, "learning_rate": 8.551821283151756e-06, "loss": 0.524, "step": 33000 }, { "epoch": 0.7281927180728193, "grad_norm": 1.5981229543685913, "learning_rate": 8.547419390337845e-06, "loss": 0.5449, "step": 33100 }, { "epoch": 0.7303926960730392, "grad_norm": 1.4789613485336304, "learning_rate": 8.543017497523935e-06, "loss": 0.5356, "step": 33200 }, { "epoch": 0.7325926740732592, "grad_norm": 1.8192943334579468, "learning_rate": 8.538615604710026e-06, "loss": 0.5691, "step": 33300 }, { "epoch": 0.7347926520734792, "grad_norm": 1.874607801437378, "learning_rate": 8.534213711896116e-06, "loss": 0.5539, "step": 33400 }, { "epoch": 0.7369926300736993, "grad_norm": 1.6394860744476318, "learning_rate": 8.529811819082206e-06, "loss": 0.5653, "step": 33500 }, { "epoch": 0.7391926080739193, "grad_norm": 1.9063067436218262, "learning_rate": 8.525409926268296e-06, "loss": 0.5515, "step": 33600 }, { "epoch": 0.7413925860741393, "grad_norm": 1.6854544878005981, "learning_rate": 8.521008033454387e-06, "loss": 0.5534, "step": 33700 }, { "epoch": 0.7435925640743593, "grad_norm": 1.7821418046951294, "learning_rate": 8.516606140640477e-06, "loss": 0.5521, "step": 33800 }, { "epoch": 0.7457925420745792, "grad_norm": 1.5063166618347168, "learning_rate": 8.512204247826567e-06, "loss": 0.5667, "step": 33900 }, { "epoch": 0.7479925200747992, "grad_norm": 1.9604572057724, "learning_rate": 8.507802355012656e-06, "loss": 0.5434, "step": 34000 }, { "epoch": 0.7501924980750192, "grad_norm": 1.8538181781768799, "learning_rate": 8.503400462198746e-06, "loss": 0.5366, "step": 34100 }, { "epoch": 0.7523924760752393, "grad_norm": 1.8284313678741455, "learning_rate": 8.498998569384836e-06, "loss": 0.5549, "step": 34200 }, { "epoch": 0.7545924540754593, "grad_norm": 1.5392765998840332, "learning_rate": 8.494596676570927e-06, "loss": 0.5459, "step": 34300 }, { "epoch": 0.7567924320756793, "grad_norm": 1.601608157157898, "learning_rate": 8.490194783757017e-06, "loss": 0.5478, "step": 34400 }, { "epoch": 0.7589924100758992, "grad_norm": 1.602129340171814, "learning_rate": 8.485792890943107e-06, "loss": 0.5264, "step": 34500 }, { "epoch": 0.7611923880761192, "grad_norm": 1.5455442667007446, "learning_rate": 8.481390998129196e-06, "loss": 0.5452, "step": 34600 }, { "epoch": 0.7633923660763392, "grad_norm": 1.7308459281921387, "learning_rate": 8.476989105315286e-06, "loss": 0.5346, "step": 34700 }, { "epoch": 0.7655923440765592, "grad_norm": 1.9421132802963257, "learning_rate": 8.472587212501376e-06, "loss": 0.5502, "step": 34800 }, { "epoch": 0.7677923220767793, "grad_norm": 1.6126275062561035, "learning_rate": 8.468185319687467e-06, "loss": 0.5531, "step": 34900 }, { "epoch": 0.7699923000769993, "grad_norm": 1.9307098388671875, "learning_rate": 8.463783426873557e-06, "loss": 0.5451, "step": 35000 }, { "epoch": 0.7721922780772192, "grad_norm": 1.785501480102539, "learning_rate": 8.459381534059646e-06, "loss": 0.5657, "step": 35100 }, { "epoch": 0.7743922560774392, "grad_norm": 1.3118321895599365, "learning_rate": 8.454979641245736e-06, "loss": 0.5425, "step": 35200 }, { "epoch": 0.7765922340776592, "grad_norm": 1.6785212755203247, "learning_rate": 8.450577748431826e-06, "loss": 0.5608, "step": 35300 }, { "epoch": 0.7787922120778792, "grad_norm": 1.687156081199646, "learning_rate": 8.446175855617916e-06, "loss": 0.5268, "step": 35400 }, { "epoch": 0.7809921900780992, "grad_norm": 1.6766939163208008, "learning_rate": 8.441773962804007e-06, "loss": 0.5505, "step": 35500 }, { "epoch": 0.7831921680783193, "grad_norm": 1.3873755931854248, "learning_rate": 8.437372069990097e-06, "loss": 0.5346, "step": 35600 }, { "epoch": 0.7853921460785392, "grad_norm": 1.4507646560668945, "learning_rate": 8.432970177176186e-06, "loss": 0.5456, "step": 35700 }, { "epoch": 0.7875921240787592, "grad_norm": 1.7354850769042969, "learning_rate": 8.428568284362276e-06, "loss": 0.5502, "step": 35800 }, { "epoch": 0.7897921020789792, "grad_norm": 1.4922300577163696, "learning_rate": 8.424166391548366e-06, "loss": 0.5628, "step": 35900 }, { "epoch": 0.7919920800791992, "grad_norm": 1.722380518913269, "learning_rate": 8.419764498734456e-06, "loss": 0.5556, "step": 36000 }, { "epoch": 0.7941920580794192, "grad_norm": 1.905194640159607, "learning_rate": 8.415362605920547e-06, "loss": 0.5529, "step": 36100 }, { "epoch": 0.7963920360796392, "grad_norm": 2.140815496444702, "learning_rate": 8.410960713106637e-06, "loss": 0.5567, "step": 36200 }, { "epoch": 0.7985920140798592, "grad_norm": 1.5261491537094116, "learning_rate": 8.406558820292726e-06, "loss": 0.554, "step": 36300 }, { "epoch": 0.8007919920800792, "grad_norm": 1.6273101568222046, "learning_rate": 8.402156927478816e-06, "loss": 0.5534, "step": 36400 }, { "epoch": 0.8029919700802992, "grad_norm": 1.7818236351013184, "learning_rate": 8.397755034664906e-06, "loss": 0.5408, "step": 36500 }, { "epoch": 0.8051919480805192, "grad_norm": 1.9317457675933838, "learning_rate": 8.393353141850996e-06, "loss": 0.5726, "step": 36600 }, { "epoch": 0.8073919260807392, "grad_norm": 1.813769817352295, "learning_rate": 8.388951249037087e-06, "loss": 0.5605, "step": 36700 }, { "epoch": 0.8095919040809592, "grad_norm": 1.9883424043655396, "learning_rate": 8.384549356223177e-06, "loss": 0.5489, "step": 36800 }, { "epoch": 0.8117918820811791, "grad_norm": 1.709024429321289, "learning_rate": 8.380147463409267e-06, "loss": 0.5411, "step": 36900 }, { "epoch": 0.8139918600813992, "grad_norm": 1.4431244134902954, "learning_rate": 8.375745570595357e-06, "loss": 0.5472, "step": 37000 }, { "epoch": 0.8161918380816192, "grad_norm": 1.5251537561416626, "learning_rate": 8.371343677781448e-06, "loss": 0.5479, "step": 37100 }, { "epoch": 0.8183918160818392, "grad_norm": 1.687023401260376, "learning_rate": 8.366941784967536e-06, "loss": 0.543, "step": 37200 }, { "epoch": 0.8205917940820592, "grad_norm": 1.5462446212768555, "learning_rate": 8.362539892153627e-06, "loss": 0.55, "step": 37300 }, { "epoch": 0.8227917720822792, "grad_norm": 1.984750747680664, "learning_rate": 8.358137999339717e-06, "loss": 0.5495, "step": 37400 }, { "epoch": 0.8249917500824991, "grad_norm": 1.6375317573547363, "learning_rate": 8.353736106525807e-06, "loss": 0.5479, "step": 37500 }, { "epoch": 0.8271917280827191, "grad_norm": 1.8285633325576782, "learning_rate": 8.349334213711897e-06, "loss": 0.5398, "step": 37600 }, { "epoch": 0.8293917060829392, "grad_norm": 1.7603964805603027, "learning_rate": 8.344932320897988e-06, "loss": 0.5343, "step": 37700 }, { "epoch": 0.8315916840831592, "grad_norm": 1.4836808443069458, "learning_rate": 8.340530428084076e-06, "loss": 0.5559, "step": 37800 }, { "epoch": 0.8337916620833792, "grad_norm": 1.4867973327636719, "learning_rate": 8.336128535270167e-06, "loss": 0.5433, "step": 37900 }, { "epoch": 0.8359916400835992, "grad_norm": 1.784264326095581, "learning_rate": 8.331726642456257e-06, "loss": 0.5451, "step": 38000 }, { "epoch": 0.8381916180838191, "grad_norm": 1.3747423887252808, "learning_rate": 8.327324749642347e-06, "loss": 0.538, "step": 38100 }, { "epoch": 0.8403915960840391, "grad_norm": 1.8073352575302124, "learning_rate": 8.322922856828437e-06, "loss": 0.545, "step": 38200 }, { "epoch": 0.8425915740842591, "grad_norm": 1.6162651777267456, "learning_rate": 8.318520964014528e-06, "loss": 0.5448, "step": 38300 }, { "epoch": 0.8447915520844792, "grad_norm": 1.6627821922302246, "learning_rate": 8.314119071200616e-06, "loss": 0.5504, "step": 38400 }, { "epoch": 0.8469915300846992, "grad_norm": 1.594759464263916, "learning_rate": 8.309717178386707e-06, "loss": 0.5344, "step": 38500 }, { "epoch": 0.8491915080849192, "grad_norm": 1.7449952363967896, "learning_rate": 8.305315285572797e-06, "loss": 0.5558, "step": 38600 }, { "epoch": 0.8513914860851391, "grad_norm": 1.6787577867507935, "learning_rate": 8.300913392758887e-06, "loss": 0.5282, "step": 38700 }, { "epoch": 0.8535914640853591, "grad_norm": 2.2145471572875977, "learning_rate": 8.296511499944977e-06, "loss": 0.5371, "step": 38800 }, { "epoch": 0.8557914420855791, "grad_norm": 1.7959023714065552, "learning_rate": 8.292109607131068e-06, "loss": 0.5467, "step": 38900 }, { "epoch": 0.8579914200857991, "grad_norm": 1.7362741231918335, "learning_rate": 8.287707714317156e-06, "loss": 0.5334, "step": 39000 }, { "epoch": 0.8601913980860192, "grad_norm": 1.471660852432251, "learning_rate": 8.283305821503247e-06, "loss": 0.5563, "step": 39100 }, { "epoch": 0.8623913760862392, "grad_norm": 1.9247560501098633, "learning_rate": 8.278903928689337e-06, "loss": 0.5422, "step": 39200 }, { "epoch": 0.8645913540864592, "grad_norm": 1.4459770917892456, "learning_rate": 8.274502035875427e-06, "loss": 0.5549, "step": 39300 }, { "epoch": 0.8667913320866791, "grad_norm": 1.8843663930892944, "learning_rate": 8.270100143061517e-06, "loss": 0.5463, "step": 39400 }, { "epoch": 0.8689913100868991, "grad_norm": 1.6664437055587769, "learning_rate": 8.265698250247606e-06, "loss": 0.557, "step": 39500 }, { "epoch": 0.8711912880871191, "grad_norm": 1.8281344175338745, "learning_rate": 8.261296357433696e-06, "loss": 0.5306, "step": 39600 }, { "epoch": 0.8733912660873391, "grad_norm": 1.9608473777770996, "learning_rate": 8.256894464619787e-06, "loss": 0.5458, "step": 39700 }, { "epoch": 0.8755912440875592, "grad_norm": 1.9003684520721436, "learning_rate": 8.252492571805877e-06, "loss": 0.55, "step": 39800 }, { "epoch": 0.8777912220877792, "grad_norm": 1.8628289699554443, "learning_rate": 8.248090678991967e-06, "loss": 0.5379, "step": 39900 }, { "epoch": 0.8799912000879991, "grad_norm": 1.5854053497314453, "learning_rate": 8.243688786178057e-06, "loss": 0.5352, "step": 40000 }, { "epoch": 0.8821911780882191, "grad_norm": 1.957435965538025, "learning_rate": 8.239286893364146e-06, "loss": 0.5358, "step": 40100 }, { "epoch": 0.8843911560884391, "grad_norm": 1.838132381439209, "learning_rate": 8.234885000550236e-06, "loss": 0.5423, "step": 40200 }, { "epoch": 0.8865911340886591, "grad_norm": 1.936266541481018, "learning_rate": 8.230483107736327e-06, "loss": 0.5335, "step": 40300 }, { "epoch": 0.8887911120888791, "grad_norm": 1.5629870891571045, "learning_rate": 8.226081214922419e-06, "loss": 0.5354, "step": 40400 }, { "epoch": 0.8909910900890992, "grad_norm": 1.7080520391464233, "learning_rate": 8.221679322108507e-06, "loss": 0.5532, "step": 40500 }, { "epoch": 0.8931910680893191, "grad_norm": 1.795921802520752, "learning_rate": 8.217277429294597e-06, "loss": 0.5528, "step": 40600 }, { "epoch": 0.8953910460895391, "grad_norm": 1.955198884010315, "learning_rate": 8.212875536480688e-06, "loss": 0.5598, "step": 40700 }, { "epoch": 0.8975910240897591, "grad_norm": 1.865143895149231, "learning_rate": 8.208473643666778e-06, "loss": 0.5371, "step": 40800 }, { "epoch": 0.8997910020899791, "grad_norm": 1.8305407762527466, "learning_rate": 8.204071750852868e-06, "loss": 0.5459, "step": 40900 }, { "epoch": 0.9019909800901991, "grad_norm": 2.158996820449829, "learning_rate": 8.199669858038959e-06, "loss": 0.5477, "step": 41000 }, { "epoch": 0.904190958090419, "grad_norm": 1.5184693336486816, "learning_rate": 8.195267965225047e-06, "loss": 0.5536, "step": 41100 }, { "epoch": 0.9063909360906391, "grad_norm": 1.2580761909484863, "learning_rate": 8.190866072411137e-06, "loss": 0.5444, "step": 41200 }, { "epoch": 0.9085909140908591, "grad_norm": 1.5662882328033447, "learning_rate": 8.186464179597228e-06, "loss": 0.5474, "step": 41300 }, { "epoch": 0.9107908920910791, "grad_norm": 1.775161623954773, "learning_rate": 8.182062286783318e-06, "loss": 0.5405, "step": 41400 }, { "epoch": 0.9129908700912991, "grad_norm": 1.604435920715332, "learning_rate": 8.177660393969408e-06, "loss": 0.5425, "step": 41500 }, { "epoch": 0.9151908480915191, "grad_norm": 1.9549158811569214, "learning_rate": 8.173258501155497e-06, "loss": 0.5398, "step": 41600 }, { "epoch": 0.917390826091739, "grad_norm": 1.4547535181045532, "learning_rate": 8.168856608341587e-06, "loss": 0.5511, "step": 41700 }, { "epoch": 0.919590804091959, "grad_norm": 1.8771201372146606, "learning_rate": 8.164454715527677e-06, "loss": 0.5481, "step": 41800 }, { "epoch": 0.9217907820921791, "grad_norm": 2.0473129749298096, "learning_rate": 8.160052822713768e-06, "loss": 0.5418, "step": 41900 }, { "epoch": 0.9239907600923991, "grad_norm": 1.8082759380340576, "learning_rate": 8.155650929899858e-06, "loss": 0.5346, "step": 42000 }, { "epoch": 0.9261907380926191, "grad_norm": 1.8849467039108276, "learning_rate": 8.151249037085948e-06, "loss": 0.5563, "step": 42100 }, { "epoch": 0.9283907160928391, "grad_norm": 1.6767569780349731, "learning_rate": 8.146847144272037e-06, "loss": 0.536, "step": 42200 }, { "epoch": 0.9305906940930591, "grad_norm": 1.9930092096328735, "learning_rate": 8.142445251458127e-06, "loss": 0.5507, "step": 42300 }, { "epoch": 0.932790672093279, "grad_norm": 1.9420870542526245, "learning_rate": 8.138043358644217e-06, "loss": 0.5405, "step": 42400 }, { "epoch": 0.934990650093499, "grad_norm": 1.6965640783309937, "learning_rate": 8.133641465830308e-06, "loss": 0.5469, "step": 42500 }, { "epoch": 0.9371906280937191, "grad_norm": 1.4808323383331299, "learning_rate": 8.129239573016398e-06, "loss": 0.5341, "step": 42600 }, { "epoch": 0.9393906060939391, "grad_norm": 1.516119122505188, "learning_rate": 8.124837680202488e-06, "loss": 0.5515, "step": 42700 }, { "epoch": 0.9415905840941591, "grad_norm": 1.6243934631347656, "learning_rate": 8.120435787388577e-06, "loss": 0.541, "step": 42800 }, { "epoch": 0.9437905620943791, "grad_norm": 1.6918444633483887, "learning_rate": 8.116033894574667e-06, "loss": 0.5302, "step": 42900 }, { "epoch": 0.945990540094599, "grad_norm": 1.6359889507293701, "learning_rate": 8.111632001760757e-06, "loss": 0.5295, "step": 43000 }, { "epoch": 0.948190518094819, "grad_norm": 1.7587625980377197, "learning_rate": 8.107230108946848e-06, "loss": 0.5415, "step": 43100 }, { "epoch": 0.950390496095039, "grad_norm": 1.8017805814743042, "learning_rate": 8.102828216132938e-06, "loss": 0.5422, "step": 43200 }, { "epoch": 0.9525904740952591, "grad_norm": 1.970982313156128, "learning_rate": 8.098426323319027e-06, "loss": 0.5296, "step": 43300 }, { "epoch": 0.9547904520954791, "grad_norm": 1.8112688064575195, "learning_rate": 8.094024430505117e-06, "loss": 0.5539, "step": 43400 }, { "epoch": 0.9569904300956991, "grad_norm": 1.7808321714401245, "learning_rate": 8.089622537691207e-06, "loss": 0.5498, "step": 43500 }, { "epoch": 0.959190408095919, "grad_norm": 1.9657952785491943, "learning_rate": 8.085220644877297e-06, "loss": 0.5424, "step": 43600 }, { "epoch": 0.961390386096139, "grad_norm": 1.8520526885986328, "learning_rate": 8.080818752063388e-06, "loss": 0.5392, "step": 43700 }, { "epoch": 0.963590364096359, "grad_norm": 1.7919948101043701, "learning_rate": 8.076416859249478e-06, "loss": 0.532, "step": 43800 }, { "epoch": 0.965790342096579, "grad_norm": 1.600967288017273, "learning_rate": 8.072014966435568e-06, "loss": 0.5406, "step": 43900 }, { "epoch": 0.9679903200967991, "grad_norm": 1.638075351715088, "learning_rate": 8.067613073621659e-06, "loss": 0.553, "step": 44000 }, { "epoch": 0.9701902980970191, "grad_norm": 1.5249767303466797, "learning_rate": 8.063211180807749e-06, "loss": 0.5533, "step": 44100 }, { "epoch": 0.972390276097239, "grad_norm": 1.6304973363876343, "learning_rate": 8.05880928799384e-06, "loss": 0.5377, "step": 44200 }, { "epoch": 0.974590254097459, "grad_norm": 1.8152045011520386, "learning_rate": 8.054407395179928e-06, "loss": 0.5284, "step": 44300 }, { "epoch": 0.976790232097679, "grad_norm": 1.652199625968933, "learning_rate": 8.050005502366018e-06, "loss": 0.5448, "step": 44400 }, { "epoch": 0.978990210097899, "grad_norm": 1.7338589429855347, "learning_rate": 8.045603609552108e-06, "loss": 0.5395, "step": 44500 }, { "epoch": 0.981190188098119, "grad_norm": 1.5801849365234375, "learning_rate": 8.041201716738199e-06, "loss": 0.5297, "step": 44600 }, { "epoch": 0.9833901660983391, "grad_norm": 2.031813621520996, "learning_rate": 8.036799823924289e-06, "loss": 0.5617, "step": 44700 }, { "epoch": 0.985590144098559, "grad_norm": 1.934370756149292, "learning_rate": 8.03239793111038e-06, "loss": 0.5329, "step": 44800 }, { "epoch": 0.987790122098779, "grad_norm": 1.849741816520691, "learning_rate": 8.027996038296468e-06, "loss": 0.5413, "step": 44900 }, { "epoch": 0.989990100098999, "grad_norm": 1.757784366607666, "learning_rate": 8.023594145482558e-06, "loss": 0.5319, "step": 45000 }, { "epoch": 0.992190078099219, "grad_norm": 1.6084299087524414, "learning_rate": 8.019192252668648e-06, "loss": 0.5465, "step": 45100 }, { "epoch": 0.994390056099439, "grad_norm": 1.9279767274856567, "learning_rate": 8.014790359854739e-06, "loss": 0.5425, "step": 45200 }, { "epoch": 0.996590034099659, "grad_norm": 1.5739712715148926, "learning_rate": 8.010388467040829e-06, "loss": 0.5471, "step": 45300 }, { "epoch": 0.998790012099879, "grad_norm": 1.5087926387786865, "learning_rate": 8.005986574226917e-06, "loss": 0.5417, "step": 45400 }, { "epoch": 1.000989990100099, "grad_norm": 2.411069393157959, "learning_rate": 8.001584681413008e-06, "loss": 0.5328, "step": 45500 }, { "epoch": 1.003189968100319, "grad_norm": 2.560279607772827, "learning_rate": 7.997182788599098e-06, "loss": 0.5018, "step": 45600 }, { "epoch": 1.005389946100539, "grad_norm": 1.8764352798461914, "learning_rate": 7.992780895785188e-06, "loss": 0.4947, "step": 45700 }, { "epoch": 1.007589924100759, "grad_norm": 2.0531773567199707, "learning_rate": 7.988379002971279e-06, "loss": 0.5016, "step": 45800 }, { "epoch": 1.009789902100979, "grad_norm": 2.1719043254852295, "learning_rate": 7.983977110157369e-06, "loss": 0.504, "step": 45900 }, { "epoch": 1.011989880101199, "grad_norm": 1.8235334157943726, "learning_rate": 7.979575217343457e-06, "loss": 0.4967, "step": 46000 }, { "epoch": 1.014189858101419, "grad_norm": 2.329827308654785, "learning_rate": 7.975173324529548e-06, "loss": 0.5121, "step": 46100 }, { "epoch": 1.016389836101639, "grad_norm": 2.2712931632995605, "learning_rate": 7.970771431715638e-06, "loss": 0.4901, "step": 46200 }, { "epoch": 1.018589814101859, "grad_norm": 1.9942501783370972, "learning_rate": 7.966369538901728e-06, "loss": 0.5052, "step": 46300 }, { "epoch": 1.020789792102079, "grad_norm": 2.014451742172241, "learning_rate": 7.961967646087819e-06, "loss": 0.5117, "step": 46400 }, { "epoch": 1.022989770102299, "grad_norm": 2.1809909343719482, "learning_rate": 7.957565753273909e-06, "loss": 0.5106, "step": 46500 }, { "epoch": 1.025189748102519, "grad_norm": 1.6118221282958984, "learning_rate": 7.953163860459997e-06, "loss": 0.4959, "step": 46600 }, { "epoch": 1.027389726102739, "grad_norm": 1.9853328466415405, "learning_rate": 7.948761967646088e-06, "loss": 0.5127, "step": 46700 }, { "epoch": 1.029589704102959, "grad_norm": 2.3931078910827637, "learning_rate": 7.944360074832178e-06, "loss": 0.5084, "step": 46800 }, { "epoch": 1.031789682103179, "grad_norm": 1.6679604053497314, "learning_rate": 7.939958182018268e-06, "loss": 0.4913, "step": 46900 }, { "epoch": 1.0339896601033989, "grad_norm": 2.377412796020508, "learning_rate": 7.935556289204359e-06, "loss": 0.4915, "step": 47000 }, { "epoch": 1.0361896381036189, "grad_norm": 2.0759618282318115, "learning_rate": 7.931154396390449e-06, "loss": 0.5011, "step": 47100 }, { "epoch": 1.038389616103839, "grad_norm": 2.061979055404663, "learning_rate": 7.926752503576537e-06, "loss": 0.4945, "step": 47200 }, { "epoch": 1.040589594104059, "grad_norm": 1.912423849105835, "learning_rate": 7.92235061076263e-06, "loss": 0.496, "step": 47300 }, { "epoch": 1.042789572104279, "grad_norm": 2.3455774784088135, "learning_rate": 7.91794871794872e-06, "loss": 0.5063, "step": 47400 }, { "epoch": 1.044989550104499, "grad_norm": 1.7976536750793457, "learning_rate": 7.913546825134808e-06, "loss": 0.5053, "step": 47500 }, { "epoch": 1.047189528104719, "grad_norm": 2.056267023086548, "learning_rate": 7.909144932320899e-06, "loss": 0.4939, "step": 47600 }, { "epoch": 1.049389506104939, "grad_norm": 2.216721534729004, "learning_rate": 7.904743039506989e-06, "loss": 0.5007, "step": 47700 }, { "epoch": 1.051589484105159, "grad_norm": 1.4782536029815674, "learning_rate": 7.90034114669308e-06, "loss": 0.4765, "step": 47800 }, { "epoch": 1.053789462105379, "grad_norm": 1.739716649055481, "learning_rate": 7.89593925387917e-06, "loss": 0.5245, "step": 47900 }, { "epoch": 1.055989440105599, "grad_norm": 1.5695744752883911, "learning_rate": 7.89153736106526e-06, "loss": 0.511, "step": 48000 }, { "epoch": 1.058189418105819, "grad_norm": 2.0835139751434326, "learning_rate": 7.887135468251348e-06, "loss": 0.4989, "step": 48100 }, { "epoch": 1.0603893961060389, "grad_norm": 1.9040948152542114, "learning_rate": 7.882733575437439e-06, "loss": 0.5001, "step": 48200 }, { "epoch": 1.0625893741062589, "grad_norm": 2.1570136547088623, "learning_rate": 7.878331682623529e-06, "loss": 0.5031, "step": 48300 }, { "epoch": 1.0647893521064788, "grad_norm": 1.8248552083969116, "learning_rate": 7.873929789809619e-06, "loss": 0.504, "step": 48400 }, { "epoch": 1.0669893301066988, "grad_norm": 1.8128606081008911, "learning_rate": 7.86952789699571e-06, "loss": 0.4825, "step": 48500 }, { "epoch": 1.069189308106919, "grad_norm": 2.15380597114563, "learning_rate": 7.8651260041818e-06, "loss": 0.4843, "step": 48600 }, { "epoch": 1.071389286107139, "grad_norm": 2.4410858154296875, "learning_rate": 7.860724111367888e-06, "loss": 0.4973, "step": 48700 }, { "epoch": 1.073589264107359, "grad_norm": 1.9602640867233276, "learning_rate": 7.856322218553979e-06, "loss": 0.5039, "step": 48800 }, { "epoch": 1.075789242107579, "grad_norm": 2.189321994781494, "learning_rate": 7.851920325740069e-06, "loss": 0.5002, "step": 48900 }, { "epoch": 1.077989220107799, "grad_norm": 2.153059244155884, "learning_rate": 7.847518432926159e-06, "loss": 0.5074, "step": 49000 }, { "epoch": 1.080189198108019, "grad_norm": 1.9804766178131104, "learning_rate": 7.84311654011225e-06, "loss": 0.4981, "step": 49100 }, { "epoch": 1.082389176108239, "grad_norm": 2.228227376937866, "learning_rate": 7.838714647298338e-06, "loss": 0.5115, "step": 49200 }, { "epoch": 1.084589154108459, "grad_norm": 2.639230489730835, "learning_rate": 7.834312754484428e-06, "loss": 0.4956, "step": 49300 }, { "epoch": 1.086789132108679, "grad_norm": 2.2388269901275635, "learning_rate": 7.829910861670519e-06, "loss": 0.4957, "step": 49400 }, { "epoch": 1.0889891101088989, "grad_norm": 2.2344448566436768, "learning_rate": 7.825508968856609e-06, "loss": 0.5191, "step": 49500 }, { "epoch": 1.0911890881091189, "grad_norm": 2.1383955478668213, "learning_rate": 7.821107076042699e-06, "loss": 0.5035, "step": 49600 }, { "epoch": 1.0933890661093388, "grad_norm": 2.0469112396240234, "learning_rate": 7.81670518322879e-06, "loss": 0.4991, "step": 49700 }, { "epoch": 1.0955890441095588, "grad_norm": 2.091733694076538, "learning_rate": 7.812303290414878e-06, "loss": 0.5213, "step": 49800 }, { "epoch": 1.0977890221097788, "grad_norm": 2.2485196590423584, "learning_rate": 7.807901397600968e-06, "loss": 0.5159, "step": 49900 }, { "epoch": 1.099989000109999, "grad_norm": 2.335508108139038, "learning_rate": 7.803499504787059e-06, "loss": 0.5035, "step": 50000 }, { "epoch": 1.099989000109999, "eval_loss": 0.579010546207428, "eval_runtime": 378.8096, "eval_samples_per_second": 158.391, "eval_steps_per_second": 4.95, "step": 50000 }, { "epoch": 1.102188978110219, "grad_norm": 2.1119778156280518, "learning_rate": 7.799097611973149e-06, "loss": 0.5081, "step": 50100 }, { "epoch": 1.104388956110439, "grad_norm": 2.182777166366577, "learning_rate": 7.794695719159239e-06, "loss": 0.4925, "step": 50200 }, { "epoch": 1.106588934110659, "grad_norm": 2.2675302028656006, "learning_rate": 7.79029382634533e-06, "loss": 0.4865, "step": 50300 }, { "epoch": 1.108788912110879, "grad_norm": 1.858472228050232, "learning_rate": 7.785891933531418e-06, "loss": 0.5118, "step": 50400 }, { "epoch": 1.110988890111099, "grad_norm": 1.8882789611816406, "learning_rate": 7.781490040717508e-06, "loss": 0.5087, "step": 50500 }, { "epoch": 1.113188868111319, "grad_norm": 1.9170640707015991, "learning_rate": 7.777088147903599e-06, "loss": 0.491, "step": 50600 }, { "epoch": 1.1153888461115389, "grad_norm": 1.9825174808502197, "learning_rate": 7.772686255089689e-06, "loss": 0.5072, "step": 50700 }, { "epoch": 1.1175888241117589, "grad_norm": 2.3916232585906982, "learning_rate": 7.768284362275779e-06, "loss": 0.5111, "step": 50800 }, { "epoch": 1.1197888021119788, "grad_norm": 2.069160223007202, "learning_rate": 7.76388246946187e-06, "loss": 0.4927, "step": 50900 }, { "epoch": 1.1219887801121988, "grad_norm": 1.780382752418518, "learning_rate": 7.75948057664796e-06, "loss": 0.4959, "step": 51000 }, { "epoch": 1.1241887581124188, "grad_norm": 2.5268094539642334, "learning_rate": 7.75507868383405e-06, "loss": 0.4975, "step": 51100 }, { "epoch": 1.1263887361126388, "grad_norm": 1.9989362955093384, "learning_rate": 7.75067679102014e-06, "loss": 0.504, "step": 51200 }, { "epoch": 1.1285887141128588, "grad_norm": 2.230954647064209, "learning_rate": 7.746274898206229e-06, "loss": 0.5172, "step": 51300 }, { "epoch": 1.1307886921130788, "grad_norm": 2.2332351207733154, "learning_rate": 7.741873005392319e-06, "loss": 0.5026, "step": 51400 }, { "epoch": 1.132988670113299, "grad_norm": 2.234415054321289, "learning_rate": 7.73747111257841e-06, "loss": 0.5169, "step": 51500 }, { "epoch": 1.135188648113519, "grad_norm": 1.9074784517288208, "learning_rate": 7.7330692197645e-06, "loss": 0.4878, "step": 51600 }, { "epoch": 1.137388626113739, "grad_norm": 1.9809048175811768, "learning_rate": 7.72866732695059e-06, "loss": 0.4794, "step": 51700 }, { "epoch": 1.139588604113959, "grad_norm": 1.90762460231781, "learning_rate": 7.72426543413668e-06, "loss": 0.4996, "step": 51800 }, { "epoch": 1.1417885821141789, "grad_norm": 2.3830220699310303, "learning_rate": 7.719863541322769e-06, "loss": 0.5028, "step": 51900 }, { "epoch": 1.1439885601143989, "grad_norm": 2.052335023880005, "learning_rate": 7.715461648508859e-06, "loss": 0.5189, "step": 52000 }, { "epoch": 1.1461885381146188, "grad_norm": 2.3055222034454346, "learning_rate": 7.71105975569495e-06, "loss": 0.5117, "step": 52100 }, { "epoch": 1.1483885161148388, "grad_norm": 2.7478485107421875, "learning_rate": 7.70665786288104e-06, "loss": 0.503, "step": 52200 }, { "epoch": 1.1505884941150588, "grad_norm": 1.8724684715270996, "learning_rate": 7.70225597006713e-06, "loss": 0.5017, "step": 52300 }, { "epoch": 1.1527884721152788, "grad_norm": 2.1905338764190674, "learning_rate": 7.69785407725322e-06, "loss": 0.4995, "step": 52400 }, { "epoch": 1.1549884501154988, "grad_norm": 2.169680118560791, "learning_rate": 7.693452184439309e-06, "loss": 0.5012, "step": 52500 }, { "epoch": 1.1571884281157188, "grad_norm": 2.3531687259674072, "learning_rate": 7.689050291625399e-06, "loss": 0.4835, "step": 52600 }, { "epoch": 1.159388406115939, "grad_norm": 1.9876978397369385, "learning_rate": 7.68464839881149e-06, "loss": 0.4949, "step": 52700 }, { "epoch": 1.161588384116159, "grad_norm": 2.463718891143799, "learning_rate": 7.68024650599758e-06, "loss": 0.5121, "step": 52800 }, { "epoch": 1.163788362116379, "grad_norm": 2.4976985454559326, "learning_rate": 7.67584461318367e-06, "loss": 0.4882, "step": 52900 }, { "epoch": 1.165988340116599, "grad_norm": 1.968513011932373, "learning_rate": 7.67144272036976e-06, "loss": 0.5052, "step": 53000 }, { "epoch": 1.168188318116819, "grad_norm": 1.998396396636963, "learning_rate": 7.667040827555849e-06, "loss": 0.4912, "step": 53100 }, { "epoch": 1.1703882961170389, "grad_norm": 2.0211946964263916, "learning_rate": 7.662638934741939e-06, "loss": 0.5087, "step": 53200 }, { "epoch": 1.1725882741172589, "grad_norm": 1.97858464717865, "learning_rate": 7.65823704192803e-06, "loss": 0.5015, "step": 53300 }, { "epoch": 1.1747882521174788, "grad_norm": 2.1665027141571045, "learning_rate": 7.65383514911412e-06, "loss": 0.5088, "step": 53400 }, { "epoch": 1.1769882301176988, "grad_norm": 2.3747305870056152, "learning_rate": 7.64943325630021e-06, "loss": 0.4971, "step": 53500 }, { "epoch": 1.1791882081179188, "grad_norm": 2.0653445720672607, "learning_rate": 7.645031363486299e-06, "loss": 0.4999, "step": 53600 }, { "epoch": 1.1813881861181388, "grad_norm": 2.0202314853668213, "learning_rate": 7.640629470672389e-06, "loss": 0.4857, "step": 53700 }, { "epoch": 1.1835881641183588, "grad_norm": 2.1644513607025146, "learning_rate": 7.636227577858479e-06, "loss": 0.4925, "step": 53800 }, { "epoch": 1.1857881421185787, "grad_norm": 2.2284882068634033, "learning_rate": 7.63182568504457e-06, "loss": 0.5076, "step": 53900 }, { "epoch": 1.1879881201187987, "grad_norm": 1.9216992855072021, "learning_rate": 7.62742379223066e-06, "loss": 0.4937, "step": 54000 }, { "epoch": 1.1901880981190187, "grad_norm": 2.151033401489258, "learning_rate": 7.623021899416749e-06, "loss": 0.5042, "step": 54100 }, { "epoch": 1.1923880761192387, "grad_norm": 2.544735908508301, "learning_rate": 7.618620006602839e-06, "loss": 0.5016, "step": 54200 }, { "epoch": 1.194588054119459, "grad_norm": 2.404811382293701, "learning_rate": 7.6142181137889306e-06, "loss": 0.4859, "step": 54300 }, { "epoch": 1.1967880321196789, "grad_norm": 2.071399450302124, "learning_rate": 7.60981622097502e-06, "loss": 0.5009, "step": 54400 }, { "epoch": 1.1989880101198989, "grad_norm": 2.0729258060455322, "learning_rate": 7.60541432816111e-06, "loss": 0.5068, "step": 54500 }, { "epoch": 1.2011879881201188, "grad_norm": 1.9438556432724, "learning_rate": 7.6010124353472006e-06, "loss": 0.5151, "step": 54600 }, { "epoch": 1.2033879661203388, "grad_norm": 2.3928163051605225, "learning_rate": 7.59661054253329e-06, "loss": 0.5152, "step": 54700 }, { "epoch": 1.2055879441205588, "grad_norm": 2.0218889713287354, "learning_rate": 7.59220864971938e-06, "loss": 0.4935, "step": 54800 }, { "epoch": 1.2077879221207788, "grad_norm": 2.0265040397644043, "learning_rate": 7.5878067569054706e-06, "loss": 0.4995, "step": 54900 }, { "epoch": 1.2099879001209988, "grad_norm": 2.6148312091827393, "learning_rate": 7.58340486409156e-06, "loss": 0.5082, "step": 55000 }, { "epoch": 1.2121878781212188, "grad_norm": 2.4383389949798584, "learning_rate": 7.57900297127765e-06, "loss": 0.4982, "step": 55100 }, { "epoch": 1.2143878561214387, "grad_norm": 2.649778366088867, "learning_rate": 7.5746010784637406e-06, "loss": 0.4974, "step": 55200 }, { "epoch": 1.2165878341216587, "grad_norm": 2.525026559829712, "learning_rate": 7.57019918564983e-06, "loss": 0.4953, "step": 55300 }, { "epoch": 1.2187878121218787, "grad_norm": 2.795290470123291, "learning_rate": 7.56579729283592e-06, "loss": 0.5118, "step": 55400 }, { "epoch": 1.220987790122099, "grad_norm": 1.8484504222869873, "learning_rate": 7.5613954000220105e-06, "loss": 0.4897, "step": 55500 }, { "epoch": 1.2231877681223189, "grad_norm": 2.673802614212036, "learning_rate": 7.5569935072081e-06, "loss": 0.4856, "step": 55600 }, { "epoch": 1.2253877461225389, "grad_norm": 2.250032663345337, "learning_rate": 7.55259161439419e-06, "loss": 0.4942, "step": 55700 }, { "epoch": 1.2275877241227588, "grad_norm": 2.281285285949707, "learning_rate": 7.5481897215802805e-06, "loss": 0.492, "step": 55800 }, { "epoch": 1.2297877021229788, "grad_norm": 2.1768269538879395, "learning_rate": 7.54378782876637e-06, "loss": 0.5014, "step": 55900 }, { "epoch": 1.2319876801231988, "grad_norm": 2.172852039337158, "learning_rate": 7.53938593595246e-06, "loss": 0.5055, "step": 56000 }, { "epoch": 1.2341876581234188, "grad_norm": 2.2055068016052246, "learning_rate": 7.5349840431385505e-06, "loss": 0.4994, "step": 56100 }, { "epoch": 1.2363876361236388, "grad_norm": 2.2056238651275635, "learning_rate": 7.53058215032464e-06, "loss": 0.5082, "step": 56200 }, { "epoch": 1.2385876141238588, "grad_norm": 1.8684000968933105, "learning_rate": 7.52618025751073e-06, "loss": 0.5001, "step": 56300 }, { "epoch": 1.2407875921240787, "grad_norm": 1.8799563646316528, "learning_rate": 7.52177836469682e-06, "loss": 0.4863, "step": 56400 }, { "epoch": 1.2429875701242987, "grad_norm": 2.0053553581237793, "learning_rate": 7.51737647188291e-06, "loss": 0.5019, "step": 56500 }, { "epoch": 1.2451875481245187, "grad_norm": 2.526304244995117, "learning_rate": 7.512974579069e-06, "loss": 0.4783, "step": 56600 }, { "epoch": 1.2473875261247387, "grad_norm": 2.2301254272460938, "learning_rate": 7.50857268625509e-06, "loss": 0.4975, "step": 56700 }, { "epoch": 1.2495875041249587, "grad_norm": 1.8377426862716675, "learning_rate": 7.50417079344118e-06, "loss": 0.4929, "step": 56800 }, { "epoch": 1.2517874821251787, "grad_norm": 2.6410109996795654, "learning_rate": 7.49976890062727e-06, "loss": 0.4816, "step": 56900 }, { "epoch": 1.2539874601253986, "grad_norm": 2.0295798778533936, "learning_rate": 7.49536700781336e-06, "loss": 0.5038, "step": 57000 }, { "epoch": 1.2561874381256186, "grad_norm": 2.7886478900909424, "learning_rate": 7.49096511499945e-06, "loss": 0.5147, "step": 57100 }, { "epoch": 1.2583874161258388, "grad_norm": 2.330388307571411, "learning_rate": 7.48656322218554e-06, "loss": 0.4929, "step": 57200 }, { "epoch": 1.2605873941260588, "grad_norm": 1.756525993347168, "learning_rate": 7.48216132937163e-06, "loss": 0.4873, "step": 57300 }, { "epoch": 1.2627873721262788, "grad_norm": 1.7345948219299316, "learning_rate": 7.47775943655772e-06, "loss": 0.4906, "step": 57400 }, { "epoch": 1.2649873501264988, "grad_norm": 2.1234254837036133, "learning_rate": 7.47335754374381e-06, "loss": 0.5082, "step": 57500 }, { "epoch": 1.2671873281267187, "grad_norm": 1.7519376277923584, "learning_rate": 7.4689556509299e-06, "loss": 0.5061, "step": 57600 }, { "epoch": 1.2693873061269387, "grad_norm": 2.4111804962158203, "learning_rate": 7.464553758115991e-06, "loss": 0.4903, "step": 57700 }, { "epoch": 1.2715872841271587, "grad_norm": 1.9729013442993164, "learning_rate": 7.460151865302081e-06, "loss": 0.4881, "step": 57800 }, { "epoch": 1.2737872621273787, "grad_norm": 2.7246460914611816, "learning_rate": 7.455749972488171e-06, "loss": 0.517, "step": 57900 }, { "epoch": 1.2759872401275987, "grad_norm": 1.660434603691101, "learning_rate": 7.451348079674261e-06, "loss": 0.502, "step": 58000 }, { "epoch": 1.2781872181278187, "grad_norm": 2.782742500305176, "learning_rate": 7.446946186860351e-06, "loss": 0.4985, "step": 58100 }, { "epoch": 1.2803871961280386, "grad_norm": 2.264404296875, "learning_rate": 7.4425442940464405e-06, "loss": 0.5016, "step": 58200 }, { "epoch": 1.2825871741282588, "grad_norm": 2.0111939907073975, "learning_rate": 7.438142401232531e-06, "loss": 0.4905, "step": 58300 }, { "epoch": 1.2847871521284788, "grad_norm": 2.0050606727600098, "learning_rate": 7.433740508418621e-06, "loss": 0.4864, "step": 58400 }, { "epoch": 1.2869871301286988, "grad_norm": 1.3107115030288696, "learning_rate": 7.4293386156047105e-06, "loss": 0.4915, "step": 58500 }, { "epoch": 1.2891871081289188, "grad_norm": 1.8996055126190186, "learning_rate": 7.424936722790801e-06, "loss": 0.49, "step": 58600 }, { "epoch": 1.2913870861291388, "grad_norm": 1.7696682214736938, "learning_rate": 7.420534829976891e-06, "loss": 0.4968, "step": 58700 }, { "epoch": 1.2935870641293588, "grad_norm": 2.1315739154815674, "learning_rate": 7.4161329371629805e-06, "loss": 0.4916, "step": 58800 }, { "epoch": 1.2957870421295787, "grad_norm": 2.2679789066314697, "learning_rate": 7.411731044349071e-06, "loss": 0.5021, "step": 58900 }, { "epoch": 1.2979870201297987, "grad_norm": 2.128899097442627, "learning_rate": 7.407329151535161e-06, "loss": 0.5148, "step": 59000 }, { "epoch": 1.3001869981300187, "grad_norm": 2.085585832595825, "learning_rate": 7.4029272587212505e-06, "loss": 0.4804, "step": 59100 }, { "epoch": 1.3023869761302387, "grad_norm": 2.367190361022949, "learning_rate": 7.398525365907341e-06, "loss": 0.4921, "step": 59200 }, { "epoch": 1.3045869541304587, "grad_norm": 2.3802804946899414, "learning_rate": 7.394123473093431e-06, "loss": 0.4974, "step": 59300 }, { "epoch": 1.3067869321306786, "grad_norm": 2.332484483718872, "learning_rate": 7.3897215802795205e-06, "loss": 0.5115, "step": 59400 }, { "epoch": 1.3089869101308986, "grad_norm": 2.1906321048736572, "learning_rate": 7.385319687465611e-06, "loss": 0.505, "step": 59500 }, { "epoch": 1.3111868881311186, "grad_norm": 1.942108154296875, "learning_rate": 7.380917794651701e-06, "loss": 0.4937, "step": 59600 }, { "epoch": 1.3133868661313386, "grad_norm": 2.0868446826934814, "learning_rate": 7.3765159018377905e-06, "loss": 0.4963, "step": 59700 }, { "epoch": 1.3155868441315586, "grad_norm": 2.3469884395599365, "learning_rate": 7.372114009023881e-06, "loss": 0.5038, "step": 59800 }, { "epoch": 1.3177868221317786, "grad_norm": 2.1203341484069824, "learning_rate": 7.367712116209971e-06, "loss": 0.4891, "step": 59900 }, { "epoch": 1.3199868001319988, "grad_norm": 1.7752751111984253, "learning_rate": 7.3633102233960605e-06, "loss": 0.5036, "step": 60000 }, { "epoch": 1.3221867781322187, "grad_norm": 2.311631441116333, "learning_rate": 7.358908330582151e-06, "loss": 0.5104, "step": 60100 }, { "epoch": 1.3243867561324387, "grad_norm": 1.9225836992263794, "learning_rate": 7.354506437768241e-06, "loss": 0.4926, "step": 60200 }, { "epoch": 1.3265867341326587, "grad_norm": 1.9772847890853882, "learning_rate": 7.3501045449543305e-06, "loss": 0.4923, "step": 60300 }, { "epoch": 1.3287867121328787, "grad_norm": 1.6036473512649536, "learning_rate": 7.345702652140421e-06, "loss": 0.4955, "step": 60400 }, { "epoch": 1.3309866901330987, "grad_norm": 1.8488271236419678, "learning_rate": 7.34130075932651e-06, "loss": 0.512, "step": 60500 }, { "epoch": 1.3331866681333187, "grad_norm": 2.149338722229004, "learning_rate": 7.3368988665126005e-06, "loss": 0.4914, "step": 60600 }, { "epoch": 1.3353866461335386, "grad_norm": 2.4873788356781006, "learning_rate": 7.332496973698691e-06, "loss": 0.4965, "step": 60700 }, { "epoch": 1.3375866241337586, "grad_norm": 2.4446520805358887, "learning_rate": 7.32809508088478e-06, "loss": 0.4917, "step": 60800 }, { "epoch": 1.3397866021339786, "grad_norm": 2.2292611598968506, "learning_rate": 7.3236931880708705e-06, "loss": 0.4876, "step": 60900 }, { "epoch": 1.3419865801341986, "grad_norm": 2.0160257816314697, "learning_rate": 7.319291295256961e-06, "loss": 0.4875, "step": 61000 }, { "epoch": 1.3441865581344188, "grad_norm": 2.0969207286834717, "learning_rate": 7.31488940244305e-06, "loss": 0.5031, "step": 61100 }, { "epoch": 1.3463865361346388, "grad_norm": 2.283207416534424, "learning_rate": 7.310487509629141e-06, "loss": 0.4907, "step": 61200 }, { "epoch": 1.3485865141348587, "grad_norm": 1.9769617319107056, "learning_rate": 7.306085616815232e-06, "loss": 0.4942, "step": 61300 }, { "epoch": 1.3507864921350787, "grad_norm": 2.156163454055786, "learning_rate": 7.301683724001322e-06, "loss": 0.4992, "step": 61400 }, { "epoch": 1.3529864701352987, "grad_norm": 1.6328924894332886, "learning_rate": 7.297281831187411e-06, "loss": 0.4861, "step": 61500 }, { "epoch": 1.3551864481355187, "grad_norm": 2.365056276321411, "learning_rate": 7.292879938373502e-06, "loss": 0.4915, "step": 61600 }, { "epoch": 1.3573864261357387, "grad_norm": 2.6308701038360596, "learning_rate": 7.288478045559592e-06, "loss": 0.4837, "step": 61700 }, { "epoch": 1.3595864041359587, "grad_norm": 2.454827070236206, "learning_rate": 7.284076152745681e-06, "loss": 0.4921, "step": 61800 }, { "epoch": 1.3617863821361786, "grad_norm": 2.19412899017334, "learning_rate": 7.279674259931772e-06, "loss": 0.501, "step": 61900 }, { "epoch": 1.3639863601363986, "grad_norm": 2.183582305908203, "learning_rate": 7.275272367117862e-06, "loss": 0.4934, "step": 62000 }, { "epoch": 1.3661863381366186, "grad_norm": 2.2355942726135254, "learning_rate": 7.270870474303951e-06, "loss": 0.5037, "step": 62100 }, { "epoch": 1.3683863161368386, "grad_norm": 1.8665735721588135, "learning_rate": 7.266468581490042e-06, "loss": 0.5054, "step": 62200 }, { "epoch": 1.3705862941370586, "grad_norm": 2.457763433456421, "learning_rate": 7.262066688676131e-06, "loss": 0.4986, "step": 62300 }, { "epoch": 1.3727862721372786, "grad_norm": 2.2373385429382324, "learning_rate": 7.257664795862221e-06, "loss": 0.4807, "step": 62400 }, { "epoch": 1.3749862501374985, "grad_norm": 2.129803419113159, "learning_rate": 7.253262903048312e-06, "loss": 0.4877, "step": 62500 }, { "epoch": 1.3771862281377185, "grad_norm": 2.2858309745788574, "learning_rate": 7.248861010234401e-06, "loss": 0.492, "step": 62600 }, { "epoch": 1.3793862061379385, "grad_norm": 2.4332919120788574, "learning_rate": 7.244459117420491e-06, "loss": 0.4907, "step": 62700 }, { "epoch": 1.3815861841381587, "grad_norm": 1.7995531558990479, "learning_rate": 7.240057224606582e-06, "loss": 0.5037, "step": 62800 }, { "epoch": 1.3837861621383787, "grad_norm": 2.672942876815796, "learning_rate": 7.235655331792671e-06, "loss": 0.4968, "step": 62900 }, { "epoch": 1.3859861401385987, "grad_norm": 2.1194186210632324, "learning_rate": 7.231253438978761e-06, "loss": 0.4944, "step": 63000 }, { "epoch": 1.3881861181388186, "grad_norm": 2.5758581161499023, "learning_rate": 7.226851546164852e-06, "loss": 0.4844, "step": 63100 }, { "epoch": 1.3903860961390386, "grad_norm": 2.359781503677368, "learning_rate": 7.222449653350941e-06, "loss": 0.4847, "step": 63200 }, { "epoch": 1.3925860741392586, "grad_norm": 2.3243279457092285, "learning_rate": 7.218047760537031e-06, "loss": 0.4986, "step": 63300 }, { "epoch": 1.3947860521394786, "grad_norm": 2.4134695529937744, "learning_rate": 7.213645867723122e-06, "loss": 0.4961, "step": 63400 }, { "epoch": 1.3969860301396986, "grad_norm": 2.3432512283325195, "learning_rate": 7.209243974909211e-06, "loss": 0.5028, "step": 63500 }, { "epoch": 1.3991860081399186, "grad_norm": 2.474076747894287, "learning_rate": 7.204842082095301e-06, "loss": 0.5004, "step": 63600 }, { "epoch": 1.4013859861401385, "grad_norm": 2.43440580368042, "learning_rate": 7.200440189281392e-06, "loss": 0.5031, "step": 63700 }, { "epoch": 1.4035859641403585, "grad_norm": 2.1737067699432373, "learning_rate": 7.196038296467481e-06, "loss": 0.4871, "step": 63800 }, { "epoch": 1.4057859421405787, "grad_norm": 1.9419715404510498, "learning_rate": 7.191636403653571e-06, "loss": 0.4903, "step": 63900 }, { "epoch": 1.4079859201407987, "grad_norm": 2.1449568271636963, "learning_rate": 7.187234510839662e-06, "loss": 0.4819, "step": 64000 }, { "epoch": 1.4101858981410187, "grad_norm": 2.1790225505828857, "learning_rate": 7.182832618025751e-06, "loss": 0.5155, "step": 64100 }, { "epoch": 1.4123858761412387, "grad_norm": 2.4493134021759033, "learning_rate": 7.178430725211841e-06, "loss": 0.4922, "step": 64200 }, { "epoch": 1.4145858541414587, "grad_norm": 2.250734806060791, "learning_rate": 7.174028832397932e-06, "loss": 0.4911, "step": 64300 }, { "epoch": 1.4167858321416786, "grad_norm": 2.312277317047119, "learning_rate": 7.169626939584021e-06, "loss": 0.4884, "step": 64400 }, { "epoch": 1.4189858101418986, "grad_norm": 2.0889904499053955, "learning_rate": 7.165225046770111e-06, "loss": 0.5023, "step": 64500 }, { "epoch": 1.4211857881421186, "grad_norm": 2.2084124088287354, "learning_rate": 7.160823153956201e-06, "loss": 0.4974, "step": 64600 }, { "epoch": 1.4233857661423386, "grad_norm": 2.046213150024414, "learning_rate": 7.156421261142292e-06, "loss": 0.4935, "step": 64700 }, { "epoch": 1.4255857441425586, "grad_norm": 2.1457226276397705, "learning_rate": 7.152019368328382e-06, "loss": 0.4903, "step": 64800 }, { "epoch": 1.4277857221427785, "grad_norm": 2.058285713195801, "learning_rate": 7.1476174755144725e-06, "loss": 0.5002, "step": 64900 }, { "epoch": 1.4299857001429985, "grad_norm": 2.269285202026367, "learning_rate": 7.143215582700562e-06, "loss": 0.4891, "step": 65000 }, { "epoch": 1.4321856781432185, "grad_norm": 2.030383586883545, "learning_rate": 7.138813689886652e-06, "loss": 0.5101, "step": 65100 }, { "epoch": 1.4343856561434385, "grad_norm": 2.0629866123199463, "learning_rate": 7.1344117970727425e-06, "loss": 0.4931, "step": 65200 }, { "epoch": 1.4365856341436585, "grad_norm": 2.064944267272949, "learning_rate": 7.130009904258832e-06, "loss": 0.4992, "step": 65300 }, { "epoch": 1.4387856121438785, "grad_norm": 2.1032135486602783, "learning_rate": 7.125608011444922e-06, "loss": 0.4919, "step": 65400 }, { "epoch": 1.4409855901440984, "grad_norm": 2.3275599479675293, "learning_rate": 7.1212061186310125e-06, "loss": 0.5119, "step": 65500 }, { "epoch": 1.4431855681443184, "grad_norm": 2.2477211952209473, "learning_rate": 7.116804225817102e-06, "loss": 0.5092, "step": 65600 }, { "epoch": 1.4453855461445386, "grad_norm": 1.8756898641586304, "learning_rate": 7.112402333003192e-06, "loss": 0.4977, "step": 65700 }, { "epoch": 1.4475855241447586, "grad_norm": 2.839963436126709, "learning_rate": 7.1080004401892825e-06, "loss": 0.4939, "step": 65800 }, { "epoch": 1.4497855021449786, "grad_norm": 1.8775593042373657, "learning_rate": 7.103598547375372e-06, "loss": 0.4851, "step": 65900 }, { "epoch": 1.4519854801451986, "grad_norm": 2.1938886642456055, "learning_rate": 7.099196654561462e-06, "loss": 0.4797, "step": 66000 }, { "epoch": 1.4541854581454186, "grad_norm": 2.063523769378662, "learning_rate": 7.0947947617475525e-06, "loss": 0.4949, "step": 66100 }, { "epoch": 1.4563854361456385, "grad_norm": 2.156369924545288, "learning_rate": 7.090392868933642e-06, "loss": 0.4936, "step": 66200 }, { "epoch": 1.4585854141458585, "grad_norm": 2.4886789321899414, "learning_rate": 7.085990976119732e-06, "loss": 0.4979, "step": 66300 }, { "epoch": 1.4607853921460785, "grad_norm": 2.3196351528167725, "learning_rate": 7.081589083305822e-06, "loss": 0.5121, "step": 66400 }, { "epoch": 1.4629853701462985, "grad_norm": 2.057623863220215, "learning_rate": 7.077187190491912e-06, "loss": 0.4827, "step": 66500 }, { "epoch": 1.4651853481465185, "grad_norm": 1.9187816381454468, "learning_rate": 7.072785297678002e-06, "loss": 0.4972, "step": 66600 }, { "epoch": 1.4673853261467387, "grad_norm": 1.9243098497390747, "learning_rate": 7.068383404864092e-06, "loss": 0.48, "step": 66700 }, { "epoch": 1.4695853041469586, "grad_norm": 2.221501111984253, "learning_rate": 7.063981512050182e-06, "loss": 0.4817, "step": 66800 }, { "epoch": 1.4717852821471786, "grad_norm": 2.145901679992676, "learning_rate": 7.059579619236272e-06, "loss": 0.4974, "step": 66900 }, { "epoch": 1.4739852601473986, "grad_norm": 2.7018229961395264, "learning_rate": 7.055177726422362e-06, "loss": 0.4776, "step": 67000 }, { "epoch": 1.4761852381476186, "grad_norm": 1.826542854309082, "learning_rate": 7.050775833608452e-06, "loss": 0.494, "step": 67100 }, { "epoch": 1.4783852161478386, "grad_norm": 2.528482437133789, "learning_rate": 7.046373940794542e-06, "loss": 0.4804, "step": 67200 }, { "epoch": 1.4805851941480586, "grad_norm": 2.3805463314056396, "learning_rate": 7.041972047980632e-06, "loss": 0.5, "step": 67300 }, { "epoch": 1.4827851721482785, "grad_norm": 2.379004716873169, "learning_rate": 7.037570155166722e-06, "loss": 0.5008, "step": 67400 }, { "epoch": 1.4849851501484985, "grad_norm": 2.351308584213257, "learning_rate": 7.033168262352812e-06, "loss": 0.4917, "step": 67500 }, { "epoch": 1.4871851281487185, "grad_norm": 2.390312910079956, "learning_rate": 7.028766369538902e-06, "loss": 0.4962, "step": 67600 }, { "epoch": 1.4893851061489385, "grad_norm": 2.4329919815063477, "learning_rate": 7.024364476724992e-06, "loss": 0.4877, "step": 67700 }, { "epoch": 1.4915850841491585, "grad_norm": 2.452253580093384, "learning_rate": 7.019962583911082e-06, "loss": 0.4908, "step": 67800 }, { "epoch": 1.4937850621493785, "grad_norm": 2.1782665252685547, "learning_rate": 7.015560691097172e-06, "loss": 0.4804, "step": 67900 }, { "epoch": 1.4959850401495984, "grad_norm": 2.0464863777160645, "learning_rate": 7.011158798283262e-06, "loss": 0.4947, "step": 68000 }, { "epoch": 1.4981850181498184, "grad_norm": 1.713578701019287, "learning_rate": 7.006756905469353e-06, "loss": 0.4875, "step": 68100 }, { "epoch": 1.5003849961500384, "grad_norm": 2.025834560394287, "learning_rate": 7.002355012655443e-06, "loss": 0.5027, "step": 68200 }, { "epoch": 1.5025849741502584, "grad_norm": 2.509138822555542, "learning_rate": 6.997953119841533e-06, "loss": 0.4822, "step": 68300 }, { "epoch": 1.5047849521504784, "grad_norm": 2.0234317779541016, "learning_rate": 6.993551227027623e-06, "loss": 0.4975, "step": 68400 }, { "epoch": 1.5069849301506983, "grad_norm": 2.465769052505493, "learning_rate": 6.9891493342137125e-06, "loss": 0.5012, "step": 68500 }, { "epoch": 1.5091849081509183, "grad_norm": 2.5200085639953613, "learning_rate": 6.984747441399803e-06, "loss": 0.5017, "step": 68600 }, { "epoch": 1.5113848861511385, "grad_norm": 2.2190017700195312, "learning_rate": 6.980345548585893e-06, "loss": 0.4898, "step": 68700 }, { "epoch": 1.5135848641513585, "grad_norm": 2.2302262783050537, "learning_rate": 6.9759436557719825e-06, "loss": 0.4989, "step": 68800 }, { "epoch": 1.5157848421515785, "grad_norm": 2.4511725902557373, "learning_rate": 6.971541762958073e-06, "loss": 0.4934, "step": 68900 }, { "epoch": 1.5179848201517985, "grad_norm": 2.3731210231781006, "learning_rate": 6.967139870144163e-06, "loss": 0.4724, "step": 69000 }, { "epoch": 1.5201847981520185, "grad_norm": 2.2834906578063965, "learning_rate": 6.9627379773302525e-06, "loss": 0.4833, "step": 69100 }, { "epoch": 1.5223847761522384, "grad_norm": 2.483689785003662, "learning_rate": 6.958336084516343e-06, "loss": 0.4923, "step": 69200 }, { "epoch": 1.5245847541524584, "grad_norm": 2.316864490509033, "learning_rate": 6.953934191702433e-06, "loss": 0.5233, "step": 69300 }, { "epoch": 1.5267847321526786, "grad_norm": 2.1905770301818848, "learning_rate": 6.9495322988885225e-06, "loss": 0.5233, "step": 69400 }, { "epoch": 1.5289847101528986, "grad_norm": 2.5095105171203613, "learning_rate": 6.945130406074613e-06, "loss": 0.4927, "step": 69500 }, { "epoch": 1.5311846881531186, "grad_norm": 2.210827112197876, "learning_rate": 6.940728513260703e-06, "loss": 0.4965, "step": 69600 }, { "epoch": 1.5333846661533386, "grad_norm": 2.6142313480377197, "learning_rate": 6.9363266204467925e-06, "loss": 0.5025, "step": 69700 }, { "epoch": 1.5355846441535586, "grad_norm": 2.3923892974853516, "learning_rate": 6.931924727632883e-06, "loss": 0.4793, "step": 69800 }, { "epoch": 1.5377846221537785, "grad_norm": 2.1831846237182617, "learning_rate": 6.927522834818973e-06, "loss": 0.4935, "step": 69900 }, { "epoch": 1.5399846001539985, "grad_norm": 2.030944347381592, "learning_rate": 6.9231209420050625e-06, "loss": 0.494, "step": 70000 }, { "epoch": 1.5421845781542185, "grad_norm": 2.089087724685669, "learning_rate": 6.918719049191153e-06, "loss": 0.4989, "step": 70100 }, { "epoch": 1.5443845561544385, "grad_norm": 2.7058706283569336, "learning_rate": 6.914317156377243e-06, "loss": 0.4982, "step": 70200 }, { "epoch": 1.5465845341546585, "grad_norm": 2.312584638595581, "learning_rate": 6.9099152635633325e-06, "loss": 0.4981, "step": 70300 }, { "epoch": 1.5487845121548784, "grad_norm": 2.5172085762023926, "learning_rate": 6.905513370749423e-06, "loss": 0.4871, "step": 70400 }, { "epoch": 1.5509844901550984, "grad_norm": 2.035313367843628, "learning_rate": 6.901111477935512e-06, "loss": 0.4859, "step": 70500 }, { "epoch": 1.5531844681553184, "grad_norm": 2.3374691009521484, "learning_rate": 6.8967095851216025e-06, "loss": 0.4831, "step": 70600 }, { "epoch": 1.5553844461555384, "grad_norm": 2.2027342319488525, "learning_rate": 6.892307692307693e-06, "loss": 0.4974, "step": 70700 }, { "epoch": 1.5575844241557584, "grad_norm": 2.4372105598449707, "learning_rate": 6.887905799493782e-06, "loss": 0.4902, "step": 70800 }, { "epoch": 1.5597844021559784, "grad_norm": 2.320554256439209, "learning_rate": 6.8835039066798725e-06, "loss": 0.4917, "step": 70900 }, { "epoch": 1.5619843801561983, "grad_norm": 2.323988437652588, "learning_rate": 6.879102013865963e-06, "loss": 0.5034, "step": 71000 }, { "epoch": 1.5641843581564183, "grad_norm": 2.111454725265503, "learning_rate": 6.874700121052052e-06, "loss": 0.492, "step": 71100 }, { "epoch": 1.5663843361566383, "grad_norm": 2.664884328842163, "learning_rate": 6.8702982282381425e-06, "loss": 0.4982, "step": 71200 }, { "epoch": 1.5685843141568583, "grad_norm": 1.9500539302825928, "learning_rate": 6.865896335424233e-06, "loss": 0.5147, "step": 71300 }, { "epoch": 1.5707842921570783, "grad_norm": 2.3592636585235596, "learning_rate": 6.861494442610322e-06, "loss": 0.4825, "step": 71400 }, { "epoch": 1.5729842701572985, "grad_norm": 2.4548308849334717, "learning_rate": 6.8570925497964125e-06, "loss": 0.4949, "step": 71500 }, { "epoch": 1.5751842481575185, "grad_norm": 2.971724033355713, "learning_rate": 6.852690656982504e-06, "loss": 0.4945, "step": 71600 }, { "epoch": 1.5773842261577384, "grad_norm": 2.399245023727417, "learning_rate": 6.848288764168594e-06, "loss": 0.4888, "step": 71700 }, { "epoch": 1.5795842041579584, "grad_norm": 2.2702841758728027, "learning_rate": 6.843886871354683e-06, "loss": 0.49, "step": 71800 }, { "epoch": 1.5817841821581784, "grad_norm": 1.9252210855484009, "learning_rate": 6.839484978540774e-06, "loss": 0.494, "step": 71900 }, { "epoch": 1.5839841601583984, "grad_norm": 2.4878454208374023, "learning_rate": 6.835083085726864e-06, "loss": 0.4984, "step": 72000 }, { "epoch": 1.5861841381586184, "grad_norm": 2.035708427429199, "learning_rate": 6.830681192912953e-06, "loss": 0.4825, "step": 72100 }, { "epoch": 1.5883841161588386, "grad_norm": 2.55355167388916, "learning_rate": 6.826279300099044e-06, "loss": 0.5056, "step": 72200 }, { "epoch": 1.5905840941590585, "grad_norm": 2.4391555786132812, "learning_rate": 6.821877407285133e-06, "loss": 0.4928, "step": 72300 }, { "epoch": 1.5927840721592785, "grad_norm": 2.2338058948516846, "learning_rate": 6.817475514471223e-06, "loss": 0.4874, "step": 72400 }, { "epoch": 1.5949840501594985, "grad_norm": 2.7937569618225098, "learning_rate": 6.813073621657314e-06, "loss": 0.477, "step": 72500 }, { "epoch": 1.5971840281597185, "grad_norm": 2.2559831142425537, "learning_rate": 6.808671728843403e-06, "loss": 0.501, "step": 72600 }, { "epoch": 1.5993840061599385, "grad_norm": 2.1428000926971436, "learning_rate": 6.804269836029493e-06, "loss": 0.4872, "step": 72700 }, { "epoch": 1.6015839841601585, "grad_norm": 2.306943655014038, "learning_rate": 6.799867943215584e-06, "loss": 0.5002, "step": 72800 }, { "epoch": 1.6037839621603784, "grad_norm": 2.3396975994110107, "learning_rate": 6.795466050401673e-06, "loss": 0.4951, "step": 72900 }, { "epoch": 1.6059839401605984, "grad_norm": 1.8894736766815186, "learning_rate": 6.791064157587763e-06, "loss": 0.4872, "step": 73000 }, { "epoch": 1.6081839181608184, "grad_norm": 2.0049326419830322, "learning_rate": 6.786662264773854e-06, "loss": 0.4877, "step": 73100 }, { "epoch": 1.6103838961610384, "grad_norm": 2.3615005016326904, "learning_rate": 6.782260371959943e-06, "loss": 0.4925, "step": 73200 }, { "epoch": 1.6125838741612584, "grad_norm": 2.386545419692993, "learning_rate": 6.777858479146033e-06, "loss": 0.4881, "step": 73300 }, { "epoch": 1.6147838521614784, "grad_norm": 2.3752076625823975, "learning_rate": 6.773456586332124e-06, "loss": 0.4813, "step": 73400 }, { "epoch": 1.6169838301616983, "grad_norm": 2.156837224960327, "learning_rate": 6.769054693518213e-06, "loss": 0.4793, "step": 73500 }, { "epoch": 1.6191838081619183, "grad_norm": 2.788848638534546, "learning_rate": 6.764652800704303e-06, "loss": 0.4946, "step": 73600 }, { "epoch": 1.6213837861621383, "grad_norm": 2.1992275714874268, "learning_rate": 6.760250907890394e-06, "loss": 0.5019, "step": 73700 }, { "epoch": 1.6235837641623583, "grad_norm": 2.664424419403076, "learning_rate": 6.755849015076483e-06, "loss": 0.4885, "step": 73800 }, { "epoch": 1.6257837421625783, "grad_norm": 2.3380892276763916, "learning_rate": 6.751447122262573e-06, "loss": 0.4947, "step": 73900 }, { "epoch": 1.6279837201627982, "grad_norm": 2.3588438034057617, "learning_rate": 6.747045229448664e-06, "loss": 0.4652, "step": 74000 }, { "epoch": 1.6301836981630182, "grad_norm": 2.6669723987579346, "learning_rate": 6.742643336634753e-06, "loss": 0.491, "step": 74100 }, { "epoch": 1.6323836761632382, "grad_norm": 2.4595651626586914, "learning_rate": 6.738241443820843e-06, "loss": 0.501, "step": 74200 }, { "epoch": 1.6345836541634584, "grad_norm": 2.2686636447906494, "learning_rate": 6.733839551006934e-06, "loss": 0.482, "step": 74300 }, { "epoch": 1.6367836321636784, "grad_norm": 2.4227776527404785, "learning_rate": 6.729437658193023e-06, "loss": 0.4958, "step": 74400 }, { "epoch": 1.6389836101638984, "grad_norm": 1.9847477674484253, "learning_rate": 6.725035765379113e-06, "loss": 0.4834, "step": 74500 }, { "epoch": 1.6411835881641184, "grad_norm": 2.6502370834350586, "learning_rate": 6.720633872565203e-06, "loss": 0.4815, "step": 74600 }, { "epoch": 1.6433835661643383, "grad_norm": 2.2831785678863525, "learning_rate": 6.716231979751293e-06, "loss": 0.4826, "step": 74700 }, { "epoch": 1.6455835441645583, "grad_norm": 1.8865406513214111, "learning_rate": 6.711830086937383e-06, "loss": 0.4986, "step": 74800 }, { "epoch": 1.6477835221647783, "grad_norm": 2.026791572570801, "learning_rate": 6.707428194123473e-06, "loss": 0.4872, "step": 74900 }, { "epoch": 1.6499835001649985, "grad_norm": 2.772639036178589, "learning_rate": 6.703026301309563e-06, "loss": 0.4891, "step": 75000 }, { "epoch": 1.6521834781652185, "grad_norm": 2.4932167530059814, "learning_rate": 6.698624408495654e-06, "loss": 0.4868, "step": 75100 }, { "epoch": 1.6543834561654385, "grad_norm": 2.5153396129608154, "learning_rate": 6.6942225156817445e-06, "loss": 0.5073, "step": 75200 }, { "epoch": 1.6565834341656585, "grad_norm": 1.7845731973648071, "learning_rate": 6.689820622867834e-06, "loss": 0.496, "step": 75300 }, { "epoch": 1.6587834121658784, "grad_norm": 2.392333745956421, "learning_rate": 6.685418730053924e-06, "loss": 0.5044, "step": 75400 }, { "epoch": 1.6609833901660984, "grad_norm": 2.624262809753418, "learning_rate": 6.6810168372400145e-06, "loss": 0.5196, "step": 75500 }, { "epoch": 1.6631833681663184, "grad_norm": 2.421013355255127, "learning_rate": 6.676614944426104e-06, "loss": 0.4938, "step": 75600 }, { "epoch": 1.6653833461665384, "grad_norm": 2.1836936473846436, "learning_rate": 6.672213051612194e-06, "loss": 0.4735, "step": 75700 }, { "epoch": 1.6675833241667584, "grad_norm": 2.523780345916748, "learning_rate": 6.6678111587982845e-06, "loss": 0.4868, "step": 75800 }, { "epoch": 1.6697833021669783, "grad_norm": 3.20668363571167, "learning_rate": 6.663409265984374e-06, "loss": 0.4902, "step": 75900 }, { "epoch": 1.6719832801671983, "grad_norm": 2.6450743675231934, "learning_rate": 6.659007373170464e-06, "loss": 0.4852, "step": 76000 }, { "epoch": 1.6741832581674183, "grad_norm": 2.3257484436035156, "learning_rate": 6.6546054803565545e-06, "loss": 0.4913, "step": 76100 }, { "epoch": 1.6763832361676383, "grad_norm": 1.7676602602005005, "learning_rate": 6.650203587542644e-06, "loss": 0.5051, "step": 76200 }, { "epoch": 1.6785832141678583, "grad_norm": 2.2192280292510986, "learning_rate": 6.645801694728734e-06, "loss": 0.4959, "step": 76300 }, { "epoch": 1.6807831921680783, "grad_norm": 2.4453659057617188, "learning_rate": 6.641399801914824e-06, "loss": 0.4841, "step": 76400 }, { "epoch": 1.6829831701682982, "grad_norm": 1.9458132982254028, "learning_rate": 6.636997909100914e-06, "loss": 0.4911, "step": 76500 }, { "epoch": 1.6851831481685182, "grad_norm": 2.2809267044067383, "learning_rate": 6.632596016287004e-06, "loss": 0.4871, "step": 76600 }, { "epoch": 1.6873831261687382, "grad_norm": 2.630840301513672, "learning_rate": 6.628194123473094e-06, "loss": 0.4813, "step": 76700 }, { "epoch": 1.6895831041689582, "grad_norm": 2.8288991451263428, "learning_rate": 6.623792230659184e-06, "loss": 0.4918, "step": 76800 }, { "epoch": 1.6917830821691782, "grad_norm": 2.220552921295166, "learning_rate": 6.619390337845274e-06, "loss": 0.4958, "step": 76900 }, { "epoch": 1.6939830601693981, "grad_norm": 2.3790931701660156, "learning_rate": 6.614988445031364e-06, "loss": 0.5098, "step": 77000 }, { "epoch": 1.6961830381696184, "grad_norm": 2.605365753173828, "learning_rate": 6.610586552217454e-06, "loss": 0.4999, "step": 77100 }, { "epoch": 1.6983830161698383, "grad_norm": 2.526428461074829, "learning_rate": 6.606184659403544e-06, "loss": 0.5008, "step": 77200 }, { "epoch": 1.7005829941700583, "grad_norm": 2.2195465564727783, "learning_rate": 6.601782766589634e-06, "loss": 0.4846, "step": 77300 }, { "epoch": 1.7027829721702783, "grad_norm": 2.925656318664551, "learning_rate": 6.597380873775724e-06, "loss": 0.4773, "step": 77400 }, { "epoch": 1.7049829501704983, "grad_norm": 2.5258848667144775, "learning_rate": 6.592978980961814e-06, "loss": 0.4972, "step": 77500 }, { "epoch": 1.7071829281707183, "grad_norm": 3.0461318492889404, "learning_rate": 6.588577088147904e-06, "loss": 0.4857, "step": 77600 }, { "epoch": 1.7093829061709382, "grad_norm": 2.3932976722717285, "learning_rate": 6.584175195333994e-06, "loss": 0.4999, "step": 77700 }, { "epoch": 1.7115828841711584, "grad_norm": 2.044865369796753, "learning_rate": 6.579773302520084e-06, "loss": 0.4898, "step": 77800 }, { "epoch": 1.7137828621713784, "grad_norm": 2.366441011428833, "learning_rate": 6.575371409706174e-06, "loss": 0.4786, "step": 77900 }, { "epoch": 1.7159828401715984, "grad_norm": 2.57084584236145, "learning_rate": 6.570969516892264e-06, "loss": 0.4766, "step": 78000 }, { "epoch": 1.7181828181718184, "grad_norm": 2.560520887374878, "learning_rate": 6.566567624078354e-06, "loss": 0.4891, "step": 78100 }, { "epoch": 1.7203827961720384, "grad_norm": 2.1307547092437744, "learning_rate": 6.562165731264444e-06, "loss": 0.4852, "step": 78200 }, { "epoch": 1.7225827741722584, "grad_norm": 2.4924020767211914, "learning_rate": 6.557763838450534e-06, "loss": 0.4836, "step": 78300 }, { "epoch": 1.7247827521724783, "grad_norm": 2.323122978210449, "learning_rate": 6.553361945636624e-06, "loss": 0.4926, "step": 78400 }, { "epoch": 1.7269827301726983, "grad_norm": 2.1391868591308594, "learning_rate": 6.5489600528227145e-06, "loss": 0.4974, "step": 78500 }, { "epoch": 1.7291827081729183, "grad_norm": 2.2388463020324707, "learning_rate": 6.544558160008805e-06, "loss": 0.4825, "step": 78600 }, { "epoch": 1.7313826861731383, "grad_norm": 2.617159843444824, "learning_rate": 6.540156267194895e-06, "loss": 0.4969, "step": 78700 }, { "epoch": 1.7335826641733583, "grad_norm": 1.9445505142211914, "learning_rate": 6.5357543743809845e-06, "loss": 0.494, "step": 78800 }, { "epoch": 1.7357826421735782, "grad_norm": 1.8033205270767212, "learning_rate": 6.531352481567075e-06, "loss": 0.4901, "step": 78900 }, { "epoch": 1.7379826201737982, "grad_norm": 2.480191469192505, "learning_rate": 6.526950588753165e-06, "loss": 0.4756, "step": 79000 }, { "epoch": 1.7401825981740182, "grad_norm": 2.203779697418213, "learning_rate": 6.5225486959392545e-06, "loss": 0.4949, "step": 79100 }, { "epoch": 1.7423825761742382, "grad_norm": 2.6420180797576904, "learning_rate": 6.518146803125345e-06, "loss": 0.476, "step": 79200 }, { "epoch": 1.7445825541744582, "grad_norm": 2.4949381351470947, "learning_rate": 6.513744910311435e-06, "loss": 0.4805, "step": 79300 }, { "epoch": 1.7467825321746782, "grad_norm": 1.6507716178894043, "learning_rate": 6.5093430174975245e-06, "loss": 0.4928, "step": 79400 }, { "epoch": 1.7489825101748981, "grad_norm": 2.849067211151123, "learning_rate": 6.504941124683615e-06, "loss": 0.4879, "step": 79500 }, { "epoch": 1.7511824881751181, "grad_norm": 2.404705047607422, "learning_rate": 6.500539231869705e-06, "loss": 0.4761, "step": 79600 }, { "epoch": 1.753382466175338, "grad_norm": 2.653310537338257, "learning_rate": 6.4961373390557945e-06, "loss": 0.5017, "step": 79700 }, { "epoch": 1.755582444175558, "grad_norm": 2.31355619430542, "learning_rate": 6.491735446241885e-06, "loss": 0.4802, "step": 79800 }, { "epoch": 1.7577824221757783, "grad_norm": 2.361945867538452, "learning_rate": 6.487333553427975e-06, "loss": 0.4816, "step": 79900 }, { "epoch": 1.7599824001759983, "grad_norm": 2.199768304824829, "learning_rate": 6.4829316606140645e-06, "loss": 0.4632, "step": 80000 }, { "epoch": 1.7621823781762183, "grad_norm": 1.8634425401687622, "learning_rate": 6.478529767800155e-06, "loss": 0.4909, "step": 80100 }, { "epoch": 1.7643823561764382, "grad_norm": 2.742694616317749, "learning_rate": 6.474127874986245e-06, "loss": 0.4939, "step": 80200 }, { "epoch": 1.7665823341766582, "grad_norm": 2.8734514713287354, "learning_rate": 6.4697259821723345e-06, "loss": 0.4917, "step": 80300 }, { "epoch": 1.7687823121768782, "grad_norm": 2.59197735786438, "learning_rate": 6.465324089358425e-06, "loss": 0.4781, "step": 80400 }, { "epoch": 1.7709822901770982, "grad_norm": 2.3575127124786377, "learning_rate": 6.460922196544514e-06, "loss": 0.4801, "step": 80500 }, { "epoch": 1.7731822681773184, "grad_norm": 2.599222421646118, "learning_rate": 6.4565203037306045e-06, "loss": 0.4891, "step": 80600 }, { "epoch": 1.7753822461775384, "grad_norm": 2.7138659954071045, "learning_rate": 6.452118410916695e-06, "loss": 0.491, "step": 80700 }, { "epoch": 1.7775822241777584, "grad_norm": 2.467128038406372, "learning_rate": 6.447716518102784e-06, "loss": 0.4984, "step": 80800 }, { "epoch": 1.7797822021779783, "grad_norm": 2.4047677516937256, "learning_rate": 6.4433146252888745e-06, "loss": 0.4756, "step": 80900 }, { "epoch": 1.7819821801781983, "grad_norm": 2.0229098796844482, "learning_rate": 6.438912732474965e-06, "loss": 0.4792, "step": 81000 }, { "epoch": 1.7841821581784183, "grad_norm": 2.463090658187866, "learning_rate": 6.434510839661054e-06, "loss": 0.4824, "step": 81100 }, { "epoch": 1.7863821361786383, "grad_norm": 2.3522398471832275, "learning_rate": 6.4301089468471445e-06, "loss": 0.4938, "step": 81200 }, { "epoch": 1.7885821141788583, "grad_norm": 2.1566226482391357, "learning_rate": 6.425707054033235e-06, "loss": 0.4858, "step": 81300 }, { "epoch": 1.7907820921790782, "grad_norm": 2.452099084854126, "learning_rate": 6.421305161219324e-06, "loss": 0.4879, "step": 81400 }, { "epoch": 1.7929820701792982, "grad_norm": 2.3728647232055664, "learning_rate": 6.4169032684054144e-06, "loss": 0.499, "step": 81500 }, { "epoch": 1.7951820481795182, "grad_norm": 2.499342441558838, "learning_rate": 6.412501375591505e-06, "loss": 0.4799, "step": 81600 }, { "epoch": 1.7973820261797382, "grad_norm": 2.281799077987671, "learning_rate": 6.408099482777594e-06, "loss": 0.4823, "step": 81700 }, { "epoch": 1.7995820041799582, "grad_norm": 2.5670275688171387, "learning_rate": 6.4036975899636844e-06, "loss": 0.4956, "step": 81800 }, { "epoch": 1.8017819821801782, "grad_norm": 2.830780506134033, "learning_rate": 6.399295697149775e-06, "loss": 0.4909, "step": 81900 }, { "epoch": 1.8039819601803981, "grad_norm": 2.3581204414367676, "learning_rate": 6.394893804335866e-06, "loss": 0.4906, "step": 82000 }, { "epoch": 1.8061819381806181, "grad_norm": 2.6061856746673584, "learning_rate": 6.390491911521955e-06, "loss": 0.488, "step": 82100 }, { "epoch": 1.808381916180838, "grad_norm": 2.3762636184692383, "learning_rate": 6.386090018708046e-06, "loss": 0.4957, "step": 82200 }, { "epoch": 1.810581894181058, "grad_norm": 2.7238190174102783, "learning_rate": 6.381688125894136e-06, "loss": 0.4866, "step": 82300 }, { "epoch": 1.812781872181278, "grad_norm": 2.1085996627807617, "learning_rate": 6.377286233080225e-06, "loss": 0.4666, "step": 82400 }, { "epoch": 1.814981850181498, "grad_norm": 2.127675771713257, "learning_rate": 6.372884340266316e-06, "loss": 0.4975, "step": 82500 }, { "epoch": 1.817181828181718, "grad_norm": 2.0977835655212402, "learning_rate": 6.368482447452405e-06, "loss": 0.5016, "step": 82600 }, { "epoch": 1.8193818061819382, "grad_norm": 2.5928144454956055, "learning_rate": 6.364080554638495e-06, "loss": 0.4904, "step": 82700 }, { "epoch": 1.8215817841821582, "grad_norm": 2.5363171100616455, "learning_rate": 6.359678661824586e-06, "loss": 0.4739, "step": 82800 }, { "epoch": 1.8237817621823782, "grad_norm": 1.779845952987671, "learning_rate": 6.355276769010675e-06, "loss": 0.475, "step": 82900 }, { "epoch": 1.8259817401825982, "grad_norm": 2.3891873359680176, "learning_rate": 6.350874876196765e-06, "loss": 0.4867, "step": 83000 }, { "epoch": 1.8281817181828182, "grad_norm": 2.5663325786590576, "learning_rate": 6.3464729833828556e-06, "loss": 0.4706, "step": 83100 }, { "epoch": 1.8303816961830381, "grad_norm": 2.2070469856262207, "learning_rate": 6.342071090568945e-06, "loss": 0.4894, "step": 83200 }, { "epoch": 1.8325816741832581, "grad_norm": 2.3300230503082275, "learning_rate": 6.337669197755035e-06, "loss": 0.4843, "step": 83300 }, { "epoch": 1.8347816521834783, "grad_norm": 2.1778311729431152, "learning_rate": 6.3332673049411256e-06, "loss": 0.5032, "step": 83400 }, { "epoch": 1.8369816301836983, "grad_norm": 2.106933832168579, "learning_rate": 6.328865412127215e-06, "loss": 0.4875, "step": 83500 }, { "epoch": 1.8391816081839183, "grad_norm": 2.6579482555389404, "learning_rate": 6.324463519313305e-06, "loss": 0.4892, "step": 83600 }, { "epoch": 1.8413815861841383, "grad_norm": 2.3309366703033447, "learning_rate": 6.3200616264993956e-06, "loss": 0.4699, "step": 83700 }, { "epoch": 1.8435815641843583, "grad_norm": 2.503455400466919, "learning_rate": 6.315659733685485e-06, "loss": 0.4801, "step": 83800 }, { "epoch": 1.8457815421845782, "grad_norm": 2.5221006870269775, "learning_rate": 6.311257840871575e-06, "loss": 0.4834, "step": 83900 }, { "epoch": 1.8479815201847982, "grad_norm": 2.271540403366089, "learning_rate": 6.3068559480576656e-06, "loss": 0.4759, "step": 84000 }, { "epoch": 1.8501814981850182, "grad_norm": 2.2240519523620605, "learning_rate": 6.302454055243755e-06, "loss": 0.4858, "step": 84100 }, { "epoch": 1.8523814761852382, "grad_norm": 2.41463041305542, "learning_rate": 6.298052162429845e-06, "loss": 0.4951, "step": 84200 }, { "epoch": 1.8545814541854582, "grad_norm": 2.420825242996216, "learning_rate": 6.2936502696159356e-06, "loss": 0.4949, "step": 84300 }, { "epoch": 1.8567814321856781, "grad_norm": 2.6283483505249023, "learning_rate": 6.289248376802025e-06, "loss": 0.4928, "step": 84400 }, { "epoch": 1.8589814101858981, "grad_norm": 2.6053175926208496, "learning_rate": 6.284846483988115e-06, "loss": 0.4951, "step": 84500 }, { "epoch": 1.8611813881861181, "grad_norm": 2.556842803955078, "learning_rate": 6.280444591174205e-06, "loss": 0.4766, "step": 84600 }, { "epoch": 1.863381366186338, "grad_norm": 2.583364248275757, "learning_rate": 6.276042698360295e-06, "loss": 0.4964, "step": 84700 }, { "epoch": 1.865581344186558, "grad_norm": 2.407144069671631, "learning_rate": 6.271640805546385e-06, "loss": 0.4882, "step": 84800 }, { "epoch": 1.867781322186778, "grad_norm": 2.20274019241333, "learning_rate": 6.267238912732475e-06, "loss": 0.488, "step": 84900 }, { "epoch": 1.869981300186998, "grad_norm": 2.537299871444702, "learning_rate": 6.262837019918565e-06, "loss": 0.4912, "step": 85000 }, { "epoch": 1.872181278187218, "grad_norm": 2.4242103099823, "learning_rate": 6.258435127104655e-06, "loss": 0.4857, "step": 85100 }, { "epoch": 1.874381256187438, "grad_norm": 1.9029467105865479, "learning_rate": 6.254033234290745e-06, "loss": 0.4969, "step": 85200 }, { "epoch": 1.876581234187658, "grad_norm": 3.0369937419891357, "learning_rate": 6.249631341476835e-06, "loss": 0.4854, "step": 85300 }, { "epoch": 1.878781212187878, "grad_norm": 2.6991753578186035, "learning_rate": 6.245229448662925e-06, "loss": 0.4771, "step": 85400 }, { "epoch": 1.8809811901880982, "grad_norm": 2.336350679397583, "learning_rate": 6.240827555849016e-06, "loss": 0.4922, "step": 85500 }, { "epoch": 1.8831811681883182, "grad_norm": 2.731637477874756, "learning_rate": 6.236425663035106e-06, "loss": 0.4877, "step": 85600 }, { "epoch": 1.8853811461885381, "grad_norm": 2.438896417617798, "learning_rate": 6.232023770221196e-06, "loss": 0.4743, "step": 85700 }, { "epoch": 1.8875811241887581, "grad_norm": 2.8118035793304443, "learning_rate": 6.227621877407286e-06, "loss": 0.4804, "step": 85800 }, { "epoch": 1.889781102188978, "grad_norm": 2.5621535778045654, "learning_rate": 6.223219984593376e-06, "loss": 0.4849, "step": 85900 }, { "epoch": 1.891981080189198, "grad_norm": 2.3240880966186523, "learning_rate": 6.218818091779466e-06, "loss": 0.4919, "step": 86000 }, { "epoch": 1.894181058189418, "grad_norm": 2.481004238128662, "learning_rate": 6.214416198965556e-06, "loss": 0.4794, "step": 86100 }, { "epoch": 1.8963810361896383, "grad_norm": 2.4835259914398193, "learning_rate": 6.210014306151646e-06, "loss": 0.479, "step": 86200 }, { "epoch": 1.8985810141898583, "grad_norm": 2.3219950199127197, "learning_rate": 6.205612413337736e-06, "loss": 0.4743, "step": 86300 }, { "epoch": 1.9007809921900782, "grad_norm": 2.9407191276550293, "learning_rate": 6.201210520523826e-06, "loss": 0.4641, "step": 86400 }, { "epoch": 1.9029809701902982, "grad_norm": 2.64907169342041, "learning_rate": 6.196808627709916e-06, "loss": 0.4821, "step": 86500 }, { "epoch": 1.9051809481905182, "grad_norm": 2.1783690452575684, "learning_rate": 6.192406734896006e-06, "loss": 0.4709, "step": 86600 }, { "epoch": 1.9073809261907382, "grad_norm": 2.755631685256958, "learning_rate": 6.1880048420820956e-06, "loss": 0.4816, "step": 86700 }, { "epoch": 1.9095809041909582, "grad_norm": 2.761409044265747, "learning_rate": 6.183602949268186e-06, "loss": 0.4833, "step": 86800 }, { "epoch": 1.9117808821911781, "grad_norm": 2.676274061203003, "learning_rate": 6.179201056454276e-06, "loss": 0.4962, "step": 86900 }, { "epoch": 1.9139808601913981, "grad_norm": 2.450660467147827, "learning_rate": 6.1747991636403656e-06, "loss": 0.473, "step": 87000 }, { "epoch": 1.916180838191618, "grad_norm": 2.693134069442749, "learning_rate": 6.170397270826456e-06, "loss": 0.4781, "step": 87100 }, { "epoch": 1.918380816191838, "grad_norm": 2.411348581314087, "learning_rate": 6.165995378012546e-06, "loss": 0.4804, "step": 87200 }, { "epoch": 1.920580794192058, "grad_norm": 2.500234842300415, "learning_rate": 6.1615934851986356e-06, "loss": 0.4837, "step": 87300 }, { "epoch": 1.922780772192278, "grad_norm": 3.033048391342163, "learning_rate": 6.157191592384726e-06, "loss": 0.471, "step": 87400 }, { "epoch": 1.924980750192498, "grad_norm": 1.847033143043518, "learning_rate": 6.152789699570816e-06, "loss": 0.4823, "step": 87500 }, { "epoch": 1.927180728192718, "grad_norm": 2.5302257537841797, "learning_rate": 6.1483878067569056e-06, "loss": 0.4826, "step": 87600 }, { "epoch": 1.929380706192938, "grad_norm": 1.998494029045105, "learning_rate": 6.143985913942996e-06, "loss": 0.4891, "step": 87700 }, { "epoch": 1.931580684193158, "grad_norm": 2.995784044265747, "learning_rate": 6.139584021129086e-06, "loss": 0.4847, "step": 87800 }, { "epoch": 1.933780662193378, "grad_norm": 2.2645761966705322, "learning_rate": 6.1351821283151756e-06, "loss": 0.5042, "step": 87900 }, { "epoch": 1.935980640193598, "grad_norm": 2.3474481105804443, "learning_rate": 6.130780235501266e-06, "loss": 0.4845, "step": 88000 }, { "epoch": 1.938180618193818, "grad_norm": 2.570206880569458, "learning_rate": 6.126378342687356e-06, "loss": 0.4794, "step": 88100 }, { "epoch": 1.940380596194038, "grad_norm": 1.8715978860855103, "learning_rate": 6.1219764498734456e-06, "loss": 0.4775, "step": 88200 }, { "epoch": 1.942580574194258, "grad_norm": 2.443993330001831, "learning_rate": 6.117574557059536e-06, "loss": 0.4824, "step": 88300 }, { "epoch": 1.944780552194478, "grad_norm": 2.4730186462402344, "learning_rate": 6.113172664245626e-06, "loss": 0.4914, "step": 88400 }, { "epoch": 1.946980530194698, "grad_norm": 2.6471264362335205, "learning_rate": 6.1087707714317156e-06, "loss": 0.4826, "step": 88500 }, { "epoch": 1.949180508194918, "grad_norm": 2.5795907974243164, "learning_rate": 6.104368878617806e-06, "loss": 0.4871, "step": 88600 }, { "epoch": 1.951380486195138, "grad_norm": 2.3072896003723145, "learning_rate": 6.099966985803895e-06, "loss": 0.4937, "step": 88700 }, { "epoch": 1.953580464195358, "grad_norm": 2.5398294925689697, "learning_rate": 6.0955650929899856e-06, "loss": 0.4919, "step": 88800 }, { "epoch": 1.955780442195578, "grad_norm": 2.15952730178833, "learning_rate": 6.091163200176077e-06, "loss": 0.4934, "step": 88900 }, { "epoch": 1.957980420195798, "grad_norm": 2.4487977027893066, "learning_rate": 6.086761307362167e-06, "loss": 0.4842, "step": 89000 }, { "epoch": 1.9601803981960182, "grad_norm": 2.4906442165374756, "learning_rate": 6.082359414548256e-06, "loss": 0.484, "step": 89100 }, { "epoch": 1.9623803761962382, "grad_norm": 2.605121374130249, "learning_rate": 6.077957521734347e-06, "loss": 0.4903, "step": 89200 }, { "epoch": 1.9645803541964582, "grad_norm": 2.7144834995269775, "learning_rate": 6.073555628920437e-06, "loss": 0.4931, "step": 89300 }, { "epoch": 1.9667803321966781, "grad_norm": 2.7881131172180176, "learning_rate": 6.069153736106526e-06, "loss": 0.495, "step": 89400 }, { "epoch": 1.9689803101968981, "grad_norm": 3.044265031814575, "learning_rate": 6.064751843292617e-06, "loss": 0.4757, "step": 89500 }, { "epoch": 1.971180288197118, "grad_norm": 2.3652849197387695, "learning_rate": 6.060349950478707e-06, "loss": 0.4761, "step": 89600 }, { "epoch": 1.973380266197338, "grad_norm": 1.9909372329711914, "learning_rate": 6.055948057664796e-06, "loss": 0.492, "step": 89700 }, { "epoch": 1.975580244197558, "grad_norm": 2.1215572357177734, "learning_rate": 6.051546164850887e-06, "loss": 0.4787, "step": 89800 }, { "epoch": 1.977780222197778, "grad_norm": 2.807328701019287, "learning_rate": 6.047144272036977e-06, "loss": 0.4845, "step": 89900 }, { "epoch": 1.979980200197998, "grad_norm": 2.344365358352661, "learning_rate": 6.042742379223066e-06, "loss": 0.4892, "step": 90000 }, { "epoch": 1.982180178198218, "grad_norm": 2.1772940158843994, "learning_rate": 6.038340486409157e-06, "loss": 0.4849, "step": 90100 }, { "epoch": 1.984380156198438, "grad_norm": 2.4292235374450684, "learning_rate": 6.033938593595247e-06, "loss": 0.4869, "step": 90200 }, { "epoch": 1.986580134198658, "grad_norm": 2.350494861602783, "learning_rate": 6.029536700781336e-06, "loss": 0.4945, "step": 90300 }, { "epoch": 1.988780112198878, "grad_norm": 2.447011709213257, "learning_rate": 6.025134807967427e-06, "loss": 0.4632, "step": 90400 }, { "epoch": 1.990980090199098, "grad_norm": 2.229335069656372, "learning_rate": 6.020732915153516e-06, "loss": 0.491, "step": 90500 }, { "epoch": 1.993180068199318, "grad_norm": 2.659064292907715, "learning_rate": 6.016331022339606e-06, "loss": 0.4788, "step": 90600 }, { "epoch": 1.995380046199538, "grad_norm": 2.435239791870117, "learning_rate": 6.011929129525697e-06, "loss": 0.4947, "step": 90700 }, { "epoch": 1.9975800241997579, "grad_norm": 2.0373647212982178, "learning_rate": 6.007527236711786e-06, "loss": 0.4832, "step": 90800 }, { "epoch": 1.9997800021999779, "grad_norm": 2.644747734069824, "learning_rate": 6.003125343897876e-06, "loss": 0.4884, "step": 90900 }, { "epoch": 2.001979980200198, "grad_norm": 2.4957003593444824, "learning_rate": 5.998723451083967e-06, "loss": 0.4441, "step": 91000 }, { "epoch": 2.004179958200418, "grad_norm": 2.8672921657562256, "learning_rate": 5.994321558270056e-06, "loss": 0.4586, "step": 91100 }, { "epoch": 2.006379936200638, "grad_norm": 2.2238707542419434, "learning_rate": 5.989919665456146e-06, "loss": 0.4508, "step": 91200 }, { "epoch": 2.008579914200858, "grad_norm": 3.085266590118408, "learning_rate": 5.985517772642237e-06, "loss": 0.4454, "step": 91300 }, { "epoch": 2.010779892201078, "grad_norm": 2.7190568447113037, "learning_rate": 5.981115879828326e-06, "loss": 0.4421, "step": 91400 }, { "epoch": 2.012979870201298, "grad_norm": 2.966407537460327, "learning_rate": 5.976713987014416e-06, "loss": 0.4334, "step": 91500 }, { "epoch": 2.015179848201518, "grad_norm": 2.963914394378662, "learning_rate": 5.972312094200507e-06, "loss": 0.4428, "step": 91600 }, { "epoch": 2.017379826201738, "grad_norm": 3.2475080490112305, "learning_rate": 5.967910201386596e-06, "loss": 0.4387, "step": 91700 }, { "epoch": 2.019579804201958, "grad_norm": 2.248386859893799, "learning_rate": 5.963508308572686e-06, "loss": 0.4509, "step": 91800 }, { "epoch": 2.021779782202178, "grad_norm": 2.9276363849639893, "learning_rate": 5.959106415758777e-06, "loss": 0.4509, "step": 91900 }, { "epoch": 2.023979760202398, "grad_norm": 3.2354319095611572, "learning_rate": 5.954704522944866e-06, "loss": 0.4396, "step": 92000 }, { "epoch": 2.026179738202618, "grad_norm": 3.478252649307251, "learning_rate": 5.950302630130956e-06, "loss": 0.454, "step": 92100 }, { "epoch": 2.028379716202838, "grad_norm": 2.1570658683776855, "learning_rate": 5.945900737317047e-06, "loss": 0.4426, "step": 92200 }, { "epoch": 2.030579694203058, "grad_norm": 3.555510997772217, "learning_rate": 5.941498844503136e-06, "loss": 0.4278, "step": 92300 }, { "epoch": 2.032779672203278, "grad_norm": 3.0837221145629883, "learning_rate": 5.937096951689227e-06, "loss": 0.4582, "step": 92400 }, { "epoch": 2.034979650203498, "grad_norm": 3.023439407348633, "learning_rate": 5.9326950588753175e-06, "loss": 0.445, "step": 92500 }, { "epoch": 2.037179628203718, "grad_norm": 2.8164618015289307, "learning_rate": 5.928293166061407e-06, "loss": 0.4474, "step": 92600 }, { "epoch": 2.039379606203938, "grad_norm": 2.4497897624969482, "learning_rate": 5.923891273247497e-06, "loss": 0.4581, "step": 92700 }, { "epoch": 2.041579584204158, "grad_norm": 2.560822010040283, "learning_rate": 5.9194893804335875e-06, "loss": 0.4402, "step": 92800 }, { "epoch": 2.043779562204378, "grad_norm": 2.457819938659668, "learning_rate": 5.915087487619677e-06, "loss": 0.457, "step": 92900 }, { "epoch": 2.045979540204598, "grad_norm": 2.840198278427124, "learning_rate": 5.910685594805767e-06, "loss": 0.4457, "step": 93000 }, { "epoch": 2.048179518204818, "grad_norm": 3.289562940597534, "learning_rate": 5.9062837019918575e-06, "loss": 0.4458, "step": 93100 }, { "epoch": 2.050379496205038, "grad_norm": 3.20574688911438, "learning_rate": 5.901881809177947e-06, "loss": 0.4464, "step": 93200 }, { "epoch": 2.052579474205258, "grad_norm": 3.1382062435150146, "learning_rate": 5.897479916364037e-06, "loss": 0.4407, "step": 93300 }, { "epoch": 2.054779452205478, "grad_norm": 2.4946656227111816, "learning_rate": 5.8930780235501275e-06, "loss": 0.4404, "step": 93400 }, { "epoch": 2.056979430205698, "grad_norm": 3.4237630367279053, "learning_rate": 5.888676130736217e-06, "loss": 0.4549, "step": 93500 }, { "epoch": 2.059179408205918, "grad_norm": 2.6181180477142334, "learning_rate": 5.884274237922307e-06, "loss": 0.4305, "step": 93600 }, { "epoch": 2.061379386206138, "grad_norm": 2.9076225757598877, "learning_rate": 5.8798723451083975e-06, "loss": 0.4543, "step": 93700 }, { "epoch": 2.063579364206358, "grad_norm": 2.6111700534820557, "learning_rate": 5.875470452294487e-06, "loss": 0.4426, "step": 93800 }, { "epoch": 2.0657793422065778, "grad_norm": 3.1381430625915527, "learning_rate": 5.871068559480577e-06, "loss": 0.4509, "step": 93900 }, { "epoch": 2.0679793202067978, "grad_norm": 2.934509754180908, "learning_rate": 5.8666666666666675e-06, "loss": 0.4538, "step": 94000 }, { "epoch": 2.0701792982070177, "grad_norm": 2.8510279655456543, "learning_rate": 5.862264773852757e-06, "loss": 0.4396, "step": 94100 }, { "epoch": 2.0723792762072377, "grad_norm": 2.753408670425415, "learning_rate": 5.857862881038847e-06, "loss": 0.4498, "step": 94200 }, { "epoch": 2.0745792542074577, "grad_norm": 2.5191516876220703, "learning_rate": 5.8534609882249375e-06, "loss": 0.4355, "step": 94300 }, { "epoch": 2.076779232207678, "grad_norm": 3.058117628097534, "learning_rate": 5.849059095411027e-06, "loss": 0.4496, "step": 94400 }, { "epoch": 2.078979210207898, "grad_norm": 2.3892626762390137, "learning_rate": 5.844657202597117e-06, "loss": 0.448, "step": 94500 }, { "epoch": 2.081179188208118, "grad_norm": 3.303252935409546, "learning_rate": 5.840255309783207e-06, "loss": 0.4423, "step": 94600 }, { "epoch": 2.083379166208338, "grad_norm": 2.571668863296509, "learning_rate": 5.835853416969297e-06, "loss": 0.4477, "step": 94700 }, { "epoch": 2.085579144208558, "grad_norm": 2.8675763607025146, "learning_rate": 5.831451524155387e-06, "loss": 0.4402, "step": 94800 }, { "epoch": 2.087779122208778, "grad_norm": 1.920617938041687, "learning_rate": 5.827049631341477e-06, "loss": 0.4469, "step": 94900 }, { "epoch": 2.089979100208998, "grad_norm": 2.4607462882995605, "learning_rate": 5.822647738527567e-06, "loss": 0.4578, "step": 95000 }, { "epoch": 2.092179078209218, "grad_norm": 2.3950858116149902, "learning_rate": 5.818245845713657e-06, "loss": 0.449, "step": 95100 }, { "epoch": 2.094379056209438, "grad_norm": 2.5188486576080322, "learning_rate": 5.813843952899747e-06, "loss": 0.4411, "step": 95200 }, { "epoch": 2.096579034209658, "grad_norm": 2.665241003036499, "learning_rate": 5.809442060085837e-06, "loss": 0.4555, "step": 95300 }, { "epoch": 2.098779012209878, "grad_norm": 3.0195603370666504, "learning_rate": 5.805040167271927e-06, "loss": 0.4605, "step": 95400 }, { "epoch": 2.100978990210098, "grad_norm": 3.2705276012420654, "learning_rate": 5.800638274458017e-06, "loss": 0.437, "step": 95500 }, { "epoch": 2.103178968210318, "grad_norm": 2.4358837604522705, "learning_rate": 5.796236381644107e-06, "loss": 0.4556, "step": 95600 }, { "epoch": 2.105378946210538, "grad_norm": 2.609314203262329, "learning_rate": 5.791834488830197e-06, "loss": 0.4396, "step": 95700 }, { "epoch": 2.107578924210758, "grad_norm": 2.715202808380127, "learning_rate": 5.787432596016287e-06, "loss": 0.4409, "step": 95800 }, { "epoch": 2.109778902210978, "grad_norm": 2.89326548576355, "learning_rate": 5.783030703202378e-06, "loss": 0.4473, "step": 95900 }, { "epoch": 2.111978880211198, "grad_norm": 2.722426414489746, "learning_rate": 5.778628810388468e-06, "loss": 0.4392, "step": 96000 }, { "epoch": 2.114178858211418, "grad_norm": 2.5516304969787598, "learning_rate": 5.774226917574558e-06, "loss": 0.4327, "step": 96100 }, { "epoch": 2.116378836211638, "grad_norm": 1.6953123807907104, "learning_rate": 5.769825024760648e-06, "loss": 0.4354, "step": 96200 }, { "epoch": 2.118578814211858, "grad_norm": 3.260712146759033, "learning_rate": 5.765423131946738e-06, "loss": 0.4587, "step": 96300 }, { "epoch": 2.1207787922120778, "grad_norm": 3.15496826171875, "learning_rate": 5.761021239132828e-06, "loss": 0.4455, "step": 96400 }, { "epoch": 2.1229787702122977, "grad_norm": 3.02713680267334, "learning_rate": 5.756619346318918e-06, "loss": 0.443, "step": 96500 }, { "epoch": 2.1251787482125177, "grad_norm": 2.6551177501678467, "learning_rate": 5.752217453505008e-06, "loss": 0.4361, "step": 96600 }, { "epoch": 2.1273787262127377, "grad_norm": 3.143676996231079, "learning_rate": 5.7478155606910975e-06, "loss": 0.4463, "step": 96700 }, { "epoch": 2.1295787042129577, "grad_norm": 3.07769775390625, "learning_rate": 5.743413667877188e-06, "loss": 0.4563, "step": 96800 }, { "epoch": 2.1317786822131777, "grad_norm": 2.862227439880371, "learning_rate": 5.739011775063278e-06, "loss": 0.4393, "step": 96900 }, { "epoch": 2.1339786602133977, "grad_norm": 2.652214288711548, "learning_rate": 5.7346098822493675e-06, "loss": 0.443, "step": 97000 }, { "epoch": 2.136178638213618, "grad_norm": 2.3733363151550293, "learning_rate": 5.730207989435458e-06, "loss": 0.4449, "step": 97100 }, { "epoch": 2.138378616213838, "grad_norm": 2.734473705291748, "learning_rate": 5.725806096621548e-06, "loss": 0.4357, "step": 97200 }, { "epoch": 2.140578594214058, "grad_norm": 2.783421039581299, "learning_rate": 5.7214042038076375e-06, "loss": 0.434, "step": 97300 }, { "epoch": 2.142778572214278, "grad_norm": 2.4740219116210938, "learning_rate": 5.717002310993728e-06, "loss": 0.4417, "step": 97400 }, { "epoch": 2.144978550214498, "grad_norm": 2.809589147567749, "learning_rate": 5.712600418179818e-06, "loss": 0.4507, "step": 97500 }, { "epoch": 2.147178528214718, "grad_norm": 2.179594039916992, "learning_rate": 5.7081985253659075e-06, "loss": 0.4552, "step": 97600 }, { "epoch": 2.149378506214938, "grad_norm": 2.5812172889709473, "learning_rate": 5.703796632551998e-06, "loss": 0.4462, "step": 97700 }, { "epoch": 2.151578484215158, "grad_norm": 2.6970343589782715, "learning_rate": 5.699394739738088e-06, "loss": 0.4448, "step": 97800 }, { "epoch": 2.153778462215378, "grad_norm": 3.2081048488616943, "learning_rate": 5.6949928469241775e-06, "loss": 0.4477, "step": 97900 }, { "epoch": 2.155978440215598, "grad_norm": 2.283027410507202, "learning_rate": 5.690590954110268e-06, "loss": 0.4554, "step": 98000 }, { "epoch": 2.158178418215818, "grad_norm": 2.4790256023406982, "learning_rate": 5.686189061296358e-06, "loss": 0.4443, "step": 98100 }, { "epoch": 2.160378396216038, "grad_norm": 3.0653131008148193, "learning_rate": 5.6817871684824475e-06, "loss": 0.4435, "step": 98200 }, { "epoch": 2.162578374216258, "grad_norm": 3.14249849319458, "learning_rate": 5.677385275668538e-06, "loss": 0.4528, "step": 98300 }, { "epoch": 2.164778352216478, "grad_norm": 3.3730337619781494, "learning_rate": 5.672983382854628e-06, "loss": 0.4397, "step": 98400 }, { "epoch": 2.166978330216698, "grad_norm": 3.2641589641571045, "learning_rate": 5.6685814900407175e-06, "loss": 0.4365, "step": 98500 }, { "epoch": 2.169178308216918, "grad_norm": 3.698474407196045, "learning_rate": 5.664179597226808e-06, "loss": 0.4416, "step": 98600 }, { "epoch": 2.171378286217138, "grad_norm": 2.253495454788208, "learning_rate": 5.659777704412897e-06, "loss": 0.4534, "step": 98700 }, { "epoch": 2.173578264217358, "grad_norm": 3.342864990234375, "learning_rate": 5.6553758115989875e-06, "loss": 0.4546, "step": 98800 }, { "epoch": 2.1757782422175778, "grad_norm": 2.818357229232788, "learning_rate": 5.650973918785078e-06, "loss": 0.4327, "step": 98900 }, { "epoch": 2.1779782202177977, "grad_norm": 3.623086452484131, "learning_rate": 5.646572025971167e-06, "loss": 0.4566, "step": 99000 }, { "epoch": 2.1801781982180177, "grad_norm": 3.0294673442840576, "learning_rate": 5.6421701331572575e-06, "loss": 0.4437, "step": 99100 }, { "epoch": 2.1823781762182377, "grad_norm": 2.562649726867676, "learning_rate": 5.637768240343348e-06, "loss": 0.4504, "step": 99200 }, { "epoch": 2.1845781542184577, "grad_norm": 2.9399819374084473, "learning_rate": 5.633366347529439e-06, "loss": 0.4405, "step": 99300 }, { "epoch": 2.1867781322186777, "grad_norm": 2.589012861251831, "learning_rate": 5.628964454715528e-06, "loss": 0.4332, "step": 99400 }, { "epoch": 2.1889781102188977, "grad_norm": 3.24257230758667, "learning_rate": 5.624562561901619e-06, "loss": 0.4486, "step": 99500 }, { "epoch": 2.1911780882191176, "grad_norm": 2.6864874362945557, "learning_rate": 5.620160669087709e-06, "loss": 0.4476, "step": 99600 }, { "epoch": 2.1933780662193376, "grad_norm": 2.183894634246826, "learning_rate": 5.615758776273798e-06, "loss": 0.4517, "step": 99700 }, { "epoch": 2.1955780442195576, "grad_norm": 2.297757625579834, "learning_rate": 5.611356883459889e-06, "loss": 0.4414, "step": 99800 }, { "epoch": 2.1977780222197776, "grad_norm": 2.6887316703796387, "learning_rate": 5.606954990645979e-06, "loss": 0.4359, "step": 99900 }, { "epoch": 2.199978000219998, "grad_norm": 2.8383491039276123, "learning_rate": 5.602553097832068e-06, "loss": 0.4455, "step": 100000 }, { "epoch": 2.199978000219998, "eval_loss": 0.5539576411247253, "eval_runtime": 386.4228, "eval_samples_per_second": 155.27, "eval_steps_per_second": 4.852, "step": 100000 }, { "epoch": 2.202177978220218, "grad_norm": 2.4842607975006104, "learning_rate": 5.598151205018159e-06, "loss": 0.4421, "step": 100100 }, { "epoch": 2.204377956220438, "grad_norm": 2.3061771392822266, "learning_rate": 5.593749312204249e-06, "loss": 0.4529, "step": 100200 }, { "epoch": 2.206577934220658, "grad_norm": 2.9890830516815186, "learning_rate": 5.589347419390338e-06, "loss": 0.4251, "step": 100300 }, { "epoch": 2.208777912220878, "grad_norm": 2.5472826957702637, "learning_rate": 5.584945526576429e-06, "loss": 0.4384, "step": 100400 }, { "epoch": 2.210977890221098, "grad_norm": 3.314694881439209, "learning_rate": 5.580543633762519e-06, "loss": 0.4372, "step": 100500 }, { "epoch": 2.213177868221318, "grad_norm": 3.1046979427337646, "learning_rate": 5.576141740948608e-06, "loss": 0.434, "step": 100600 }, { "epoch": 2.215377846221538, "grad_norm": 2.180180788040161, "learning_rate": 5.571739848134699e-06, "loss": 0.43, "step": 100700 }, { "epoch": 2.217577824221758, "grad_norm": 3.7238945960998535, "learning_rate": 5.567337955320788e-06, "loss": 0.4404, "step": 100800 }, { "epoch": 2.219777802221978, "grad_norm": 3.2101945877075195, "learning_rate": 5.562936062506878e-06, "loss": 0.4393, "step": 100900 }, { "epoch": 2.221977780222198, "grad_norm": 2.822737455368042, "learning_rate": 5.558534169692969e-06, "loss": 0.4407, "step": 101000 }, { "epoch": 2.224177758222418, "grad_norm": 2.736593723297119, "learning_rate": 5.554132276879058e-06, "loss": 0.4603, "step": 101100 }, { "epoch": 2.226377736222638, "grad_norm": 2.5259158611297607, "learning_rate": 5.549730384065148e-06, "loss": 0.438, "step": 101200 }, { "epoch": 2.2285777142228578, "grad_norm": 2.8023760318756104, "learning_rate": 5.545328491251239e-06, "loss": 0.4476, "step": 101300 }, { "epoch": 2.2307776922230778, "grad_norm": 3.469649076461792, "learning_rate": 5.540926598437328e-06, "loss": 0.4498, "step": 101400 }, { "epoch": 2.2329776702232977, "grad_norm": 2.2170920372009277, "learning_rate": 5.536524705623418e-06, "loss": 0.4531, "step": 101500 }, { "epoch": 2.2351776482235177, "grad_norm": 2.9399514198303223, "learning_rate": 5.532122812809509e-06, "loss": 0.4496, "step": 101600 }, { "epoch": 2.2373776262237377, "grad_norm": 3.1350746154785156, "learning_rate": 5.527720919995598e-06, "loss": 0.4412, "step": 101700 }, { "epoch": 2.2395776042239577, "grad_norm": 2.7231826782226562, "learning_rate": 5.523319027181688e-06, "loss": 0.4434, "step": 101800 }, { "epoch": 2.2417775822241777, "grad_norm": 2.8241002559661865, "learning_rate": 5.518917134367779e-06, "loss": 0.4405, "step": 101900 }, { "epoch": 2.2439775602243976, "grad_norm": 2.6854066848754883, "learning_rate": 5.514515241553868e-06, "loss": 0.4558, "step": 102000 }, { "epoch": 2.2461775382246176, "grad_norm": 3.1952197551727295, "learning_rate": 5.510113348739958e-06, "loss": 0.4354, "step": 102100 }, { "epoch": 2.2483775162248376, "grad_norm": 2.9026472568511963, "learning_rate": 5.505711455926049e-06, "loss": 0.4485, "step": 102200 }, { "epoch": 2.2505774942250576, "grad_norm": 3.1712558269500732, "learning_rate": 5.501309563112138e-06, "loss": 0.4468, "step": 102300 }, { "epoch": 2.2527774722252776, "grad_norm": 2.9717068672180176, "learning_rate": 5.496907670298228e-06, "loss": 0.4386, "step": 102400 }, { "epoch": 2.2549774502254976, "grad_norm": 2.8104095458984375, "learning_rate": 5.492505777484319e-06, "loss": 0.4452, "step": 102500 }, { "epoch": 2.2571774282257175, "grad_norm": 3.142512798309326, "learning_rate": 5.488103884670408e-06, "loss": 0.4487, "step": 102600 }, { "epoch": 2.259377406225938, "grad_norm": 3.723659038543701, "learning_rate": 5.483701991856498e-06, "loss": 0.449, "step": 102700 }, { "epoch": 2.2615773842261575, "grad_norm": 3.365520477294922, "learning_rate": 5.4793000990425895e-06, "loss": 0.4409, "step": 102800 }, { "epoch": 2.263777362226378, "grad_norm": 2.1158196926116943, "learning_rate": 5.474898206228679e-06, "loss": 0.4526, "step": 102900 }, { "epoch": 2.265977340226598, "grad_norm": 2.77187442779541, "learning_rate": 5.470496313414769e-06, "loss": 0.4597, "step": 103000 }, { "epoch": 2.268177318226818, "grad_norm": 3.1668035984039307, "learning_rate": 5.4660944206008595e-06, "loss": 0.4515, "step": 103100 }, { "epoch": 2.270377296227038, "grad_norm": 3.3199713230133057, "learning_rate": 5.461692527786949e-06, "loss": 0.4421, "step": 103200 }, { "epoch": 2.272577274227258, "grad_norm": 3.0452702045440674, "learning_rate": 5.457290634973039e-06, "loss": 0.451, "step": 103300 }, { "epoch": 2.274777252227478, "grad_norm": 2.889191150665283, "learning_rate": 5.4528887421591295e-06, "loss": 0.4433, "step": 103400 }, { "epoch": 2.276977230227698, "grad_norm": 3.1005496978759766, "learning_rate": 5.448486849345219e-06, "loss": 0.459, "step": 103500 }, { "epoch": 2.279177208227918, "grad_norm": 3.024289131164551, "learning_rate": 5.444084956531309e-06, "loss": 0.4369, "step": 103600 }, { "epoch": 2.281377186228138, "grad_norm": 2.3427116870880127, "learning_rate": 5.4396830637173995e-06, "loss": 0.4461, "step": 103700 }, { "epoch": 2.2835771642283578, "grad_norm": 3.6452486515045166, "learning_rate": 5.435281170903489e-06, "loss": 0.4626, "step": 103800 }, { "epoch": 2.2857771422285778, "grad_norm": 3.5883066654205322, "learning_rate": 5.430879278089579e-06, "loss": 0.4439, "step": 103900 }, { "epoch": 2.2879771202287977, "grad_norm": 3.1896305084228516, "learning_rate": 5.4264773852756695e-06, "loss": 0.4342, "step": 104000 }, { "epoch": 2.2901770982290177, "grad_norm": 3.0149104595184326, "learning_rate": 5.422075492461759e-06, "loss": 0.4503, "step": 104100 }, { "epoch": 2.2923770762292377, "grad_norm": 3.1118035316467285, "learning_rate": 5.417673599647849e-06, "loss": 0.4402, "step": 104200 }, { "epoch": 2.2945770542294577, "grad_norm": 3.0000152587890625, "learning_rate": 5.4132717068339395e-06, "loss": 0.4321, "step": 104300 }, { "epoch": 2.2967770322296777, "grad_norm": 3.1988613605499268, "learning_rate": 5.408869814020029e-06, "loss": 0.4458, "step": 104400 }, { "epoch": 2.2989770102298976, "grad_norm": 2.5336127281188965, "learning_rate": 5.404467921206119e-06, "loss": 0.4412, "step": 104500 }, { "epoch": 2.3011769882301176, "grad_norm": 2.478907823562622, "learning_rate": 5.400066028392209e-06, "loss": 0.4391, "step": 104600 }, { "epoch": 2.3033769662303376, "grad_norm": 3.0680346488952637, "learning_rate": 5.395664135578299e-06, "loss": 0.4466, "step": 104700 }, { "epoch": 2.3055769442305576, "grad_norm": 2.976754665374756, "learning_rate": 5.391262242764389e-06, "loss": 0.4534, "step": 104800 }, { "epoch": 2.3077769222307776, "grad_norm": 2.921550989151001, "learning_rate": 5.386860349950479e-06, "loss": 0.4461, "step": 104900 }, { "epoch": 2.3099769002309976, "grad_norm": 2.6085400581359863, "learning_rate": 5.382458457136569e-06, "loss": 0.439, "step": 105000 }, { "epoch": 2.3121768782312175, "grad_norm": 3.231365203857422, "learning_rate": 5.378056564322659e-06, "loss": 0.4511, "step": 105100 }, { "epoch": 2.3143768562314375, "grad_norm": 3.2471604347229004, "learning_rate": 5.373654671508749e-06, "loss": 0.434, "step": 105200 }, { "epoch": 2.3165768342316575, "grad_norm": 3.265526056289673, "learning_rate": 5.369252778694839e-06, "loss": 0.4414, "step": 105300 }, { "epoch": 2.318776812231878, "grad_norm": 2.632627010345459, "learning_rate": 5.364850885880929e-06, "loss": 0.4469, "step": 105400 }, { "epoch": 2.3209767902320975, "grad_norm": 3.3575692176818848, "learning_rate": 5.360448993067019e-06, "loss": 0.4517, "step": 105500 }, { "epoch": 2.323176768232318, "grad_norm": 2.751236915588379, "learning_rate": 5.356047100253109e-06, "loss": 0.4321, "step": 105600 }, { "epoch": 2.3253767462325374, "grad_norm": 3.4512314796447754, "learning_rate": 5.351645207439199e-06, "loss": 0.4513, "step": 105700 }, { "epoch": 2.327576724232758, "grad_norm": 2.5892844200134277, "learning_rate": 5.347243314625289e-06, "loss": 0.448, "step": 105800 }, { "epoch": 2.329776702232978, "grad_norm": 3.1810543537139893, "learning_rate": 5.342841421811379e-06, "loss": 0.4489, "step": 105900 }, { "epoch": 2.331976680233198, "grad_norm": 3.527425765991211, "learning_rate": 5.338439528997469e-06, "loss": 0.4362, "step": 106000 }, { "epoch": 2.334176658233418, "grad_norm": 2.97705340385437, "learning_rate": 5.334037636183559e-06, "loss": 0.4424, "step": 106100 }, { "epoch": 2.336376636233638, "grad_norm": 2.3554928302764893, "learning_rate": 5.329635743369649e-06, "loss": 0.4354, "step": 106200 }, { "epoch": 2.3385766142338578, "grad_norm": 3.598785161972046, "learning_rate": 5.32523385055574e-06, "loss": 0.4429, "step": 106300 }, { "epoch": 2.3407765922340777, "grad_norm": 3.603203058242798, "learning_rate": 5.32083195774183e-06, "loss": 0.4508, "step": 106400 }, { "epoch": 2.3429765702342977, "grad_norm": 2.5761771202087402, "learning_rate": 5.31643006492792e-06, "loss": 0.448, "step": 106500 }, { "epoch": 2.3451765482345177, "grad_norm": 3.6221818923950195, "learning_rate": 5.31202817211401e-06, "loss": 0.4305, "step": 106600 }, { "epoch": 2.3473765262347377, "grad_norm": 3.062361717224121, "learning_rate": 5.3076262793000995e-06, "loss": 0.45, "step": 106700 }, { "epoch": 2.3495765042349577, "grad_norm": 2.6559677124023438, "learning_rate": 5.30322438648619e-06, "loss": 0.4569, "step": 106800 }, { "epoch": 2.3517764822351777, "grad_norm": 2.8080978393554688, "learning_rate": 5.29882249367228e-06, "loss": 0.4376, "step": 106900 }, { "epoch": 2.3539764602353976, "grad_norm": 2.880061388015747, "learning_rate": 5.2944206008583695e-06, "loss": 0.4435, "step": 107000 }, { "epoch": 2.3561764382356176, "grad_norm": 2.902592420578003, "learning_rate": 5.29001870804446e-06, "loss": 0.4446, "step": 107100 }, { "epoch": 2.3583764162358376, "grad_norm": 2.560408592224121, "learning_rate": 5.28561681523055e-06, "loss": 0.4533, "step": 107200 }, { "epoch": 2.3605763942360576, "grad_norm": 3.5301778316497803, "learning_rate": 5.2812149224166395e-06, "loss": 0.4499, "step": 107300 }, { "epoch": 2.3627763722362776, "grad_norm": 3.1170268058776855, "learning_rate": 5.27681302960273e-06, "loss": 0.4392, "step": 107400 }, { "epoch": 2.3649763502364975, "grad_norm": 2.9975242614746094, "learning_rate": 5.27241113678882e-06, "loss": 0.4443, "step": 107500 }, { "epoch": 2.3671763282367175, "grad_norm": 2.9318737983703613, "learning_rate": 5.2680092439749095e-06, "loss": 0.4382, "step": 107600 }, { "epoch": 2.3693763062369375, "grad_norm": 2.7009778022766113, "learning_rate": 5.263607351161e-06, "loss": 0.4486, "step": 107700 }, { "epoch": 2.3715762842371575, "grad_norm": 3.265301465988159, "learning_rate": 5.25920545834709e-06, "loss": 0.4386, "step": 107800 }, { "epoch": 2.3737762622373775, "grad_norm": 3.5099949836730957, "learning_rate": 5.2548035655331795e-06, "loss": 0.4354, "step": 107900 }, { "epoch": 2.3759762402375975, "grad_norm": 2.997199296951294, "learning_rate": 5.25040167271927e-06, "loss": 0.4449, "step": 108000 }, { "epoch": 2.3781762182378174, "grad_norm": 3.5661022663116455, "learning_rate": 5.24599977990536e-06, "loss": 0.4533, "step": 108100 }, { "epoch": 2.3803761962380374, "grad_norm": 2.6311588287353516, "learning_rate": 5.2415978870914495e-06, "loss": 0.4535, "step": 108200 }, { "epoch": 2.382576174238258, "grad_norm": 2.3854992389678955, "learning_rate": 5.23719599427754e-06, "loss": 0.441, "step": 108300 }, { "epoch": 2.3847761522384774, "grad_norm": 3.3768720626831055, "learning_rate": 5.23279410146363e-06, "loss": 0.4467, "step": 108400 }, { "epoch": 2.386976130238698, "grad_norm": 3.2119550704956055, "learning_rate": 5.2283922086497195e-06, "loss": 0.4608, "step": 108500 }, { "epoch": 2.389176108238918, "grad_norm": 3.434720277786255, "learning_rate": 5.22399031583581e-06, "loss": 0.4415, "step": 108600 }, { "epoch": 2.3913760862391378, "grad_norm": 2.7094149589538574, "learning_rate": 5.219588423021899e-06, "loss": 0.4562, "step": 108700 }, { "epoch": 2.3935760642393578, "grad_norm": 2.9980342388153076, "learning_rate": 5.2151865302079895e-06, "loss": 0.4564, "step": 108800 }, { "epoch": 2.3957760422395777, "grad_norm": 3.241988182067871, "learning_rate": 5.21078463739408e-06, "loss": 0.4494, "step": 108900 }, { "epoch": 2.3979760202397977, "grad_norm": 3.08487606048584, "learning_rate": 5.206382744580169e-06, "loss": 0.439, "step": 109000 }, { "epoch": 2.4001759982400177, "grad_norm": 3.0313308238983154, "learning_rate": 5.2019808517662595e-06, "loss": 0.4412, "step": 109100 }, { "epoch": 2.4023759762402377, "grad_norm": 3.303107738494873, "learning_rate": 5.19757895895235e-06, "loss": 0.4437, "step": 109200 }, { "epoch": 2.4045759542404577, "grad_norm": 2.7623887062072754, "learning_rate": 5.193177066138439e-06, "loss": 0.4426, "step": 109300 }, { "epoch": 2.4067759322406777, "grad_norm": 3.3932597637176514, "learning_rate": 5.1887751733245295e-06, "loss": 0.4381, "step": 109400 }, { "epoch": 2.4089759102408976, "grad_norm": 2.618532657623291, "learning_rate": 5.18437328051062e-06, "loss": 0.4466, "step": 109500 }, { "epoch": 2.4111758882411176, "grad_norm": 2.4478089809417725, "learning_rate": 5.179971387696709e-06, "loss": 0.4494, "step": 109600 }, { "epoch": 2.4133758662413376, "grad_norm": 3.646751642227173, "learning_rate": 5.1755694948828e-06, "loss": 0.4684, "step": 109700 }, { "epoch": 2.4155758442415576, "grad_norm": 2.54317569732666, "learning_rate": 5.171167602068891e-06, "loss": 0.4467, "step": 109800 }, { "epoch": 2.4177758222417776, "grad_norm": 2.939131021499634, "learning_rate": 5.166765709254981e-06, "loss": 0.4438, "step": 109900 }, { "epoch": 2.4199758002419975, "grad_norm": 3.4897677898406982, "learning_rate": 5.16236381644107e-06, "loss": 0.4444, "step": 110000 }, { "epoch": 2.4221757782422175, "grad_norm": 3.108614683151245, "learning_rate": 5.157961923627161e-06, "loss": 0.4639, "step": 110100 }, { "epoch": 2.4243757562424375, "grad_norm": 3.135338068008423, "learning_rate": 5.153560030813251e-06, "loss": 0.4424, "step": 110200 }, { "epoch": 2.4265757342426575, "grad_norm": 2.249314785003662, "learning_rate": 5.14915813799934e-06, "loss": 0.4298, "step": 110300 }, { "epoch": 2.4287757122428775, "grad_norm": 2.582010269165039, "learning_rate": 5.144756245185431e-06, "loss": 0.4404, "step": 110400 }, { "epoch": 2.4309756902430975, "grad_norm": 3.1904852390289307, "learning_rate": 5.140354352371521e-06, "loss": 0.4526, "step": 110500 }, { "epoch": 2.4331756682433174, "grad_norm": 3.054769277572632, "learning_rate": 5.13595245955761e-06, "loss": 0.4332, "step": 110600 }, { "epoch": 2.4353756462435374, "grad_norm": 2.8248226642608643, "learning_rate": 5.131550566743701e-06, "loss": 0.446, "step": 110700 }, { "epoch": 2.4375756242437574, "grad_norm": 3.6720070838928223, "learning_rate": 5.12714867392979e-06, "loss": 0.445, "step": 110800 }, { "epoch": 2.4397756022439774, "grad_norm": 2.920863389968872, "learning_rate": 5.12274678111588e-06, "loss": 0.4482, "step": 110900 }, { "epoch": 2.441975580244198, "grad_norm": 2.426818370819092, "learning_rate": 5.118344888301971e-06, "loss": 0.452, "step": 111000 }, { "epoch": 2.4441755582444173, "grad_norm": 3.390026330947876, "learning_rate": 5.11394299548806e-06, "loss": 0.4413, "step": 111100 }, { "epoch": 2.4463755362446378, "grad_norm": 3.1187210083007812, "learning_rate": 5.10954110267415e-06, "loss": 0.4381, "step": 111200 }, { "epoch": 2.4485755142448573, "grad_norm": 3.0595436096191406, "learning_rate": 5.105139209860241e-06, "loss": 0.4432, "step": 111300 }, { "epoch": 2.4507754922450777, "grad_norm": 2.9489197731018066, "learning_rate": 5.10073731704633e-06, "loss": 0.438, "step": 111400 }, { "epoch": 2.4529754702452977, "grad_norm": 2.4834353923797607, "learning_rate": 5.09633542423242e-06, "loss": 0.4452, "step": 111500 }, { "epoch": 2.4551754482455177, "grad_norm": 3.135232925415039, "learning_rate": 5.091933531418511e-06, "loss": 0.4493, "step": 111600 }, { "epoch": 2.4573754262457377, "grad_norm": 3.158200263977051, "learning_rate": 5.0875316386046e-06, "loss": 0.446, "step": 111700 }, { "epoch": 2.4595754042459577, "grad_norm": 3.199408531188965, "learning_rate": 5.08312974579069e-06, "loss": 0.4457, "step": 111800 }, { "epoch": 2.4617753822461776, "grad_norm": 2.692122459411621, "learning_rate": 5.078727852976781e-06, "loss": 0.4453, "step": 111900 }, { "epoch": 2.4639753602463976, "grad_norm": 2.708963632583618, "learning_rate": 5.07432596016287e-06, "loss": 0.4375, "step": 112000 }, { "epoch": 2.4661753382466176, "grad_norm": 2.9427683353424072, "learning_rate": 5.06992406734896e-06, "loss": 0.4419, "step": 112100 }, { "epoch": 2.4683753162468376, "grad_norm": 3.154421329498291, "learning_rate": 5.065522174535051e-06, "loss": 0.4539, "step": 112200 }, { "epoch": 2.4705752942470576, "grad_norm": 2.364830255508423, "learning_rate": 5.06112028172114e-06, "loss": 0.4535, "step": 112300 }, { "epoch": 2.4727752722472776, "grad_norm": 3.534681797027588, "learning_rate": 5.05671838890723e-06, "loss": 0.4457, "step": 112400 }, { "epoch": 2.4749752502474975, "grad_norm": 3.2701926231384277, "learning_rate": 5.052316496093321e-06, "loss": 0.447, "step": 112500 }, { "epoch": 2.4771752282477175, "grad_norm": 2.395355701446533, "learning_rate": 5.04791460327941e-06, "loss": 0.4399, "step": 112600 }, { "epoch": 2.4793752062479375, "grad_norm": 3.506509780883789, "learning_rate": 5.0435127104655e-06, "loss": 0.4527, "step": 112700 }, { "epoch": 2.4815751842481575, "grad_norm": 2.844534397125244, "learning_rate": 5.03911081765159e-06, "loss": 0.4436, "step": 112800 }, { "epoch": 2.4837751622483775, "grad_norm": 3.3356661796569824, "learning_rate": 5.03470892483768e-06, "loss": 0.4432, "step": 112900 }, { "epoch": 2.4859751402485974, "grad_norm": 2.6262450218200684, "learning_rate": 5.03030703202377e-06, "loss": 0.4508, "step": 113000 }, { "epoch": 2.4881751182488174, "grad_norm": 2.852914810180664, "learning_rate": 5.02590513920986e-06, "loss": 0.453, "step": 113100 }, { "epoch": 2.4903750962490374, "grad_norm": 3.224490165710449, "learning_rate": 5.021503246395951e-06, "loss": 0.4637, "step": 113200 }, { "epoch": 2.4925750742492574, "grad_norm": 2.180025577545166, "learning_rate": 5.017101353582041e-06, "loss": 0.4316, "step": 113300 }, { "epoch": 2.4947750522494774, "grad_norm": 2.4598264694213867, "learning_rate": 5.0126994607681314e-06, "loss": 0.4381, "step": 113400 }, { "epoch": 2.4969750302496974, "grad_norm": 2.587557315826416, "learning_rate": 5.008297567954221e-06, "loss": 0.4469, "step": 113500 }, { "epoch": 2.4991750082499173, "grad_norm": 2.93766450881958, "learning_rate": 5.003895675140311e-06, "loss": 0.459, "step": 113600 }, { "epoch": 2.5013749862501378, "grad_norm": 2.926692485809326, "learning_rate": 4.999493782326401e-06, "loss": 0.4444, "step": 113700 }, { "epoch": 2.5035749642503573, "grad_norm": 3.22385311126709, "learning_rate": 4.995091889512491e-06, "loss": 0.4395, "step": 113800 }, { "epoch": 2.5057749422505777, "grad_norm": 2.241689682006836, "learning_rate": 4.99068999669858e-06, "loss": 0.4461, "step": 113900 }, { "epoch": 2.5079749202507973, "grad_norm": 3.1882591247558594, "learning_rate": 4.986288103884671e-06, "loss": 0.4482, "step": 114000 }, { "epoch": 2.5101748982510177, "grad_norm": 2.0357823371887207, "learning_rate": 4.981886211070761e-06, "loss": 0.4291, "step": 114100 }, { "epoch": 2.5123748762512372, "grad_norm": 3.0307114124298096, "learning_rate": 4.977484318256851e-06, "loss": 0.4556, "step": 114200 }, { "epoch": 2.5145748542514577, "grad_norm": 3.274482488632202, "learning_rate": 4.9730824254429414e-06, "loss": 0.4561, "step": 114300 }, { "epoch": 2.5167748322516776, "grad_norm": 2.6364364624023438, "learning_rate": 4.968680532629031e-06, "loss": 0.4462, "step": 114400 }, { "epoch": 2.5189748102518976, "grad_norm": 3.74102520942688, "learning_rate": 4.964278639815121e-06, "loss": 0.4343, "step": 114500 }, { "epoch": 2.5211747882521176, "grad_norm": 2.6400420665740967, "learning_rate": 4.9598767470012114e-06, "loss": 0.431, "step": 114600 }, { "epoch": 2.5233747662523376, "grad_norm": 3.313936948776245, "learning_rate": 4.955474854187301e-06, "loss": 0.4361, "step": 114700 }, { "epoch": 2.5255747442525576, "grad_norm": 3.1733415126800537, "learning_rate": 4.951072961373391e-06, "loss": 0.4346, "step": 114800 }, { "epoch": 2.5277747222527776, "grad_norm": 3.2925596237182617, "learning_rate": 4.946671068559481e-06, "loss": 0.4382, "step": 114900 }, { "epoch": 2.5299747002529975, "grad_norm": 3.0472724437713623, "learning_rate": 4.942269175745571e-06, "loss": 0.4294, "step": 115000 }, { "epoch": 2.5321746782532175, "grad_norm": 2.684380054473877, "learning_rate": 4.937867282931661e-06, "loss": 0.446, "step": 115100 }, { "epoch": 2.5343746562534375, "grad_norm": 2.86007022857666, "learning_rate": 4.933465390117751e-06, "loss": 0.4445, "step": 115200 }, { "epoch": 2.5365746342536575, "grad_norm": 3.1201276779174805, "learning_rate": 4.929063497303841e-06, "loss": 0.4363, "step": 115300 }, { "epoch": 2.5387746122538775, "grad_norm": 2.558084726333618, "learning_rate": 4.924661604489931e-06, "loss": 0.4272, "step": 115400 }, { "epoch": 2.5409745902540974, "grad_norm": 3.3204970359802246, "learning_rate": 4.920259711676021e-06, "loss": 0.425, "step": 115500 }, { "epoch": 2.5431745682543174, "grad_norm": 2.824089288711548, "learning_rate": 4.915857818862111e-06, "loss": 0.4428, "step": 115600 }, { "epoch": 2.5453745462545374, "grad_norm": 3.430072784423828, "learning_rate": 4.911455926048201e-06, "loss": 0.435, "step": 115700 }, { "epoch": 2.5475745242547574, "grad_norm": 3.415888547897339, "learning_rate": 4.9070540332342914e-06, "loss": 0.4546, "step": 115800 }, { "epoch": 2.5497745022549774, "grad_norm": 3.0503039360046387, "learning_rate": 4.902652140420382e-06, "loss": 0.4359, "step": 115900 }, { "epoch": 2.5519744802551974, "grad_norm": 3.0880868434906006, "learning_rate": 4.898250247606471e-06, "loss": 0.445, "step": 116000 }, { "epoch": 2.5541744582554173, "grad_norm": 3.1325924396514893, "learning_rate": 4.8938483547925614e-06, "loss": 0.4339, "step": 116100 }, { "epoch": 2.5563744362556373, "grad_norm": 3.6278367042541504, "learning_rate": 4.889446461978652e-06, "loss": 0.4573, "step": 116200 }, { "epoch": 2.5585744142558573, "grad_norm": 2.8526251316070557, "learning_rate": 4.885044569164741e-06, "loss": 0.4359, "step": 116300 }, { "epoch": 2.5607743922560773, "grad_norm": 2.335749864578247, "learning_rate": 4.8806426763508314e-06, "loss": 0.4336, "step": 116400 }, { "epoch": 2.5629743702562973, "grad_norm": 2.912464141845703, "learning_rate": 4.876240783536922e-06, "loss": 0.4463, "step": 116500 }, { "epoch": 2.5651743482565177, "grad_norm": 3.1221654415130615, "learning_rate": 4.871838890723011e-06, "loss": 0.4394, "step": 116600 }, { "epoch": 2.5673743262567372, "grad_norm": 3.0189766883850098, "learning_rate": 4.8674369979091014e-06, "loss": 0.4279, "step": 116700 }, { "epoch": 2.5695743042569577, "grad_norm": 2.7130327224731445, "learning_rate": 4.863035105095191e-06, "loss": 0.4397, "step": 116800 }, { "epoch": 2.571774282257177, "grad_norm": 3.0579302310943604, "learning_rate": 4.858633212281281e-06, "loss": 0.4509, "step": 116900 }, { "epoch": 2.5739742602573976, "grad_norm": 2.568333625793457, "learning_rate": 4.8542313194673714e-06, "loss": 0.4391, "step": 117000 }, { "epoch": 2.576174238257617, "grad_norm": 3.574082612991333, "learning_rate": 4.849829426653461e-06, "loss": 0.4354, "step": 117100 }, { "epoch": 2.5783742162578376, "grad_norm": 2.516016721725464, "learning_rate": 4.845427533839551e-06, "loss": 0.4417, "step": 117200 }, { "epoch": 2.5805741942580576, "grad_norm": 2.464613199234009, "learning_rate": 4.8410256410256414e-06, "loss": 0.4464, "step": 117300 }, { "epoch": 2.5827741722582775, "grad_norm": 2.841379404067993, "learning_rate": 4.836623748211731e-06, "loss": 0.45, "step": 117400 }, { "epoch": 2.5849741502584975, "grad_norm": 2.6032309532165527, "learning_rate": 4.832221855397821e-06, "loss": 0.4258, "step": 117500 }, { "epoch": 2.5871741282587175, "grad_norm": 2.964139938354492, "learning_rate": 4.8278199625839114e-06, "loss": 0.4495, "step": 117600 }, { "epoch": 2.5893741062589375, "grad_norm": 4.020530700683594, "learning_rate": 4.823418069770002e-06, "loss": 0.435, "step": 117700 }, { "epoch": 2.5915740842591575, "grad_norm": 2.493910312652588, "learning_rate": 4.819016176956092e-06, "loss": 0.4371, "step": 117800 }, { "epoch": 2.5937740622593775, "grad_norm": 3.542248249053955, "learning_rate": 4.8146142841421814e-06, "loss": 0.4489, "step": 117900 }, { "epoch": 2.5959740402595974, "grad_norm": 3.5736639499664307, "learning_rate": 4.810212391328272e-06, "loss": 0.43, "step": 118000 }, { "epoch": 2.5981740182598174, "grad_norm": 2.6904780864715576, "learning_rate": 4.805810498514362e-06, "loss": 0.4326, "step": 118100 }, { "epoch": 2.6003739962600374, "grad_norm": 3.0727078914642334, "learning_rate": 4.8014086057004514e-06, "loss": 0.4338, "step": 118200 }, { "epoch": 2.6025739742602574, "grad_norm": 3.355623722076416, "learning_rate": 4.797006712886542e-06, "loss": 0.4489, "step": 118300 }, { "epoch": 2.6047739522604774, "grad_norm": 2.657305955886841, "learning_rate": 4.792604820072632e-06, "loss": 0.4416, "step": 118400 }, { "epoch": 2.6069739302606973, "grad_norm": 2.6770079135894775, "learning_rate": 4.7882029272587214e-06, "loss": 0.4317, "step": 118500 }, { "epoch": 2.6091739082609173, "grad_norm": 3.4358301162719727, "learning_rate": 4.783801034444812e-06, "loss": 0.4307, "step": 118600 }, { "epoch": 2.6113738862611373, "grad_norm": 2.719377040863037, "learning_rate": 4.779399141630901e-06, "loss": 0.4452, "step": 118700 }, { "epoch": 2.6135738642613573, "grad_norm": 3.2705419063568115, "learning_rate": 4.7749972488169914e-06, "loss": 0.4391, "step": 118800 }, { "epoch": 2.6157738422615773, "grad_norm": 2.5769264698028564, "learning_rate": 4.770595356003082e-06, "loss": 0.4566, "step": 118900 }, { "epoch": 2.6179738202617973, "grad_norm": 3.0442352294921875, "learning_rate": 4.766193463189171e-06, "loss": 0.4394, "step": 119000 }, { "epoch": 2.6201737982620172, "grad_norm": 2.913459539413452, "learning_rate": 4.7617915703752614e-06, "loss": 0.4445, "step": 119100 }, { "epoch": 2.622373776262237, "grad_norm": 2.6879146099090576, "learning_rate": 4.757389677561352e-06, "loss": 0.4433, "step": 119200 }, { "epoch": 2.6245737542624576, "grad_norm": 3.6051576137542725, "learning_rate": 4.752987784747442e-06, "loss": 0.4385, "step": 119300 }, { "epoch": 2.626773732262677, "grad_norm": 2.6867752075195312, "learning_rate": 4.748585891933532e-06, "loss": 0.4569, "step": 119400 }, { "epoch": 2.6289737102628976, "grad_norm": 2.795522928237915, "learning_rate": 4.744183999119622e-06, "loss": 0.4413, "step": 119500 }, { "epoch": 2.631173688263117, "grad_norm": 3.2469236850738525, "learning_rate": 4.739782106305712e-06, "loss": 0.4539, "step": 119600 }, { "epoch": 2.6333736662633376, "grad_norm": 3.3304011821746826, "learning_rate": 4.735380213491802e-06, "loss": 0.46, "step": 119700 }, { "epoch": 2.635573644263557, "grad_norm": 3.8114166259765625, "learning_rate": 4.730978320677892e-06, "loss": 0.4472, "step": 119800 }, { "epoch": 2.6377736222637775, "grad_norm": 3.1319470405578613, "learning_rate": 4.726576427863982e-06, "loss": 0.4381, "step": 119900 }, { "epoch": 2.6399736002639975, "grad_norm": 3.4921023845672607, "learning_rate": 4.722174535050072e-06, "loss": 0.4397, "step": 120000 }, { "epoch": 2.6421735782642175, "grad_norm": 2.191277265548706, "learning_rate": 4.717772642236162e-06, "loss": 0.4329, "step": 120100 }, { "epoch": 2.6443735562644375, "grad_norm": 3.3526830673217773, "learning_rate": 4.713370749422252e-06, "loss": 0.4425, "step": 120200 }, { "epoch": 2.6465735342646575, "grad_norm": 3.1728663444519043, "learning_rate": 4.708968856608342e-06, "loss": 0.4304, "step": 120300 }, { "epoch": 2.6487735122648774, "grad_norm": 2.8686399459838867, "learning_rate": 4.704566963794432e-06, "loss": 0.4414, "step": 120400 }, { "epoch": 2.6509734902650974, "grad_norm": 3.0160744190216064, "learning_rate": 4.700165070980522e-06, "loss": 0.4318, "step": 120500 }, { "epoch": 2.6531734682653174, "grad_norm": 3.1512398719787598, "learning_rate": 4.695763178166612e-06, "loss": 0.4469, "step": 120600 }, { "epoch": 2.6553734462655374, "grad_norm": 3.48527193069458, "learning_rate": 4.691361285352702e-06, "loss": 0.4239, "step": 120700 }, { "epoch": 2.6575734242657574, "grad_norm": 3.1018311977386475, "learning_rate": 4.686959392538792e-06, "loss": 0.4555, "step": 120800 }, { "epoch": 2.6597734022659774, "grad_norm": 2.867196559906006, "learning_rate": 4.6825574997248814e-06, "loss": 0.4392, "step": 120900 }, { "epoch": 2.6619733802661973, "grad_norm": 3.6857316493988037, "learning_rate": 4.6781556069109726e-06, "loss": 0.4384, "step": 121000 }, { "epoch": 2.6641733582664173, "grad_norm": 3.517019748687744, "learning_rate": 4.673753714097062e-06, "loss": 0.4323, "step": 121100 }, { "epoch": 2.6663733362666373, "grad_norm": 2.7562782764434814, "learning_rate": 4.669351821283152e-06, "loss": 0.4425, "step": 121200 }, { "epoch": 2.6685733142668573, "grad_norm": 3.2904388904571533, "learning_rate": 4.6649499284692426e-06, "loss": 0.4365, "step": 121300 }, { "epoch": 2.6707732922670773, "grad_norm": 2.5950496196746826, "learning_rate": 4.660548035655332e-06, "loss": 0.4451, "step": 121400 }, { "epoch": 2.6729732702672973, "grad_norm": 2.883255958557129, "learning_rate": 4.656146142841422e-06, "loss": 0.4327, "step": 121500 }, { "epoch": 2.6751732482675172, "grad_norm": 2.8640213012695312, "learning_rate": 4.6517442500275126e-06, "loss": 0.4507, "step": 121600 }, { "epoch": 2.677373226267737, "grad_norm": 2.547304630279541, "learning_rate": 4.647342357213602e-06, "loss": 0.4391, "step": 121700 }, { "epoch": 2.679573204267957, "grad_norm": 2.599860668182373, "learning_rate": 4.642940464399692e-06, "loss": 0.4351, "step": 121800 }, { "epoch": 2.681773182268177, "grad_norm": 2.898108959197998, "learning_rate": 4.6385385715857826e-06, "loss": 0.4486, "step": 121900 }, { "epoch": 2.683973160268397, "grad_norm": 3.2468693256378174, "learning_rate": 4.634136678771872e-06, "loss": 0.4465, "step": 122000 }, { "epoch": 2.686173138268617, "grad_norm": 2.5715901851654053, "learning_rate": 4.629734785957962e-06, "loss": 0.4464, "step": 122100 }, { "epoch": 2.6883731162688376, "grad_norm": 3.2512638568878174, "learning_rate": 4.6253328931440526e-06, "loss": 0.436, "step": 122200 }, { "epoch": 2.690573094269057, "grad_norm": 3.1215593814849854, "learning_rate": 4.620931000330142e-06, "loss": 0.4333, "step": 122300 }, { "epoch": 2.6927730722692775, "grad_norm": 3.264613151550293, "learning_rate": 4.616529107516232e-06, "loss": 0.4288, "step": 122400 }, { "epoch": 2.694973050269497, "grad_norm": 3.0146634578704834, "learning_rate": 4.6121272147023226e-06, "loss": 0.4436, "step": 122500 }, { "epoch": 2.6971730282697175, "grad_norm": 3.1037158966064453, "learning_rate": 4.607725321888412e-06, "loss": 0.4545, "step": 122600 }, { "epoch": 2.699373006269937, "grad_norm": 3.4465529918670654, "learning_rate": 4.603323429074502e-06, "loss": 0.421, "step": 122700 }, { "epoch": 2.7015729842701575, "grad_norm": 3.160284996032715, "learning_rate": 4.5989215362605926e-06, "loss": 0.4323, "step": 122800 }, { "epoch": 2.7037729622703774, "grad_norm": 2.4882216453552246, "learning_rate": 4.594519643446683e-06, "loss": 0.4477, "step": 122900 }, { "epoch": 2.7059729402705974, "grad_norm": 2.794928550720215, "learning_rate": 4.590117750632772e-06, "loss": 0.4365, "step": 123000 }, { "epoch": 2.7081729182708174, "grad_norm": 3.3920443058013916, "learning_rate": 4.5857158578188626e-06, "loss": 0.4419, "step": 123100 }, { "epoch": 2.7103728962710374, "grad_norm": 3.186394214630127, "learning_rate": 4.581313965004953e-06, "loss": 0.4226, "step": 123200 }, { "epoch": 2.7125728742712574, "grad_norm": 3.5185060501098633, "learning_rate": 4.576912072191042e-06, "loss": 0.4487, "step": 123300 }, { "epoch": 2.7147728522714774, "grad_norm": 2.8069283962249756, "learning_rate": 4.5725101793771326e-06, "loss": 0.443, "step": 123400 }, { "epoch": 2.7169728302716973, "grad_norm": 4.11994743347168, "learning_rate": 4.568108286563223e-06, "loss": 0.4387, "step": 123500 }, { "epoch": 2.7191728082719173, "grad_norm": 3.6650454998016357, "learning_rate": 4.563706393749312e-06, "loss": 0.4431, "step": 123600 }, { "epoch": 2.7213727862721373, "grad_norm": 2.7214787006378174, "learning_rate": 4.5593045009354026e-06, "loss": 0.4299, "step": 123700 }, { "epoch": 2.7235727642723573, "grad_norm": 3.1517221927642822, "learning_rate": 4.554902608121493e-06, "loss": 0.4417, "step": 123800 }, { "epoch": 2.7257727422725773, "grad_norm": 2.9995832443237305, "learning_rate": 4.550500715307582e-06, "loss": 0.4357, "step": 123900 }, { "epoch": 2.7279727202727972, "grad_norm": 3.1918044090270996, "learning_rate": 4.5460988224936726e-06, "loss": 0.442, "step": 124000 }, { "epoch": 2.7301726982730172, "grad_norm": 3.101876735687256, "learning_rate": 4.541696929679763e-06, "loss": 0.4333, "step": 124100 }, { "epoch": 2.732372676273237, "grad_norm": 3.4324588775634766, "learning_rate": 4.537295036865852e-06, "loss": 0.431, "step": 124200 }, { "epoch": 2.734572654273457, "grad_norm": 3.6786587238311768, "learning_rate": 4.5328931440519426e-06, "loss": 0.4499, "step": 124300 }, { "epoch": 2.736772632273677, "grad_norm": 2.661198139190674, "learning_rate": 4.528491251238033e-06, "loss": 0.4446, "step": 124400 }, { "epoch": 2.738972610273897, "grad_norm": 2.958374500274658, "learning_rate": 4.524089358424123e-06, "loss": 0.4364, "step": 124500 }, { "epoch": 2.741172588274117, "grad_norm": 3.012861967086792, "learning_rate": 4.519687465610213e-06, "loss": 0.432, "step": 124600 }, { "epoch": 2.743372566274337, "grad_norm": 2.911194324493408, "learning_rate": 4.515285572796303e-06, "loss": 0.4524, "step": 124700 }, { "epoch": 2.745572544274557, "grad_norm": 2.891263961791992, "learning_rate": 4.510883679982393e-06, "loss": 0.435, "step": 124800 }, { "epoch": 2.7477725222747775, "grad_norm": 3.8306422233581543, "learning_rate": 4.5064817871684826e-06, "loss": 0.4331, "step": 124900 }, { "epoch": 2.749972500274997, "grad_norm": 3.3201122283935547, "learning_rate": 4.502079894354573e-06, "loss": 0.4483, "step": 125000 }, { "epoch": 2.7521724782752175, "grad_norm": 3.1668500900268555, "learning_rate": 4.497678001540663e-06, "loss": 0.4525, "step": 125100 }, { "epoch": 2.754372456275437, "grad_norm": 2.9911599159240723, "learning_rate": 4.4932761087267525e-06, "loss": 0.426, "step": 125200 }, { "epoch": 2.7565724342756575, "grad_norm": 2.79228138923645, "learning_rate": 4.488874215912843e-06, "loss": 0.4494, "step": 125300 }, { "epoch": 2.758772412275877, "grad_norm": 3.6654934883117676, "learning_rate": 4.484472323098933e-06, "loss": 0.4246, "step": 125400 }, { "epoch": 2.7609723902760974, "grad_norm": 3.1065101623535156, "learning_rate": 4.4800704302850225e-06, "loss": 0.4419, "step": 125500 }, { "epoch": 2.7631723682763174, "grad_norm": 3.4590702056884766, "learning_rate": 4.475668537471113e-06, "loss": 0.4429, "step": 125600 }, { "epoch": 2.7653723462765374, "grad_norm": 2.956099033355713, "learning_rate": 4.471266644657203e-06, "loss": 0.444, "step": 125700 }, { "epoch": 2.7675723242767574, "grad_norm": 3.131133556365967, "learning_rate": 4.4668647518432925e-06, "loss": 0.436, "step": 125800 }, { "epoch": 2.7697723022769773, "grad_norm": 2.866058349609375, "learning_rate": 4.462462859029383e-06, "loss": 0.4308, "step": 125900 }, { "epoch": 2.7719722802771973, "grad_norm": 2.9503538608551025, "learning_rate": 4.458060966215473e-06, "loss": 0.4339, "step": 126000 }, { "epoch": 2.7741722582774173, "grad_norm": 2.9972116947174072, "learning_rate": 4.4536590734015625e-06, "loss": 0.4457, "step": 126100 }, { "epoch": 2.7763722362776373, "grad_norm": 2.7631094455718994, "learning_rate": 4.449257180587654e-06, "loss": 0.4387, "step": 126200 }, { "epoch": 2.7785722142778573, "grad_norm": 2.7902297973632812, "learning_rate": 4.444855287773743e-06, "loss": 0.44, "step": 126300 }, { "epoch": 2.7807721922780773, "grad_norm": 2.688309907913208, "learning_rate": 4.440453394959833e-06, "loss": 0.4432, "step": 126400 }, { "epoch": 2.7829721702782972, "grad_norm": 2.6654300689697266, "learning_rate": 4.436051502145924e-06, "loss": 0.4347, "step": 126500 }, { "epoch": 2.7851721482785172, "grad_norm": 3.6261539459228516, "learning_rate": 4.431649609332013e-06, "loss": 0.4267, "step": 126600 }, { "epoch": 2.787372126278737, "grad_norm": 3.265857696533203, "learning_rate": 4.427247716518103e-06, "loss": 0.4374, "step": 126700 }, { "epoch": 2.789572104278957, "grad_norm": 3.311096668243408, "learning_rate": 4.422845823704193e-06, "loss": 0.4432, "step": 126800 }, { "epoch": 2.791772082279177, "grad_norm": 3.3290562629699707, "learning_rate": 4.418443930890283e-06, "loss": 0.4326, "step": 126900 }, { "epoch": 2.793972060279397, "grad_norm": 3.593282461166382, "learning_rate": 4.414042038076373e-06, "loss": 0.4422, "step": 127000 }, { "epoch": 2.796172038279617, "grad_norm": 2.8509931564331055, "learning_rate": 4.409640145262463e-06, "loss": 0.4443, "step": 127100 }, { "epoch": 2.798372016279837, "grad_norm": 2.7307536602020264, "learning_rate": 4.405238252448553e-06, "loss": 0.4362, "step": 127200 }, { "epoch": 2.800571994280057, "grad_norm": 3.677067518234253, "learning_rate": 4.400836359634643e-06, "loss": 0.4391, "step": 127300 }, { "epoch": 2.802771972280277, "grad_norm": 3.1093156337738037, "learning_rate": 4.396434466820733e-06, "loss": 0.4299, "step": 127400 }, { "epoch": 2.804971950280497, "grad_norm": 3.1432149410247803, "learning_rate": 4.392032574006823e-06, "loss": 0.4454, "step": 127500 }, { "epoch": 2.807171928280717, "grad_norm": 3.175234079360962, "learning_rate": 4.387630681192913e-06, "loss": 0.4361, "step": 127600 }, { "epoch": 2.809371906280937, "grad_norm": 3.7147324085235596, "learning_rate": 4.383228788379003e-06, "loss": 0.434, "step": 127700 }, { "epoch": 2.8115718842811575, "grad_norm": 3.4218947887420654, "learning_rate": 4.378826895565093e-06, "loss": 0.4392, "step": 127800 }, { "epoch": 2.813771862281377, "grad_norm": 4.2883195877075195, "learning_rate": 4.374425002751183e-06, "loss": 0.4254, "step": 127900 }, { "epoch": 2.8159718402815974, "grad_norm": 2.9192450046539307, "learning_rate": 4.370023109937274e-06, "loss": 0.4596, "step": 128000 }, { "epoch": 2.818171818281817, "grad_norm": 3.3479373455047607, "learning_rate": 4.365621217123364e-06, "loss": 0.4299, "step": 128100 }, { "epoch": 2.8203717962820374, "grad_norm": 3.0389039516448975, "learning_rate": 4.361219324309453e-06, "loss": 0.4348, "step": 128200 }, { "epoch": 2.822571774282257, "grad_norm": 3.236820697784424, "learning_rate": 4.356817431495544e-06, "loss": 0.4244, "step": 128300 }, { "epoch": 2.8247717522824773, "grad_norm": 3.319667339324951, "learning_rate": 4.352415538681634e-06, "loss": 0.4316, "step": 128400 }, { "epoch": 2.8269717302826973, "grad_norm": 3.5438737869262695, "learning_rate": 4.348013645867723e-06, "loss": 0.4427, "step": 128500 }, { "epoch": 2.8291717082829173, "grad_norm": 2.995439052581787, "learning_rate": 4.343611753053814e-06, "loss": 0.4402, "step": 128600 }, { "epoch": 2.8313716862831373, "grad_norm": 2.894104480743408, "learning_rate": 4.339209860239904e-06, "loss": 0.4351, "step": 128700 }, { "epoch": 2.8335716642833573, "grad_norm": 3.5675222873687744, "learning_rate": 4.334807967425993e-06, "loss": 0.4359, "step": 128800 }, { "epoch": 2.8357716422835773, "grad_norm": 2.749405860900879, "learning_rate": 4.330406074612084e-06, "loss": 0.4353, "step": 128900 }, { "epoch": 2.8379716202837972, "grad_norm": 3.581648826599121, "learning_rate": 4.326004181798173e-06, "loss": 0.4465, "step": 129000 }, { "epoch": 2.840171598284017, "grad_norm": 2.5396058559417725, "learning_rate": 4.321602288984263e-06, "loss": 0.4505, "step": 129100 }, { "epoch": 2.842371576284237, "grad_norm": 3.2663464546203613, "learning_rate": 4.317200396170354e-06, "loss": 0.4315, "step": 129200 }, { "epoch": 2.844571554284457, "grad_norm": 3.687699556350708, "learning_rate": 4.312798503356443e-06, "loss": 0.4341, "step": 129300 }, { "epoch": 2.846771532284677, "grad_norm": 3.822061538696289, "learning_rate": 4.308396610542533e-06, "loss": 0.4204, "step": 129400 }, { "epoch": 2.848971510284897, "grad_norm": 4.063410758972168, "learning_rate": 4.303994717728624e-06, "loss": 0.4281, "step": 129500 }, { "epoch": 2.851171488285117, "grad_norm": 3.304727554321289, "learning_rate": 4.299592824914713e-06, "loss": 0.4276, "step": 129600 }, { "epoch": 2.853371466285337, "grad_norm": 3.195687770843506, "learning_rate": 4.295190932100804e-06, "loss": 0.4195, "step": 129700 }, { "epoch": 2.855571444285557, "grad_norm": 3.390817403793335, "learning_rate": 4.290789039286894e-06, "loss": 0.4442, "step": 129800 }, { "epoch": 2.857771422285777, "grad_norm": 3.095522403717041, "learning_rate": 4.286387146472984e-06, "loss": 0.4459, "step": 129900 }, { "epoch": 2.859971400285997, "grad_norm": 2.7765722274780273, "learning_rate": 4.281985253659074e-06, "loss": 0.4347, "step": 130000 }, { "epoch": 2.862171378286217, "grad_norm": 3.3501715660095215, "learning_rate": 4.277583360845164e-06, "loss": 0.4315, "step": 130100 }, { "epoch": 2.864371356286437, "grad_norm": 2.8992860317230225, "learning_rate": 4.273181468031254e-06, "loss": 0.4196, "step": 130200 }, { "epoch": 2.866571334286657, "grad_norm": 3.240837335586548, "learning_rate": 4.268779575217344e-06, "loss": 0.4253, "step": 130300 }, { "epoch": 2.868771312286877, "grad_norm": 2.687161445617676, "learning_rate": 4.264377682403434e-06, "loss": 0.4297, "step": 130400 }, { "epoch": 2.870971290287097, "grad_norm": 3.1937789916992188, "learning_rate": 4.259975789589524e-06, "loss": 0.4367, "step": 130500 }, { "epoch": 2.873171268287317, "grad_norm": 2.9205288887023926, "learning_rate": 4.255573896775614e-06, "loss": 0.434, "step": 130600 }, { "epoch": 2.8753712462875374, "grad_norm": 2.830369710922241, "learning_rate": 4.251172003961704e-06, "loss": 0.4487, "step": 130700 }, { "epoch": 2.877571224287757, "grad_norm": 3.458214044570923, "learning_rate": 4.246770111147794e-06, "loss": 0.4326, "step": 130800 }, { "epoch": 2.8797712022879773, "grad_norm": 3.2541399002075195, "learning_rate": 4.242368218333883e-06, "loss": 0.4541, "step": 130900 }, { "epoch": 2.881971180288197, "grad_norm": 3.3345773220062256, "learning_rate": 4.237966325519974e-06, "loss": 0.434, "step": 131000 }, { "epoch": 2.8841711582884173, "grad_norm": 3.172386646270752, "learning_rate": 4.233564432706064e-06, "loss": 0.4399, "step": 131100 }, { "epoch": 2.886371136288637, "grad_norm": 2.489182472229004, "learning_rate": 4.229162539892153e-06, "loss": 0.4324, "step": 131200 }, { "epoch": 2.8885711142888573, "grad_norm": 3.0109496116638184, "learning_rate": 4.224760647078244e-06, "loss": 0.4342, "step": 131300 }, { "epoch": 2.8907710922890772, "grad_norm": 3.0989527702331543, "learning_rate": 4.220358754264334e-06, "loss": 0.4317, "step": 131400 }, { "epoch": 2.8929710702892972, "grad_norm": 4.138854026794434, "learning_rate": 4.215956861450424e-06, "loss": 0.4243, "step": 131500 }, { "epoch": 2.895171048289517, "grad_norm": 2.925975799560547, "learning_rate": 4.2115549686365145e-06, "loss": 0.4307, "step": 131600 }, { "epoch": 2.897371026289737, "grad_norm": 2.7254014015197754, "learning_rate": 4.207153075822604e-06, "loss": 0.4311, "step": 131700 }, { "epoch": 2.899571004289957, "grad_norm": 3.5956203937530518, "learning_rate": 4.202751183008694e-06, "loss": 0.43, "step": 131800 }, { "epoch": 2.901770982290177, "grad_norm": 3.406620502471924, "learning_rate": 4.1983492901947845e-06, "loss": 0.4321, "step": 131900 }, { "epoch": 2.903970960290397, "grad_norm": 3.0268537998199463, "learning_rate": 4.193947397380874e-06, "loss": 0.4328, "step": 132000 }, { "epoch": 2.906170938290617, "grad_norm": 3.0812931060791016, "learning_rate": 4.189545504566964e-06, "loss": 0.4443, "step": 132100 }, { "epoch": 2.908370916290837, "grad_norm": 2.5374112129211426, "learning_rate": 4.1851436117530545e-06, "loss": 0.4324, "step": 132200 }, { "epoch": 2.910570894291057, "grad_norm": 2.906034231185913, "learning_rate": 4.180741718939144e-06, "loss": 0.4358, "step": 132300 }, { "epoch": 2.912770872291277, "grad_norm": 3.593029499053955, "learning_rate": 4.176339826125234e-06, "loss": 0.4517, "step": 132400 }, { "epoch": 2.914970850291497, "grad_norm": 2.914520025253296, "learning_rate": 4.1719379333113245e-06, "loss": 0.4485, "step": 132500 }, { "epoch": 2.917170828291717, "grad_norm": 2.874202013015747, "learning_rate": 4.167536040497414e-06, "loss": 0.4506, "step": 132600 }, { "epoch": 2.919370806291937, "grad_norm": 3.172924041748047, "learning_rate": 4.163134147683504e-06, "loss": 0.4416, "step": 132700 }, { "epoch": 2.921570784292157, "grad_norm": 4.034905433654785, "learning_rate": 4.1587322548695945e-06, "loss": 0.4297, "step": 132800 }, { "epoch": 2.923770762292377, "grad_norm": 2.940948963165283, "learning_rate": 4.154330362055684e-06, "loss": 0.4277, "step": 132900 }, { "epoch": 2.925970740292597, "grad_norm": 4.134010314941406, "learning_rate": 4.149928469241774e-06, "loss": 0.4206, "step": 133000 }, { "epoch": 2.928170718292817, "grad_norm": 3.641511917114258, "learning_rate": 4.145526576427864e-06, "loss": 0.4433, "step": 133100 }, { "epoch": 2.930370696293037, "grad_norm": 3.1284308433532715, "learning_rate": 4.141124683613955e-06, "loss": 0.42, "step": 133200 }, { "epoch": 2.932570674293257, "grad_norm": 3.499300956726074, "learning_rate": 4.136722790800044e-06, "loss": 0.4429, "step": 133300 }, { "epoch": 2.9347706522934773, "grad_norm": 3.7782890796661377, "learning_rate": 4.1323208979861345e-06, "loss": 0.4207, "step": 133400 }, { "epoch": 2.936970630293697, "grad_norm": 2.8001630306243896, "learning_rate": 4.127919005172225e-06, "loss": 0.423, "step": 133500 }, { "epoch": 2.9391706082939173, "grad_norm": 3.0386412143707275, "learning_rate": 4.123517112358314e-06, "loss": 0.4353, "step": 133600 }, { "epoch": 2.941370586294137, "grad_norm": 3.480564594268799, "learning_rate": 4.1191152195444045e-06, "loss": 0.4373, "step": 133700 }, { "epoch": 2.9435705642943573, "grad_norm": 3.148545503616333, "learning_rate": 4.114713326730495e-06, "loss": 0.4359, "step": 133800 }, { "epoch": 2.945770542294577, "grad_norm": 2.8668603897094727, "learning_rate": 4.110311433916584e-06, "loss": 0.4435, "step": 133900 }, { "epoch": 2.9479705202947972, "grad_norm": 3.410372495651245, "learning_rate": 4.1059095411026745e-06, "loss": 0.4515, "step": 134000 }, { "epoch": 2.950170498295017, "grad_norm": 3.0960798263549805, "learning_rate": 4.101507648288765e-06, "loss": 0.4359, "step": 134100 }, { "epoch": 2.952370476295237, "grad_norm": 2.3949267864227295, "learning_rate": 4.097105755474854e-06, "loss": 0.4283, "step": 134200 }, { "epoch": 2.954570454295457, "grad_norm": 3.325115442276001, "learning_rate": 4.0927038626609445e-06, "loss": 0.4281, "step": 134300 }, { "epoch": 2.956770432295677, "grad_norm": 3.046936511993408, "learning_rate": 4.088301969847035e-06, "loss": 0.4431, "step": 134400 }, { "epoch": 2.958970410295897, "grad_norm": 3.0470268726348877, "learning_rate": 4.083900077033124e-06, "loss": 0.4235, "step": 134500 }, { "epoch": 2.961170388296117, "grad_norm": 2.8730931282043457, "learning_rate": 4.0794981842192145e-06, "loss": 0.4248, "step": 134600 }, { "epoch": 2.963370366296337, "grad_norm": 2.930630922317505, "learning_rate": 4.075096291405305e-06, "loss": 0.4398, "step": 134700 }, { "epoch": 2.965570344296557, "grad_norm": 3.920790672302246, "learning_rate": 4.070694398591394e-06, "loss": 0.4423, "step": 134800 }, { "epoch": 2.967770322296777, "grad_norm": 3.2532870769500732, "learning_rate": 4.0662925057774845e-06, "loss": 0.424, "step": 134900 }, { "epoch": 2.969970300296997, "grad_norm": 3.4274468421936035, "learning_rate": 4.061890612963575e-06, "loss": 0.4408, "step": 135000 }, { "epoch": 2.972170278297217, "grad_norm": 3.6045656204223633, "learning_rate": 4.057488720149665e-06, "loss": 0.4439, "step": 135100 }, { "epoch": 2.974370256297437, "grad_norm": 3.063582181930542, "learning_rate": 4.0530868273357545e-06, "loss": 0.4305, "step": 135200 }, { "epoch": 2.976570234297657, "grad_norm": 3.4174232482910156, "learning_rate": 4.048684934521845e-06, "loss": 0.4334, "step": 135300 }, { "epoch": 2.978770212297877, "grad_norm": 2.2402751445770264, "learning_rate": 4.044283041707935e-06, "loss": 0.4376, "step": 135400 }, { "epoch": 2.980970190298097, "grad_norm": 3.14042067527771, "learning_rate": 4.0398811488940245e-06, "loss": 0.443, "step": 135500 }, { "epoch": 2.983170168298317, "grad_norm": 4.143354892730713, "learning_rate": 4.035479256080115e-06, "loss": 0.4336, "step": 135600 }, { "epoch": 2.985370146298537, "grad_norm": 3.4250524044036865, "learning_rate": 4.031077363266205e-06, "loss": 0.4286, "step": 135700 }, { "epoch": 2.987570124298757, "grad_norm": 3.041456937789917, "learning_rate": 4.0266754704522945e-06, "loss": 0.4296, "step": 135800 }, { "epoch": 2.989770102298977, "grad_norm": 2.1302220821380615, "learning_rate": 4.022273577638385e-06, "loss": 0.4336, "step": 135900 }, { "epoch": 2.991970080299197, "grad_norm": 3.928239107131958, "learning_rate": 4.017871684824475e-06, "loss": 0.4352, "step": 136000 }, { "epoch": 2.994170058299417, "grad_norm": 2.7161359786987305, "learning_rate": 4.0134697920105645e-06, "loss": 0.4409, "step": 136100 }, { "epoch": 2.996370036299637, "grad_norm": 2.8443000316619873, "learning_rate": 4.009067899196655e-06, "loss": 0.4227, "step": 136200 }, { "epoch": 2.9985700142998573, "grad_norm": 2.336637020111084, "learning_rate": 4.004666006382745e-06, "loss": 0.4296, "step": 136300 }, { "epoch": 3.000769992300077, "grad_norm": 2.747061252593994, "learning_rate": 4.0002641135688345e-06, "loss": 0.4243, "step": 136400 }, { "epoch": 3.0029699703002968, "grad_norm": 2.6186234951019287, "learning_rate": 3.995862220754925e-06, "loss": 0.3946, "step": 136500 }, { "epoch": 3.005169948300517, "grad_norm": 3.578420400619507, "learning_rate": 3.991460327941015e-06, "loss": 0.3841, "step": 136600 }, { "epoch": 3.007369926300737, "grad_norm": 3.9675164222717285, "learning_rate": 3.987058435127105e-06, "loss": 0.4017, "step": 136700 }, { "epoch": 3.009569904300957, "grad_norm": 3.1522490978240967, "learning_rate": 3.982656542313195e-06, "loss": 0.3878, "step": 136800 }, { "epoch": 3.011769882301177, "grad_norm": 3.3388068675994873, "learning_rate": 3.978254649499285e-06, "loss": 0.4005, "step": 136900 }, { "epoch": 3.013969860301397, "grad_norm": 3.6714820861816406, "learning_rate": 3.973852756685375e-06, "loss": 0.406, "step": 137000 }, { "epoch": 3.016169838301617, "grad_norm": 2.9617388248443604, "learning_rate": 3.969450863871465e-06, "loss": 0.3915, "step": 137100 }, { "epoch": 3.018369816301837, "grad_norm": 3.648895263671875, "learning_rate": 3.965048971057555e-06, "loss": 0.3919, "step": 137200 }, { "epoch": 3.020569794302057, "grad_norm": 3.127763509750366, "learning_rate": 3.960647078243645e-06, "loss": 0.3969, "step": 137300 }, { "epoch": 3.022769772302277, "grad_norm": 4.054533004760742, "learning_rate": 3.956245185429735e-06, "loss": 0.4023, "step": 137400 }, { "epoch": 3.024969750302497, "grad_norm": 3.8178627490997314, "learning_rate": 3.951843292615825e-06, "loss": 0.3848, "step": 137500 }, { "epoch": 3.027169728302717, "grad_norm": 3.450464963912964, "learning_rate": 3.947441399801915e-06, "loss": 0.3983, "step": 137600 }, { "epoch": 3.029369706302937, "grad_norm": 3.210991144180298, "learning_rate": 3.943039506988005e-06, "loss": 0.3993, "step": 137700 }, { "epoch": 3.031569684303157, "grad_norm": 3.706838369369507, "learning_rate": 3.938637614174095e-06, "loss": 0.3916, "step": 137800 }, { "epoch": 3.033769662303377, "grad_norm": 3.306352138519287, "learning_rate": 3.934235721360185e-06, "loss": 0.404, "step": 137900 }, { "epoch": 3.035969640303597, "grad_norm": 4.016099452972412, "learning_rate": 3.929833828546275e-06, "loss": 0.4058, "step": 138000 }, { "epoch": 3.038169618303817, "grad_norm": 3.057190418243408, "learning_rate": 3.925431935732365e-06, "loss": 0.4105, "step": 138100 }, { "epoch": 3.040369596304037, "grad_norm": 3.2745933532714844, "learning_rate": 3.921030042918455e-06, "loss": 0.3888, "step": 138200 }, { "epoch": 3.042569574304257, "grad_norm": 3.0036211013793945, "learning_rate": 3.916628150104545e-06, "loss": 0.4039, "step": 138300 }, { "epoch": 3.044769552304477, "grad_norm": 3.3982667922973633, "learning_rate": 3.912226257290636e-06, "loss": 0.3955, "step": 138400 }, { "epoch": 3.046969530304697, "grad_norm": 2.8318960666656494, "learning_rate": 3.907824364476725e-06, "loss": 0.406, "step": 138500 }, { "epoch": 3.049169508304917, "grad_norm": 2.669373035430908, "learning_rate": 3.903422471662816e-06, "loss": 0.4009, "step": 138600 }, { "epoch": 3.051369486305137, "grad_norm": 3.476454257965088, "learning_rate": 3.899020578848906e-06, "loss": 0.3958, "step": 138700 }, { "epoch": 3.053569464305357, "grad_norm": 3.4908926486968994, "learning_rate": 3.894618686034995e-06, "loss": 0.4008, "step": 138800 }, { "epoch": 3.055769442305577, "grad_norm": 3.15459942817688, "learning_rate": 3.890216793221086e-06, "loss": 0.3928, "step": 138900 }, { "epoch": 3.0579694203057968, "grad_norm": 3.2345471382141113, "learning_rate": 3.885814900407175e-06, "loss": 0.4051, "step": 139000 }, { "epoch": 3.0601693983060168, "grad_norm": 3.2796826362609863, "learning_rate": 3.881413007593265e-06, "loss": 0.3985, "step": 139100 }, { "epoch": 3.0623693763062367, "grad_norm": 3.1456501483917236, "learning_rate": 3.877011114779356e-06, "loss": 0.4006, "step": 139200 }, { "epoch": 3.0645693543064567, "grad_norm": 3.5905213356018066, "learning_rate": 3.872609221965445e-06, "loss": 0.4005, "step": 139300 }, { "epoch": 3.066769332306677, "grad_norm": 3.593623399734497, "learning_rate": 3.868207329151535e-06, "loss": 0.3984, "step": 139400 }, { "epoch": 3.068969310306897, "grad_norm": 3.059357166290283, "learning_rate": 3.863805436337626e-06, "loss": 0.4105, "step": 139500 }, { "epoch": 3.071169288307117, "grad_norm": 3.4862234592437744, "learning_rate": 3.859403543523715e-06, "loss": 0.3943, "step": 139600 }, { "epoch": 3.073369266307337, "grad_norm": 3.381134033203125, "learning_rate": 3.855001650709805e-06, "loss": 0.3865, "step": 139700 }, { "epoch": 3.075569244307557, "grad_norm": 3.13862681388855, "learning_rate": 3.850599757895896e-06, "loss": 0.3895, "step": 139800 }, { "epoch": 3.077769222307777, "grad_norm": 3.6578209400177, "learning_rate": 3.846197865081985e-06, "loss": 0.3972, "step": 139900 }, { "epoch": 3.079969200307997, "grad_norm": 3.353710174560547, "learning_rate": 3.841795972268075e-06, "loss": 0.3935, "step": 140000 }, { "epoch": 3.082169178308217, "grad_norm": 3.3863718509674072, "learning_rate": 3.837394079454166e-06, "loss": 0.401, "step": 140100 }, { "epoch": 3.084369156308437, "grad_norm": 3.574791669845581, "learning_rate": 3.832992186640256e-06, "loss": 0.3932, "step": 140200 }, { "epoch": 3.086569134308657, "grad_norm": 3.1452407836914062, "learning_rate": 3.828590293826346e-06, "loss": 0.3982, "step": 140300 }, { "epoch": 3.088769112308877, "grad_norm": 3.7013602256774902, "learning_rate": 3.824188401012436e-06, "loss": 0.3871, "step": 140400 }, { "epoch": 3.090969090309097, "grad_norm": 3.909804105758667, "learning_rate": 3.819786508198526e-06, "loss": 0.3843, "step": 140500 }, { "epoch": 3.093169068309317, "grad_norm": 3.576997995376587, "learning_rate": 3.815384615384616e-06, "loss": 0.3895, "step": 140600 }, { "epoch": 3.095369046309537, "grad_norm": 3.9983808994293213, "learning_rate": 3.8109827225707056e-06, "loss": 0.3864, "step": 140700 }, { "epoch": 3.097569024309757, "grad_norm": 3.6163980960845947, "learning_rate": 3.806580829756796e-06, "loss": 0.3903, "step": 140800 }, { "epoch": 3.099769002309977, "grad_norm": 3.2333366870880127, "learning_rate": 3.8021789369428858e-06, "loss": 0.3939, "step": 140900 }, { "epoch": 3.101968980310197, "grad_norm": 3.2053537368774414, "learning_rate": 3.7977770441289756e-06, "loss": 0.3983, "step": 141000 }, { "epoch": 3.104168958310417, "grad_norm": 3.423635721206665, "learning_rate": 3.793375151315066e-06, "loss": 0.3937, "step": 141100 }, { "epoch": 3.106368936310637, "grad_norm": 3.9658424854278564, "learning_rate": 3.7889732585011558e-06, "loss": 0.3952, "step": 141200 }, { "epoch": 3.108568914310857, "grad_norm": 3.184368848800659, "learning_rate": 3.7845713656872456e-06, "loss": 0.3854, "step": 141300 }, { "epoch": 3.110768892311077, "grad_norm": 3.8191583156585693, "learning_rate": 3.7801694728733355e-06, "loss": 0.3965, "step": 141400 }, { "epoch": 3.1129688703112968, "grad_norm": 3.2895469665527344, "learning_rate": 3.7757675800594258e-06, "loss": 0.4101, "step": 141500 }, { "epoch": 3.1151688483115167, "grad_norm": 3.220507860183716, "learning_rate": 3.7713656872455156e-06, "loss": 0.3977, "step": 141600 }, { "epoch": 3.1173688263117367, "grad_norm": 3.0123960971832275, "learning_rate": 3.7669637944316055e-06, "loss": 0.4042, "step": 141700 }, { "epoch": 3.1195688043119567, "grad_norm": 3.4893038272857666, "learning_rate": 3.762561901617696e-06, "loss": 0.4052, "step": 141800 }, { "epoch": 3.1217687823121767, "grad_norm": 3.987698793411255, "learning_rate": 3.758160008803786e-06, "loss": 0.3979, "step": 141900 }, { "epoch": 3.1239687603123967, "grad_norm": 3.5184412002563477, "learning_rate": 3.7537581159898763e-06, "loss": 0.4114, "step": 142000 }, { "epoch": 3.126168738312617, "grad_norm": 4.024544715881348, "learning_rate": 3.749356223175966e-06, "loss": 0.3996, "step": 142100 }, { "epoch": 3.1283687163128366, "grad_norm": 3.358962059020996, "learning_rate": 3.744954330362056e-06, "loss": 0.3981, "step": 142200 }, { "epoch": 3.130568694313057, "grad_norm": 2.8024742603302, "learning_rate": 3.7405524375481463e-06, "loss": 0.402, "step": 142300 }, { "epoch": 3.132768672313277, "grad_norm": 3.1832938194274902, "learning_rate": 3.736150544734236e-06, "loss": 0.3951, "step": 142400 }, { "epoch": 3.134968650313497, "grad_norm": 3.7349486351013184, "learning_rate": 3.731748651920326e-06, "loss": 0.3911, "step": 142500 }, { "epoch": 3.137168628313717, "grad_norm": 3.722146987915039, "learning_rate": 3.727346759106416e-06, "loss": 0.3991, "step": 142600 }, { "epoch": 3.139368606313937, "grad_norm": 3.13198184967041, "learning_rate": 3.722944866292506e-06, "loss": 0.3937, "step": 142700 }, { "epoch": 3.141568584314157, "grad_norm": 3.511359453201294, "learning_rate": 3.718542973478596e-06, "loss": 0.3989, "step": 142800 }, { "epoch": 3.143768562314377, "grad_norm": 4.363528251647949, "learning_rate": 3.714141080664686e-06, "loss": 0.4041, "step": 142900 }, { "epoch": 3.145968540314597, "grad_norm": 3.696638584136963, "learning_rate": 3.709739187850776e-06, "loss": 0.3971, "step": 143000 }, { "epoch": 3.148168518314817, "grad_norm": 3.170654058456421, "learning_rate": 3.705337295036866e-06, "loss": 0.4002, "step": 143100 }, { "epoch": 3.150368496315037, "grad_norm": 4.185492992401123, "learning_rate": 3.700935402222956e-06, "loss": 0.3912, "step": 143200 }, { "epoch": 3.152568474315257, "grad_norm": 3.829686403274536, "learning_rate": 3.696533509409046e-06, "loss": 0.3914, "step": 143300 }, { "epoch": 3.154768452315477, "grad_norm": 2.9345476627349854, "learning_rate": 3.692131616595136e-06, "loss": 0.3962, "step": 143400 }, { "epoch": 3.156968430315697, "grad_norm": 3.666574239730835, "learning_rate": 3.687729723781226e-06, "loss": 0.401, "step": 143500 }, { "epoch": 3.159168408315917, "grad_norm": 3.359739303588867, "learning_rate": 3.6833278309673166e-06, "loss": 0.3992, "step": 143600 }, { "epoch": 3.161368386316137, "grad_norm": 4.227367877960205, "learning_rate": 3.6789259381534065e-06, "loss": 0.401, "step": 143700 }, { "epoch": 3.163568364316357, "grad_norm": 2.7858405113220215, "learning_rate": 3.6745240453394963e-06, "loss": 0.3907, "step": 143800 }, { "epoch": 3.1657683423165768, "grad_norm": 3.08479905128479, "learning_rate": 3.6701221525255866e-06, "loss": 0.395, "step": 143900 }, { "epoch": 3.1679683203167968, "grad_norm": 3.606621265411377, "learning_rate": 3.6657202597116765e-06, "loss": 0.3916, "step": 144000 }, { "epoch": 3.1701682983170167, "grad_norm": 4.141706466674805, "learning_rate": 3.6613183668977663e-06, "loss": 0.3987, "step": 144100 }, { "epoch": 3.1723682763172367, "grad_norm": 3.2608320713043213, "learning_rate": 3.6569164740838566e-06, "loss": 0.4112, "step": 144200 }, { "epoch": 3.1745682543174567, "grad_norm": 3.0765554904937744, "learning_rate": 3.6525145812699465e-06, "loss": 0.4057, "step": 144300 }, { "epoch": 3.1767682323176767, "grad_norm": 3.198472738265991, "learning_rate": 3.6481126884560363e-06, "loss": 0.3928, "step": 144400 }, { "epoch": 3.1789682103178967, "grad_norm": 3.3553693294525146, "learning_rate": 3.643710795642126e-06, "loss": 0.3934, "step": 144500 }, { "epoch": 3.1811681883181167, "grad_norm": 3.7630527019500732, "learning_rate": 3.6393089028282165e-06, "loss": 0.3957, "step": 144600 }, { "epoch": 3.1833681663183366, "grad_norm": 3.3282408714294434, "learning_rate": 3.6349070100143063e-06, "loss": 0.4038, "step": 144700 }, { "epoch": 3.1855681443185566, "grad_norm": 3.896204710006714, "learning_rate": 3.630505117200396e-06, "loss": 0.4176, "step": 144800 }, { "epoch": 3.1877681223187766, "grad_norm": 3.9070045948028564, "learning_rate": 3.6261032243864865e-06, "loss": 0.394, "step": 144900 }, { "epoch": 3.189968100318997, "grad_norm": 4.341803073883057, "learning_rate": 3.6217013315725763e-06, "loss": 0.4016, "step": 145000 }, { "epoch": 3.192168078319217, "grad_norm": 3.0518646240234375, "learning_rate": 3.617299438758666e-06, "loss": 0.4021, "step": 145100 }, { "epoch": 3.194368056319437, "grad_norm": 2.9907262325286865, "learning_rate": 3.6128975459447565e-06, "loss": 0.401, "step": 145200 }, { "epoch": 3.196568034319657, "grad_norm": 3.994093894958496, "learning_rate": 3.6084956531308467e-06, "loss": 0.3869, "step": 145300 }, { "epoch": 3.198768012319877, "grad_norm": 4.31938362121582, "learning_rate": 3.6040937603169366e-06, "loss": 0.3942, "step": 145400 }, { "epoch": 3.200967990320097, "grad_norm": 3.5487558841705322, "learning_rate": 3.599691867503027e-06, "loss": 0.3958, "step": 145500 }, { "epoch": 3.203167968320317, "grad_norm": 4.53445291519165, "learning_rate": 3.5952899746891167e-06, "loss": 0.3962, "step": 145600 }, { "epoch": 3.205367946320537, "grad_norm": 3.816943645477295, "learning_rate": 3.5908880818752066e-06, "loss": 0.4112, "step": 145700 }, { "epoch": 3.207567924320757, "grad_norm": 4.102901935577393, "learning_rate": 3.586486189061297e-06, "loss": 0.3914, "step": 145800 }, { "epoch": 3.209767902320977, "grad_norm": 3.5486576557159424, "learning_rate": 3.5820842962473867e-06, "loss": 0.3993, "step": 145900 }, { "epoch": 3.211967880321197, "grad_norm": 3.8645424842834473, "learning_rate": 3.5776824034334766e-06, "loss": 0.4078, "step": 146000 }, { "epoch": 3.214167858321417, "grad_norm": 3.282376527786255, "learning_rate": 3.573280510619567e-06, "loss": 0.4007, "step": 146100 }, { "epoch": 3.216367836321637, "grad_norm": 3.293292999267578, "learning_rate": 3.5688786178056567e-06, "loss": 0.3909, "step": 146200 }, { "epoch": 3.218567814321857, "grad_norm": 3.7592716217041016, "learning_rate": 3.5644767249917466e-06, "loss": 0.3916, "step": 146300 }, { "epoch": 3.2207677923220768, "grad_norm": 3.8671295642852783, "learning_rate": 3.5600748321778365e-06, "loss": 0.4028, "step": 146400 }, { "epoch": 3.2229677703222968, "grad_norm": 3.9341673851013184, "learning_rate": 3.5556729393639267e-06, "loss": 0.3957, "step": 146500 }, { "epoch": 3.2251677483225167, "grad_norm": 3.232847213745117, "learning_rate": 3.5512710465500166e-06, "loss": 0.39, "step": 146600 }, { "epoch": 3.2273677263227367, "grad_norm": 3.2240495681762695, "learning_rate": 3.5468691537361065e-06, "loss": 0.3947, "step": 146700 }, { "epoch": 3.2295677043229567, "grad_norm": 3.527489423751831, "learning_rate": 3.5424672609221967e-06, "loss": 0.3968, "step": 146800 }, { "epoch": 3.2317676823231767, "grad_norm": 3.633652687072754, "learning_rate": 3.5380653681082866e-06, "loss": 0.3883, "step": 146900 }, { "epoch": 3.2339676603233967, "grad_norm": 3.5555477142333984, "learning_rate": 3.5336634752943773e-06, "loss": 0.3965, "step": 147000 }, { "epoch": 3.2361676383236166, "grad_norm": 3.5308194160461426, "learning_rate": 3.529261582480467e-06, "loss": 0.3881, "step": 147100 }, { "epoch": 3.2383676163238366, "grad_norm": 4.819995880126953, "learning_rate": 3.524859689666557e-06, "loss": 0.4036, "step": 147200 }, { "epoch": 3.2405675943240566, "grad_norm": 4.3929033279418945, "learning_rate": 3.5204577968526473e-06, "loss": 0.3931, "step": 147300 }, { "epoch": 3.2427675723242766, "grad_norm": 2.9747936725616455, "learning_rate": 3.516055904038737e-06, "loss": 0.3986, "step": 147400 }, { "epoch": 3.2449675503244966, "grad_norm": 3.78959321975708, "learning_rate": 3.511654011224827e-06, "loss": 0.4019, "step": 147500 }, { "epoch": 3.2471675283247166, "grad_norm": 3.943894624710083, "learning_rate": 3.507252118410917e-06, "loss": 0.3924, "step": 147600 }, { "epoch": 3.249367506324937, "grad_norm": 3.963569402694702, "learning_rate": 3.502850225597007e-06, "loss": 0.4118, "step": 147700 }, { "epoch": 3.2515674843251565, "grad_norm": 3.0138792991638184, "learning_rate": 3.498448332783097e-06, "loss": 0.3914, "step": 147800 }, { "epoch": 3.253767462325377, "grad_norm": 3.723484754562378, "learning_rate": 3.494046439969187e-06, "loss": 0.3847, "step": 147900 }, { "epoch": 3.255967440325597, "grad_norm": 3.7332823276519775, "learning_rate": 3.489644547155277e-06, "loss": 0.3943, "step": 148000 }, { "epoch": 3.258167418325817, "grad_norm": 4.216028690338135, "learning_rate": 3.485242654341367e-06, "loss": 0.3959, "step": 148100 }, { "epoch": 3.260367396326037, "grad_norm": 2.8157236576080322, "learning_rate": 3.480840761527457e-06, "loss": 0.4163, "step": 148200 }, { "epoch": 3.262567374326257, "grad_norm": 3.428497076034546, "learning_rate": 3.476438868713547e-06, "loss": 0.4064, "step": 148300 }, { "epoch": 3.264767352326477, "grad_norm": 3.9073712825775146, "learning_rate": 3.472036975899637e-06, "loss": 0.4014, "step": 148400 }, { "epoch": 3.266967330326697, "grad_norm": 4.03035306930542, "learning_rate": 3.467635083085727e-06, "loss": 0.3991, "step": 148500 }, { "epoch": 3.269167308326917, "grad_norm": 2.9807870388031006, "learning_rate": 3.4632331902718167e-06, "loss": 0.4106, "step": 148600 }, { "epoch": 3.271367286327137, "grad_norm": 2.9369282722473145, "learning_rate": 3.458831297457907e-06, "loss": 0.4048, "step": 148700 }, { "epoch": 3.273567264327357, "grad_norm": 3.9154739379882812, "learning_rate": 3.4544294046439973e-06, "loss": 0.3984, "step": 148800 }, { "epoch": 3.2757672423275768, "grad_norm": 4.0573601722717285, "learning_rate": 3.4500275118300876e-06, "loss": 0.3994, "step": 148900 }, { "epoch": 3.2779672203277967, "grad_norm": 3.521632671356201, "learning_rate": 3.4456256190161774e-06, "loss": 0.4101, "step": 149000 }, { "epoch": 3.2801671983280167, "grad_norm": 3.2614357471466064, "learning_rate": 3.4412237262022673e-06, "loss": 0.404, "step": 149100 }, { "epoch": 3.2823671763282367, "grad_norm": 2.79972767829895, "learning_rate": 3.4368218333883576e-06, "loss": 0.3997, "step": 149200 }, { "epoch": 3.2845671543284567, "grad_norm": 3.753329038619995, "learning_rate": 3.4324199405744474e-06, "loss": 0.3992, "step": 149300 }, { "epoch": 3.2867671323286767, "grad_norm": 2.569004774093628, "learning_rate": 3.4280180477605373e-06, "loss": 0.4007, "step": 149400 }, { "epoch": 3.2889671103288967, "grad_norm": 3.0934865474700928, "learning_rate": 3.423616154946627e-06, "loss": 0.3978, "step": 149500 }, { "epoch": 3.2911670883291166, "grad_norm": 3.7940945625305176, "learning_rate": 3.4192142621327174e-06, "loss": 0.4009, "step": 149600 }, { "epoch": 3.2933670663293366, "grad_norm": 3.0418498516082764, "learning_rate": 3.4148123693188073e-06, "loss": 0.3977, "step": 149700 }, { "epoch": 3.2955670443295566, "grad_norm": 3.6287832260131836, "learning_rate": 3.410410476504897e-06, "loss": 0.399, "step": 149800 }, { "epoch": 3.2977670223297766, "grad_norm": 3.9616570472717285, "learning_rate": 3.4060085836909874e-06, "loss": 0.392, "step": 149900 }, { "epoch": 3.2999670003299966, "grad_norm": 3.6250250339508057, "learning_rate": 3.4016066908770773e-06, "loss": 0.3837, "step": 150000 }, { "epoch": 3.2999670003299966, "eval_loss": 0.5354483723640442, "eval_runtime": 386.9888, "eval_samples_per_second": 155.043, "eval_steps_per_second": 4.845, "step": 150000 } ], "logging_steps": 100, "max_steps": 227275, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 50000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.913014241035995e+17, "train_batch_size": 22, "trial_name": null, "trial_params": null }