{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.031181202890604, "eval_steps": 500, "global_step": 63000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016368085506878688, "grad_norm": 0.5328027606010437, "learning_rate": 3.600654664484452e-07, "loss": 1.6968, "step": 100 }, { "epoch": 0.0032736171013757376, "grad_norm": 0.5594077706336975, "learning_rate": 7.237679578105111e-07, "loss": 1.6883, "step": 200 }, { "epoch": 0.004910425652063607, "grad_norm": 0.6636043787002563, "learning_rate": 1.087470449172577e-06, "loss": 1.6196, "step": 300 }, { "epoch": 0.006547234202751475, "grad_norm": 0.6200364828109741, "learning_rate": 1.4511729405346428e-06, "loss": 1.511, "step": 400 }, { "epoch": 0.008184042753439345, "grad_norm": 0.4777531623840332, "learning_rate": 1.8148754318967086e-06, "loss": 1.342, "step": 500 }, { "epoch": 0.009820851304127213, "grad_norm": 0.3041970133781433, "learning_rate": 2.1785779232587743e-06, "loss": 1.2154, "step": 600 }, { "epoch": 0.011457659854815082, "grad_norm": 0.21760690212249756, "learning_rate": 2.54228041462084e-06, "loss": 1.1427, "step": 700 }, { "epoch": 0.01309446840550295, "grad_norm": 0.22987280786037445, "learning_rate": 2.9059829059829063e-06, "loss": 1.0943, "step": 800 }, { "epoch": 0.014731276956190819, "grad_norm": 0.24943482875823975, "learning_rate": 3.269685397344972e-06, "loss": 1.0696, "step": 900 }, { "epoch": 0.01636808550687869, "grad_norm": 0.2619542181491852, "learning_rate": 3.633387888707038e-06, "loss": 1.0318, "step": 1000 }, { "epoch": 0.018004894057566556, "grad_norm": 0.2811136841773987, "learning_rate": 3.997090380069103e-06, "loss": 1.0035, "step": 1100 }, { "epoch": 0.019641702608254426, "grad_norm": 0.3045084476470947, "learning_rate": 4.36079287143117e-06, "loss": 0.9726, "step": 1200 }, { "epoch": 0.021278511158942293, "grad_norm": 0.3168332278728485, "learning_rate": 4.7244953627932355e-06, "loss": 0.971, "step": 1300 }, { "epoch": 0.022915319709630164, "grad_norm": 0.33685848116874695, "learning_rate": 5.088197854155301e-06, "loss": 0.952, "step": 1400 }, { "epoch": 0.02455212826031803, "grad_norm": 0.3198516368865967, "learning_rate": 5.451900345517367e-06, "loss": 0.9385, "step": 1500 }, { "epoch": 0.0261889368110059, "grad_norm": 0.3457159101963043, "learning_rate": 5.815602836879432e-06, "loss": 0.9291, "step": 1600 }, { "epoch": 0.02782574536169377, "grad_norm": 0.3343696594238281, "learning_rate": 6.179305328241499e-06, "loss": 0.9251, "step": 1700 }, { "epoch": 0.029462553912381638, "grad_norm": 0.4662475287914276, "learning_rate": 6.543007819603565e-06, "loss": 0.9328, "step": 1800 }, { "epoch": 0.03109936246306951, "grad_norm": 0.3559871017932892, "learning_rate": 6.906710310965631e-06, "loss": 0.9126, "step": 1900 }, { "epoch": 0.03273617101375738, "grad_norm": 0.3852447271347046, "learning_rate": 7.270412802327696e-06, "loss": 0.9024, "step": 2000 }, { "epoch": 0.034372979564445245, "grad_norm": 0.36482807993888855, "learning_rate": 7.634115293689762e-06, "loss": 0.9086, "step": 2100 }, { "epoch": 0.03600978811513311, "grad_norm": 0.39493420720100403, "learning_rate": 7.997817785051828e-06, "loss": 0.9144, "step": 2200 }, { "epoch": 0.03764659666582098, "grad_norm": 0.4406372010707855, "learning_rate": 8.361520276413894e-06, "loss": 0.9067, "step": 2300 }, { "epoch": 0.03928340521650885, "grad_norm": 0.43684300780296326, "learning_rate": 8.72522276777596e-06, "loss": 0.898, "step": 2400 }, { "epoch": 0.04092021376719672, "grad_norm": 0.4949699342250824, "learning_rate": 9.088925259138026e-06, "loss": 0.8893, "step": 2500 }, { "epoch": 0.04255702231788459, "grad_norm": 0.4759005308151245, "learning_rate": 9.452627750500092e-06, "loss": 0.9036, "step": 2600 }, { "epoch": 0.04419383086857246, "grad_norm": 0.4733336567878723, "learning_rate": 9.816330241862157e-06, "loss": 0.9046, "step": 2700 }, { "epoch": 0.04583063941926033, "grad_norm": 0.5515408515930176, "learning_rate": 1.0180032733224223e-05, "loss": 0.8899, "step": 2800 }, { "epoch": 0.047467447969948194, "grad_norm": 0.5026727318763733, "learning_rate": 1.054373522458629e-05, "loss": 0.8868, "step": 2900 }, { "epoch": 0.04910425652063606, "grad_norm": 0.5517929196357727, "learning_rate": 1.0907437715948354e-05, "loss": 0.8905, "step": 3000 }, { "epoch": 0.050741065071323935, "grad_norm": 0.5139409899711609, "learning_rate": 1.127114020731042e-05, "loss": 0.8711, "step": 3100 }, { "epoch": 0.0523778736220118, "grad_norm": 0.5762068033218384, "learning_rate": 1.1634842698672486e-05, "loss": 0.9, "step": 3200 }, { "epoch": 0.05401468217269967, "grad_norm": 0.5540242791175842, "learning_rate": 1.1998545190034552e-05, "loss": 0.8854, "step": 3300 }, { "epoch": 0.05565149072338754, "grad_norm": 0.6651942133903503, "learning_rate": 1.236224768139662e-05, "loss": 0.875, "step": 3400 }, { "epoch": 0.05728829927407541, "grad_norm": 0.6157256364822388, "learning_rate": 1.2725950172758685e-05, "loss": 0.87, "step": 3500 }, { "epoch": 0.058925107824763276, "grad_norm": 0.6638494729995728, "learning_rate": 1.3089652664120751e-05, "loss": 0.8666, "step": 3600 }, { "epoch": 0.06056191637545114, "grad_norm": 0.6535647511482239, "learning_rate": 1.3453355155482817e-05, "loss": 0.8675, "step": 3700 }, { "epoch": 0.06219872492613902, "grad_norm": 0.7346630692481995, "learning_rate": 1.3817057646844883e-05, "loss": 0.8724, "step": 3800 }, { "epoch": 0.06383553347682688, "grad_norm": 0.7002882957458496, "learning_rate": 1.4180760138206948e-05, "loss": 0.8476, "step": 3900 }, { "epoch": 0.06547234202751476, "grad_norm": 0.6632655262947083, "learning_rate": 1.4544462629569014e-05, "loss": 0.8641, "step": 4000 }, { "epoch": 0.06710915057820262, "grad_norm": 0.7253566384315491, "learning_rate": 1.490816512093108e-05, "loss": 0.8611, "step": 4100 }, { "epoch": 0.06874595912889049, "grad_norm": 0.7651970386505127, "learning_rate": 1.5271867612293146e-05, "loss": 0.8597, "step": 4200 }, { "epoch": 0.07038276767957836, "grad_norm": 0.6781213879585266, "learning_rate": 1.563557010365521e-05, "loss": 0.844, "step": 4300 }, { "epoch": 0.07201957623026622, "grad_norm": 0.7465602159500122, "learning_rate": 1.5999272595017275e-05, "loss": 0.8558, "step": 4400 }, { "epoch": 0.0736563847809541, "grad_norm": 0.7796695828437805, "learning_rate": 1.6362975086379343e-05, "loss": 0.8533, "step": 4500 }, { "epoch": 0.07529319333164196, "grad_norm": 0.7622010111808777, "learning_rate": 1.6726677577741408e-05, "loss": 0.8414, "step": 4600 }, { "epoch": 0.07693000188232983, "grad_norm": 0.7499621510505676, "learning_rate": 1.7090380069103472e-05, "loss": 0.8459, "step": 4700 }, { "epoch": 0.0785668104330177, "grad_norm": 0.7822730541229248, "learning_rate": 1.745408256046554e-05, "loss": 0.8468, "step": 4800 }, { "epoch": 0.08020361898370557, "grad_norm": 0.7850978970527649, "learning_rate": 1.7817785051827608e-05, "loss": 0.8603, "step": 4900 }, { "epoch": 0.08184042753439344, "grad_norm": 0.8370286822319031, "learning_rate": 1.8181487543189672e-05, "loss": 0.837, "step": 5000 }, { "epoch": 0.08347723608508131, "grad_norm": 0.821024477481842, "learning_rate": 1.854519003455174e-05, "loss": 0.8464, "step": 5100 }, { "epoch": 0.08511404463576917, "grad_norm": 0.8516008257865906, "learning_rate": 1.8908892525913805e-05, "loss": 0.837, "step": 5200 }, { "epoch": 0.08675085318645705, "grad_norm": 0.7816336750984192, "learning_rate": 1.927259501727587e-05, "loss": 0.8471, "step": 5300 }, { "epoch": 0.08838766173714492, "grad_norm": 0.8347124457359314, "learning_rate": 1.9636297508637937e-05, "loss": 0.8333, "step": 5400 }, { "epoch": 0.09002447028783278, "grad_norm": 0.8995541334152222, "learning_rate": 2e-05, "loss": 0.8341, "step": 5500 }, { "epoch": 0.09166127883852065, "grad_norm": 0.9787241816520691, "learning_rate": 1.9999984387425675e-05, "loss": 0.8431, "step": 5600 }, { "epoch": 0.09329808738920851, "grad_norm": 0.8093689680099487, "learning_rate": 1.999993754975144e-05, "loss": 0.8325, "step": 5700 }, { "epoch": 0.09493489593989639, "grad_norm": 0.9042837023735046, "learning_rate": 1.999985948712355e-05, "loss": 0.828, "step": 5800 }, { "epoch": 0.09657170449058426, "grad_norm": 0.9188331961631775, "learning_rate": 1.999975019978576e-05, "loss": 0.8291, "step": 5900 }, { "epoch": 0.09820851304127212, "grad_norm": 0.8699648380279541, "learning_rate": 1.9999609688079316e-05, "loss": 0.8277, "step": 6000 }, { "epoch": 0.09984532159196, "grad_norm": 0.9138243794441223, "learning_rate": 1.999943795244297e-05, "loss": 0.8367, "step": 6100 }, { "epoch": 0.10148213014264787, "grad_norm": 0.9293233156204224, "learning_rate": 1.9999234993412973e-05, "loss": 0.8281, "step": 6200 }, { "epoch": 0.10311893869333573, "grad_norm": 0.9346773624420166, "learning_rate": 1.999900081162306e-05, "loss": 0.8323, "step": 6300 }, { "epoch": 0.1047557472440236, "grad_norm": 0.9332927465438843, "learning_rate": 1.999873540780447e-05, "loss": 0.8259, "step": 6400 }, { "epoch": 0.10639255579471148, "grad_norm": 0.8887437582015991, "learning_rate": 1.9998438782785937e-05, "loss": 0.8305, "step": 6500 }, { "epoch": 0.10802936434539934, "grad_norm": 0.9184074401855469, "learning_rate": 1.999811093749367e-05, "loss": 0.829, "step": 6600 }, { "epoch": 0.10966617289608721, "grad_norm": 0.8532683849334717, "learning_rate": 1.999775187295137e-05, "loss": 0.8275, "step": 6700 }, { "epoch": 0.11130298144677508, "grad_norm": 0.9298515915870667, "learning_rate": 1.9997361590280225e-05, "loss": 0.8192, "step": 6800 }, { "epoch": 0.11293978999746294, "grad_norm": 0.9617123603820801, "learning_rate": 1.9996940090698896e-05, "loss": 0.8198, "step": 6900 }, { "epoch": 0.11457659854815082, "grad_norm": 1.0112113952636719, "learning_rate": 1.9996487375523524e-05, "loss": 0.8239, "step": 7000 }, { "epoch": 0.11621340709883868, "grad_norm": 0.9226319193840027, "learning_rate": 1.9996003446167718e-05, "loss": 0.8281, "step": 7100 }, { "epoch": 0.11785021564952655, "grad_norm": 1.0199968814849854, "learning_rate": 1.999548830414255e-05, "loss": 0.82, "step": 7200 }, { "epoch": 0.11948702420021443, "grad_norm": 0.9594390988349915, "learning_rate": 1.999494195105657e-05, "loss": 0.8139, "step": 7300 }, { "epoch": 0.12112383275090229, "grad_norm": 0.9685386419296265, "learning_rate": 1.9994364388615763e-05, "loss": 0.8193, "step": 7400 }, { "epoch": 0.12276064130159016, "grad_norm": 0.9797342419624329, "learning_rate": 1.999375561862358e-05, "loss": 0.815, "step": 7500 }, { "epoch": 0.12439744985227803, "grad_norm": 1.0541061162948608, "learning_rate": 1.9993115642980912e-05, "loss": 0.8239, "step": 7600 }, { "epoch": 0.1260342584029659, "grad_norm": 0.9543519616127014, "learning_rate": 1.99924444636861e-05, "loss": 0.8145, "step": 7700 }, { "epoch": 0.12767106695365377, "grad_norm": 0.9379186630249023, "learning_rate": 1.99917420828349e-05, "loss": 0.817, "step": 7800 }, { "epoch": 0.12930787550434164, "grad_norm": 0.9919012188911438, "learning_rate": 1.9991008502620515e-05, "loss": 0.8208, "step": 7900 }, { "epoch": 0.13094468405502951, "grad_norm": 0.9344952702522278, "learning_rate": 1.999024372533356e-05, "loss": 0.8167, "step": 8000 }, { "epoch": 0.13258149260571736, "grad_norm": 0.9583950638771057, "learning_rate": 1.9989447753362058e-05, "loss": 0.8125, "step": 8100 }, { "epoch": 0.13421830115640523, "grad_norm": 0.9945580363273621, "learning_rate": 1.998862058919145e-05, "loss": 0.8225, "step": 8200 }, { "epoch": 0.1358551097070931, "grad_norm": 0.9583763480186462, "learning_rate": 1.9987762235404566e-05, "loss": 0.8105, "step": 8300 }, { "epoch": 0.13749191825778098, "grad_norm": 1.025468349456787, "learning_rate": 1.998687269468162e-05, "loss": 0.8107, "step": 8400 }, { "epoch": 0.13912872680846886, "grad_norm": 1.0057779550552368, "learning_rate": 1.998595196980023e-05, "loss": 0.8138, "step": 8500 }, { "epoch": 0.14076553535915673, "grad_norm": 0.9300206899642944, "learning_rate": 1.9985000063635365e-05, "loss": 0.8207, "step": 8600 }, { "epoch": 0.14240234390984458, "grad_norm": 1.0241742134094238, "learning_rate": 1.9984016979159368e-05, "loss": 0.8046, "step": 8700 }, { "epoch": 0.14403915246053245, "grad_norm": 0.9688097238540649, "learning_rate": 1.9983002719441935e-05, "loss": 0.8193, "step": 8800 }, { "epoch": 0.14567596101122032, "grad_norm": 0.9877735376358032, "learning_rate": 1.9981957287650107e-05, "loss": 0.8003, "step": 8900 }, { "epoch": 0.1473127695619082, "grad_norm": 0.9533541202545166, "learning_rate": 1.9980880687048257e-05, "loss": 0.8089, "step": 9000 }, { "epoch": 0.14894957811259607, "grad_norm": 1.0934607982635498, "learning_rate": 1.997977292099809e-05, "loss": 0.7971, "step": 9100 }, { "epoch": 0.15058638666328392, "grad_norm": 0.9715205430984497, "learning_rate": 1.9978633992958624e-05, "loss": 0.8194, "step": 9200 }, { "epoch": 0.1522231952139718, "grad_norm": 0.9527362585067749, "learning_rate": 1.9977463906486175e-05, "loss": 0.8095, "step": 9300 }, { "epoch": 0.15386000376465966, "grad_norm": 1.0439358949661255, "learning_rate": 1.9976262665234357e-05, "loss": 0.7997, "step": 9400 }, { "epoch": 0.15549681231534754, "grad_norm": 1.1087926626205444, "learning_rate": 1.9975030272954066e-05, "loss": 0.8012, "step": 9500 }, { "epoch": 0.1571336208660354, "grad_norm": 1.0532102584838867, "learning_rate": 1.9973766733493458e-05, "loss": 0.8006, "step": 9600 }, { "epoch": 0.15877042941672329, "grad_norm": 0.9958882331848145, "learning_rate": 1.997247205079796e-05, "loss": 0.8138, "step": 9700 }, { "epoch": 0.16040723796741113, "grad_norm": 1.0133436918258667, "learning_rate": 1.9971146228910236e-05, "loss": 0.7942, "step": 9800 }, { "epoch": 0.162044046518099, "grad_norm": 0.9266718029975891, "learning_rate": 1.9969789271970187e-05, "loss": 0.7917, "step": 9900 }, { "epoch": 0.16368085506878688, "grad_norm": 1.0468189716339111, "learning_rate": 1.9968401184214924e-05, "loss": 0.8012, "step": 10000 }, { "epoch": 0.16531766361947475, "grad_norm": 1.0444200038909912, "learning_rate": 1.9966981969978782e-05, "loss": 0.7979, "step": 10100 }, { "epoch": 0.16695447217016263, "grad_norm": 1.0317082405090332, "learning_rate": 1.9965531633693268e-05, "loss": 0.8209, "step": 10200 }, { "epoch": 0.16859128072085047, "grad_norm": 1.0699563026428223, "learning_rate": 1.9964050179887088e-05, "loss": 0.8035, "step": 10300 }, { "epoch": 0.17022808927153835, "grad_norm": 0.9806187748908997, "learning_rate": 1.9962537613186096e-05, "loss": 0.7957, "step": 10400 }, { "epoch": 0.17186489782222622, "grad_norm": 1.0728228092193604, "learning_rate": 1.996099393831331e-05, "loss": 0.791, "step": 10500 }, { "epoch": 0.1735017063729141, "grad_norm": 1.028189778327942, "learning_rate": 1.9959419160088874e-05, "loss": 0.7964, "step": 10600 }, { "epoch": 0.17513851492360197, "grad_norm": 1.0126999616622925, "learning_rate": 1.9957813283430054e-05, "loss": 0.799, "step": 10700 }, { "epoch": 0.17677532347428984, "grad_norm": 0.96955406665802, "learning_rate": 1.995617631335123e-05, "loss": 0.8118, "step": 10800 }, { "epoch": 0.1784121320249777, "grad_norm": 1.0654776096343994, "learning_rate": 1.9954508254963865e-05, "loss": 0.8084, "step": 10900 }, { "epoch": 0.18004894057566556, "grad_norm": 0.9537600874900818, "learning_rate": 1.9952809113476493e-05, "loss": 0.8011, "step": 11000 }, { "epoch": 0.18168574912635344, "grad_norm": 0.9695281982421875, "learning_rate": 1.9951078894194708e-05, "loss": 0.8054, "step": 11100 }, { "epoch": 0.1833225576770413, "grad_norm": 1.0722426176071167, "learning_rate": 1.9949317602521144e-05, "loss": 0.7917, "step": 11200 }, { "epoch": 0.18495936622772918, "grad_norm": 0.9706518054008484, "learning_rate": 1.9947525243955467e-05, "loss": 0.8055, "step": 11300 }, { "epoch": 0.18659617477841703, "grad_norm": 0.9769388437271118, "learning_rate": 1.994570182409434e-05, "loss": 0.7981, "step": 11400 }, { "epoch": 0.1882329833291049, "grad_norm": 0.9185972809791565, "learning_rate": 1.9943847348631415e-05, "loss": 0.7907, "step": 11500 }, { "epoch": 0.18986979187979278, "grad_norm": 1.0683258771896362, "learning_rate": 1.9941961823357322e-05, "loss": 0.8021, "step": 11600 }, { "epoch": 0.19150660043048065, "grad_norm": 0.9599470496177673, "learning_rate": 1.9940045254159644e-05, "loss": 0.7923, "step": 11700 }, { "epoch": 0.19314340898116852, "grad_norm": 0.9822320938110352, "learning_rate": 1.9938097647022895e-05, "loss": 0.7864, "step": 11800 }, { "epoch": 0.1947802175318564, "grad_norm": 1.180939793586731, "learning_rate": 1.9936119008028503e-05, "loss": 0.7841, "step": 11900 }, { "epoch": 0.19641702608254424, "grad_norm": 1.1611251831054688, "learning_rate": 1.9934109343354808e-05, "loss": 0.7855, "step": 12000 }, { "epoch": 0.19805383463323212, "grad_norm": 1.0176281929016113, "learning_rate": 1.9932068659277006e-05, "loss": 0.7936, "step": 12100 }, { "epoch": 0.19969064318392, "grad_norm": 1.05084228515625, "learning_rate": 1.992999696216717e-05, "loss": 0.7856, "step": 12200 }, { "epoch": 0.20132745173460787, "grad_norm": 1.1582859754562378, "learning_rate": 1.9927894258494204e-05, "loss": 0.8064, "step": 12300 }, { "epoch": 0.20296426028529574, "grad_norm": 0.9974379539489746, "learning_rate": 1.992576055482383e-05, "loss": 0.7923, "step": 12400 }, { "epoch": 0.2046010688359836, "grad_norm": 1.0076924562454224, "learning_rate": 1.9923595857818573e-05, "loss": 0.801, "step": 12500 }, { "epoch": 0.20623787738667146, "grad_norm": 1.104923129081726, "learning_rate": 1.9921400174237732e-05, "loss": 0.8053, "step": 12600 }, { "epoch": 0.20787468593735933, "grad_norm": 1.0884004831314087, "learning_rate": 1.9919173510937355e-05, "loss": 0.7948, "step": 12700 }, { "epoch": 0.2095114944880472, "grad_norm": 0.9803980588912964, "learning_rate": 1.9916915874870234e-05, "loss": 0.791, "step": 12800 }, { "epoch": 0.21114830303873508, "grad_norm": 1.0630168914794922, "learning_rate": 1.9914627273085876e-05, "loss": 0.7813, "step": 12900 }, { "epoch": 0.21278511158942295, "grad_norm": 1.0575711727142334, "learning_rate": 1.9912307712730468e-05, "loss": 0.7862, "step": 13000 }, { "epoch": 0.2144219201401108, "grad_norm": 1.0258235931396484, "learning_rate": 1.9909957201046875e-05, "loss": 0.7855, "step": 13100 }, { "epoch": 0.21605872869079867, "grad_norm": 0.970610499382019, "learning_rate": 1.9907575745374605e-05, "loss": 0.7845, "step": 13200 }, { "epoch": 0.21769553724148655, "grad_norm": 1.0707366466522217, "learning_rate": 1.9905163353149787e-05, "loss": 0.7986, "step": 13300 }, { "epoch": 0.21933234579217442, "grad_norm": 0.9396125674247742, "learning_rate": 1.9902720031905153e-05, "loss": 0.7798, "step": 13400 }, { "epoch": 0.2209691543428623, "grad_norm": 1.0123385190963745, "learning_rate": 1.9900245789270006e-05, "loss": 0.7866, "step": 13500 }, { "epoch": 0.22260596289355017, "grad_norm": 0.9208526015281677, "learning_rate": 1.989774063297021e-05, "loss": 0.79, "step": 13600 }, { "epoch": 0.22424277144423801, "grad_norm": 1.0145132541656494, "learning_rate": 1.989520457082815e-05, "loss": 0.7826, "step": 13700 }, { "epoch": 0.2258795799949259, "grad_norm": 0.9474859237670898, "learning_rate": 1.9892637610762723e-05, "loss": 0.7904, "step": 13800 }, { "epoch": 0.22751638854561376, "grad_norm": 0.997414767742157, "learning_rate": 1.9890039760789294e-05, "loss": 0.7863, "step": 13900 }, { "epoch": 0.22915319709630164, "grad_norm": 1.0312907695770264, "learning_rate": 1.9887411029019686e-05, "loss": 0.7825, "step": 14000 }, { "epoch": 0.2307900056469895, "grad_norm": 1.019665002822876, "learning_rate": 1.9884751423662162e-05, "loss": 0.7746, "step": 14100 }, { "epoch": 0.23242681419767736, "grad_norm": 0.9788889288902283, "learning_rate": 1.9882060953021375e-05, "loss": 0.7805, "step": 14200 }, { "epoch": 0.23406362274836523, "grad_norm": 1.1468379497528076, "learning_rate": 1.9879339625498356e-05, "loss": 0.7783, "step": 14300 }, { "epoch": 0.2357004312990531, "grad_norm": 0.9630206823348999, "learning_rate": 1.9876587449590496e-05, "loss": 0.7785, "step": 14400 }, { "epoch": 0.23733723984974098, "grad_norm": 1.0484507083892822, "learning_rate": 1.98738044338915e-05, "loss": 0.7577, "step": 14500 }, { "epoch": 0.23897404840042885, "grad_norm": 0.9262145161628723, "learning_rate": 1.987099058709138e-05, "loss": 0.7847, "step": 14600 }, { "epoch": 0.24061085695111672, "grad_norm": 1.0156426429748535, "learning_rate": 1.9868145917976412e-05, "loss": 0.7754, "step": 14700 }, { "epoch": 0.24224766550180457, "grad_norm": 1.0557153224945068, "learning_rate": 1.986527043542912e-05, "loss": 0.783, "step": 14800 }, { "epoch": 0.24388447405249244, "grad_norm": 0.9480391144752502, "learning_rate": 1.9862364148428243e-05, "loss": 0.7795, "step": 14900 }, { "epoch": 0.24552128260318032, "grad_norm": 1.1189950704574585, "learning_rate": 1.9859427066048694e-05, "loss": 0.773, "step": 15000 }, { "epoch": 0.2471580911538682, "grad_norm": 1.0406650304794312, "learning_rate": 1.985645919746157e-05, "loss": 0.7815, "step": 15100 }, { "epoch": 0.24879489970455607, "grad_norm": 1.0539467334747314, "learning_rate": 1.985346055193408e-05, "loss": 0.7832, "step": 15200 }, { "epoch": 0.2504317082552439, "grad_norm": 1.0707350969314575, "learning_rate": 1.9850431138829537e-05, "loss": 0.7775, "step": 15300 }, { "epoch": 0.2520685168059318, "grad_norm": 1.0518571138381958, "learning_rate": 1.9847370967607332e-05, "loss": 0.7692, "step": 15400 }, { "epoch": 0.25370532535661966, "grad_norm": 1.038328766822815, "learning_rate": 1.9844280047822892e-05, "loss": 0.7812, "step": 15500 }, { "epoch": 0.25534213390730753, "grad_norm": 1.0571229457855225, "learning_rate": 1.984115838912766e-05, "loss": 0.7773, "step": 15600 }, { "epoch": 0.2569789424579954, "grad_norm": 1.0450866222381592, "learning_rate": 1.9838006001269064e-05, "loss": 0.7789, "step": 15700 }, { "epoch": 0.2586157510086833, "grad_norm": 1.107710838317871, "learning_rate": 1.9834822894090478e-05, "loss": 0.7628, "step": 15800 }, { "epoch": 0.26025255955937115, "grad_norm": 1.0595227479934692, "learning_rate": 1.9831609077531205e-05, "loss": 0.7805, "step": 15900 }, { "epoch": 0.26188936811005903, "grad_norm": 1.0978327989578247, "learning_rate": 1.982836456162644e-05, "loss": 0.7779, "step": 16000 }, { "epoch": 0.2635261766607469, "grad_norm": 1.0871798992156982, "learning_rate": 1.982508935650722e-05, "loss": 0.7696, "step": 16100 }, { "epoch": 0.2651629852114347, "grad_norm": 1.0791369676589966, "learning_rate": 1.982178347240043e-05, "loss": 0.7701, "step": 16200 }, { "epoch": 0.2667997937621226, "grad_norm": 1.095301866531372, "learning_rate": 1.981844691962874e-05, "loss": 0.783, "step": 16300 }, { "epoch": 0.26843660231281047, "grad_norm": 1.1223257780075073, "learning_rate": 1.9815079708610588e-05, "loss": 0.7785, "step": 16400 }, { "epoch": 0.27007341086349834, "grad_norm": 1.0025781393051147, "learning_rate": 1.9811681849860137e-05, "loss": 0.7787, "step": 16500 }, { "epoch": 0.2717102194141862, "grad_norm": 1.1232304573059082, "learning_rate": 1.9808253353987252e-05, "loss": 0.7655, "step": 16600 }, { "epoch": 0.2733470279648741, "grad_norm": 0.9625865817070007, "learning_rate": 1.9804794231697464e-05, "loss": 0.785, "step": 16700 }, { "epoch": 0.27498383651556196, "grad_norm": 1.1022255420684814, "learning_rate": 1.980130449379193e-05, "loss": 0.7681, "step": 16800 }, { "epoch": 0.27662064506624984, "grad_norm": 1.0605260133743286, "learning_rate": 1.9797784151167417e-05, "loss": 0.7686, "step": 16900 }, { "epoch": 0.2782574536169377, "grad_norm": 1.0693503618240356, "learning_rate": 1.9794233214816237e-05, "loss": 0.7653, "step": 17000 }, { "epoch": 0.2798942621676256, "grad_norm": 1.0027199983596802, "learning_rate": 1.979065169582625e-05, "loss": 0.7802, "step": 17100 }, { "epoch": 0.28153107071831346, "grad_norm": 1.002388834953308, "learning_rate": 1.9787039605380792e-05, "loss": 0.7668, "step": 17200 }, { "epoch": 0.2831678792690013, "grad_norm": 1.0847641229629517, "learning_rate": 1.9783396954758682e-05, "loss": 0.7685, "step": 17300 }, { "epoch": 0.28480468781968915, "grad_norm": 1.1153062582015991, "learning_rate": 1.9779723755334142e-05, "loss": 0.7761, "step": 17400 }, { "epoch": 0.286441496370377, "grad_norm": 1.0675033330917358, "learning_rate": 1.9776020018576794e-05, "loss": 0.7637, "step": 17500 }, { "epoch": 0.2880783049210649, "grad_norm": 1.0875293016433716, "learning_rate": 1.9772285756051613e-05, "loss": 0.7689, "step": 17600 }, { "epoch": 0.28971511347175277, "grad_norm": 1.135380744934082, "learning_rate": 1.9768520979418885e-05, "loss": 0.7763, "step": 17700 }, { "epoch": 0.29135192202244065, "grad_norm": 1.0305795669555664, "learning_rate": 1.9764725700434183e-05, "loss": 0.7688, "step": 17800 }, { "epoch": 0.2929887305731285, "grad_norm": 1.0471090078353882, "learning_rate": 1.976089993094832e-05, "loss": 0.7573, "step": 17900 }, { "epoch": 0.2946255391238164, "grad_norm": 1.0096269845962524, "learning_rate": 1.9757043682907325e-05, "loss": 0.7622, "step": 18000 }, { "epoch": 0.29626234767450427, "grad_norm": 1.103242039680481, "learning_rate": 1.9753156968352388e-05, "loss": 0.7573, "step": 18100 }, { "epoch": 0.29789915622519214, "grad_norm": 1.1128453016281128, "learning_rate": 1.9749239799419827e-05, "loss": 0.7692, "step": 18200 }, { "epoch": 0.29953596477588, "grad_norm": 1.0762085914611816, "learning_rate": 1.974529218834106e-05, "loss": 0.7838, "step": 18300 }, { "epoch": 0.30117277332656783, "grad_norm": 1.0150110721588135, "learning_rate": 1.9741314147442573e-05, "loss": 0.773, "step": 18400 }, { "epoch": 0.3028095818772557, "grad_norm": 1.0824315547943115, "learning_rate": 1.9737305689145842e-05, "loss": 0.7636, "step": 18500 }, { "epoch": 0.3044463904279436, "grad_norm": 1.2597285509109497, "learning_rate": 1.973326682596735e-05, "loss": 0.7688, "step": 18600 }, { "epoch": 0.30608319897863145, "grad_norm": 1.112971544265747, "learning_rate": 1.97291975705185e-05, "loss": 0.762, "step": 18700 }, { "epoch": 0.30772000752931933, "grad_norm": 1.11709725856781, "learning_rate": 1.9725097935505607e-05, "loss": 0.7674, "step": 18800 }, { "epoch": 0.3093568160800072, "grad_norm": 1.0609350204467773, "learning_rate": 1.972096793372984e-05, "loss": 0.7603, "step": 18900 }, { "epoch": 0.3109936246306951, "grad_norm": 1.111243486404419, "learning_rate": 1.9716807578087193e-05, "loss": 0.7572, "step": 19000 }, { "epoch": 0.31263043318138295, "grad_norm": 0.9914565086364746, "learning_rate": 1.971261688156843e-05, "loss": 0.7558, "step": 19100 }, { "epoch": 0.3142672417320708, "grad_norm": 1.030030369758606, "learning_rate": 1.9708395857259077e-05, "loss": 0.7558, "step": 19200 }, { "epoch": 0.3159040502827587, "grad_norm": 1.1039714813232422, "learning_rate": 1.9704144518339336e-05, "loss": 0.7507, "step": 19300 }, { "epoch": 0.31754085883344657, "grad_norm": 1.0048165321350098, "learning_rate": 1.969986287808408e-05, "loss": 0.7806, "step": 19400 }, { "epoch": 0.3191776673841344, "grad_norm": 1.2964001893997192, "learning_rate": 1.969555094986279e-05, "loss": 0.7504, "step": 19500 }, { "epoch": 0.32081447593482226, "grad_norm": 1.198273777961731, "learning_rate": 1.9691208747139527e-05, "loss": 0.7597, "step": 19600 }, { "epoch": 0.32245128448551014, "grad_norm": 1.0260130167007446, "learning_rate": 1.968683628347289e-05, "loss": 0.7571, "step": 19700 }, { "epoch": 0.324088093036198, "grad_norm": 1.1643099784851074, "learning_rate": 1.9682433572515952e-05, "loss": 0.7712, "step": 19800 }, { "epoch": 0.3257249015868859, "grad_norm": 1.1653162240982056, "learning_rate": 1.9678000628016248e-05, "loss": 0.7599, "step": 19900 }, { "epoch": 0.32736171013757376, "grad_norm": 1.5513461828231812, "learning_rate": 1.9673537463815718e-05, "loss": 0.7673, "step": 20000 }, { "epoch": 0.32899851868826163, "grad_norm": 1.138498306274414, "learning_rate": 1.9669044093850652e-05, "loss": 0.7521, "step": 20100 }, { "epoch": 0.3306353272389495, "grad_norm": 1.0548768043518066, "learning_rate": 1.9664520532151664e-05, "loss": 0.7596, "step": 20200 }, { "epoch": 0.3322721357896374, "grad_norm": 1.0597394704818726, "learning_rate": 1.965996679284365e-05, "loss": 0.7586, "step": 20300 }, { "epoch": 0.33390894434032525, "grad_norm": 1.1359139680862427, "learning_rate": 1.965538289014572e-05, "loss": 0.7618, "step": 20400 }, { "epoch": 0.3355457528910131, "grad_norm": 1.1026830673217773, "learning_rate": 1.9650768838371182e-05, "loss": 0.7613, "step": 20500 }, { "epoch": 0.33718256144170095, "grad_norm": 1.0065330266952515, "learning_rate": 1.9646124651927484e-05, "loss": 0.7394, "step": 20600 }, { "epoch": 0.3388193699923888, "grad_norm": 0.9368694424629211, "learning_rate": 1.964145034531616e-05, "loss": 0.761, "step": 20700 }, { "epoch": 0.3404561785430767, "grad_norm": 0.9686558246612549, "learning_rate": 1.9636745933132807e-05, "loss": 0.7597, "step": 20800 }, { "epoch": 0.34209298709376457, "grad_norm": 1.114066243171692, "learning_rate": 1.9632011430067024e-05, "loss": 0.7675, "step": 20900 }, { "epoch": 0.34372979564445244, "grad_norm": 1.1572498083114624, "learning_rate": 1.9627246850902363e-05, "loss": 0.7576, "step": 21000 }, { "epoch": 0.3453666041951403, "grad_norm": 1.0342215299606323, "learning_rate": 1.9622452210516296e-05, "loss": 0.7629, "step": 21100 }, { "epoch": 0.3470034127458282, "grad_norm": 1.0652525424957275, "learning_rate": 1.9617627523880158e-05, "loss": 0.7636, "step": 21200 }, { "epoch": 0.34864022129651606, "grad_norm": 1.048869013786316, "learning_rate": 1.9612772806059104e-05, "loss": 0.7625, "step": 21300 }, { "epoch": 0.35027702984720394, "grad_norm": 1.1751947402954102, "learning_rate": 1.9607888072212062e-05, "loss": 0.7475, "step": 21400 }, { "epoch": 0.3519138383978918, "grad_norm": 1.2830709218978882, "learning_rate": 1.9602973337591688e-05, "loss": 0.7558, "step": 21500 }, { "epoch": 0.3535506469485797, "grad_norm": 1.1591740846633911, "learning_rate": 1.9598028617544313e-05, "loss": 0.7435, "step": 21600 }, { "epoch": 0.3551874554992675, "grad_norm": 0.9801552295684814, "learning_rate": 1.95930539275099e-05, "loss": 0.7621, "step": 21700 }, { "epoch": 0.3568242640499554, "grad_norm": 1.126760721206665, "learning_rate": 1.958804928302199e-05, "loss": 0.7672, "step": 21800 }, { "epoch": 0.35846107260064325, "grad_norm": 1.0655152797698975, "learning_rate": 1.958301469970766e-05, "loss": 0.7491, "step": 21900 }, { "epoch": 0.3600978811513311, "grad_norm": 1.1613372564315796, "learning_rate": 1.9577950193287475e-05, "loss": 0.7733, "step": 22000 }, { "epoch": 0.361734689702019, "grad_norm": 0.9363147020339966, "learning_rate": 1.9572855779575427e-05, "loss": 0.7522, "step": 22100 }, { "epoch": 0.36337149825270687, "grad_norm": 1.1021246910095215, "learning_rate": 1.9567731474478903e-05, "loss": 0.7539, "step": 22200 }, { "epoch": 0.36500830680339474, "grad_norm": 1.084695816040039, "learning_rate": 1.9562577293998616e-05, "loss": 0.7514, "step": 22300 }, { "epoch": 0.3666451153540826, "grad_norm": 1.1221933364868164, "learning_rate": 1.9557393254228575e-05, "loss": 0.7608, "step": 22400 }, { "epoch": 0.3682819239047705, "grad_norm": 1.073371410369873, "learning_rate": 1.9552179371356024e-05, "loss": 0.7509, "step": 22500 }, { "epoch": 0.36991873245545837, "grad_norm": 1.124243140220642, "learning_rate": 1.9546935661661382e-05, "loss": 0.7552, "step": 22600 }, { "epoch": 0.37155554100614624, "grad_norm": 1.0397138595581055, "learning_rate": 1.9541662141518222e-05, "loss": 0.7451, "step": 22700 }, { "epoch": 0.37319234955683406, "grad_norm": 1.0600475072860718, "learning_rate": 1.9536358827393177e-05, "loss": 0.7358, "step": 22800 }, { "epoch": 0.37482915810752193, "grad_norm": 1.1461478471755981, "learning_rate": 1.953102573584593e-05, "loss": 0.7513, "step": 22900 }, { "epoch": 0.3764659666582098, "grad_norm": 1.093103051185608, "learning_rate": 1.952566288352914e-05, "loss": 0.7369, "step": 23000 }, { "epoch": 0.3781027752088977, "grad_norm": 1.2357380390167236, "learning_rate": 1.952027028718839e-05, "loss": 0.7628, "step": 23100 }, { "epoch": 0.37973958375958555, "grad_norm": 0.9737277030944824, "learning_rate": 1.9514847963662144e-05, "loss": 0.7358, "step": 23200 }, { "epoch": 0.3813763923102734, "grad_norm": 1.0810784101486206, "learning_rate": 1.9509395929881683e-05, "loss": 0.7431, "step": 23300 }, { "epoch": 0.3830132008609613, "grad_norm": 1.0600659847259521, "learning_rate": 1.9503914202871072e-05, "loss": 0.7465, "step": 23400 }, { "epoch": 0.3846500094116492, "grad_norm": 1.129676342010498, "learning_rate": 1.9498402799747077e-05, "loss": 0.746, "step": 23500 }, { "epoch": 0.38628681796233705, "grad_norm": 1.0627739429473877, "learning_rate": 1.9492861737719145e-05, "loss": 0.7517, "step": 23600 }, { "epoch": 0.3879236265130249, "grad_norm": 1.0382601022720337, "learning_rate": 1.9487291034089316e-05, "loss": 0.7466, "step": 23700 }, { "epoch": 0.3895604350637128, "grad_norm": 1.0782064199447632, "learning_rate": 1.9481690706252198e-05, "loss": 0.7436, "step": 23800 }, { "epoch": 0.39119724361440067, "grad_norm": 1.052713394165039, "learning_rate": 1.94760607716949e-05, "loss": 0.7363, "step": 23900 }, { "epoch": 0.3928340521650885, "grad_norm": 1.0485634803771973, "learning_rate": 1.947040124799697e-05, "loss": 0.7491, "step": 24000 }, { "epoch": 0.39447086071577636, "grad_norm": 1.1206567287445068, "learning_rate": 1.9464712152830368e-05, "loss": 0.7372, "step": 24100 }, { "epoch": 0.39610766926646424, "grad_norm": 1.0319308042526245, "learning_rate": 1.9458993503959368e-05, "loss": 0.7493, "step": 24200 }, { "epoch": 0.3977444778171521, "grad_norm": 1.1401089429855347, "learning_rate": 1.9453245319240533e-05, "loss": 0.7693, "step": 24300 }, { "epoch": 0.39938128636784, "grad_norm": 1.2440853118896484, "learning_rate": 1.944746761662266e-05, "loss": 0.7477, "step": 24400 }, { "epoch": 0.40101809491852786, "grad_norm": 1.1666104793548584, "learning_rate": 1.9441660414146715e-05, "loss": 0.7364, "step": 24500 }, { "epoch": 0.40265490346921573, "grad_norm": 1.0812019109725952, "learning_rate": 1.9435823729945768e-05, "loss": 0.7278, "step": 24600 }, { "epoch": 0.4042917120199036, "grad_norm": 1.1338680982589722, "learning_rate": 1.9429957582244957e-05, "loss": 0.7396, "step": 24700 }, { "epoch": 0.4059285205705915, "grad_norm": 1.0170310735702515, "learning_rate": 1.942406198936141e-05, "loss": 0.7373, "step": 24800 }, { "epoch": 0.40756532912127935, "grad_norm": 1.0910414457321167, "learning_rate": 1.941813696970421e-05, "loss": 0.743, "step": 24900 }, { "epoch": 0.4092021376719672, "grad_norm": 0.9840279221534729, "learning_rate": 1.9412182541774312e-05, "loss": 0.7432, "step": 25000 }, { "epoch": 0.41083894622265504, "grad_norm": 1.1482113599777222, "learning_rate": 1.9406198724164515e-05, "loss": 0.7457, "step": 25100 }, { "epoch": 0.4124757547733429, "grad_norm": 0.9647344946861267, "learning_rate": 1.9400185535559366e-05, "loss": 0.7494, "step": 25200 }, { "epoch": 0.4141125633240308, "grad_norm": 1.1271613836288452, "learning_rate": 1.9394142994735147e-05, "loss": 0.7358, "step": 25300 }, { "epoch": 0.41574937187471867, "grad_norm": 1.1209514141082764, "learning_rate": 1.9388071120559774e-05, "loss": 0.7477, "step": 25400 }, { "epoch": 0.41738618042540654, "grad_norm": 1.1221638917922974, "learning_rate": 1.9381969931992768e-05, "loss": 0.7401, "step": 25500 }, { "epoch": 0.4190229889760944, "grad_norm": 1.1341800689697266, "learning_rate": 1.937583944808518e-05, "loss": 0.7341, "step": 25600 }, { "epoch": 0.4206597975267823, "grad_norm": 1.0561330318450928, "learning_rate": 1.9369679687979538e-05, "loss": 0.7427, "step": 25700 }, { "epoch": 0.42229660607747016, "grad_norm": 1.0445774793624878, "learning_rate": 1.9363490670909788e-05, "loss": 0.7485, "step": 25800 }, { "epoch": 0.42393341462815803, "grad_norm": 1.1463161706924438, "learning_rate": 1.9357272416201214e-05, "loss": 0.7345, "step": 25900 }, { "epoch": 0.4255702231788459, "grad_norm": 1.1426818370819092, "learning_rate": 1.9351024943270426e-05, "loss": 0.7369, "step": 26000 }, { "epoch": 0.4272070317295338, "grad_norm": 1.0911140441894531, "learning_rate": 1.934474827162524e-05, "loss": 0.7472, "step": 26100 }, { "epoch": 0.4288438402802216, "grad_norm": 1.0775692462921143, "learning_rate": 1.9338442420864663e-05, "loss": 0.7401, "step": 26200 }, { "epoch": 0.4304806488309095, "grad_norm": 1.136518955230713, "learning_rate": 1.9332107410678805e-05, "loss": 0.7355, "step": 26300 }, { "epoch": 0.43211745738159735, "grad_norm": 1.085319995880127, "learning_rate": 1.932574326084883e-05, "loss": 0.7485, "step": 26400 }, { "epoch": 0.4337542659322852, "grad_norm": 1.034986972808838, "learning_rate": 1.9319349991246887e-05, "loss": 0.7422, "step": 26500 }, { "epoch": 0.4353910744829731, "grad_norm": 1.1199235916137695, "learning_rate": 1.9312927621836058e-05, "loss": 0.7362, "step": 26600 }, { "epoch": 0.43702788303366097, "grad_norm": 1.1646606922149658, "learning_rate": 1.930647617267029e-05, "loss": 0.7274, "step": 26700 }, { "epoch": 0.43866469158434884, "grad_norm": 1.1620571613311768, "learning_rate": 1.9299995663894325e-05, "loss": 0.7351, "step": 26800 }, { "epoch": 0.4403015001350367, "grad_norm": 1.1194571256637573, "learning_rate": 1.9293486115743646e-05, "loss": 0.7309, "step": 26900 }, { "epoch": 0.4419383086857246, "grad_norm": 1.1805561780929565, "learning_rate": 1.928694754854442e-05, "loss": 0.7378, "step": 27000 }, { "epoch": 0.44357511723641246, "grad_norm": 1.1845600605010986, "learning_rate": 1.9280379982713417e-05, "loss": 0.7319, "step": 27100 }, { "epoch": 0.44521192578710034, "grad_norm": 1.2962830066680908, "learning_rate": 1.927378343875796e-05, "loss": 0.7305, "step": 27200 }, { "epoch": 0.44684873433778816, "grad_norm": 1.0655794143676758, "learning_rate": 1.9267157937275854e-05, "loss": 0.7236, "step": 27300 }, { "epoch": 0.44848554288847603, "grad_norm": 1.0807515382766724, "learning_rate": 1.9260503498955326e-05, "loss": 0.7326, "step": 27400 }, { "epoch": 0.4501223514391639, "grad_norm": 1.0515137910842896, "learning_rate": 1.9253820144574958e-05, "loss": 0.7293, "step": 27500 }, { "epoch": 0.4517591599898518, "grad_norm": 1.103508710861206, "learning_rate": 1.9247107895003628e-05, "loss": 0.7473, "step": 27600 }, { "epoch": 0.45339596854053965, "grad_norm": 1.1016185283660889, "learning_rate": 1.924036677120043e-05, "loss": 0.7264, "step": 27700 }, { "epoch": 0.4550327770912275, "grad_norm": 1.0213091373443604, "learning_rate": 1.9233596794214623e-05, "loss": 0.7325, "step": 27800 }, { "epoch": 0.4566695856419154, "grad_norm": 1.1028705835342407, "learning_rate": 1.9226797985185565e-05, "loss": 0.7381, "step": 27900 }, { "epoch": 0.4583063941926033, "grad_norm": 1.0844396352767944, "learning_rate": 1.9219970365342634e-05, "loss": 0.7279, "step": 28000 }, { "epoch": 0.45994320274329115, "grad_norm": 1.037714958190918, "learning_rate": 1.9213113956005176e-05, "loss": 0.7433, "step": 28100 }, { "epoch": 0.461580011293979, "grad_norm": 1.2123370170593262, "learning_rate": 1.9206228778582435e-05, "loss": 0.7341, "step": 28200 }, { "epoch": 0.4632168198446669, "grad_norm": 1.013845682144165, "learning_rate": 1.9199314854573474e-05, "loss": 0.7369, "step": 28300 }, { "epoch": 0.4648536283953547, "grad_norm": 1.0552864074707031, "learning_rate": 1.9192372205567123e-05, "loss": 0.7202, "step": 28400 }, { "epoch": 0.4664904369460426, "grad_norm": 1.049025058746338, "learning_rate": 1.9185400853241917e-05, "loss": 0.7246, "step": 28500 }, { "epoch": 0.46812724549673046, "grad_norm": 1.0877737998962402, "learning_rate": 1.9178400819365994e-05, "loss": 0.7261, "step": 28600 }, { "epoch": 0.46976405404741833, "grad_norm": 1.099348783493042, "learning_rate": 1.9171372125797072e-05, "loss": 0.7327, "step": 28700 }, { "epoch": 0.4714008625981062, "grad_norm": 1.1000944375991821, "learning_rate": 1.916431479448235e-05, "loss": 0.7305, "step": 28800 }, { "epoch": 0.4730376711487941, "grad_norm": 1.0979351997375488, "learning_rate": 1.9157228847458446e-05, "loss": 0.7279, "step": 28900 }, { "epoch": 0.47467447969948195, "grad_norm": 1.0918766260147095, "learning_rate": 1.9150114306851336e-05, "loss": 0.7215, "step": 29000 }, { "epoch": 0.47631128825016983, "grad_norm": 1.109971046447754, "learning_rate": 1.9142971194876284e-05, "loss": 0.7322, "step": 29100 }, { "epoch": 0.4779480968008577, "grad_norm": 1.1282057762145996, "learning_rate": 1.913579953383776e-05, "loss": 0.7257, "step": 29200 }, { "epoch": 0.4795849053515456, "grad_norm": 1.1076371669769287, "learning_rate": 1.912859934612938e-05, "loss": 0.7516, "step": 29300 }, { "epoch": 0.48122171390223345, "grad_norm": 1.1480896472930908, "learning_rate": 1.9121370654233843e-05, "loss": 0.728, "step": 29400 }, { "epoch": 0.48285852245292127, "grad_norm": 1.1083163022994995, "learning_rate": 1.911411348072284e-05, "loss": 0.7235, "step": 29500 }, { "epoch": 0.48449533100360914, "grad_norm": 1.2141623497009277, "learning_rate": 1.9106827848257007e-05, "loss": 0.7237, "step": 29600 }, { "epoch": 0.486132139554297, "grad_norm": 1.0334457159042358, "learning_rate": 1.9099513779585836e-05, "loss": 0.7306, "step": 29700 }, { "epoch": 0.4877689481049849, "grad_norm": 1.1086657047271729, "learning_rate": 1.909217129754762e-05, "loss": 0.7295, "step": 29800 }, { "epoch": 0.48940575665567276, "grad_norm": 1.0128360986709595, "learning_rate": 1.908480042506937e-05, "loss": 0.733, "step": 29900 }, { "epoch": 0.49104256520636064, "grad_norm": 1.1484946012496948, "learning_rate": 1.907740118516674e-05, "loss": 0.7396, "step": 30000 }, { "epoch": 0.4926793737570485, "grad_norm": 1.031750202178955, "learning_rate": 1.9069973600943962e-05, "loss": 0.7204, "step": 30100 }, { "epoch": 0.4943161823077364, "grad_norm": 1.1274133920669556, "learning_rate": 1.9062517695593792e-05, "loss": 0.7235, "step": 30200 }, { "epoch": 0.49595299085842426, "grad_norm": 1.1863317489624023, "learning_rate": 1.9055033492397396e-05, "loss": 0.7329, "step": 30300 }, { "epoch": 0.49758979940911213, "grad_norm": 1.0985053777694702, "learning_rate": 1.9047521014724303e-05, "loss": 0.7341, "step": 30400 }, { "epoch": 0.4992266079598, "grad_norm": 1.136760950088501, "learning_rate": 1.9039980286032353e-05, "loss": 0.7189, "step": 30500 }, { "epoch": 0.5008634165104878, "grad_norm": 1.0787100791931152, "learning_rate": 1.9032411329867573e-05, "loss": 0.7298, "step": 30600 }, { "epoch": 0.5025002250611758, "grad_norm": 1.3436377048492432, "learning_rate": 1.902481416986414e-05, "loss": 0.719, "step": 30700 }, { "epoch": 0.5041370336118636, "grad_norm": 1.1863504648208618, "learning_rate": 1.9017188829744305e-05, "loss": 0.7125, "step": 30800 }, { "epoch": 0.5057738421625515, "grad_norm": 1.0385360717773438, "learning_rate": 1.90095353333183e-05, "loss": 0.7297, "step": 30900 }, { "epoch": 0.5074106507132393, "grad_norm": 1.1736425161361694, "learning_rate": 1.9001853704484285e-05, "loss": 0.7205, "step": 31000 }, { "epoch": 0.5090474592639272, "grad_norm": 1.0939114093780518, "learning_rate": 1.899414396722826e-05, "loss": 0.741, "step": 31100 }, { "epoch": 0.5106842678146151, "grad_norm": 1.3368091583251953, "learning_rate": 1.8986406145623996e-05, "loss": 0.7277, "step": 31200 }, { "epoch": 0.5123210763653029, "grad_norm": 1.1556004285812378, "learning_rate": 1.897864026383295e-05, "loss": 0.7383, "step": 31300 }, { "epoch": 0.5139578849159908, "grad_norm": 1.2308059930801392, "learning_rate": 1.897084634610421e-05, "loss": 0.7188, "step": 31400 }, { "epoch": 0.5155946934666786, "grad_norm": 1.1211739778518677, "learning_rate": 1.8963024416774393e-05, "loss": 0.7241, "step": 31500 }, { "epoch": 0.5172315020173666, "grad_norm": 1.1302770376205444, "learning_rate": 1.8955174500267596e-05, "loss": 0.7207, "step": 31600 }, { "epoch": 0.5188683105680544, "grad_norm": 1.1893266439437866, "learning_rate": 1.8947296621095297e-05, "loss": 0.7088, "step": 31700 }, { "epoch": 0.5205051191187423, "grad_norm": 1.2034817934036255, "learning_rate": 1.893939080385629e-05, "loss": 0.7225, "step": 31800 }, { "epoch": 0.5221419276694301, "grad_norm": 1.0935208797454834, "learning_rate": 1.8931457073236612e-05, "loss": 0.7219, "step": 31900 }, { "epoch": 0.5237787362201181, "grad_norm": 1.2129491567611694, "learning_rate": 1.892349545400945e-05, "loss": 0.7323, "step": 32000 }, { "epoch": 0.5254155447708059, "grad_norm": 1.0750499963760376, "learning_rate": 1.8915505971035077e-05, "loss": 0.7213, "step": 32100 }, { "epoch": 0.5270523533214938, "grad_norm": 1.1311250925064087, "learning_rate": 1.8907488649260775e-05, "loss": 0.7265, "step": 32200 }, { "epoch": 0.5286891618721816, "grad_norm": 1.1503121852874756, "learning_rate": 1.889944351372075e-05, "loss": 0.7177, "step": 32300 }, { "epoch": 0.5303259704228694, "grad_norm": 1.3034614324569702, "learning_rate": 1.8891370589536058e-05, "loss": 0.7118, "step": 32400 }, { "epoch": 0.5319627789735574, "grad_norm": 1.0626057386398315, "learning_rate": 1.8883269901914524e-05, "loss": 0.7205, "step": 32500 }, { "epoch": 0.5335995875242452, "grad_norm": 1.2290301322937012, "learning_rate": 1.8875141476150664e-05, "loss": 0.73, "step": 32600 }, { "epoch": 0.5352363960749331, "grad_norm": 1.2172757387161255, "learning_rate": 1.8866985337625615e-05, "loss": 0.7234, "step": 32700 }, { "epoch": 0.5368732046256209, "grad_norm": 1.0496524572372437, "learning_rate": 1.885880151180703e-05, "loss": 0.7127, "step": 32800 }, { "epoch": 0.5385100131763089, "grad_norm": 0.9903925061225891, "learning_rate": 1.8850590024249037e-05, "loss": 0.728, "step": 32900 }, { "epoch": 0.5401468217269967, "grad_norm": 1.2562659978866577, "learning_rate": 1.8842350900592122e-05, "loss": 0.7188, "step": 33000 }, { "epoch": 0.5417836302776846, "grad_norm": 1.2212430238723755, "learning_rate": 1.8834084166563072e-05, "loss": 0.7086, "step": 33100 }, { "epoch": 0.5434204388283724, "grad_norm": 1.1504745483398438, "learning_rate": 1.882578984797489e-05, "loss": 0.7198, "step": 33200 }, { "epoch": 0.5450572473790604, "grad_norm": 1.1029900312423706, "learning_rate": 1.8817467970726704e-05, "loss": 0.729, "step": 33300 }, { "epoch": 0.5466940559297482, "grad_norm": 1.1274054050445557, "learning_rate": 1.8809118560803704e-05, "loss": 0.7249, "step": 33400 }, { "epoch": 0.548330864480436, "grad_norm": 1.093854546546936, "learning_rate": 1.880074164427704e-05, "loss": 0.704, "step": 33500 }, { "epoch": 0.5499676730311239, "grad_norm": 1.0846567153930664, "learning_rate": 1.879233724730377e-05, "loss": 0.7194, "step": 33600 }, { "epoch": 0.5516044815818117, "grad_norm": 1.35237455368042, "learning_rate": 1.8783905396126737e-05, "loss": 0.7205, "step": 33700 }, { "epoch": 0.5532412901324997, "grad_norm": 0.9714828133583069, "learning_rate": 1.8775446117074528e-05, "loss": 0.7334, "step": 33800 }, { "epoch": 0.5548780986831875, "grad_norm": 1.2619616985321045, "learning_rate": 1.8766959436561363e-05, "loss": 0.718, "step": 33900 }, { "epoch": 0.5565149072338754, "grad_norm": 1.036129355430603, "learning_rate": 1.8758445381087034e-05, "loss": 0.7191, "step": 34000 }, { "epoch": 0.5581517157845632, "grad_norm": 1.097095012664795, "learning_rate": 1.8749903977236802e-05, "loss": 0.7171, "step": 34100 }, { "epoch": 0.5597885243352512, "grad_norm": 1.1133558750152588, "learning_rate": 1.8741335251681328e-05, "loss": 0.7179, "step": 34200 }, { "epoch": 0.561425332885939, "grad_norm": 1.0562981367111206, "learning_rate": 1.8732739231176587e-05, "loss": 0.7201, "step": 34300 }, { "epoch": 0.5630621414366269, "grad_norm": 1.20978581905365, "learning_rate": 1.8724115942563773e-05, "loss": 0.7129, "step": 34400 }, { "epoch": 0.5646989499873147, "grad_norm": 1.0966860055923462, "learning_rate": 1.8715465412769243e-05, "loss": 0.715, "step": 34500 }, { "epoch": 0.5663357585380026, "grad_norm": 1.2173317670822144, "learning_rate": 1.87067876688044e-05, "loss": 0.7052, "step": 34600 }, { "epoch": 0.5679725670886905, "grad_norm": 1.126670241355896, "learning_rate": 1.869808273776563e-05, "loss": 0.7172, "step": 34700 }, { "epoch": 0.5696093756393783, "grad_norm": 1.0486496686935425, "learning_rate": 1.8689350646834207e-05, "loss": 0.7269, "step": 34800 }, { "epoch": 0.5712461841900662, "grad_norm": 1.1730561256408691, "learning_rate": 1.868059142327622e-05, "loss": 0.7191, "step": 34900 }, { "epoch": 0.572882992740754, "grad_norm": 1.1153805255889893, "learning_rate": 1.867180509444247e-05, "loss": 0.7124, "step": 35000 }, { "epoch": 0.574519801291442, "grad_norm": 1.200767159461975, "learning_rate": 1.8662991687768394e-05, "loss": 0.7342, "step": 35100 }, { "epoch": 0.5761566098421298, "grad_norm": 1.093985676765442, "learning_rate": 1.8654151230774e-05, "loss": 0.7073, "step": 35200 }, { "epoch": 0.5777934183928177, "grad_norm": 1.1902211904525757, "learning_rate": 1.8645283751063734e-05, "loss": 0.7147, "step": 35300 }, { "epoch": 0.5794302269435055, "grad_norm": 1.1363279819488525, "learning_rate": 1.863638927632644e-05, "loss": 0.7162, "step": 35400 }, { "epoch": 0.5810670354941935, "grad_norm": 1.2271382808685303, "learning_rate": 1.8627467834335243e-05, "loss": 0.7042, "step": 35500 }, { "epoch": 0.5827038440448813, "grad_norm": 1.1823738813400269, "learning_rate": 1.8618519452947484e-05, "loss": 0.7197, "step": 35600 }, { "epoch": 0.5843406525955691, "grad_norm": 1.042771577835083, "learning_rate": 1.8609544160104608e-05, "loss": 0.7103, "step": 35700 }, { "epoch": 0.585977461146257, "grad_norm": 1.2053323984146118, "learning_rate": 1.8600541983832114e-05, "loss": 0.7206, "step": 35800 }, { "epoch": 0.5876142696969449, "grad_norm": 1.2077679634094238, "learning_rate": 1.8591512952239416e-05, "loss": 0.7003, "step": 35900 }, { "epoch": 0.5892510782476328, "grad_norm": 1.2675883769989014, "learning_rate": 1.8582457093519806e-05, "loss": 0.7119, "step": 36000 }, { "epoch": 0.5908878867983206, "grad_norm": 1.102798342704773, "learning_rate": 1.857337443595034e-05, "loss": 0.7097, "step": 36100 }, { "epoch": 0.5925246953490085, "grad_norm": 1.0432052612304688, "learning_rate": 1.8564265007891747e-05, "loss": 0.7197, "step": 36200 }, { "epoch": 0.5941615038996964, "grad_norm": 1.1461999416351318, "learning_rate": 1.8555128837788356e-05, "loss": 0.7128, "step": 36300 }, { "epoch": 0.5957983124503843, "grad_norm": 1.1425740718841553, "learning_rate": 1.854596595416799e-05, "loss": 0.7221, "step": 36400 }, { "epoch": 0.5974351210010721, "grad_norm": 1.1499603986740112, "learning_rate": 1.8536776385641896e-05, "loss": 0.7118, "step": 36500 }, { "epoch": 0.59907192955176, "grad_norm": 1.1369038820266724, "learning_rate": 1.8527560160904628e-05, "loss": 0.7101, "step": 36600 }, { "epoch": 0.6007087381024478, "grad_norm": 1.3000248670578003, "learning_rate": 1.8518317308733987e-05, "loss": 0.7042, "step": 36700 }, { "epoch": 0.6023455466531357, "grad_norm": 1.193550944328308, "learning_rate": 1.8509047857990925e-05, "loss": 0.7143, "step": 36800 }, { "epoch": 0.6039823552038236, "grad_norm": 1.1038364171981812, "learning_rate": 1.849975183761943e-05, "loss": 0.6953, "step": 36900 }, { "epoch": 0.6056191637545114, "grad_norm": 1.2535215616226196, "learning_rate": 1.849042927664647e-05, "loss": 0.7021, "step": 37000 }, { "epoch": 0.6072559723051993, "grad_norm": 1.1770461797714233, "learning_rate": 1.848108020418188e-05, "loss": 0.6971, "step": 37100 }, { "epoch": 0.6088927808558872, "grad_norm": 1.3245750665664673, "learning_rate": 1.8471704649418272e-05, "loss": 0.7062, "step": 37200 }, { "epoch": 0.6105295894065751, "grad_norm": 1.064820408821106, "learning_rate": 1.8462302641630957e-05, "loss": 0.7247, "step": 37300 }, { "epoch": 0.6121663979572629, "grad_norm": 1.2426869869232178, "learning_rate": 1.8452874210177853e-05, "loss": 0.697, "step": 37400 }, { "epoch": 0.6138032065079508, "grad_norm": 1.0495688915252686, "learning_rate": 1.8443419384499367e-05, "loss": 0.7066, "step": 37500 }, { "epoch": 0.6154400150586387, "grad_norm": 1.0227185487747192, "learning_rate": 1.8433938194118332e-05, "loss": 0.6975, "step": 37600 }, { "epoch": 0.6170768236093266, "grad_norm": 1.1213784217834473, "learning_rate": 1.8424430668639916e-05, "loss": 0.7101, "step": 37700 }, { "epoch": 0.6187136321600144, "grad_norm": 1.3823000192642212, "learning_rate": 1.8414896837751497e-05, "loss": 0.7143, "step": 37800 }, { "epoch": 0.6203504407107022, "grad_norm": 1.280870795249939, "learning_rate": 1.8405336731222615e-05, "loss": 0.7137, "step": 37900 }, { "epoch": 0.6219872492613902, "grad_norm": 1.1578929424285889, "learning_rate": 1.839575037890483e-05, "loss": 0.7035, "step": 38000 }, { "epoch": 0.623624057812078, "grad_norm": 1.1784029006958008, "learning_rate": 1.838613781073169e-05, "loss": 0.7003, "step": 38100 }, { "epoch": 0.6252608663627659, "grad_norm": 1.5140550136566162, "learning_rate": 1.8376499056718563e-05, "loss": 0.7182, "step": 38200 }, { "epoch": 0.6268976749134537, "grad_norm": 1.1795947551727295, "learning_rate": 1.8366834146962613e-05, "loss": 0.707, "step": 38300 }, { "epoch": 0.6285344834641416, "grad_norm": 1.2156872749328613, "learning_rate": 1.8357143111642658e-05, "loss": 0.7041, "step": 38400 }, { "epoch": 0.6301712920148295, "grad_norm": 1.120609164237976, "learning_rate": 1.8347425981019104e-05, "loss": 0.7087, "step": 38500 }, { "epoch": 0.6318081005655174, "grad_norm": 1.0960373878479004, "learning_rate": 1.8337682785433838e-05, "loss": 0.7136, "step": 38600 }, { "epoch": 0.6334449091162052, "grad_norm": 1.2065433263778687, "learning_rate": 1.8327913555310125e-05, "loss": 0.7077, "step": 38700 }, { "epoch": 0.6350817176668931, "grad_norm": 1.158570647239685, "learning_rate": 1.8318118321152534e-05, "loss": 0.7199, "step": 38800 }, { "epoch": 0.636718526217581, "grad_norm": 1.1315112113952637, "learning_rate": 1.8308297113546834e-05, "loss": 0.7157, "step": 38900 }, { "epoch": 0.6383553347682688, "grad_norm": 1.567763328552246, "learning_rate": 1.829844996315989e-05, "loss": 0.7024, "step": 39000 }, { "epoch": 0.6399921433189567, "grad_norm": 1.3154592514038086, "learning_rate": 1.8288576900739573e-05, "loss": 0.7093, "step": 39100 }, { "epoch": 0.6416289518696445, "grad_norm": 1.2426626682281494, "learning_rate": 1.8278677957114666e-05, "loss": 0.7108, "step": 39200 }, { "epoch": 0.6432657604203325, "grad_norm": 1.2186305522918701, "learning_rate": 1.8268753163194773e-05, "loss": 0.704, "step": 39300 }, { "epoch": 0.6449025689710203, "grad_norm": 1.049307942390442, "learning_rate": 1.8258802549970206e-05, "loss": 0.7057, "step": 39400 }, { "epoch": 0.6465393775217082, "grad_norm": 1.3523504734039307, "learning_rate": 1.8248826148511908e-05, "loss": 0.6965, "step": 39500 }, { "epoch": 0.648176186072396, "grad_norm": 1.2402653694152832, "learning_rate": 1.823882398997133e-05, "loss": 0.704, "step": 39600 }, { "epoch": 0.649812994623084, "grad_norm": 1.3009974956512451, "learning_rate": 1.8228796105580373e-05, "loss": 0.6892, "step": 39700 }, { "epoch": 0.6514498031737718, "grad_norm": 1.161328673362732, "learning_rate": 1.821874252665125e-05, "loss": 0.7099, "step": 39800 }, { "epoch": 0.6530866117244597, "grad_norm": 1.5753206014633179, "learning_rate": 1.820866328457641e-05, "loss": 0.6958, "step": 39900 }, { "epoch": 0.6547234202751475, "grad_norm": 1.1261160373687744, "learning_rate": 1.8198558410828436e-05, "loss": 0.7048, "step": 40000 }, { "epoch": 0.6563602288258353, "grad_norm": 1.2303427457809448, "learning_rate": 1.818842793695995e-05, "loss": 0.7024, "step": 40100 }, { "epoch": 0.6579970373765233, "grad_norm": 1.2187303304672241, "learning_rate": 1.8178271894603502e-05, "loss": 0.696, "step": 40200 }, { "epoch": 0.6596338459272111, "grad_norm": 1.1081221103668213, "learning_rate": 1.8168090315471488e-05, "loss": 0.7082, "step": 40300 }, { "epoch": 0.661270654477899, "grad_norm": 1.1961265802383423, "learning_rate": 1.8157883231356036e-05, "loss": 0.6875, "step": 40400 }, { "epoch": 0.6629074630285868, "grad_norm": 1.1577361822128296, "learning_rate": 1.8147650674128927e-05, "loss": 0.7004, "step": 40500 }, { "epoch": 0.6645442715792748, "grad_norm": 1.1837248802185059, "learning_rate": 1.813739267574147e-05, "loss": 0.7084, "step": 40600 }, { "epoch": 0.6661810801299626, "grad_norm": 1.140136957168579, "learning_rate": 1.8127109268224414e-05, "loss": 0.6897, "step": 40700 }, { "epoch": 0.6678178886806505, "grad_norm": 1.132994532585144, "learning_rate": 1.811680048368785e-05, "loss": 0.6999, "step": 40800 }, { "epoch": 0.6694546972313383, "grad_norm": 1.184187889099121, "learning_rate": 1.8106466354321113e-05, "loss": 0.6994, "step": 40900 }, { "epoch": 0.6710915057820263, "grad_norm": 1.1196414232254028, "learning_rate": 1.809610691239268e-05, "loss": 0.7008, "step": 41000 }, { "epoch": 0.6727283143327141, "grad_norm": 1.1688846349716187, "learning_rate": 1.808572219025006e-05, "loss": 0.6954, "step": 41100 }, { "epoch": 0.6743651228834019, "grad_norm": 1.222205638885498, "learning_rate": 1.80753122203197e-05, "loss": 0.6918, "step": 41200 }, { "epoch": 0.6760019314340898, "grad_norm": 1.1374167203903198, "learning_rate": 1.8064877035106887e-05, "loss": 0.6906, "step": 41300 }, { "epoch": 0.6776387399847776, "grad_norm": 1.0707694292068481, "learning_rate": 1.8054416667195643e-05, "loss": 0.6943, "step": 41400 }, { "epoch": 0.6792755485354656, "grad_norm": 1.1394332647323608, "learning_rate": 1.8043931149248625e-05, "loss": 0.7073, "step": 41500 }, { "epoch": 0.6809123570861534, "grad_norm": 1.118058443069458, "learning_rate": 1.803342051400701e-05, "loss": 0.6983, "step": 41600 }, { "epoch": 0.6825491656368413, "grad_norm": 1.3730331659317017, "learning_rate": 1.8022884794290417e-05, "loss": 0.6924, "step": 41700 }, { "epoch": 0.6841859741875291, "grad_norm": 1.1573492288589478, "learning_rate": 1.801232402299679e-05, "loss": 0.6964, "step": 41800 }, { "epoch": 0.6858227827382171, "grad_norm": 1.1315394639968872, "learning_rate": 1.80017382331023e-05, "loss": 0.693, "step": 41900 }, { "epoch": 0.6874595912889049, "grad_norm": 1.1479718685150146, "learning_rate": 1.799112745766122e-05, "loss": 0.6985, "step": 42000 }, { "epoch": 0.6890963998395928, "grad_norm": 1.1869304180145264, "learning_rate": 1.7980491729805858e-05, "loss": 0.7132, "step": 42100 }, { "epoch": 0.6907332083902806, "grad_norm": 1.322792887687683, "learning_rate": 1.796983108274644e-05, "loss": 0.7085, "step": 42200 }, { "epoch": 0.6923700169409684, "grad_norm": 1.1635984182357788, "learning_rate": 1.7959145549770985e-05, "loss": 0.7117, "step": 42300 }, { "epoch": 0.6940068254916564, "grad_norm": 1.1490191221237183, "learning_rate": 1.7948435164245236e-05, "loss": 0.697, "step": 42400 }, { "epoch": 0.6956436340423442, "grad_norm": 1.2376859188079834, "learning_rate": 1.7937699959612523e-05, "loss": 0.7079, "step": 42500 }, { "epoch": 0.6972804425930321, "grad_norm": 1.2555029392242432, "learning_rate": 1.7926939969393693e-05, "loss": 0.6895, "step": 42600 }, { "epoch": 0.6989172511437199, "grad_norm": 1.1793533563613892, "learning_rate": 1.7916155227186966e-05, "loss": 0.6784, "step": 42700 }, { "epoch": 0.7005540596944079, "grad_norm": 1.0882368087768555, "learning_rate": 1.7905345766667867e-05, "loss": 0.6875, "step": 42800 }, { "epoch": 0.7021908682450957, "grad_norm": 1.2925825119018555, "learning_rate": 1.789451162158909e-05, "loss": 0.7072, "step": 42900 }, { "epoch": 0.7038276767957836, "grad_norm": 1.2188570499420166, "learning_rate": 1.7883652825780418e-05, "loss": 0.7084, "step": 43000 }, { "epoch": 0.7054644853464714, "grad_norm": 1.2425892353057861, "learning_rate": 1.7872769413148602e-05, "loss": 0.7059, "step": 43100 }, { "epoch": 0.7071012938971594, "grad_norm": 1.3490030765533447, "learning_rate": 1.786186141767726e-05, "loss": 0.6861, "step": 43200 }, { "epoch": 0.7087381024478472, "grad_norm": 1.2493983507156372, "learning_rate": 1.785092887342677e-05, "loss": 0.6862, "step": 43300 }, { "epoch": 0.710374910998535, "grad_norm": 1.1606495380401611, "learning_rate": 1.7839971814534163e-05, "loss": 0.6959, "step": 43400 }, { "epoch": 0.7120117195492229, "grad_norm": 1.0867750644683838, "learning_rate": 1.7828990275213023e-05, "loss": 0.6838, "step": 43500 }, { "epoch": 0.7136485280999108, "grad_norm": 1.4481595754623413, "learning_rate": 1.781798428975336e-05, "loss": 0.6877, "step": 43600 }, { "epoch": 0.7152853366505987, "grad_norm": 1.0603893995285034, "learning_rate": 1.7806953892521536e-05, "loss": 0.6922, "step": 43700 }, { "epoch": 0.7169221452012865, "grad_norm": 1.1686676740646362, "learning_rate": 1.7795899117960126e-05, "loss": 0.6933, "step": 43800 }, { "epoch": 0.7185589537519744, "grad_norm": 1.423593282699585, "learning_rate": 1.7784820000587828e-05, "loss": 0.6947, "step": 43900 }, { "epoch": 0.7201957623026622, "grad_norm": 1.2158969640731812, "learning_rate": 1.7773716574999354e-05, "loss": 0.6832, "step": 44000 }, { "epoch": 0.7218325708533502, "grad_norm": 1.3259363174438477, "learning_rate": 1.776258887586531e-05, "loss": 0.6836, "step": 44100 }, { "epoch": 0.723469379404038, "grad_norm": 1.2114306688308716, "learning_rate": 1.775143693793211e-05, "loss": 0.6934, "step": 44200 }, { "epoch": 0.7251061879547259, "grad_norm": 1.0769015550613403, "learning_rate": 1.774026079602184e-05, "loss": 0.692, "step": 44300 }, { "epoch": 0.7267429965054137, "grad_norm": 1.098381519317627, "learning_rate": 1.7729060485032167e-05, "loss": 0.6929, "step": 44400 }, { "epoch": 0.7283798050561016, "grad_norm": 1.1960115432739258, "learning_rate": 1.7717836039936235e-05, "loss": 0.6895, "step": 44500 }, { "epoch": 0.7300166136067895, "grad_norm": 1.2899237871170044, "learning_rate": 1.7706587495782538e-05, "loss": 0.6891, "step": 44600 }, { "epoch": 0.7316534221574773, "grad_norm": 1.1849106550216675, "learning_rate": 1.769531488769482e-05, "loss": 0.6994, "step": 44700 }, { "epoch": 0.7332902307081652, "grad_norm": 1.0840647220611572, "learning_rate": 1.7684018250871967e-05, "loss": 0.6902, "step": 44800 }, { "epoch": 0.734927039258853, "grad_norm": 1.1262308359146118, "learning_rate": 1.7672697620587904e-05, "loss": 0.686, "step": 44900 }, { "epoch": 0.736563847809541, "grad_norm": 1.2281126976013184, "learning_rate": 1.7661353032191458e-05, "loss": 0.6971, "step": 45000 }, { "epoch": 0.7382006563602288, "grad_norm": 1.0803622007369995, "learning_rate": 1.7649984521106282e-05, "loss": 0.694, "step": 45100 }, { "epoch": 0.7398374649109167, "grad_norm": 1.4072610139846802, "learning_rate": 1.763859212283071e-05, "loss": 0.704, "step": 45200 }, { "epoch": 0.7414742734616045, "grad_norm": 1.2351950407028198, "learning_rate": 1.7627175872937686e-05, "loss": 0.6991, "step": 45300 }, { "epoch": 0.7431110820122925, "grad_norm": 1.1985889673233032, "learning_rate": 1.7615735807074616e-05, "loss": 0.6947, "step": 45400 }, { "epoch": 0.7447478905629803, "grad_norm": 1.1948813199996948, "learning_rate": 1.7604271960963274e-05, "loss": 0.6986, "step": 45500 }, { "epoch": 0.7463846991136681, "grad_norm": 1.2745295763015747, "learning_rate": 1.759278437039969e-05, "loss": 0.6989, "step": 45600 }, { "epoch": 0.748021507664356, "grad_norm": 1.1414821147918701, "learning_rate": 1.7581273071254038e-05, "loss": 0.6883, "step": 45700 }, { "epoch": 0.7496583162150439, "grad_norm": 1.1246697902679443, "learning_rate": 1.7569738099470524e-05, "loss": 0.6818, "step": 45800 }, { "epoch": 0.7512951247657318, "grad_norm": 1.1820296049118042, "learning_rate": 1.7558179491067263e-05, "loss": 0.7079, "step": 45900 }, { "epoch": 0.7529319333164196, "grad_norm": 1.1293789148330688, "learning_rate": 1.7546597282136186e-05, "loss": 0.696, "step": 46000 }, { "epoch": 0.7545687418671075, "grad_norm": 1.2405450344085693, "learning_rate": 1.753499150884291e-05, "loss": 0.6912, "step": 46100 }, { "epoch": 0.7562055504177954, "grad_norm": 1.2177417278289795, "learning_rate": 1.7523362207426634e-05, "loss": 0.6824, "step": 46200 }, { "epoch": 0.7578423589684833, "grad_norm": 1.124414086341858, "learning_rate": 1.7511709414200024e-05, "loss": 0.6868, "step": 46300 }, { "epoch": 0.7594791675191711, "grad_norm": 1.1439573764801025, "learning_rate": 1.7500033165549105e-05, "loss": 0.6882, "step": 46400 }, { "epoch": 0.761115976069859, "grad_norm": 1.1549428701400757, "learning_rate": 1.7488333497933133e-05, "loss": 0.681, "step": 46500 }, { "epoch": 0.7627527846205469, "grad_norm": 1.3092726469039917, "learning_rate": 1.7476610447884492e-05, "loss": 0.6973, "step": 46600 }, { "epoch": 0.7643895931712347, "grad_norm": 1.5812910795211792, "learning_rate": 1.7464864052008586e-05, "loss": 0.6855, "step": 46700 }, { "epoch": 0.7660264017219226, "grad_norm": 1.189775824546814, "learning_rate": 1.7453094346983707e-05, "loss": 0.6983, "step": 46800 }, { "epoch": 0.7676632102726104, "grad_norm": 1.3100470304489136, "learning_rate": 1.7441301369560934e-05, "loss": 0.6938, "step": 46900 }, { "epoch": 0.7693000188232983, "grad_norm": 1.227925419807434, "learning_rate": 1.7429485156564014e-05, "loss": 0.6762, "step": 47000 }, { "epoch": 0.7709368273739862, "grad_norm": 1.3295223712921143, "learning_rate": 1.7417645744889248e-05, "loss": 0.6823, "step": 47100 }, { "epoch": 0.7725736359246741, "grad_norm": 1.1091123819351196, "learning_rate": 1.740578317150538e-05, "loss": 0.6978, "step": 47200 }, { "epoch": 0.7742104444753619, "grad_norm": 1.2926867008209229, "learning_rate": 1.7393897473453462e-05, "loss": 0.6853, "step": 47300 }, { "epoch": 0.7758472530260498, "grad_norm": 1.279630422592163, "learning_rate": 1.738198868784677e-05, "loss": 0.6911, "step": 47400 }, { "epoch": 0.7774840615767377, "grad_norm": 1.1175949573516846, "learning_rate": 1.7370056851870665e-05, "loss": 0.687, "step": 47500 }, { "epoch": 0.7791208701274256, "grad_norm": 1.0889476537704468, "learning_rate": 1.7358102002782477e-05, "loss": 0.689, "step": 47600 }, { "epoch": 0.7807576786781134, "grad_norm": 1.1944537162780762, "learning_rate": 1.7346124177911402e-05, "loss": 0.6841, "step": 47700 }, { "epoch": 0.7823944872288013, "grad_norm": 1.208275556564331, "learning_rate": 1.7334123414658376e-05, "loss": 0.6777, "step": 47800 }, { "epoch": 0.7840312957794892, "grad_norm": 1.1608806848526, "learning_rate": 1.7322099750495964e-05, "loss": 0.6841, "step": 47900 }, { "epoch": 0.785668104330177, "grad_norm": 1.0674712657928467, "learning_rate": 1.731005322296823e-05, "loss": 0.6765, "step": 48000 }, { "epoch": 0.7873049128808649, "grad_norm": 1.1852935552597046, "learning_rate": 1.729798386969064e-05, "loss": 0.6968, "step": 48100 }, { "epoch": 0.7889417214315527, "grad_norm": 1.1918047666549683, "learning_rate": 1.728589172834993e-05, "loss": 0.6815, "step": 48200 }, { "epoch": 0.7905785299822407, "grad_norm": 1.3117504119873047, "learning_rate": 1.7273776836703985e-05, "loss": 0.6799, "step": 48300 }, { "epoch": 0.7922153385329285, "grad_norm": 1.2398260831832886, "learning_rate": 1.726163923258174e-05, "loss": 0.6869, "step": 48400 }, { "epoch": 0.7938521470836164, "grad_norm": 1.2091760635375977, "learning_rate": 1.724947895388304e-05, "loss": 0.6679, "step": 48500 }, { "epoch": 0.7954889556343042, "grad_norm": 1.1533339023590088, "learning_rate": 1.723729603857854e-05, "loss": 0.6877, "step": 48600 }, { "epoch": 0.7971257641849921, "grad_norm": 1.2629398107528687, "learning_rate": 1.7225090524709577e-05, "loss": 0.6878, "step": 48700 }, { "epoch": 0.79876257273568, "grad_norm": 1.202531099319458, "learning_rate": 1.7212862450388037e-05, "loss": 0.6911, "step": 48800 }, { "epoch": 0.8003993812863679, "grad_norm": 1.189326286315918, "learning_rate": 1.7200611853796278e-05, "loss": 0.6966, "step": 48900 }, { "epoch": 0.8020361898370557, "grad_norm": 1.2614778280258179, "learning_rate": 1.718833877318696e-05, "loss": 0.6952, "step": 49000 }, { "epoch": 0.8036729983877435, "grad_norm": 1.1864616870880127, "learning_rate": 1.7176043246882966e-05, "loss": 0.6756, "step": 49100 }, { "epoch": 0.8053098069384315, "grad_norm": 1.205569863319397, "learning_rate": 1.7163725313277255e-05, "loss": 0.6748, "step": 49200 }, { "epoch": 0.8069466154891193, "grad_norm": 1.2782241106033325, "learning_rate": 1.715138501083276e-05, "loss": 0.6903, "step": 49300 }, { "epoch": 0.8085834240398072, "grad_norm": 1.0571094751358032, "learning_rate": 1.7139022378082256e-05, "loss": 0.6871, "step": 49400 }, { "epoch": 0.810220232590495, "grad_norm": 1.3369005918502808, "learning_rate": 1.712663745362826e-05, "loss": 0.6746, "step": 49500 }, { "epoch": 0.811857041141183, "grad_norm": 1.2506871223449707, "learning_rate": 1.7114230276142866e-05, "loss": 0.6935, "step": 49600 }, { "epoch": 0.8134938496918708, "grad_norm": 1.3436931371688843, "learning_rate": 1.7101800884367676e-05, "loss": 0.6859, "step": 49700 }, { "epoch": 0.8151306582425587, "grad_norm": 1.3217076063156128, "learning_rate": 1.708934931711365e-05, "loss": 0.6766, "step": 49800 }, { "epoch": 0.8167674667932465, "grad_norm": 1.3521711826324463, "learning_rate": 1.7076875613261e-05, "loss": 0.6828, "step": 49900 }, { "epoch": 0.8184042753439345, "grad_norm": 1.1544018983840942, "learning_rate": 1.706437981175904e-05, "loss": 0.6866, "step": 50000 }, { "epoch": 0.8200410838946223, "grad_norm": 1.3795074224472046, "learning_rate": 1.7051861951626105e-05, "loss": 0.6893, "step": 50100 }, { "epoch": 0.8216778924453101, "grad_norm": 1.2545524835586548, "learning_rate": 1.7039322071949396e-05, "loss": 0.6865, "step": 50200 }, { "epoch": 0.823314700995998, "grad_norm": 1.3663312196731567, "learning_rate": 1.702676021188487e-05, "loss": 0.6858, "step": 50300 }, { "epoch": 0.8249515095466858, "grad_norm": 1.4371784925460815, "learning_rate": 1.701417641065713e-05, "loss": 0.6827, "step": 50400 }, { "epoch": 0.8265883180973738, "grad_norm": 1.465648889541626, "learning_rate": 1.7001570707559274e-05, "loss": 0.6813, "step": 50500 }, { "epoch": 0.8282251266480616, "grad_norm": 1.1045328378677368, "learning_rate": 1.69889431419528e-05, "loss": 0.6858, "step": 50600 }, { "epoch": 0.8298619351987495, "grad_norm": 1.1676952838897705, "learning_rate": 1.6976293753267467e-05, "loss": 0.662, "step": 50700 }, { "epoch": 0.8314987437494373, "grad_norm": 1.2377560138702393, "learning_rate": 1.6963622581001188e-05, "loss": 0.6853, "step": 50800 }, { "epoch": 0.8331355523001253, "grad_norm": 1.2052476406097412, "learning_rate": 1.6950929664719883e-05, "loss": 0.6898, "step": 50900 }, { "epoch": 0.8347723608508131, "grad_norm": 1.400944709777832, "learning_rate": 1.6938215044057363e-05, "loss": 0.6905, "step": 51000 }, { "epoch": 0.836409169401501, "grad_norm": 1.2622673511505127, "learning_rate": 1.6925478758715226e-05, "loss": 0.6651, "step": 51100 }, { "epoch": 0.8380459779521888, "grad_norm": 1.1664501428604126, "learning_rate": 1.691272084846272e-05, "loss": 0.6851, "step": 51200 }, { "epoch": 0.8396827865028766, "grad_norm": 1.2591482400894165, "learning_rate": 1.68999413531366e-05, "loss": 0.6936, "step": 51300 }, { "epoch": 0.8413195950535646, "grad_norm": 1.163874864578247, "learning_rate": 1.6887140312641036e-05, "loss": 0.6886, "step": 51400 }, { "epoch": 0.8429564036042524, "grad_norm": 1.2441082000732422, "learning_rate": 1.6874317766947458e-05, "loss": 0.6761, "step": 51500 }, { "epoch": 0.8445932121549403, "grad_norm": 1.1966642141342163, "learning_rate": 1.6861473756094464e-05, "loss": 0.6758, "step": 51600 }, { "epoch": 0.8462300207056281, "grad_norm": 1.1858773231506348, "learning_rate": 1.6848608320187668e-05, "loss": 0.6806, "step": 51700 }, { "epoch": 0.8478668292563161, "grad_norm": 1.1656018495559692, "learning_rate": 1.6835721499399583e-05, "loss": 0.6768, "step": 51800 }, { "epoch": 0.8495036378070039, "grad_norm": 1.2097491025924683, "learning_rate": 1.6822813333969495e-05, "loss": 0.6936, "step": 51900 }, { "epoch": 0.8511404463576918, "grad_norm": 1.4976009130477905, "learning_rate": 1.6809883864203352e-05, "loss": 0.6721, "step": 52000 }, { "epoch": 0.8527772549083796, "grad_norm": 1.3640004396438599, "learning_rate": 1.6796933130473606e-05, "loss": 0.6738, "step": 52100 }, { "epoch": 0.8544140634590676, "grad_norm": 1.2159740924835205, "learning_rate": 1.6783961173219116e-05, "loss": 0.6755, "step": 52200 }, { "epoch": 0.8560508720097554, "grad_norm": 1.23357355594635, "learning_rate": 1.677096803294502e-05, "loss": 0.6789, "step": 52300 }, { "epoch": 0.8576876805604432, "grad_norm": 1.2574186325073242, "learning_rate": 1.6757953750222586e-05, "loss": 0.6892, "step": 52400 }, { "epoch": 0.8593244891111311, "grad_norm": 1.2394073009490967, "learning_rate": 1.6744918365689106e-05, "loss": 0.6726, "step": 52500 }, { "epoch": 0.860961297661819, "grad_norm": 1.2098554372787476, "learning_rate": 1.6731861920047758e-05, "loss": 0.6714, "step": 52600 }, { "epoch": 0.8625981062125069, "grad_norm": 1.3548126220703125, "learning_rate": 1.6718784454067495e-05, "loss": 0.6849, "step": 52700 }, { "epoch": 0.8642349147631947, "grad_norm": 1.5218019485473633, "learning_rate": 1.670568600858289e-05, "loss": 0.6744, "step": 52800 }, { "epoch": 0.8658717233138826, "grad_norm": 1.3826264142990112, "learning_rate": 1.669256662449404e-05, "loss": 0.6762, "step": 52900 }, { "epoch": 0.8675085318645704, "grad_norm": 1.2154985666275024, "learning_rate": 1.667942634276642e-05, "loss": 0.6711, "step": 53000 }, { "epoch": 0.8691453404152584, "grad_norm": 1.3120452165603638, "learning_rate": 1.666626520443075e-05, "loss": 0.6788, "step": 53100 }, { "epoch": 0.8707821489659462, "grad_norm": 1.2221883535385132, "learning_rate": 1.665308325058288e-05, "loss": 0.6661, "step": 53200 }, { "epoch": 0.8724189575166341, "grad_norm": 1.385396957397461, "learning_rate": 1.6639880522383655e-05, "loss": 0.6714, "step": 53300 }, { "epoch": 0.8740557660673219, "grad_norm": 1.2685418128967285, "learning_rate": 1.6626657061058797e-05, "loss": 0.668, "step": 53400 }, { "epoch": 0.8756925746180098, "grad_norm": 1.513152837753296, "learning_rate": 1.661341290789875e-05, "loss": 0.6706, "step": 53500 }, { "epoch": 0.8773293831686977, "grad_norm": 1.2810958623886108, "learning_rate": 1.6600148104258594e-05, "loss": 0.6904, "step": 53600 }, { "epoch": 0.8789661917193855, "grad_norm": 1.2695286273956299, "learning_rate": 1.6586862691557863e-05, "loss": 0.6733, "step": 53700 }, { "epoch": 0.8806030002700734, "grad_norm": 1.0760889053344727, "learning_rate": 1.6573556711280457e-05, "loss": 0.6743, "step": 53800 }, { "epoch": 0.8822398088207613, "grad_norm": 1.3402081727981567, "learning_rate": 1.6560230204974502e-05, "loss": 0.6706, "step": 53900 }, { "epoch": 0.8838766173714492, "grad_norm": 1.191873550415039, "learning_rate": 1.654688321425221e-05, "loss": 0.6764, "step": 54000 }, { "epoch": 0.885513425922137, "grad_norm": 1.1215344667434692, "learning_rate": 1.6533515780789758e-05, "loss": 0.6857, "step": 54100 }, { "epoch": 0.8871502344728249, "grad_norm": 1.1322293281555176, "learning_rate": 1.6520127946327155e-05, "loss": 0.6723, "step": 54200 }, { "epoch": 0.8887870430235127, "grad_norm": 1.7162648439407349, "learning_rate": 1.6506719752668115e-05, "loss": 0.679, "step": 54300 }, { "epoch": 0.8904238515742007, "grad_norm": 1.5632336139678955, "learning_rate": 1.6493291241679922e-05, "loss": 0.6807, "step": 54400 }, { "epoch": 0.8920606601248885, "grad_norm": 1.0530614852905273, "learning_rate": 1.6479842455293297e-05, "loss": 0.6681, "step": 54500 }, { "epoch": 0.8936974686755763, "grad_norm": 1.2179269790649414, "learning_rate": 1.6466373435502276e-05, "loss": 0.6614, "step": 54600 }, { "epoch": 0.8953342772262642, "grad_norm": 1.3225027322769165, "learning_rate": 1.6452884224364082e-05, "loss": 0.671, "step": 54700 }, { "epoch": 0.8969710857769521, "grad_norm": 1.3610303401947021, "learning_rate": 1.6439374863998966e-05, "loss": 0.6801, "step": 54800 }, { "epoch": 0.89860789432764, "grad_norm": 1.3277727365493774, "learning_rate": 1.6425845396590114e-05, "loss": 0.6746, "step": 54900 }, { "epoch": 0.9002447028783278, "grad_norm": 1.2963169813156128, "learning_rate": 1.6412295864383487e-05, "loss": 0.6817, "step": 55000 }, { "epoch": 0.9018815114290157, "grad_norm": 1.475885033607483, "learning_rate": 1.6398726309687704e-05, "loss": 0.6891, "step": 55100 }, { "epoch": 0.9035183199797036, "grad_norm": 1.2722758054733276, "learning_rate": 1.638513677487389e-05, "loss": 0.6709, "step": 55200 }, { "epoch": 0.9051551285303915, "grad_norm": 1.3521857261657715, "learning_rate": 1.637152730237558e-05, "loss": 0.6812, "step": 55300 }, { "epoch": 0.9067919370810793, "grad_norm": 1.2276744842529297, "learning_rate": 1.6357897934688555e-05, "loss": 0.6644, "step": 55400 }, { "epoch": 0.9084287456317672, "grad_norm": 1.5432332754135132, "learning_rate": 1.634424871437071e-05, "loss": 0.6817, "step": 55500 }, { "epoch": 0.910065554182455, "grad_norm": 1.2314627170562744, "learning_rate": 1.6330579684041946e-05, "loss": 0.6761, "step": 55600 }, { "epoch": 0.9117023627331429, "grad_norm": 1.473347544670105, "learning_rate": 1.631689088638401e-05, "loss": 0.6587, "step": 55700 }, { "epoch": 0.9133391712838308, "grad_norm": 1.4029542207717896, "learning_rate": 1.6303182364140376e-05, "loss": 0.6863, "step": 55800 }, { "epoch": 0.9149759798345186, "grad_norm": 1.1235482692718506, "learning_rate": 1.628945416011611e-05, "loss": 0.6717, "step": 55900 }, { "epoch": 0.9166127883852065, "grad_norm": 1.1514254808425903, "learning_rate": 1.6275706317177732e-05, "loss": 0.6815, "step": 56000 }, { "epoch": 0.9182495969358944, "grad_norm": 1.388074517250061, "learning_rate": 1.6261938878253086e-05, "loss": 0.6849, "step": 56100 }, { "epoch": 0.9198864054865823, "grad_norm": 1.1814851760864258, "learning_rate": 1.6248151886331208e-05, "loss": 0.6641, "step": 56200 }, { "epoch": 0.9215232140372701, "grad_norm": 1.4052802324295044, "learning_rate": 1.6234345384462174e-05, "loss": 0.6787, "step": 56300 }, { "epoch": 0.923160022587958, "grad_norm": 1.5508378744125366, "learning_rate": 1.6220519415757005e-05, "loss": 0.6808, "step": 56400 }, { "epoch": 0.9247968311386459, "grad_norm": 1.3127562999725342, "learning_rate": 1.620667402338749e-05, "loss": 0.6663, "step": 56500 }, { "epoch": 0.9264336396893338, "grad_norm": 1.2677356004714966, "learning_rate": 1.619280925058607e-05, "loss": 0.6723, "step": 56600 }, { "epoch": 0.9280704482400216, "grad_norm": 1.2480475902557373, "learning_rate": 1.61789251406457e-05, "loss": 0.6583, "step": 56700 }, { "epoch": 0.9297072567907094, "grad_norm": 1.1523864269256592, "learning_rate": 1.616502173691973e-05, "loss": 0.6858, "step": 56800 }, { "epoch": 0.9313440653413974, "grad_norm": 1.2443100214004517, "learning_rate": 1.615109908282174e-05, "loss": 0.6842, "step": 56900 }, { "epoch": 0.9329808738920852, "grad_norm": 1.172663927078247, "learning_rate": 1.6137157221825418e-05, "loss": 0.6708, "step": 57000 }, { "epoch": 0.9346176824427731, "grad_norm": 1.2049202919006348, "learning_rate": 1.6123196197464445e-05, "loss": 0.6665, "step": 57100 }, { "epoch": 0.9362544909934609, "grad_norm": 1.3395051956176758, "learning_rate": 1.6109216053332313e-05, "loss": 0.6593, "step": 57200 }, { "epoch": 0.9378912995441488, "grad_norm": 1.4670510292053223, "learning_rate": 1.6095216833082242e-05, "loss": 0.6715, "step": 57300 }, { "epoch": 0.9395281080948367, "grad_norm": 1.349523663520813, "learning_rate": 1.6081198580427e-05, "loss": 0.6724, "step": 57400 }, { "epoch": 0.9411649166455246, "grad_norm": 1.5846613645553589, "learning_rate": 1.606716133913879e-05, "loss": 0.6716, "step": 57500 }, { "epoch": 0.9428017251962124, "grad_norm": 1.1905144453048706, "learning_rate": 1.6053105153049103e-05, "loss": 0.6702, "step": 57600 }, { "epoch": 0.9444385337469003, "grad_norm": 1.4006574153900146, "learning_rate": 1.6039030066048592e-05, "loss": 0.6665, "step": 57700 }, { "epoch": 0.9460753422975882, "grad_norm": 1.3038159608840942, "learning_rate": 1.602493612208693e-05, "loss": 0.665, "step": 57800 }, { "epoch": 0.947712150848276, "grad_norm": 1.336591124534607, "learning_rate": 1.601082336517266e-05, "loss": 0.6572, "step": 57900 }, { "epoch": 0.9493489593989639, "grad_norm": 1.3096286058425903, "learning_rate": 1.5996691839373077e-05, "loss": 0.6651, "step": 58000 }, { "epoch": 0.9509857679496517, "grad_norm": 1.3385711908340454, "learning_rate": 1.5982541588814083e-05, "loss": 0.6708, "step": 58100 }, { "epoch": 0.9526225765003397, "grad_norm": 1.2425600290298462, "learning_rate": 1.596837265768004e-05, "loss": 0.6629, "step": 58200 }, { "epoch": 0.9542593850510275, "grad_norm": 1.1755977869033813, "learning_rate": 1.5954185090213653e-05, "loss": 0.6618, "step": 58300 }, { "epoch": 0.9558961936017154, "grad_norm": 1.5241588354110718, "learning_rate": 1.5939978930715808e-05, "loss": 0.6747, "step": 58400 }, { "epoch": 0.9575330021524032, "grad_norm": 1.113451361656189, "learning_rate": 1.5925754223545452e-05, "loss": 0.6779, "step": 58500 }, { "epoch": 0.9591698107030912, "grad_norm": 1.2721067667007446, "learning_rate": 1.5911511013119438e-05, "loss": 0.6586, "step": 58600 }, { "epoch": 0.960806619253779, "grad_norm": 1.5037124156951904, "learning_rate": 1.589724934391241e-05, "loss": 0.6646, "step": 58700 }, { "epoch": 0.9624434278044669, "grad_norm": 1.2813490629196167, "learning_rate": 1.588296926045664e-05, "loss": 0.6644, "step": 58800 }, { "epoch": 0.9640802363551547, "grad_norm": 1.2610142230987549, "learning_rate": 1.58686708073419e-05, "loss": 0.6717, "step": 58900 }, { "epoch": 0.9657170449058425, "grad_norm": 1.2408130168914795, "learning_rate": 1.585435402921532e-05, "loss": 0.6695, "step": 59000 }, { "epoch": 0.9673538534565305, "grad_norm": 1.4657983779907227, "learning_rate": 1.584001897078126e-05, "loss": 0.6777, "step": 59100 }, { "epoch": 0.9689906620072183, "grad_norm": 1.370548129081726, "learning_rate": 1.5825665676801145e-05, "loss": 0.6881, "step": 59200 }, { "epoch": 0.9706274705579062, "grad_norm": 1.3695186376571655, "learning_rate": 1.5811294192093353e-05, "loss": 0.6594, "step": 59300 }, { "epoch": 0.972264279108594, "grad_norm": 1.2767751216888428, "learning_rate": 1.5796904561533054e-05, "loss": 0.6661, "step": 59400 }, { "epoch": 0.973901087659282, "grad_norm": 1.293419361114502, "learning_rate": 1.578249683005209e-05, "loss": 0.6781, "step": 59500 }, { "epoch": 0.9755378962099698, "grad_norm": 1.5075045824050903, "learning_rate": 1.576807104263881e-05, "loss": 0.6706, "step": 59600 }, { "epoch": 0.9771747047606577, "grad_norm": 1.1597870588302612, "learning_rate": 1.5753627244337958e-05, "loss": 0.6709, "step": 59700 }, { "epoch": 0.9788115133113455, "grad_norm": 1.5488371849060059, "learning_rate": 1.5739165480250504e-05, "loss": 0.6611, "step": 59800 }, { "epoch": 0.9804483218620335, "grad_norm": 1.3339688777923584, "learning_rate": 1.5724685795533518e-05, "loss": 0.679, "step": 59900 }, { "epoch": 0.9820851304127213, "grad_norm": 1.3151462078094482, "learning_rate": 1.571018823540004e-05, "loss": 0.6636, "step": 60000 }, { "epoch": 0.9837219389634091, "grad_norm": 1.3205444812774658, "learning_rate": 1.5695672845118903e-05, "loss": 0.6623, "step": 60100 }, { "epoch": 0.985358747514097, "grad_norm": 1.294420599937439, "learning_rate": 1.5681139670014643e-05, "loss": 0.6666, "step": 60200 }, { "epoch": 0.9869955560647848, "grad_norm": 1.3142366409301758, "learning_rate": 1.566658875546731e-05, "loss": 0.6629, "step": 60300 }, { "epoch": 0.9886323646154728, "grad_norm": 1.3516416549682617, "learning_rate": 1.565202014691235e-05, "loss": 0.6664, "step": 60400 }, { "epoch": 0.9902691731661606, "grad_norm": 1.2360502481460571, "learning_rate": 1.5637433889840455e-05, "loss": 0.6608, "step": 60500 }, { "epoch": 0.9919059817168485, "grad_norm": 1.155104398727417, "learning_rate": 1.562283002979744e-05, "loss": 0.6676, "step": 60600 }, { "epoch": 0.9935427902675363, "grad_norm": 1.2880823612213135, "learning_rate": 1.560820861238407e-05, "loss": 0.6632, "step": 60700 }, { "epoch": 0.9951795988182243, "grad_norm": 1.2748744487762451, "learning_rate": 1.5593569683255936e-05, "loss": 0.6723, "step": 60800 }, { "epoch": 0.9968164073689121, "grad_norm": 1.2065379619598389, "learning_rate": 1.557891328812332e-05, "loss": 0.6831, "step": 60900 }, { "epoch": 0.9984532159196, "grad_norm": 1.143071174621582, "learning_rate": 1.5564239472751022e-05, "loss": 0.6656, "step": 61000 }, { "epoch": 1.0000818404275345, "grad_norm": 1.1476441621780396, "learning_rate": 1.5549548282958253e-05, "loss": 0.6591, "step": 61100 }, { "epoch": 1.0017186489782222, "grad_norm": 1.210295557975769, "learning_rate": 1.5534839764618477e-05, "loss": 0.6559, "step": 61200 }, { "epoch": 1.00335545752891, "grad_norm": 1.5003302097320557, "learning_rate": 1.5520113963659257e-05, "loss": 0.6615, "step": 61300 }, { "epoch": 1.004992266079598, "grad_norm": 1.235449194908142, "learning_rate": 1.550537092606212e-05, "loss": 0.6709, "step": 61400 }, { "epoch": 1.006629074630286, "grad_norm": 1.1739157438278198, "learning_rate": 1.549061069786243e-05, "loss": 0.668, "step": 61500 }, { "epoch": 1.0082658831809737, "grad_norm": 1.2646570205688477, "learning_rate": 1.5475833325149215e-05, "loss": 0.6553, "step": 61600 }, { "epoch": 1.0099026917316616, "grad_norm": 1.2951397895812988, "learning_rate": 1.546103885406504e-05, "loss": 0.6584, "step": 61700 }, { "epoch": 1.0115395002823495, "grad_norm": 1.2838189601898193, "learning_rate": 1.544622733080586e-05, "loss": 0.6518, "step": 61800 }, { "epoch": 1.0131763088330374, "grad_norm": 1.3708552122116089, "learning_rate": 1.543139880162088e-05, "loss": 0.6628, "step": 61900 }, { "epoch": 1.0148131173837251, "grad_norm": 1.301353931427002, "learning_rate": 1.54165533128124e-05, "loss": 0.6478, "step": 62000 }, { "epoch": 1.016449925934413, "grad_norm": 1.3044975996017456, "learning_rate": 1.5401690910735677e-05, "loss": 0.6439, "step": 62100 }, { "epoch": 1.018086734485101, "grad_norm": 1.4568370580673218, "learning_rate": 1.5386811641798785e-05, "loss": 0.6482, "step": 62200 }, { "epoch": 1.0197235430357887, "grad_norm": 1.3758224248886108, "learning_rate": 1.5371915552462466e-05, "loss": 0.663, "step": 62300 }, { "epoch": 1.0213603515864766, "grad_norm": 1.6428395509719849, "learning_rate": 1.535700268923998e-05, "loss": 0.6533, "step": 62400 }, { "epoch": 1.0229971601371646, "grad_norm": 1.3830885887145996, "learning_rate": 1.5342073098696956e-05, "loss": 0.6632, "step": 62500 }, { "epoch": 1.0246339686878525, "grad_norm": 1.426006555557251, "learning_rate": 1.5327126827451272e-05, "loss": 0.6491, "step": 62600 }, { "epoch": 1.0262707772385402, "grad_norm": 1.4166696071624756, "learning_rate": 1.531216392217288e-05, "loss": 0.6465, "step": 62700 }, { "epoch": 1.0279075857892281, "grad_norm": 1.224443793296814, "learning_rate": 1.529718442958367e-05, "loss": 0.6642, "step": 62800 }, { "epoch": 1.029544394339916, "grad_norm": 1.250406265258789, "learning_rate": 1.528218839645733e-05, "loss": 0.6516, "step": 62900 }, { "epoch": 1.031181202890604, "grad_norm": 1.2630037069320679, "learning_rate": 1.52671758696192e-05, "loss": 0.6649, "step": 63000 } ], "logging_steps": 100, "max_steps": 183285, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.34907099427588e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null }