diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4444 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.031181202890604, + "eval_steps": 500, + "global_step": 63000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016368085506878688, + "grad_norm": 0.5328027606010437, + "learning_rate": 3.600654664484452e-07, + "loss": 1.6968, + "step": 100 + }, + { + "epoch": 0.0032736171013757376, + "grad_norm": 0.5594077706336975, + "learning_rate": 7.237679578105111e-07, + "loss": 1.6883, + "step": 200 + }, + { + "epoch": 0.004910425652063607, + "grad_norm": 0.6636043787002563, + "learning_rate": 1.087470449172577e-06, + "loss": 1.6196, + "step": 300 + }, + { + "epoch": 0.006547234202751475, + "grad_norm": 0.6200364828109741, + "learning_rate": 1.4511729405346428e-06, + "loss": 1.511, + "step": 400 + }, + { + "epoch": 0.008184042753439345, + "grad_norm": 0.4777531623840332, + "learning_rate": 1.8148754318967086e-06, + "loss": 1.342, + "step": 500 + }, + { + "epoch": 0.009820851304127213, + "grad_norm": 0.3041970133781433, + "learning_rate": 2.1785779232587743e-06, + "loss": 1.2154, + "step": 600 + }, + { + "epoch": 0.011457659854815082, + "grad_norm": 0.21760690212249756, + "learning_rate": 2.54228041462084e-06, + "loss": 1.1427, + "step": 700 + }, + { + "epoch": 0.01309446840550295, + "grad_norm": 0.22987280786037445, + "learning_rate": 2.9059829059829063e-06, + "loss": 1.0943, + "step": 800 + }, + { + "epoch": 0.014731276956190819, + "grad_norm": 0.24943482875823975, + "learning_rate": 3.269685397344972e-06, + "loss": 1.0696, + "step": 900 + }, + { + "epoch": 0.01636808550687869, + "grad_norm": 0.2619542181491852, + "learning_rate": 3.633387888707038e-06, + "loss": 1.0318, + "step": 1000 + }, + { + "epoch": 0.018004894057566556, + "grad_norm": 0.2811136841773987, + "learning_rate": 3.997090380069103e-06, + "loss": 1.0035, + "step": 1100 + }, + { + "epoch": 0.019641702608254426, + "grad_norm": 0.3045084476470947, + "learning_rate": 4.36079287143117e-06, + "loss": 0.9726, + "step": 1200 + }, + { + "epoch": 0.021278511158942293, + "grad_norm": 0.3168332278728485, + "learning_rate": 4.7244953627932355e-06, + "loss": 0.971, + "step": 1300 + }, + { + "epoch": 0.022915319709630164, + "grad_norm": 0.33685848116874695, + "learning_rate": 5.088197854155301e-06, + "loss": 0.952, + "step": 1400 + }, + { + "epoch": 0.02455212826031803, + "grad_norm": 0.3198516368865967, + "learning_rate": 5.451900345517367e-06, + "loss": 0.9385, + "step": 1500 + }, + { + "epoch": 0.0261889368110059, + "grad_norm": 0.3457159101963043, + "learning_rate": 5.815602836879432e-06, + "loss": 0.9291, + "step": 1600 + }, + { + "epoch": 0.02782574536169377, + "grad_norm": 0.3343696594238281, + "learning_rate": 6.179305328241499e-06, + "loss": 0.9251, + "step": 1700 + }, + { + "epoch": 0.029462553912381638, + "grad_norm": 0.4662475287914276, + "learning_rate": 6.543007819603565e-06, + "loss": 0.9328, + "step": 1800 + }, + { + "epoch": 0.03109936246306951, + "grad_norm": 0.3559871017932892, + "learning_rate": 6.906710310965631e-06, + "loss": 0.9126, + "step": 1900 + }, + { + "epoch": 0.03273617101375738, + "grad_norm": 0.3852447271347046, + "learning_rate": 7.270412802327696e-06, + "loss": 0.9024, + "step": 2000 + }, + { + "epoch": 0.034372979564445245, + "grad_norm": 0.36482807993888855, + "learning_rate": 7.634115293689762e-06, + "loss": 0.9086, + "step": 2100 + }, + { + "epoch": 0.03600978811513311, + "grad_norm": 0.39493420720100403, + "learning_rate": 7.997817785051828e-06, + "loss": 0.9144, + "step": 2200 + }, + { + "epoch": 0.03764659666582098, + "grad_norm": 0.4406372010707855, + "learning_rate": 8.361520276413894e-06, + "loss": 0.9067, + "step": 2300 + }, + { + "epoch": 0.03928340521650885, + "grad_norm": 0.43684300780296326, + "learning_rate": 8.72522276777596e-06, + "loss": 0.898, + "step": 2400 + }, + { + "epoch": 0.04092021376719672, + "grad_norm": 0.4949699342250824, + "learning_rate": 9.088925259138026e-06, + "loss": 0.8893, + "step": 2500 + }, + { + "epoch": 0.04255702231788459, + "grad_norm": 0.4759005308151245, + "learning_rate": 9.452627750500092e-06, + "loss": 0.9036, + "step": 2600 + }, + { + "epoch": 0.04419383086857246, + "grad_norm": 0.4733336567878723, + "learning_rate": 9.816330241862157e-06, + "loss": 0.9046, + "step": 2700 + }, + { + "epoch": 0.04583063941926033, + "grad_norm": 0.5515408515930176, + "learning_rate": 1.0180032733224223e-05, + "loss": 0.8899, + "step": 2800 + }, + { + "epoch": 0.047467447969948194, + "grad_norm": 0.5026727318763733, + "learning_rate": 1.054373522458629e-05, + "loss": 0.8868, + "step": 2900 + }, + { + "epoch": 0.04910425652063606, + "grad_norm": 0.5517929196357727, + "learning_rate": 1.0907437715948354e-05, + "loss": 0.8905, + "step": 3000 + }, + { + "epoch": 0.050741065071323935, + "grad_norm": 0.5139409899711609, + "learning_rate": 1.127114020731042e-05, + "loss": 0.8711, + "step": 3100 + }, + { + "epoch": 0.0523778736220118, + "grad_norm": 0.5762068033218384, + "learning_rate": 1.1634842698672486e-05, + "loss": 0.9, + "step": 3200 + }, + { + "epoch": 0.05401468217269967, + "grad_norm": 0.5540242791175842, + "learning_rate": 1.1998545190034552e-05, + "loss": 0.8854, + "step": 3300 + }, + { + "epoch": 0.05565149072338754, + "grad_norm": 0.6651942133903503, + "learning_rate": 1.236224768139662e-05, + "loss": 0.875, + "step": 3400 + }, + { + "epoch": 0.05728829927407541, + "grad_norm": 0.6157256364822388, + "learning_rate": 1.2725950172758685e-05, + "loss": 0.87, + "step": 3500 + }, + { + "epoch": 0.058925107824763276, + "grad_norm": 0.6638494729995728, + "learning_rate": 1.3089652664120751e-05, + "loss": 0.8666, + "step": 3600 + }, + { + "epoch": 0.06056191637545114, + "grad_norm": 0.6535647511482239, + "learning_rate": 1.3453355155482817e-05, + "loss": 0.8675, + "step": 3700 + }, + { + "epoch": 0.06219872492613902, + "grad_norm": 0.7346630692481995, + "learning_rate": 1.3817057646844883e-05, + "loss": 0.8724, + "step": 3800 + }, + { + "epoch": 0.06383553347682688, + "grad_norm": 0.7002882957458496, + "learning_rate": 1.4180760138206948e-05, + "loss": 0.8476, + "step": 3900 + }, + { + "epoch": 0.06547234202751476, + "grad_norm": 0.6632655262947083, + "learning_rate": 1.4544462629569014e-05, + "loss": 0.8641, + "step": 4000 + }, + { + "epoch": 0.06710915057820262, + "grad_norm": 0.7253566384315491, + "learning_rate": 1.490816512093108e-05, + "loss": 0.8611, + "step": 4100 + }, + { + "epoch": 0.06874595912889049, + "grad_norm": 0.7651970386505127, + "learning_rate": 1.5271867612293146e-05, + "loss": 0.8597, + "step": 4200 + }, + { + "epoch": 0.07038276767957836, + "grad_norm": 0.6781213879585266, + "learning_rate": 1.563557010365521e-05, + "loss": 0.844, + "step": 4300 + }, + { + "epoch": 0.07201957623026622, + "grad_norm": 0.7465602159500122, + "learning_rate": 1.5999272595017275e-05, + "loss": 0.8558, + "step": 4400 + }, + { + "epoch": 0.0736563847809541, + "grad_norm": 0.7796695828437805, + "learning_rate": 1.6362975086379343e-05, + "loss": 0.8533, + "step": 4500 + }, + { + "epoch": 0.07529319333164196, + "grad_norm": 0.7622010111808777, + "learning_rate": 1.6726677577741408e-05, + "loss": 0.8414, + "step": 4600 + }, + { + "epoch": 0.07693000188232983, + "grad_norm": 0.7499621510505676, + "learning_rate": 1.7090380069103472e-05, + "loss": 0.8459, + "step": 4700 + }, + { + "epoch": 0.0785668104330177, + "grad_norm": 0.7822730541229248, + "learning_rate": 1.745408256046554e-05, + "loss": 0.8468, + "step": 4800 + }, + { + "epoch": 0.08020361898370557, + "grad_norm": 0.7850978970527649, + "learning_rate": 1.7817785051827608e-05, + "loss": 0.8603, + "step": 4900 + }, + { + "epoch": 0.08184042753439344, + "grad_norm": 0.8370286822319031, + "learning_rate": 1.8181487543189672e-05, + "loss": 0.837, + "step": 5000 + }, + { + "epoch": 0.08347723608508131, + "grad_norm": 0.821024477481842, + "learning_rate": 1.854519003455174e-05, + "loss": 0.8464, + "step": 5100 + }, + { + "epoch": 0.08511404463576917, + "grad_norm": 0.8516008257865906, + "learning_rate": 1.8908892525913805e-05, + "loss": 0.837, + "step": 5200 + }, + { + "epoch": 0.08675085318645705, + "grad_norm": 0.7816336750984192, + "learning_rate": 1.927259501727587e-05, + "loss": 0.8471, + "step": 5300 + }, + { + "epoch": 0.08838766173714492, + "grad_norm": 0.8347124457359314, + "learning_rate": 1.9636297508637937e-05, + "loss": 0.8333, + "step": 5400 + }, + { + "epoch": 0.09002447028783278, + "grad_norm": 0.8995541334152222, + "learning_rate": 2e-05, + "loss": 0.8341, + "step": 5500 + }, + { + "epoch": 0.09166127883852065, + "grad_norm": 0.9787241816520691, + "learning_rate": 1.9999984387425675e-05, + "loss": 0.8431, + "step": 5600 + }, + { + "epoch": 0.09329808738920851, + "grad_norm": 0.8093689680099487, + "learning_rate": 1.999993754975144e-05, + "loss": 0.8325, + "step": 5700 + }, + { + "epoch": 0.09493489593989639, + "grad_norm": 0.9042837023735046, + "learning_rate": 1.999985948712355e-05, + "loss": 0.828, + "step": 5800 + }, + { + "epoch": 0.09657170449058426, + "grad_norm": 0.9188331961631775, + "learning_rate": 1.999975019978576e-05, + "loss": 0.8291, + "step": 5900 + }, + { + "epoch": 0.09820851304127212, + "grad_norm": 0.8699648380279541, + "learning_rate": 1.9999609688079316e-05, + "loss": 0.8277, + "step": 6000 + }, + { + "epoch": 0.09984532159196, + "grad_norm": 0.9138243794441223, + "learning_rate": 1.999943795244297e-05, + "loss": 0.8367, + "step": 6100 + }, + { + "epoch": 0.10148213014264787, + "grad_norm": 0.9293233156204224, + "learning_rate": 1.9999234993412973e-05, + "loss": 0.8281, + "step": 6200 + }, + { + "epoch": 0.10311893869333573, + "grad_norm": 0.9346773624420166, + "learning_rate": 1.999900081162306e-05, + "loss": 0.8323, + "step": 6300 + }, + { + "epoch": 0.1047557472440236, + "grad_norm": 0.9332927465438843, + "learning_rate": 1.999873540780447e-05, + "loss": 0.8259, + "step": 6400 + }, + { + "epoch": 0.10639255579471148, + "grad_norm": 0.8887437582015991, + "learning_rate": 1.9998438782785937e-05, + "loss": 0.8305, + "step": 6500 + }, + { + "epoch": 0.10802936434539934, + "grad_norm": 0.9184074401855469, + "learning_rate": 1.999811093749367e-05, + "loss": 0.829, + "step": 6600 + }, + { + "epoch": 0.10966617289608721, + "grad_norm": 0.8532683849334717, + "learning_rate": 1.999775187295137e-05, + "loss": 0.8275, + "step": 6700 + }, + { + "epoch": 0.11130298144677508, + "grad_norm": 0.9298515915870667, + "learning_rate": 1.9997361590280225e-05, + "loss": 0.8192, + "step": 6800 + }, + { + "epoch": 0.11293978999746294, + "grad_norm": 0.9617123603820801, + "learning_rate": 1.9996940090698896e-05, + "loss": 0.8198, + "step": 6900 + }, + { + "epoch": 0.11457659854815082, + "grad_norm": 1.0112113952636719, + "learning_rate": 1.9996487375523524e-05, + "loss": 0.8239, + "step": 7000 + }, + { + "epoch": 0.11621340709883868, + "grad_norm": 0.9226319193840027, + "learning_rate": 1.9996003446167718e-05, + "loss": 0.8281, + "step": 7100 + }, + { + "epoch": 0.11785021564952655, + "grad_norm": 1.0199968814849854, + "learning_rate": 1.999548830414255e-05, + "loss": 0.82, + "step": 7200 + }, + { + "epoch": 0.11948702420021443, + "grad_norm": 0.9594390988349915, + "learning_rate": 1.999494195105657e-05, + "loss": 0.8139, + "step": 7300 + }, + { + "epoch": 0.12112383275090229, + "grad_norm": 0.9685386419296265, + "learning_rate": 1.9994364388615763e-05, + "loss": 0.8193, + "step": 7400 + }, + { + "epoch": 0.12276064130159016, + "grad_norm": 0.9797342419624329, + "learning_rate": 1.999375561862358e-05, + "loss": 0.815, + "step": 7500 + }, + { + "epoch": 0.12439744985227803, + "grad_norm": 1.0541061162948608, + "learning_rate": 1.9993115642980912e-05, + "loss": 0.8239, + "step": 7600 + }, + { + "epoch": 0.1260342584029659, + "grad_norm": 0.9543519616127014, + "learning_rate": 1.99924444636861e-05, + "loss": 0.8145, + "step": 7700 + }, + { + "epoch": 0.12767106695365377, + "grad_norm": 0.9379186630249023, + "learning_rate": 1.99917420828349e-05, + "loss": 0.817, + "step": 7800 + }, + { + "epoch": 0.12930787550434164, + "grad_norm": 0.9919012188911438, + "learning_rate": 1.9991008502620515e-05, + "loss": 0.8208, + "step": 7900 + }, + { + "epoch": 0.13094468405502951, + "grad_norm": 0.9344952702522278, + "learning_rate": 1.999024372533356e-05, + "loss": 0.8167, + "step": 8000 + }, + { + "epoch": 0.13258149260571736, + "grad_norm": 0.9583950638771057, + "learning_rate": 1.9989447753362058e-05, + "loss": 0.8125, + "step": 8100 + }, + { + "epoch": 0.13421830115640523, + "grad_norm": 0.9945580363273621, + "learning_rate": 1.998862058919145e-05, + "loss": 0.8225, + "step": 8200 + }, + { + "epoch": 0.1358551097070931, + "grad_norm": 0.9583763480186462, + "learning_rate": 1.9987762235404566e-05, + "loss": 0.8105, + "step": 8300 + }, + { + "epoch": 0.13749191825778098, + "grad_norm": 1.025468349456787, + "learning_rate": 1.998687269468162e-05, + "loss": 0.8107, + "step": 8400 + }, + { + "epoch": 0.13912872680846886, + "grad_norm": 1.0057779550552368, + "learning_rate": 1.998595196980023e-05, + "loss": 0.8138, + "step": 8500 + }, + { + "epoch": 0.14076553535915673, + "grad_norm": 0.9300206899642944, + "learning_rate": 1.9985000063635365e-05, + "loss": 0.8207, + "step": 8600 + }, + { + "epoch": 0.14240234390984458, + "grad_norm": 1.0241742134094238, + "learning_rate": 1.9984016979159368e-05, + "loss": 0.8046, + "step": 8700 + }, + { + "epoch": 0.14403915246053245, + "grad_norm": 0.9688097238540649, + "learning_rate": 1.9983002719441935e-05, + "loss": 0.8193, + "step": 8800 + }, + { + "epoch": 0.14567596101122032, + "grad_norm": 0.9877735376358032, + "learning_rate": 1.9981957287650107e-05, + "loss": 0.8003, + "step": 8900 + }, + { + "epoch": 0.1473127695619082, + "grad_norm": 0.9533541202545166, + "learning_rate": 1.9980880687048257e-05, + "loss": 0.8089, + "step": 9000 + }, + { + "epoch": 0.14894957811259607, + "grad_norm": 1.0934607982635498, + "learning_rate": 1.997977292099809e-05, + "loss": 0.7971, + "step": 9100 + }, + { + "epoch": 0.15058638666328392, + "grad_norm": 0.9715205430984497, + "learning_rate": 1.9978633992958624e-05, + "loss": 0.8194, + "step": 9200 + }, + { + "epoch": 0.1522231952139718, + "grad_norm": 0.9527362585067749, + "learning_rate": 1.9977463906486175e-05, + "loss": 0.8095, + "step": 9300 + }, + { + "epoch": 0.15386000376465966, + "grad_norm": 1.0439358949661255, + "learning_rate": 1.9976262665234357e-05, + "loss": 0.7997, + "step": 9400 + }, + { + "epoch": 0.15549681231534754, + "grad_norm": 1.1087926626205444, + "learning_rate": 1.9975030272954066e-05, + "loss": 0.8012, + "step": 9500 + }, + { + "epoch": 0.1571336208660354, + "grad_norm": 1.0532102584838867, + "learning_rate": 1.9973766733493458e-05, + "loss": 0.8006, + "step": 9600 + }, + { + "epoch": 0.15877042941672329, + "grad_norm": 0.9958882331848145, + "learning_rate": 1.997247205079796e-05, + "loss": 0.8138, + "step": 9700 + }, + { + "epoch": 0.16040723796741113, + "grad_norm": 1.0133436918258667, + "learning_rate": 1.9971146228910236e-05, + "loss": 0.7942, + "step": 9800 + }, + { + "epoch": 0.162044046518099, + "grad_norm": 0.9266718029975891, + "learning_rate": 1.9969789271970187e-05, + "loss": 0.7917, + "step": 9900 + }, + { + "epoch": 0.16368085506878688, + "grad_norm": 1.0468189716339111, + "learning_rate": 1.9968401184214924e-05, + "loss": 0.8012, + "step": 10000 + }, + { + "epoch": 0.16531766361947475, + "grad_norm": 1.0444200038909912, + "learning_rate": 1.9966981969978782e-05, + "loss": 0.7979, + "step": 10100 + }, + { + "epoch": 0.16695447217016263, + "grad_norm": 1.0317082405090332, + "learning_rate": 1.9965531633693268e-05, + "loss": 0.8209, + "step": 10200 + }, + { + "epoch": 0.16859128072085047, + "grad_norm": 1.0699563026428223, + "learning_rate": 1.9964050179887088e-05, + "loss": 0.8035, + "step": 10300 + }, + { + "epoch": 0.17022808927153835, + "grad_norm": 0.9806187748908997, + "learning_rate": 1.9962537613186096e-05, + "loss": 0.7957, + "step": 10400 + }, + { + "epoch": 0.17186489782222622, + "grad_norm": 1.0728228092193604, + "learning_rate": 1.996099393831331e-05, + "loss": 0.791, + "step": 10500 + }, + { + "epoch": 0.1735017063729141, + "grad_norm": 1.028189778327942, + "learning_rate": 1.9959419160088874e-05, + "loss": 0.7964, + "step": 10600 + }, + { + "epoch": 0.17513851492360197, + "grad_norm": 1.0126999616622925, + "learning_rate": 1.9957813283430054e-05, + "loss": 0.799, + "step": 10700 + }, + { + "epoch": 0.17677532347428984, + "grad_norm": 0.96955406665802, + "learning_rate": 1.995617631335123e-05, + "loss": 0.8118, + "step": 10800 + }, + { + "epoch": 0.1784121320249777, + "grad_norm": 1.0654776096343994, + "learning_rate": 1.9954508254963865e-05, + "loss": 0.8084, + "step": 10900 + }, + { + "epoch": 0.18004894057566556, + "grad_norm": 0.9537600874900818, + "learning_rate": 1.9952809113476493e-05, + "loss": 0.8011, + "step": 11000 + }, + { + "epoch": 0.18168574912635344, + "grad_norm": 0.9695281982421875, + "learning_rate": 1.9951078894194708e-05, + "loss": 0.8054, + "step": 11100 + }, + { + "epoch": 0.1833225576770413, + "grad_norm": 1.0722426176071167, + "learning_rate": 1.9949317602521144e-05, + "loss": 0.7917, + "step": 11200 + }, + { + "epoch": 0.18495936622772918, + "grad_norm": 0.9706518054008484, + "learning_rate": 1.9947525243955467e-05, + "loss": 0.8055, + "step": 11300 + }, + { + "epoch": 0.18659617477841703, + "grad_norm": 0.9769388437271118, + "learning_rate": 1.994570182409434e-05, + "loss": 0.7981, + "step": 11400 + }, + { + "epoch": 0.1882329833291049, + "grad_norm": 0.9185972809791565, + "learning_rate": 1.9943847348631415e-05, + "loss": 0.7907, + "step": 11500 + }, + { + "epoch": 0.18986979187979278, + "grad_norm": 1.0683258771896362, + "learning_rate": 1.9941961823357322e-05, + "loss": 0.8021, + "step": 11600 + }, + { + "epoch": 0.19150660043048065, + "grad_norm": 0.9599470496177673, + "learning_rate": 1.9940045254159644e-05, + "loss": 0.7923, + "step": 11700 + }, + { + "epoch": 0.19314340898116852, + "grad_norm": 0.9822320938110352, + "learning_rate": 1.9938097647022895e-05, + "loss": 0.7864, + "step": 11800 + }, + { + "epoch": 0.1947802175318564, + "grad_norm": 1.180939793586731, + "learning_rate": 1.9936119008028503e-05, + "loss": 0.7841, + "step": 11900 + }, + { + "epoch": 0.19641702608254424, + "grad_norm": 1.1611251831054688, + "learning_rate": 1.9934109343354808e-05, + "loss": 0.7855, + "step": 12000 + }, + { + "epoch": 0.19805383463323212, + "grad_norm": 1.0176281929016113, + "learning_rate": 1.9932068659277006e-05, + "loss": 0.7936, + "step": 12100 + }, + { + "epoch": 0.19969064318392, + "grad_norm": 1.05084228515625, + "learning_rate": 1.992999696216717e-05, + "loss": 0.7856, + "step": 12200 + }, + { + "epoch": 0.20132745173460787, + "grad_norm": 1.1582859754562378, + "learning_rate": 1.9927894258494204e-05, + "loss": 0.8064, + "step": 12300 + }, + { + "epoch": 0.20296426028529574, + "grad_norm": 0.9974379539489746, + "learning_rate": 1.992576055482383e-05, + "loss": 0.7923, + "step": 12400 + }, + { + "epoch": 0.2046010688359836, + "grad_norm": 1.0076924562454224, + "learning_rate": 1.9923595857818573e-05, + "loss": 0.801, + "step": 12500 + }, + { + "epoch": 0.20623787738667146, + "grad_norm": 1.104923129081726, + "learning_rate": 1.9921400174237732e-05, + "loss": 0.8053, + "step": 12600 + }, + { + "epoch": 0.20787468593735933, + "grad_norm": 1.0884004831314087, + "learning_rate": 1.9919173510937355e-05, + "loss": 0.7948, + "step": 12700 + }, + { + "epoch": 0.2095114944880472, + "grad_norm": 0.9803980588912964, + "learning_rate": 1.9916915874870234e-05, + "loss": 0.791, + "step": 12800 + }, + { + "epoch": 0.21114830303873508, + "grad_norm": 1.0630168914794922, + "learning_rate": 1.9914627273085876e-05, + "loss": 0.7813, + "step": 12900 + }, + { + "epoch": 0.21278511158942295, + "grad_norm": 1.0575711727142334, + "learning_rate": 1.9912307712730468e-05, + "loss": 0.7862, + "step": 13000 + }, + { + "epoch": 0.2144219201401108, + "grad_norm": 1.0258235931396484, + "learning_rate": 1.9909957201046875e-05, + "loss": 0.7855, + "step": 13100 + }, + { + "epoch": 0.21605872869079867, + "grad_norm": 0.970610499382019, + "learning_rate": 1.9907575745374605e-05, + "loss": 0.7845, + "step": 13200 + }, + { + "epoch": 0.21769553724148655, + "grad_norm": 1.0707366466522217, + "learning_rate": 1.9905163353149787e-05, + "loss": 0.7986, + "step": 13300 + }, + { + "epoch": 0.21933234579217442, + "grad_norm": 0.9396125674247742, + "learning_rate": 1.9902720031905153e-05, + "loss": 0.7798, + "step": 13400 + }, + { + "epoch": 0.2209691543428623, + "grad_norm": 1.0123385190963745, + "learning_rate": 1.9900245789270006e-05, + "loss": 0.7866, + "step": 13500 + }, + { + "epoch": 0.22260596289355017, + "grad_norm": 0.9208526015281677, + "learning_rate": 1.989774063297021e-05, + "loss": 0.79, + "step": 13600 + }, + { + "epoch": 0.22424277144423801, + "grad_norm": 1.0145132541656494, + "learning_rate": 1.989520457082815e-05, + "loss": 0.7826, + "step": 13700 + }, + { + "epoch": 0.2258795799949259, + "grad_norm": 0.9474859237670898, + "learning_rate": 1.9892637610762723e-05, + "loss": 0.7904, + "step": 13800 + }, + { + "epoch": 0.22751638854561376, + "grad_norm": 0.997414767742157, + "learning_rate": 1.9890039760789294e-05, + "loss": 0.7863, + "step": 13900 + }, + { + "epoch": 0.22915319709630164, + "grad_norm": 1.0312907695770264, + "learning_rate": 1.9887411029019686e-05, + "loss": 0.7825, + "step": 14000 + }, + { + "epoch": 0.2307900056469895, + "grad_norm": 1.019665002822876, + "learning_rate": 1.9884751423662162e-05, + "loss": 0.7746, + "step": 14100 + }, + { + "epoch": 0.23242681419767736, + "grad_norm": 0.9788889288902283, + "learning_rate": 1.9882060953021375e-05, + "loss": 0.7805, + "step": 14200 + }, + { + "epoch": 0.23406362274836523, + "grad_norm": 1.1468379497528076, + "learning_rate": 1.9879339625498356e-05, + "loss": 0.7783, + "step": 14300 + }, + { + "epoch": 0.2357004312990531, + "grad_norm": 0.9630206823348999, + "learning_rate": 1.9876587449590496e-05, + "loss": 0.7785, + "step": 14400 + }, + { + "epoch": 0.23733723984974098, + "grad_norm": 1.0484507083892822, + "learning_rate": 1.98738044338915e-05, + "loss": 0.7577, + "step": 14500 + }, + { + "epoch": 0.23897404840042885, + "grad_norm": 0.9262145161628723, + "learning_rate": 1.987099058709138e-05, + "loss": 0.7847, + "step": 14600 + }, + { + "epoch": 0.24061085695111672, + "grad_norm": 1.0156426429748535, + "learning_rate": 1.9868145917976412e-05, + "loss": 0.7754, + "step": 14700 + }, + { + "epoch": 0.24224766550180457, + "grad_norm": 1.0557153224945068, + "learning_rate": 1.986527043542912e-05, + "loss": 0.783, + "step": 14800 + }, + { + "epoch": 0.24388447405249244, + "grad_norm": 0.9480391144752502, + "learning_rate": 1.9862364148428243e-05, + "loss": 0.7795, + "step": 14900 + }, + { + "epoch": 0.24552128260318032, + "grad_norm": 1.1189950704574585, + "learning_rate": 1.9859427066048694e-05, + "loss": 0.773, + "step": 15000 + }, + { + "epoch": 0.2471580911538682, + "grad_norm": 1.0406650304794312, + "learning_rate": 1.985645919746157e-05, + "loss": 0.7815, + "step": 15100 + }, + { + "epoch": 0.24879489970455607, + "grad_norm": 1.0539467334747314, + "learning_rate": 1.985346055193408e-05, + "loss": 0.7832, + "step": 15200 + }, + { + "epoch": 0.2504317082552439, + "grad_norm": 1.0707350969314575, + "learning_rate": 1.9850431138829537e-05, + "loss": 0.7775, + "step": 15300 + }, + { + "epoch": 0.2520685168059318, + "grad_norm": 1.0518571138381958, + "learning_rate": 1.9847370967607332e-05, + "loss": 0.7692, + "step": 15400 + }, + { + "epoch": 0.25370532535661966, + "grad_norm": 1.038328766822815, + "learning_rate": 1.9844280047822892e-05, + "loss": 0.7812, + "step": 15500 + }, + { + "epoch": 0.25534213390730753, + "grad_norm": 1.0571229457855225, + "learning_rate": 1.984115838912766e-05, + "loss": 0.7773, + "step": 15600 + }, + { + "epoch": 0.2569789424579954, + "grad_norm": 1.0450866222381592, + "learning_rate": 1.9838006001269064e-05, + "loss": 0.7789, + "step": 15700 + }, + { + "epoch": 0.2586157510086833, + "grad_norm": 1.107710838317871, + "learning_rate": 1.9834822894090478e-05, + "loss": 0.7628, + "step": 15800 + }, + { + "epoch": 0.26025255955937115, + "grad_norm": 1.0595227479934692, + "learning_rate": 1.9831609077531205e-05, + "loss": 0.7805, + "step": 15900 + }, + { + "epoch": 0.26188936811005903, + "grad_norm": 1.0978327989578247, + "learning_rate": 1.982836456162644e-05, + "loss": 0.7779, + "step": 16000 + }, + { + "epoch": 0.2635261766607469, + "grad_norm": 1.0871798992156982, + "learning_rate": 1.982508935650722e-05, + "loss": 0.7696, + "step": 16100 + }, + { + "epoch": 0.2651629852114347, + "grad_norm": 1.0791369676589966, + "learning_rate": 1.982178347240043e-05, + "loss": 0.7701, + "step": 16200 + }, + { + "epoch": 0.2667997937621226, + "grad_norm": 1.095301866531372, + "learning_rate": 1.981844691962874e-05, + "loss": 0.783, + "step": 16300 + }, + { + "epoch": 0.26843660231281047, + "grad_norm": 1.1223257780075073, + "learning_rate": 1.9815079708610588e-05, + "loss": 0.7785, + "step": 16400 + }, + { + "epoch": 0.27007341086349834, + "grad_norm": 1.0025781393051147, + "learning_rate": 1.9811681849860137e-05, + "loss": 0.7787, + "step": 16500 + }, + { + "epoch": 0.2717102194141862, + "grad_norm": 1.1232304573059082, + "learning_rate": 1.9808253353987252e-05, + "loss": 0.7655, + "step": 16600 + }, + { + "epoch": 0.2733470279648741, + "grad_norm": 0.9625865817070007, + "learning_rate": 1.9804794231697464e-05, + "loss": 0.785, + "step": 16700 + }, + { + "epoch": 0.27498383651556196, + "grad_norm": 1.1022255420684814, + "learning_rate": 1.980130449379193e-05, + "loss": 0.7681, + "step": 16800 + }, + { + "epoch": 0.27662064506624984, + "grad_norm": 1.0605260133743286, + "learning_rate": 1.9797784151167417e-05, + "loss": 0.7686, + "step": 16900 + }, + { + "epoch": 0.2782574536169377, + "grad_norm": 1.0693503618240356, + "learning_rate": 1.9794233214816237e-05, + "loss": 0.7653, + "step": 17000 + }, + { + "epoch": 0.2798942621676256, + "grad_norm": 1.0027199983596802, + "learning_rate": 1.979065169582625e-05, + "loss": 0.7802, + "step": 17100 + }, + { + "epoch": 0.28153107071831346, + "grad_norm": 1.002388834953308, + "learning_rate": 1.9787039605380792e-05, + "loss": 0.7668, + "step": 17200 + }, + { + "epoch": 0.2831678792690013, + "grad_norm": 1.0847641229629517, + "learning_rate": 1.9783396954758682e-05, + "loss": 0.7685, + "step": 17300 + }, + { + "epoch": 0.28480468781968915, + "grad_norm": 1.1153062582015991, + "learning_rate": 1.9779723755334142e-05, + "loss": 0.7761, + "step": 17400 + }, + { + "epoch": 0.286441496370377, + "grad_norm": 1.0675033330917358, + "learning_rate": 1.9776020018576794e-05, + "loss": 0.7637, + "step": 17500 + }, + { + "epoch": 0.2880783049210649, + "grad_norm": 1.0875293016433716, + "learning_rate": 1.9772285756051613e-05, + "loss": 0.7689, + "step": 17600 + }, + { + "epoch": 0.28971511347175277, + "grad_norm": 1.135380744934082, + "learning_rate": 1.9768520979418885e-05, + "loss": 0.7763, + "step": 17700 + }, + { + "epoch": 0.29135192202244065, + "grad_norm": 1.0305795669555664, + "learning_rate": 1.9764725700434183e-05, + "loss": 0.7688, + "step": 17800 + }, + { + "epoch": 0.2929887305731285, + "grad_norm": 1.0471090078353882, + "learning_rate": 1.976089993094832e-05, + "loss": 0.7573, + "step": 17900 + }, + { + "epoch": 0.2946255391238164, + "grad_norm": 1.0096269845962524, + "learning_rate": 1.9757043682907325e-05, + "loss": 0.7622, + "step": 18000 + }, + { + "epoch": 0.29626234767450427, + "grad_norm": 1.103242039680481, + "learning_rate": 1.9753156968352388e-05, + "loss": 0.7573, + "step": 18100 + }, + { + "epoch": 0.29789915622519214, + "grad_norm": 1.1128453016281128, + "learning_rate": 1.9749239799419827e-05, + "loss": 0.7692, + "step": 18200 + }, + { + "epoch": 0.29953596477588, + "grad_norm": 1.0762085914611816, + "learning_rate": 1.974529218834106e-05, + "loss": 0.7838, + "step": 18300 + }, + { + "epoch": 0.30117277332656783, + "grad_norm": 1.0150110721588135, + "learning_rate": 1.9741314147442573e-05, + "loss": 0.773, + "step": 18400 + }, + { + "epoch": 0.3028095818772557, + "grad_norm": 1.0824315547943115, + "learning_rate": 1.9737305689145842e-05, + "loss": 0.7636, + "step": 18500 + }, + { + "epoch": 0.3044463904279436, + "grad_norm": 1.2597285509109497, + "learning_rate": 1.973326682596735e-05, + "loss": 0.7688, + "step": 18600 + }, + { + "epoch": 0.30608319897863145, + "grad_norm": 1.112971544265747, + "learning_rate": 1.97291975705185e-05, + "loss": 0.762, + "step": 18700 + }, + { + "epoch": 0.30772000752931933, + "grad_norm": 1.11709725856781, + "learning_rate": 1.9725097935505607e-05, + "loss": 0.7674, + "step": 18800 + }, + { + "epoch": 0.3093568160800072, + "grad_norm": 1.0609350204467773, + "learning_rate": 1.972096793372984e-05, + "loss": 0.7603, + "step": 18900 + }, + { + "epoch": 0.3109936246306951, + "grad_norm": 1.111243486404419, + "learning_rate": 1.9716807578087193e-05, + "loss": 0.7572, + "step": 19000 + }, + { + "epoch": 0.31263043318138295, + "grad_norm": 0.9914565086364746, + "learning_rate": 1.971261688156843e-05, + "loss": 0.7558, + "step": 19100 + }, + { + "epoch": 0.3142672417320708, + "grad_norm": 1.030030369758606, + "learning_rate": 1.9708395857259077e-05, + "loss": 0.7558, + "step": 19200 + }, + { + "epoch": 0.3159040502827587, + "grad_norm": 1.1039714813232422, + "learning_rate": 1.9704144518339336e-05, + "loss": 0.7507, + "step": 19300 + }, + { + "epoch": 0.31754085883344657, + "grad_norm": 1.0048165321350098, + "learning_rate": 1.969986287808408e-05, + "loss": 0.7806, + "step": 19400 + }, + { + "epoch": 0.3191776673841344, + "grad_norm": 1.2964001893997192, + "learning_rate": 1.969555094986279e-05, + "loss": 0.7504, + "step": 19500 + }, + { + "epoch": 0.32081447593482226, + "grad_norm": 1.198273777961731, + "learning_rate": 1.9691208747139527e-05, + "loss": 0.7597, + "step": 19600 + }, + { + "epoch": 0.32245128448551014, + "grad_norm": 1.0260130167007446, + "learning_rate": 1.968683628347289e-05, + "loss": 0.7571, + "step": 19700 + }, + { + "epoch": 0.324088093036198, + "grad_norm": 1.1643099784851074, + "learning_rate": 1.9682433572515952e-05, + "loss": 0.7712, + "step": 19800 + }, + { + "epoch": 0.3257249015868859, + "grad_norm": 1.1653162240982056, + "learning_rate": 1.9678000628016248e-05, + "loss": 0.7599, + "step": 19900 + }, + { + "epoch": 0.32736171013757376, + "grad_norm": 1.5513461828231812, + "learning_rate": 1.9673537463815718e-05, + "loss": 0.7673, + "step": 20000 + }, + { + "epoch": 0.32899851868826163, + "grad_norm": 1.138498306274414, + "learning_rate": 1.9669044093850652e-05, + "loss": 0.7521, + "step": 20100 + }, + { + "epoch": 0.3306353272389495, + "grad_norm": 1.0548768043518066, + "learning_rate": 1.9664520532151664e-05, + "loss": 0.7596, + "step": 20200 + }, + { + "epoch": 0.3322721357896374, + "grad_norm": 1.0597394704818726, + "learning_rate": 1.965996679284365e-05, + "loss": 0.7586, + "step": 20300 + }, + { + "epoch": 0.33390894434032525, + "grad_norm": 1.1359139680862427, + "learning_rate": 1.965538289014572e-05, + "loss": 0.7618, + "step": 20400 + }, + { + "epoch": 0.3355457528910131, + "grad_norm": 1.1026830673217773, + "learning_rate": 1.9650768838371182e-05, + "loss": 0.7613, + "step": 20500 + }, + { + "epoch": 0.33718256144170095, + "grad_norm": 1.0065330266952515, + "learning_rate": 1.9646124651927484e-05, + "loss": 0.7394, + "step": 20600 + }, + { + "epoch": 0.3388193699923888, + "grad_norm": 0.9368694424629211, + "learning_rate": 1.964145034531616e-05, + "loss": 0.761, + "step": 20700 + }, + { + "epoch": 0.3404561785430767, + "grad_norm": 0.9686558246612549, + "learning_rate": 1.9636745933132807e-05, + "loss": 0.7597, + "step": 20800 + }, + { + "epoch": 0.34209298709376457, + "grad_norm": 1.114066243171692, + "learning_rate": 1.9632011430067024e-05, + "loss": 0.7675, + "step": 20900 + }, + { + "epoch": 0.34372979564445244, + "grad_norm": 1.1572498083114624, + "learning_rate": 1.9627246850902363e-05, + "loss": 0.7576, + "step": 21000 + }, + { + "epoch": 0.3453666041951403, + "grad_norm": 1.0342215299606323, + "learning_rate": 1.9622452210516296e-05, + "loss": 0.7629, + "step": 21100 + }, + { + "epoch": 0.3470034127458282, + "grad_norm": 1.0652525424957275, + "learning_rate": 1.9617627523880158e-05, + "loss": 0.7636, + "step": 21200 + }, + { + "epoch": 0.34864022129651606, + "grad_norm": 1.048869013786316, + "learning_rate": 1.9612772806059104e-05, + "loss": 0.7625, + "step": 21300 + }, + { + "epoch": 0.35027702984720394, + "grad_norm": 1.1751947402954102, + "learning_rate": 1.9607888072212062e-05, + "loss": 0.7475, + "step": 21400 + }, + { + "epoch": 0.3519138383978918, + "grad_norm": 1.2830709218978882, + "learning_rate": 1.9602973337591688e-05, + "loss": 0.7558, + "step": 21500 + }, + { + "epoch": 0.3535506469485797, + "grad_norm": 1.1591740846633911, + "learning_rate": 1.9598028617544313e-05, + "loss": 0.7435, + "step": 21600 + }, + { + "epoch": 0.3551874554992675, + "grad_norm": 0.9801552295684814, + "learning_rate": 1.95930539275099e-05, + "loss": 0.7621, + "step": 21700 + }, + { + "epoch": 0.3568242640499554, + "grad_norm": 1.126760721206665, + "learning_rate": 1.958804928302199e-05, + "loss": 0.7672, + "step": 21800 + }, + { + "epoch": 0.35846107260064325, + "grad_norm": 1.0655152797698975, + "learning_rate": 1.958301469970766e-05, + "loss": 0.7491, + "step": 21900 + }, + { + "epoch": 0.3600978811513311, + "grad_norm": 1.1613372564315796, + "learning_rate": 1.9577950193287475e-05, + "loss": 0.7733, + "step": 22000 + }, + { + "epoch": 0.361734689702019, + "grad_norm": 0.9363147020339966, + "learning_rate": 1.9572855779575427e-05, + "loss": 0.7522, + "step": 22100 + }, + { + "epoch": 0.36337149825270687, + "grad_norm": 1.1021246910095215, + "learning_rate": 1.9567731474478903e-05, + "loss": 0.7539, + "step": 22200 + }, + { + "epoch": 0.36500830680339474, + "grad_norm": 1.084695816040039, + "learning_rate": 1.9562577293998616e-05, + "loss": 0.7514, + "step": 22300 + }, + { + "epoch": 0.3666451153540826, + "grad_norm": 1.1221933364868164, + "learning_rate": 1.9557393254228575e-05, + "loss": 0.7608, + "step": 22400 + }, + { + "epoch": 0.3682819239047705, + "grad_norm": 1.073371410369873, + "learning_rate": 1.9552179371356024e-05, + "loss": 0.7509, + "step": 22500 + }, + { + "epoch": 0.36991873245545837, + "grad_norm": 1.124243140220642, + "learning_rate": 1.9546935661661382e-05, + "loss": 0.7552, + "step": 22600 + }, + { + "epoch": 0.37155554100614624, + "grad_norm": 1.0397138595581055, + "learning_rate": 1.9541662141518222e-05, + "loss": 0.7451, + "step": 22700 + }, + { + "epoch": 0.37319234955683406, + "grad_norm": 1.0600475072860718, + "learning_rate": 1.9536358827393177e-05, + "loss": 0.7358, + "step": 22800 + }, + { + "epoch": 0.37482915810752193, + "grad_norm": 1.1461478471755981, + "learning_rate": 1.953102573584593e-05, + "loss": 0.7513, + "step": 22900 + }, + { + "epoch": 0.3764659666582098, + "grad_norm": 1.093103051185608, + "learning_rate": 1.952566288352914e-05, + "loss": 0.7369, + "step": 23000 + }, + { + "epoch": 0.3781027752088977, + "grad_norm": 1.2357380390167236, + "learning_rate": 1.952027028718839e-05, + "loss": 0.7628, + "step": 23100 + }, + { + "epoch": 0.37973958375958555, + "grad_norm": 0.9737277030944824, + "learning_rate": 1.9514847963662144e-05, + "loss": 0.7358, + "step": 23200 + }, + { + "epoch": 0.3813763923102734, + "grad_norm": 1.0810784101486206, + "learning_rate": 1.9509395929881683e-05, + "loss": 0.7431, + "step": 23300 + }, + { + "epoch": 0.3830132008609613, + "grad_norm": 1.0600659847259521, + "learning_rate": 1.9503914202871072e-05, + "loss": 0.7465, + "step": 23400 + }, + { + "epoch": 0.3846500094116492, + "grad_norm": 1.129676342010498, + "learning_rate": 1.9498402799747077e-05, + "loss": 0.746, + "step": 23500 + }, + { + "epoch": 0.38628681796233705, + "grad_norm": 1.0627739429473877, + "learning_rate": 1.9492861737719145e-05, + "loss": 0.7517, + "step": 23600 + }, + { + "epoch": 0.3879236265130249, + "grad_norm": 1.0382601022720337, + "learning_rate": 1.9487291034089316e-05, + "loss": 0.7466, + "step": 23700 + }, + { + "epoch": 0.3895604350637128, + "grad_norm": 1.0782064199447632, + "learning_rate": 1.9481690706252198e-05, + "loss": 0.7436, + "step": 23800 + }, + { + "epoch": 0.39119724361440067, + "grad_norm": 1.052713394165039, + "learning_rate": 1.94760607716949e-05, + "loss": 0.7363, + "step": 23900 + }, + { + "epoch": 0.3928340521650885, + "grad_norm": 1.0485634803771973, + "learning_rate": 1.947040124799697e-05, + "loss": 0.7491, + "step": 24000 + }, + { + "epoch": 0.39447086071577636, + "grad_norm": 1.1206567287445068, + "learning_rate": 1.9464712152830368e-05, + "loss": 0.7372, + "step": 24100 + }, + { + "epoch": 0.39610766926646424, + "grad_norm": 1.0319308042526245, + "learning_rate": 1.9458993503959368e-05, + "loss": 0.7493, + "step": 24200 + }, + { + "epoch": 0.3977444778171521, + "grad_norm": 1.1401089429855347, + "learning_rate": 1.9453245319240533e-05, + "loss": 0.7693, + "step": 24300 + }, + { + "epoch": 0.39938128636784, + "grad_norm": 1.2440853118896484, + "learning_rate": 1.944746761662266e-05, + "loss": 0.7477, + "step": 24400 + }, + { + "epoch": 0.40101809491852786, + "grad_norm": 1.1666104793548584, + "learning_rate": 1.9441660414146715e-05, + "loss": 0.7364, + "step": 24500 + }, + { + "epoch": 0.40265490346921573, + "grad_norm": 1.0812019109725952, + "learning_rate": 1.9435823729945768e-05, + "loss": 0.7278, + "step": 24600 + }, + { + "epoch": 0.4042917120199036, + "grad_norm": 1.1338680982589722, + "learning_rate": 1.9429957582244957e-05, + "loss": 0.7396, + "step": 24700 + }, + { + "epoch": 0.4059285205705915, + "grad_norm": 1.0170310735702515, + "learning_rate": 1.942406198936141e-05, + "loss": 0.7373, + "step": 24800 + }, + { + "epoch": 0.40756532912127935, + "grad_norm": 1.0910414457321167, + "learning_rate": 1.941813696970421e-05, + "loss": 0.743, + "step": 24900 + }, + { + "epoch": 0.4092021376719672, + "grad_norm": 0.9840279221534729, + "learning_rate": 1.9412182541774312e-05, + "loss": 0.7432, + "step": 25000 + }, + { + "epoch": 0.41083894622265504, + "grad_norm": 1.1482113599777222, + "learning_rate": 1.9406198724164515e-05, + "loss": 0.7457, + "step": 25100 + }, + { + "epoch": 0.4124757547733429, + "grad_norm": 0.9647344946861267, + "learning_rate": 1.9400185535559366e-05, + "loss": 0.7494, + "step": 25200 + }, + { + "epoch": 0.4141125633240308, + "grad_norm": 1.1271613836288452, + "learning_rate": 1.9394142994735147e-05, + "loss": 0.7358, + "step": 25300 + }, + { + "epoch": 0.41574937187471867, + "grad_norm": 1.1209514141082764, + "learning_rate": 1.9388071120559774e-05, + "loss": 0.7477, + "step": 25400 + }, + { + "epoch": 0.41738618042540654, + "grad_norm": 1.1221638917922974, + "learning_rate": 1.9381969931992768e-05, + "loss": 0.7401, + "step": 25500 + }, + { + "epoch": 0.4190229889760944, + "grad_norm": 1.1341800689697266, + "learning_rate": 1.937583944808518e-05, + "loss": 0.7341, + "step": 25600 + }, + { + "epoch": 0.4206597975267823, + "grad_norm": 1.0561330318450928, + "learning_rate": 1.9369679687979538e-05, + "loss": 0.7427, + "step": 25700 + }, + { + "epoch": 0.42229660607747016, + "grad_norm": 1.0445774793624878, + "learning_rate": 1.9363490670909788e-05, + "loss": 0.7485, + "step": 25800 + }, + { + "epoch": 0.42393341462815803, + "grad_norm": 1.1463161706924438, + "learning_rate": 1.9357272416201214e-05, + "loss": 0.7345, + "step": 25900 + }, + { + "epoch": 0.4255702231788459, + "grad_norm": 1.1426818370819092, + "learning_rate": 1.9351024943270426e-05, + "loss": 0.7369, + "step": 26000 + }, + { + "epoch": 0.4272070317295338, + "grad_norm": 1.0911140441894531, + "learning_rate": 1.934474827162524e-05, + "loss": 0.7472, + "step": 26100 + }, + { + "epoch": 0.4288438402802216, + "grad_norm": 1.0775692462921143, + "learning_rate": 1.9338442420864663e-05, + "loss": 0.7401, + "step": 26200 + }, + { + "epoch": 0.4304806488309095, + "grad_norm": 1.136518955230713, + "learning_rate": 1.9332107410678805e-05, + "loss": 0.7355, + "step": 26300 + }, + { + "epoch": 0.43211745738159735, + "grad_norm": 1.085319995880127, + "learning_rate": 1.932574326084883e-05, + "loss": 0.7485, + "step": 26400 + }, + { + "epoch": 0.4337542659322852, + "grad_norm": 1.034986972808838, + "learning_rate": 1.9319349991246887e-05, + "loss": 0.7422, + "step": 26500 + }, + { + "epoch": 0.4353910744829731, + "grad_norm": 1.1199235916137695, + "learning_rate": 1.9312927621836058e-05, + "loss": 0.7362, + "step": 26600 + }, + { + "epoch": 0.43702788303366097, + "grad_norm": 1.1646606922149658, + "learning_rate": 1.930647617267029e-05, + "loss": 0.7274, + "step": 26700 + }, + { + "epoch": 0.43866469158434884, + "grad_norm": 1.1620571613311768, + "learning_rate": 1.9299995663894325e-05, + "loss": 0.7351, + "step": 26800 + }, + { + "epoch": 0.4403015001350367, + "grad_norm": 1.1194571256637573, + "learning_rate": 1.9293486115743646e-05, + "loss": 0.7309, + "step": 26900 + }, + { + "epoch": 0.4419383086857246, + "grad_norm": 1.1805561780929565, + "learning_rate": 1.928694754854442e-05, + "loss": 0.7378, + "step": 27000 + }, + { + "epoch": 0.44357511723641246, + "grad_norm": 1.1845600605010986, + "learning_rate": 1.9280379982713417e-05, + "loss": 0.7319, + "step": 27100 + }, + { + "epoch": 0.44521192578710034, + "grad_norm": 1.2962830066680908, + "learning_rate": 1.927378343875796e-05, + "loss": 0.7305, + "step": 27200 + }, + { + "epoch": 0.44684873433778816, + "grad_norm": 1.0655794143676758, + "learning_rate": 1.9267157937275854e-05, + "loss": 0.7236, + "step": 27300 + }, + { + "epoch": 0.44848554288847603, + "grad_norm": 1.0807515382766724, + "learning_rate": 1.9260503498955326e-05, + "loss": 0.7326, + "step": 27400 + }, + { + "epoch": 0.4501223514391639, + "grad_norm": 1.0515137910842896, + "learning_rate": 1.9253820144574958e-05, + "loss": 0.7293, + "step": 27500 + }, + { + "epoch": 0.4517591599898518, + "grad_norm": 1.103508710861206, + "learning_rate": 1.9247107895003628e-05, + "loss": 0.7473, + "step": 27600 + }, + { + "epoch": 0.45339596854053965, + "grad_norm": 1.1016185283660889, + "learning_rate": 1.924036677120043e-05, + "loss": 0.7264, + "step": 27700 + }, + { + "epoch": 0.4550327770912275, + "grad_norm": 1.0213091373443604, + "learning_rate": 1.9233596794214623e-05, + "loss": 0.7325, + "step": 27800 + }, + { + "epoch": 0.4566695856419154, + "grad_norm": 1.1028705835342407, + "learning_rate": 1.9226797985185565e-05, + "loss": 0.7381, + "step": 27900 + }, + { + "epoch": 0.4583063941926033, + "grad_norm": 1.0844396352767944, + "learning_rate": 1.9219970365342634e-05, + "loss": 0.7279, + "step": 28000 + }, + { + "epoch": 0.45994320274329115, + "grad_norm": 1.037714958190918, + "learning_rate": 1.9213113956005176e-05, + "loss": 0.7433, + "step": 28100 + }, + { + "epoch": 0.461580011293979, + "grad_norm": 1.2123370170593262, + "learning_rate": 1.9206228778582435e-05, + "loss": 0.7341, + "step": 28200 + }, + { + "epoch": 0.4632168198446669, + "grad_norm": 1.013845682144165, + "learning_rate": 1.9199314854573474e-05, + "loss": 0.7369, + "step": 28300 + }, + { + "epoch": 0.4648536283953547, + "grad_norm": 1.0552864074707031, + "learning_rate": 1.9192372205567123e-05, + "loss": 0.7202, + "step": 28400 + }, + { + "epoch": 0.4664904369460426, + "grad_norm": 1.049025058746338, + "learning_rate": 1.9185400853241917e-05, + "loss": 0.7246, + "step": 28500 + }, + { + "epoch": 0.46812724549673046, + "grad_norm": 1.0877737998962402, + "learning_rate": 1.9178400819365994e-05, + "loss": 0.7261, + "step": 28600 + }, + { + "epoch": 0.46976405404741833, + "grad_norm": 1.099348783493042, + "learning_rate": 1.9171372125797072e-05, + "loss": 0.7327, + "step": 28700 + }, + { + "epoch": 0.4714008625981062, + "grad_norm": 1.1000944375991821, + "learning_rate": 1.916431479448235e-05, + "loss": 0.7305, + "step": 28800 + }, + { + "epoch": 0.4730376711487941, + "grad_norm": 1.0979351997375488, + "learning_rate": 1.9157228847458446e-05, + "loss": 0.7279, + "step": 28900 + }, + { + "epoch": 0.47467447969948195, + "grad_norm": 1.0918766260147095, + "learning_rate": 1.9150114306851336e-05, + "loss": 0.7215, + "step": 29000 + }, + { + "epoch": 0.47631128825016983, + "grad_norm": 1.109971046447754, + "learning_rate": 1.9142971194876284e-05, + "loss": 0.7322, + "step": 29100 + }, + { + "epoch": 0.4779480968008577, + "grad_norm": 1.1282057762145996, + "learning_rate": 1.913579953383776e-05, + "loss": 0.7257, + "step": 29200 + }, + { + "epoch": 0.4795849053515456, + "grad_norm": 1.1076371669769287, + "learning_rate": 1.912859934612938e-05, + "loss": 0.7516, + "step": 29300 + }, + { + "epoch": 0.48122171390223345, + "grad_norm": 1.1480896472930908, + "learning_rate": 1.9121370654233843e-05, + "loss": 0.728, + "step": 29400 + }, + { + "epoch": 0.48285852245292127, + "grad_norm": 1.1083163022994995, + "learning_rate": 1.911411348072284e-05, + "loss": 0.7235, + "step": 29500 + }, + { + "epoch": 0.48449533100360914, + "grad_norm": 1.2141623497009277, + "learning_rate": 1.9106827848257007e-05, + "loss": 0.7237, + "step": 29600 + }, + { + "epoch": 0.486132139554297, + "grad_norm": 1.0334457159042358, + "learning_rate": 1.9099513779585836e-05, + "loss": 0.7306, + "step": 29700 + }, + { + "epoch": 0.4877689481049849, + "grad_norm": 1.1086657047271729, + "learning_rate": 1.909217129754762e-05, + "loss": 0.7295, + "step": 29800 + }, + { + "epoch": 0.48940575665567276, + "grad_norm": 1.0128360986709595, + "learning_rate": 1.908480042506937e-05, + "loss": 0.733, + "step": 29900 + }, + { + "epoch": 0.49104256520636064, + "grad_norm": 1.1484946012496948, + "learning_rate": 1.907740118516674e-05, + "loss": 0.7396, + "step": 30000 + }, + { + "epoch": 0.4926793737570485, + "grad_norm": 1.031750202178955, + "learning_rate": 1.9069973600943962e-05, + "loss": 0.7204, + "step": 30100 + }, + { + "epoch": 0.4943161823077364, + "grad_norm": 1.1274133920669556, + "learning_rate": 1.9062517695593792e-05, + "loss": 0.7235, + "step": 30200 + }, + { + "epoch": 0.49595299085842426, + "grad_norm": 1.1863317489624023, + "learning_rate": 1.9055033492397396e-05, + "loss": 0.7329, + "step": 30300 + }, + { + "epoch": 0.49758979940911213, + "grad_norm": 1.0985053777694702, + "learning_rate": 1.9047521014724303e-05, + "loss": 0.7341, + "step": 30400 + }, + { + "epoch": 0.4992266079598, + "grad_norm": 1.136760950088501, + "learning_rate": 1.9039980286032353e-05, + "loss": 0.7189, + "step": 30500 + }, + { + "epoch": 0.5008634165104878, + "grad_norm": 1.0787100791931152, + "learning_rate": 1.9032411329867573e-05, + "loss": 0.7298, + "step": 30600 + }, + { + "epoch": 0.5025002250611758, + "grad_norm": 1.3436377048492432, + "learning_rate": 1.902481416986414e-05, + "loss": 0.719, + "step": 30700 + }, + { + "epoch": 0.5041370336118636, + "grad_norm": 1.1863504648208618, + "learning_rate": 1.9017188829744305e-05, + "loss": 0.7125, + "step": 30800 + }, + { + "epoch": 0.5057738421625515, + "grad_norm": 1.0385360717773438, + "learning_rate": 1.90095353333183e-05, + "loss": 0.7297, + "step": 30900 + }, + { + "epoch": 0.5074106507132393, + "grad_norm": 1.1736425161361694, + "learning_rate": 1.9001853704484285e-05, + "loss": 0.7205, + "step": 31000 + }, + { + "epoch": 0.5090474592639272, + "grad_norm": 1.0939114093780518, + "learning_rate": 1.899414396722826e-05, + "loss": 0.741, + "step": 31100 + }, + { + "epoch": 0.5106842678146151, + "grad_norm": 1.3368091583251953, + "learning_rate": 1.8986406145623996e-05, + "loss": 0.7277, + "step": 31200 + }, + { + "epoch": 0.5123210763653029, + "grad_norm": 1.1556004285812378, + "learning_rate": 1.897864026383295e-05, + "loss": 0.7383, + "step": 31300 + }, + { + "epoch": 0.5139578849159908, + "grad_norm": 1.2308059930801392, + "learning_rate": 1.897084634610421e-05, + "loss": 0.7188, + "step": 31400 + }, + { + "epoch": 0.5155946934666786, + "grad_norm": 1.1211739778518677, + "learning_rate": 1.8963024416774393e-05, + "loss": 0.7241, + "step": 31500 + }, + { + "epoch": 0.5172315020173666, + "grad_norm": 1.1302770376205444, + "learning_rate": 1.8955174500267596e-05, + "loss": 0.7207, + "step": 31600 + }, + { + "epoch": 0.5188683105680544, + "grad_norm": 1.1893266439437866, + "learning_rate": 1.8947296621095297e-05, + "loss": 0.7088, + "step": 31700 + }, + { + "epoch": 0.5205051191187423, + "grad_norm": 1.2034817934036255, + "learning_rate": 1.893939080385629e-05, + "loss": 0.7225, + "step": 31800 + }, + { + "epoch": 0.5221419276694301, + "grad_norm": 1.0935208797454834, + "learning_rate": 1.8931457073236612e-05, + "loss": 0.7219, + "step": 31900 + }, + { + "epoch": 0.5237787362201181, + "grad_norm": 1.2129491567611694, + "learning_rate": 1.892349545400945e-05, + "loss": 0.7323, + "step": 32000 + }, + { + "epoch": 0.5254155447708059, + "grad_norm": 1.0750499963760376, + "learning_rate": 1.8915505971035077e-05, + "loss": 0.7213, + "step": 32100 + }, + { + "epoch": 0.5270523533214938, + "grad_norm": 1.1311250925064087, + "learning_rate": 1.8907488649260775e-05, + "loss": 0.7265, + "step": 32200 + }, + { + "epoch": 0.5286891618721816, + "grad_norm": 1.1503121852874756, + "learning_rate": 1.889944351372075e-05, + "loss": 0.7177, + "step": 32300 + }, + { + "epoch": 0.5303259704228694, + "grad_norm": 1.3034614324569702, + "learning_rate": 1.8891370589536058e-05, + "loss": 0.7118, + "step": 32400 + }, + { + "epoch": 0.5319627789735574, + "grad_norm": 1.0626057386398315, + "learning_rate": 1.8883269901914524e-05, + "loss": 0.7205, + "step": 32500 + }, + { + "epoch": 0.5335995875242452, + "grad_norm": 1.2290301322937012, + "learning_rate": 1.8875141476150664e-05, + "loss": 0.73, + "step": 32600 + }, + { + "epoch": 0.5352363960749331, + "grad_norm": 1.2172757387161255, + "learning_rate": 1.8866985337625615e-05, + "loss": 0.7234, + "step": 32700 + }, + { + "epoch": 0.5368732046256209, + "grad_norm": 1.0496524572372437, + "learning_rate": 1.885880151180703e-05, + "loss": 0.7127, + "step": 32800 + }, + { + "epoch": 0.5385100131763089, + "grad_norm": 0.9903925061225891, + "learning_rate": 1.8850590024249037e-05, + "loss": 0.728, + "step": 32900 + }, + { + "epoch": 0.5401468217269967, + "grad_norm": 1.2562659978866577, + "learning_rate": 1.8842350900592122e-05, + "loss": 0.7188, + "step": 33000 + }, + { + "epoch": 0.5417836302776846, + "grad_norm": 1.2212430238723755, + "learning_rate": 1.8834084166563072e-05, + "loss": 0.7086, + "step": 33100 + }, + { + "epoch": 0.5434204388283724, + "grad_norm": 1.1504745483398438, + "learning_rate": 1.882578984797489e-05, + "loss": 0.7198, + "step": 33200 + }, + { + "epoch": 0.5450572473790604, + "grad_norm": 1.1029900312423706, + "learning_rate": 1.8817467970726704e-05, + "loss": 0.729, + "step": 33300 + }, + { + "epoch": 0.5466940559297482, + "grad_norm": 1.1274054050445557, + "learning_rate": 1.8809118560803704e-05, + "loss": 0.7249, + "step": 33400 + }, + { + "epoch": 0.548330864480436, + "grad_norm": 1.093854546546936, + "learning_rate": 1.880074164427704e-05, + "loss": 0.704, + "step": 33500 + }, + { + "epoch": 0.5499676730311239, + "grad_norm": 1.0846567153930664, + "learning_rate": 1.879233724730377e-05, + "loss": 0.7194, + "step": 33600 + }, + { + "epoch": 0.5516044815818117, + "grad_norm": 1.35237455368042, + "learning_rate": 1.8783905396126737e-05, + "loss": 0.7205, + "step": 33700 + }, + { + "epoch": 0.5532412901324997, + "grad_norm": 0.9714828133583069, + "learning_rate": 1.8775446117074528e-05, + "loss": 0.7334, + "step": 33800 + }, + { + "epoch": 0.5548780986831875, + "grad_norm": 1.2619616985321045, + "learning_rate": 1.8766959436561363e-05, + "loss": 0.718, + "step": 33900 + }, + { + "epoch": 0.5565149072338754, + "grad_norm": 1.036129355430603, + "learning_rate": 1.8758445381087034e-05, + "loss": 0.7191, + "step": 34000 + }, + { + "epoch": 0.5581517157845632, + "grad_norm": 1.097095012664795, + "learning_rate": 1.8749903977236802e-05, + "loss": 0.7171, + "step": 34100 + }, + { + "epoch": 0.5597885243352512, + "grad_norm": 1.1133558750152588, + "learning_rate": 1.8741335251681328e-05, + "loss": 0.7179, + "step": 34200 + }, + { + "epoch": 0.561425332885939, + "grad_norm": 1.0562981367111206, + "learning_rate": 1.8732739231176587e-05, + "loss": 0.7201, + "step": 34300 + }, + { + "epoch": 0.5630621414366269, + "grad_norm": 1.20978581905365, + "learning_rate": 1.8724115942563773e-05, + "loss": 0.7129, + "step": 34400 + }, + { + "epoch": 0.5646989499873147, + "grad_norm": 1.0966860055923462, + "learning_rate": 1.8715465412769243e-05, + "loss": 0.715, + "step": 34500 + }, + { + "epoch": 0.5663357585380026, + "grad_norm": 1.2173317670822144, + "learning_rate": 1.87067876688044e-05, + "loss": 0.7052, + "step": 34600 + }, + { + "epoch": 0.5679725670886905, + "grad_norm": 1.126670241355896, + "learning_rate": 1.869808273776563e-05, + "loss": 0.7172, + "step": 34700 + }, + { + "epoch": 0.5696093756393783, + "grad_norm": 1.0486496686935425, + "learning_rate": 1.8689350646834207e-05, + "loss": 0.7269, + "step": 34800 + }, + { + "epoch": 0.5712461841900662, + "grad_norm": 1.1730561256408691, + "learning_rate": 1.868059142327622e-05, + "loss": 0.7191, + "step": 34900 + }, + { + "epoch": 0.572882992740754, + "grad_norm": 1.1153805255889893, + "learning_rate": 1.867180509444247e-05, + "loss": 0.7124, + "step": 35000 + }, + { + "epoch": 0.574519801291442, + "grad_norm": 1.200767159461975, + "learning_rate": 1.8662991687768394e-05, + "loss": 0.7342, + "step": 35100 + }, + { + "epoch": 0.5761566098421298, + "grad_norm": 1.093985676765442, + "learning_rate": 1.8654151230774e-05, + "loss": 0.7073, + "step": 35200 + }, + { + "epoch": 0.5777934183928177, + "grad_norm": 1.1902211904525757, + "learning_rate": 1.8645283751063734e-05, + "loss": 0.7147, + "step": 35300 + }, + { + "epoch": 0.5794302269435055, + "grad_norm": 1.1363279819488525, + "learning_rate": 1.863638927632644e-05, + "loss": 0.7162, + "step": 35400 + }, + { + "epoch": 0.5810670354941935, + "grad_norm": 1.2271382808685303, + "learning_rate": 1.8627467834335243e-05, + "loss": 0.7042, + "step": 35500 + }, + { + "epoch": 0.5827038440448813, + "grad_norm": 1.1823738813400269, + "learning_rate": 1.8618519452947484e-05, + "loss": 0.7197, + "step": 35600 + }, + { + "epoch": 0.5843406525955691, + "grad_norm": 1.042771577835083, + "learning_rate": 1.8609544160104608e-05, + "loss": 0.7103, + "step": 35700 + }, + { + "epoch": 0.585977461146257, + "grad_norm": 1.2053323984146118, + "learning_rate": 1.8600541983832114e-05, + "loss": 0.7206, + "step": 35800 + }, + { + "epoch": 0.5876142696969449, + "grad_norm": 1.2077679634094238, + "learning_rate": 1.8591512952239416e-05, + "loss": 0.7003, + "step": 35900 + }, + { + "epoch": 0.5892510782476328, + "grad_norm": 1.2675883769989014, + "learning_rate": 1.8582457093519806e-05, + "loss": 0.7119, + "step": 36000 + }, + { + "epoch": 0.5908878867983206, + "grad_norm": 1.102798342704773, + "learning_rate": 1.857337443595034e-05, + "loss": 0.7097, + "step": 36100 + }, + { + "epoch": 0.5925246953490085, + "grad_norm": 1.0432052612304688, + "learning_rate": 1.8564265007891747e-05, + "loss": 0.7197, + "step": 36200 + }, + { + "epoch": 0.5941615038996964, + "grad_norm": 1.1461999416351318, + "learning_rate": 1.8555128837788356e-05, + "loss": 0.7128, + "step": 36300 + }, + { + "epoch": 0.5957983124503843, + "grad_norm": 1.1425740718841553, + "learning_rate": 1.854596595416799e-05, + "loss": 0.7221, + "step": 36400 + }, + { + "epoch": 0.5974351210010721, + "grad_norm": 1.1499603986740112, + "learning_rate": 1.8536776385641896e-05, + "loss": 0.7118, + "step": 36500 + }, + { + "epoch": 0.59907192955176, + "grad_norm": 1.1369038820266724, + "learning_rate": 1.8527560160904628e-05, + "loss": 0.7101, + "step": 36600 + }, + { + "epoch": 0.6007087381024478, + "grad_norm": 1.3000248670578003, + "learning_rate": 1.8518317308733987e-05, + "loss": 0.7042, + "step": 36700 + }, + { + "epoch": 0.6023455466531357, + "grad_norm": 1.193550944328308, + "learning_rate": 1.8509047857990925e-05, + "loss": 0.7143, + "step": 36800 + }, + { + "epoch": 0.6039823552038236, + "grad_norm": 1.1038364171981812, + "learning_rate": 1.849975183761943e-05, + "loss": 0.6953, + "step": 36900 + }, + { + "epoch": 0.6056191637545114, + "grad_norm": 1.2535215616226196, + "learning_rate": 1.849042927664647e-05, + "loss": 0.7021, + "step": 37000 + }, + { + "epoch": 0.6072559723051993, + "grad_norm": 1.1770461797714233, + "learning_rate": 1.848108020418188e-05, + "loss": 0.6971, + "step": 37100 + }, + { + "epoch": 0.6088927808558872, + "grad_norm": 1.3245750665664673, + "learning_rate": 1.8471704649418272e-05, + "loss": 0.7062, + "step": 37200 + }, + { + "epoch": 0.6105295894065751, + "grad_norm": 1.064820408821106, + "learning_rate": 1.8462302641630957e-05, + "loss": 0.7247, + "step": 37300 + }, + { + "epoch": 0.6121663979572629, + "grad_norm": 1.2426869869232178, + "learning_rate": 1.8452874210177853e-05, + "loss": 0.697, + "step": 37400 + }, + { + "epoch": 0.6138032065079508, + "grad_norm": 1.0495688915252686, + "learning_rate": 1.8443419384499367e-05, + "loss": 0.7066, + "step": 37500 + }, + { + "epoch": 0.6154400150586387, + "grad_norm": 1.0227185487747192, + "learning_rate": 1.8433938194118332e-05, + "loss": 0.6975, + "step": 37600 + }, + { + "epoch": 0.6170768236093266, + "grad_norm": 1.1213784217834473, + "learning_rate": 1.8424430668639916e-05, + "loss": 0.7101, + "step": 37700 + }, + { + "epoch": 0.6187136321600144, + "grad_norm": 1.3823000192642212, + "learning_rate": 1.8414896837751497e-05, + "loss": 0.7143, + "step": 37800 + }, + { + "epoch": 0.6203504407107022, + "grad_norm": 1.280870795249939, + "learning_rate": 1.8405336731222615e-05, + "loss": 0.7137, + "step": 37900 + }, + { + "epoch": 0.6219872492613902, + "grad_norm": 1.1578929424285889, + "learning_rate": 1.839575037890483e-05, + "loss": 0.7035, + "step": 38000 + }, + { + "epoch": 0.623624057812078, + "grad_norm": 1.1784029006958008, + "learning_rate": 1.838613781073169e-05, + "loss": 0.7003, + "step": 38100 + }, + { + "epoch": 0.6252608663627659, + "grad_norm": 1.5140550136566162, + "learning_rate": 1.8376499056718563e-05, + "loss": 0.7182, + "step": 38200 + }, + { + "epoch": 0.6268976749134537, + "grad_norm": 1.1795947551727295, + "learning_rate": 1.8366834146962613e-05, + "loss": 0.707, + "step": 38300 + }, + { + "epoch": 0.6285344834641416, + "grad_norm": 1.2156872749328613, + "learning_rate": 1.8357143111642658e-05, + "loss": 0.7041, + "step": 38400 + }, + { + "epoch": 0.6301712920148295, + "grad_norm": 1.120609164237976, + "learning_rate": 1.8347425981019104e-05, + "loss": 0.7087, + "step": 38500 + }, + { + "epoch": 0.6318081005655174, + "grad_norm": 1.0960373878479004, + "learning_rate": 1.8337682785433838e-05, + "loss": 0.7136, + "step": 38600 + }, + { + "epoch": 0.6334449091162052, + "grad_norm": 1.2065433263778687, + "learning_rate": 1.8327913555310125e-05, + "loss": 0.7077, + "step": 38700 + }, + { + "epoch": 0.6350817176668931, + "grad_norm": 1.158570647239685, + "learning_rate": 1.8318118321152534e-05, + "loss": 0.7199, + "step": 38800 + }, + { + "epoch": 0.636718526217581, + "grad_norm": 1.1315112113952637, + "learning_rate": 1.8308297113546834e-05, + "loss": 0.7157, + "step": 38900 + }, + { + "epoch": 0.6383553347682688, + "grad_norm": 1.567763328552246, + "learning_rate": 1.829844996315989e-05, + "loss": 0.7024, + "step": 39000 + }, + { + "epoch": 0.6399921433189567, + "grad_norm": 1.3154592514038086, + "learning_rate": 1.8288576900739573e-05, + "loss": 0.7093, + "step": 39100 + }, + { + "epoch": 0.6416289518696445, + "grad_norm": 1.2426626682281494, + "learning_rate": 1.8278677957114666e-05, + "loss": 0.7108, + "step": 39200 + }, + { + "epoch": 0.6432657604203325, + "grad_norm": 1.2186305522918701, + "learning_rate": 1.8268753163194773e-05, + "loss": 0.704, + "step": 39300 + }, + { + "epoch": 0.6449025689710203, + "grad_norm": 1.049307942390442, + "learning_rate": 1.8258802549970206e-05, + "loss": 0.7057, + "step": 39400 + }, + { + "epoch": 0.6465393775217082, + "grad_norm": 1.3523504734039307, + "learning_rate": 1.8248826148511908e-05, + "loss": 0.6965, + "step": 39500 + }, + { + "epoch": 0.648176186072396, + "grad_norm": 1.2402653694152832, + "learning_rate": 1.823882398997133e-05, + "loss": 0.704, + "step": 39600 + }, + { + "epoch": 0.649812994623084, + "grad_norm": 1.3009974956512451, + "learning_rate": 1.8228796105580373e-05, + "loss": 0.6892, + "step": 39700 + }, + { + "epoch": 0.6514498031737718, + "grad_norm": 1.161328673362732, + "learning_rate": 1.821874252665125e-05, + "loss": 0.7099, + "step": 39800 + }, + { + "epoch": 0.6530866117244597, + "grad_norm": 1.5753206014633179, + "learning_rate": 1.820866328457641e-05, + "loss": 0.6958, + "step": 39900 + }, + { + "epoch": 0.6547234202751475, + "grad_norm": 1.1261160373687744, + "learning_rate": 1.8198558410828436e-05, + "loss": 0.7048, + "step": 40000 + }, + { + "epoch": 0.6563602288258353, + "grad_norm": 1.2303427457809448, + "learning_rate": 1.818842793695995e-05, + "loss": 0.7024, + "step": 40100 + }, + { + "epoch": 0.6579970373765233, + "grad_norm": 1.2187303304672241, + "learning_rate": 1.8178271894603502e-05, + "loss": 0.696, + "step": 40200 + }, + { + "epoch": 0.6596338459272111, + "grad_norm": 1.1081221103668213, + "learning_rate": 1.8168090315471488e-05, + "loss": 0.7082, + "step": 40300 + }, + { + "epoch": 0.661270654477899, + "grad_norm": 1.1961265802383423, + "learning_rate": 1.8157883231356036e-05, + "loss": 0.6875, + "step": 40400 + }, + { + "epoch": 0.6629074630285868, + "grad_norm": 1.1577361822128296, + "learning_rate": 1.8147650674128927e-05, + "loss": 0.7004, + "step": 40500 + }, + { + "epoch": 0.6645442715792748, + "grad_norm": 1.1837248802185059, + "learning_rate": 1.813739267574147e-05, + "loss": 0.7084, + "step": 40600 + }, + { + "epoch": 0.6661810801299626, + "grad_norm": 1.140136957168579, + "learning_rate": 1.8127109268224414e-05, + "loss": 0.6897, + "step": 40700 + }, + { + "epoch": 0.6678178886806505, + "grad_norm": 1.132994532585144, + "learning_rate": 1.811680048368785e-05, + "loss": 0.6999, + "step": 40800 + }, + { + "epoch": 0.6694546972313383, + "grad_norm": 1.184187889099121, + "learning_rate": 1.8106466354321113e-05, + "loss": 0.6994, + "step": 40900 + }, + { + "epoch": 0.6710915057820263, + "grad_norm": 1.1196414232254028, + "learning_rate": 1.809610691239268e-05, + "loss": 0.7008, + "step": 41000 + }, + { + "epoch": 0.6727283143327141, + "grad_norm": 1.1688846349716187, + "learning_rate": 1.808572219025006e-05, + "loss": 0.6954, + "step": 41100 + }, + { + "epoch": 0.6743651228834019, + "grad_norm": 1.222205638885498, + "learning_rate": 1.80753122203197e-05, + "loss": 0.6918, + "step": 41200 + }, + { + "epoch": 0.6760019314340898, + "grad_norm": 1.1374167203903198, + "learning_rate": 1.8064877035106887e-05, + "loss": 0.6906, + "step": 41300 + }, + { + "epoch": 0.6776387399847776, + "grad_norm": 1.0707694292068481, + "learning_rate": 1.8054416667195643e-05, + "loss": 0.6943, + "step": 41400 + }, + { + "epoch": 0.6792755485354656, + "grad_norm": 1.1394332647323608, + "learning_rate": 1.8043931149248625e-05, + "loss": 0.7073, + "step": 41500 + }, + { + "epoch": 0.6809123570861534, + "grad_norm": 1.118058443069458, + "learning_rate": 1.803342051400701e-05, + "loss": 0.6983, + "step": 41600 + }, + { + "epoch": 0.6825491656368413, + "grad_norm": 1.3730331659317017, + "learning_rate": 1.8022884794290417e-05, + "loss": 0.6924, + "step": 41700 + }, + { + "epoch": 0.6841859741875291, + "grad_norm": 1.1573492288589478, + "learning_rate": 1.801232402299679e-05, + "loss": 0.6964, + "step": 41800 + }, + { + "epoch": 0.6858227827382171, + "grad_norm": 1.1315394639968872, + "learning_rate": 1.80017382331023e-05, + "loss": 0.693, + "step": 41900 + }, + { + "epoch": 0.6874595912889049, + "grad_norm": 1.1479718685150146, + "learning_rate": 1.799112745766122e-05, + "loss": 0.6985, + "step": 42000 + }, + { + "epoch": 0.6890963998395928, + "grad_norm": 1.1869304180145264, + "learning_rate": 1.7980491729805858e-05, + "loss": 0.7132, + "step": 42100 + }, + { + "epoch": 0.6907332083902806, + "grad_norm": 1.322792887687683, + "learning_rate": 1.796983108274644e-05, + "loss": 0.7085, + "step": 42200 + }, + { + "epoch": 0.6923700169409684, + "grad_norm": 1.1635984182357788, + "learning_rate": 1.7959145549770985e-05, + "loss": 0.7117, + "step": 42300 + }, + { + "epoch": 0.6940068254916564, + "grad_norm": 1.1490191221237183, + "learning_rate": 1.7948435164245236e-05, + "loss": 0.697, + "step": 42400 + }, + { + "epoch": 0.6956436340423442, + "grad_norm": 1.2376859188079834, + "learning_rate": 1.7937699959612523e-05, + "loss": 0.7079, + "step": 42500 + }, + { + "epoch": 0.6972804425930321, + "grad_norm": 1.2555029392242432, + "learning_rate": 1.7926939969393693e-05, + "loss": 0.6895, + "step": 42600 + }, + { + "epoch": 0.6989172511437199, + "grad_norm": 1.1793533563613892, + "learning_rate": 1.7916155227186966e-05, + "loss": 0.6784, + "step": 42700 + }, + { + "epoch": 0.7005540596944079, + "grad_norm": 1.0882368087768555, + "learning_rate": 1.7905345766667867e-05, + "loss": 0.6875, + "step": 42800 + }, + { + "epoch": 0.7021908682450957, + "grad_norm": 1.2925825119018555, + "learning_rate": 1.789451162158909e-05, + "loss": 0.7072, + "step": 42900 + }, + { + "epoch": 0.7038276767957836, + "grad_norm": 1.2188570499420166, + "learning_rate": 1.7883652825780418e-05, + "loss": 0.7084, + "step": 43000 + }, + { + "epoch": 0.7054644853464714, + "grad_norm": 1.2425892353057861, + "learning_rate": 1.7872769413148602e-05, + "loss": 0.7059, + "step": 43100 + }, + { + "epoch": 0.7071012938971594, + "grad_norm": 1.3490030765533447, + "learning_rate": 1.786186141767726e-05, + "loss": 0.6861, + "step": 43200 + }, + { + "epoch": 0.7087381024478472, + "grad_norm": 1.2493983507156372, + "learning_rate": 1.785092887342677e-05, + "loss": 0.6862, + "step": 43300 + }, + { + "epoch": 0.710374910998535, + "grad_norm": 1.1606495380401611, + "learning_rate": 1.7839971814534163e-05, + "loss": 0.6959, + "step": 43400 + }, + { + "epoch": 0.7120117195492229, + "grad_norm": 1.0867750644683838, + "learning_rate": 1.7828990275213023e-05, + "loss": 0.6838, + "step": 43500 + }, + { + "epoch": 0.7136485280999108, + "grad_norm": 1.4481595754623413, + "learning_rate": 1.781798428975336e-05, + "loss": 0.6877, + "step": 43600 + }, + { + "epoch": 0.7152853366505987, + "grad_norm": 1.0603893995285034, + "learning_rate": 1.7806953892521536e-05, + "loss": 0.6922, + "step": 43700 + }, + { + "epoch": 0.7169221452012865, + "grad_norm": 1.1686676740646362, + "learning_rate": 1.7795899117960126e-05, + "loss": 0.6933, + "step": 43800 + }, + { + "epoch": 0.7185589537519744, + "grad_norm": 1.423593282699585, + "learning_rate": 1.7784820000587828e-05, + "loss": 0.6947, + "step": 43900 + }, + { + "epoch": 0.7201957623026622, + "grad_norm": 1.2158969640731812, + "learning_rate": 1.7773716574999354e-05, + "loss": 0.6832, + "step": 44000 + }, + { + "epoch": 0.7218325708533502, + "grad_norm": 1.3259363174438477, + "learning_rate": 1.776258887586531e-05, + "loss": 0.6836, + "step": 44100 + }, + { + "epoch": 0.723469379404038, + "grad_norm": 1.2114306688308716, + "learning_rate": 1.775143693793211e-05, + "loss": 0.6934, + "step": 44200 + }, + { + "epoch": 0.7251061879547259, + "grad_norm": 1.0769015550613403, + "learning_rate": 1.774026079602184e-05, + "loss": 0.692, + "step": 44300 + }, + { + "epoch": 0.7267429965054137, + "grad_norm": 1.098381519317627, + "learning_rate": 1.7729060485032167e-05, + "loss": 0.6929, + "step": 44400 + }, + { + "epoch": 0.7283798050561016, + "grad_norm": 1.1960115432739258, + "learning_rate": 1.7717836039936235e-05, + "loss": 0.6895, + "step": 44500 + }, + { + "epoch": 0.7300166136067895, + "grad_norm": 1.2899237871170044, + "learning_rate": 1.7706587495782538e-05, + "loss": 0.6891, + "step": 44600 + }, + { + "epoch": 0.7316534221574773, + "grad_norm": 1.1849106550216675, + "learning_rate": 1.769531488769482e-05, + "loss": 0.6994, + "step": 44700 + }, + { + "epoch": 0.7332902307081652, + "grad_norm": 1.0840647220611572, + "learning_rate": 1.7684018250871967e-05, + "loss": 0.6902, + "step": 44800 + }, + { + "epoch": 0.734927039258853, + "grad_norm": 1.1262308359146118, + "learning_rate": 1.7672697620587904e-05, + "loss": 0.686, + "step": 44900 + }, + { + "epoch": 0.736563847809541, + "grad_norm": 1.2281126976013184, + "learning_rate": 1.7661353032191458e-05, + "loss": 0.6971, + "step": 45000 + }, + { + "epoch": 0.7382006563602288, + "grad_norm": 1.0803622007369995, + "learning_rate": 1.7649984521106282e-05, + "loss": 0.694, + "step": 45100 + }, + { + "epoch": 0.7398374649109167, + "grad_norm": 1.4072610139846802, + "learning_rate": 1.763859212283071e-05, + "loss": 0.704, + "step": 45200 + }, + { + "epoch": 0.7414742734616045, + "grad_norm": 1.2351950407028198, + "learning_rate": 1.7627175872937686e-05, + "loss": 0.6991, + "step": 45300 + }, + { + "epoch": 0.7431110820122925, + "grad_norm": 1.1985889673233032, + "learning_rate": 1.7615735807074616e-05, + "loss": 0.6947, + "step": 45400 + }, + { + "epoch": 0.7447478905629803, + "grad_norm": 1.1948813199996948, + "learning_rate": 1.7604271960963274e-05, + "loss": 0.6986, + "step": 45500 + }, + { + "epoch": 0.7463846991136681, + "grad_norm": 1.2745295763015747, + "learning_rate": 1.759278437039969e-05, + "loss": 0.6989, + "step": 45600 + }, + { + "epoch": 0.748021507664356, + "grad_norm": 1.1414821147918701, + "learning_rate": 1.7581273071254038e-05, + "loss": 0.6883, + "step": 45700 + }, + { + "epoch": 0.7496583162150439, + "grad_norm": 1.1246697902679443, + "learning_rate": 1.7569738099470524e-05, + "loss": 0.6818, + "step": 45800 + }, + { + "epoch": 0.7512951247657318, + "grad_norm": 1.1820296049118042, + "learning_rate": 1.7558179491067263e-05, + "loss": 0.7079, + "step": 45900 + }, + { + "epoch": 0.7529319333164196, + "grad_norm": 1.1293789148330688, + "learning_rate": 1.7546597282136186e-05, + "loss": 0.696, + "step": 46000 + }, + { + "epoch": 0.7545687418671075, + "grad_norm": 1.2405450344085693, + "learning_rate": 1.753499150884291e-05, + "loss": 0.6912, + "step": 46100 + }, + { + "epoch": 0.7562055504177954, + "grad_norm": 1.2177417278289795, + "learning_rate": 1.7523362207426634e-05, + "loss": 0.6824, + "step": 46200 + }, + { + "epoch": 0.7578423589684833, + "grad_norm": 1.124414086341858, + "learning_rate": 1.7511709414200024e-05, + "loss": 0.6868, + "step": 46300 + }, + { + "epoch": 0.7594791675191711, + "grad_norm": 1.1439573764801025, + "learning_rate": 1.7500033165549105e-05, + "loss": 0.6882, + "step": 46400 + }, + { + "epoch": 0.761115976069859, + "grad_norm": 1.1549428701400757, + "learning_rate": 1.7488333497933133e-05, + "loss": 0.681, + "step": 46500 + }, + { + "epoch": 0.7627527846205469, + "grad_norm": 1.3092726469039917, + "learning_rate": 1.7476610447884492e-05, + "loss": 0.6973, + "step": 46600 + }, + { + "epoch": 0.7643895931712347, + "grad_norm": 1.5812910795211792, + "learning_rate": 1.7464864052008586e-05, + "loss": 0.6855, + "step": 46700 + }, + { + "epoch": 0.7660264017219226, + "grad_norm": 1.189775824546814, + "learning_rate": 1.7453094346983707e-05, + "loss": 0.6983, + "step": 46800 + }, + { + "epoch": 0.7676632102726104, + "grad_norm": 1.3100470304489136, + "learning_rate": 1.7441301369560934e-05, + "loss": 0.6938, + "step": 46900 + }, + { + "epoch": 0.7693000188232983, + "grad_norm": 1.227925419807434, + "learning_rate": 1.7429485156564014e-05, + "loss": 0.6762, + "step": 47000 + }, + { + "epoch": 0.7709368273739862, + "grad_norm": 1.3295223712921143, + "learning_rate": 1.7417645744889248e-05, + "loss": 0.6823, + "step": 47100 + }, + { + "epoch": 0.7725736359246741, + "grad_norm": 1.1091123819351196, + "learning_rate": 1.740578317150538e-05, + "loss": 0.6978, + "step": 47200 + }, + { + "epoch": 0.7742104444753619, + "grad_norm": 1.2926867008209229, + "learning_rate": 1.7393897473453462e-05, + "loss": 0.6853, + "step": 47300 + }, + { + "epoch": 0.7758472530260498, + "grad_norm": 1.279630422592163, + "learning_rate": 1.738198868784677e-05, + "loss": 0.6911, + "step": 47400 + }, + { + "epoch": 0.7774840615767377, + "grad_norm": 1.1175949573516846, + "learning_rate": 1.7370056851870665e-05, + "loss": 0.687, + "step": 47500 + }, + { + "epoch": 0.7791208701274256, + "grad_norm": 1.0889476537704468, + "learning_rate": 1.7358102002782477e-05, + "loss": 0.689, + "step": 47600 + }, + { + "epoch": 0.7807576786781134, + "grad_norm": 1.1944537162780762, + "learning_rate": 1.7346124177911402e-05, + "loss": 0.6841, + "step": 47700 + }, + { + "epoch": 0.7823944872288013, + "grad_norm": 1.208275556564331, + "learning_rate": 1.7334123414658376e-05, + "loss": 0.6777, + "step": 47800 + }, + { + "epoch": 0.7840312957794892, + "grad_norm": 1.1608806848526, + "learning_rate": 1.7322099750495964e-05, + "loss": 0.6841, + "step": 47900 + }, + { + "epoch": 0.785668104330177, + "grad_norm": 1.0674712657928467, + "learning_rate": 1.731005322296823e-05, + "loss": 0.6765, + "step": 48000 + }, + { + "epoch": 0.7873049128808649, + "grad_norm": 1.1852935552597046, + "learning_rate": 1.729798386969064e-05, + "loss": 0.6968, + "step": 48100 + }, + { + "epoch": 0.7889417214315527, + "grad_norm": 1.1918047666549683, + "learning_rate": 1.728589172834993e-05, + "loss": 0.6815, + "step": 48200 + }, + { + "epoch": 0.7905785299822407, + "grad_norm": 1.3117504119873047, + "learning_rate": 1.7273776836703985e-05, + "loss": 0.6799, + "step": 48300 + }, + { + "epoch": 0.7922153385329285, + "grad_norm": 1.2398260831832886, + "learning_rate": 1.726163923258174e-05, + "loss": 0.6869, + "step": 48400 + }, + { + "epoch": 0.7938521470836164, + "grad_norm": 1.2091760635375977, + "learning_rate": 1.724947895388304e-05, + "loss": 0.6679, + "step": 48500 + }, + { + "epoch": 0.7954889556343042, + "grad_norm": 1.1533339023590088, + "learning_rate": 1.723729603857854e-05, + "loss": 0.6877, + "step": 48600 + }, + { + "epoch": 0.7971257641849921, + "grad_norm": 1.2629398107528687, + "learning_rate": 1.7225090524709577e-05, + "loss": 0.6878, + "step": 48700 + }, + { + "epoch": 0.79876257273568, + "grad_norm": 1.202531099319458, + "learning_rate": 1.7212862450388037e-05, + "loss": 0.6911, + "step": 48800 + }, + { + "epoch": 0.8003993812863679, + "grad_norm": 1.189326286315918, + "learning_rate": 1.7200611853796278e-05, + "loss": 0.6966, + "step": 48900 + }, + { + "epoch": 0.8020361898370557, + "grad_norm": 1.2614778280258179, + "learning_rate": 1.718833877318696e-05, + "loss": 0.6952, + "step": 49000 + }, + { + "epoch": 0.8036729983877435, + "grad_norm": 1.1864616870880127, + "learning_rate": 1.7176043246882966e-05, + "loss": 0.6756, + "step": 49100 + }, + { + "epoch": 0.8053098069384315, + "grad_norm": 1.205569863319397, + "learning_rate": 1.7163725313277255e-05, + "loss": 0.6748, + "step": 49200 + }, + { + "epoch": 0.8069466154891193, + "grad_norm": 1.2782241106033325, + "learning_rate": 1.715138501083276e-05, + "loss": 0.6903, + "step": 49300 + }, + { + "epoch": 0.8085834240398072, + "grad_norm": 1.0571094751358032, + "learning_rate": 1.7139022378082256e-05, + "loss": 0.6871, + "step": 49400 + }, + { + "epoch": 0.810220232590495, + "grad_norm": 1.3369005918502808, + "learning_rate": 1.712663745362826e-05, + "loss": 0.6746, + "step": 49500 + }, + { + "epoch": 0.811857041141183, + "grad_norm": 1.2506871223449707, + "learning_rate": 1.7114230276142866e-05, + "loss": 0.6935, + "step": 49600 + }, + { + "epoch": 0.8134938496918708, + "grad_norm": 1.3436931371688843, + "learning_rate": 1.7101800884367676e-05, + "loss": 0.6859, + "step": 49700 + }, + { + "epoch": 0.8151306582425587, + "grad_norm": 1.3217076063156128, + "learning_rate": 1.708934931711365e-05, + "loss": 0.6766, + "step": 49800 + }, + { + "epoch": 0.8167674667932465, + "grad_norm": 1.3521711826324463, + "learning_rate": 1.7076875613261e-05, + "loss": 0.6828, + "step": 49900 + }, + { + "epoch": 0.8184042753439345, + "grad_norm": 1.1544018983840942, + "learning_rate": 1.706437981175904e-05, + "loss": 0.6866, + "step": 50000 + }, + { + "epoch": 0.8200410838946223, + "grad_norm": 1.3795074224472046, + "learning_rate": 1.7051861951626105e-05, + "loss": 0.6893, + "step": 50100 + }, + { + "epoch": 0.8216778924453101, + "grad_norm": 1.2545524835586548, + "learning_rate": 1.7039322071949396e-05, + "loss": 0.6865, + "step": 50200 + }, + { + "epoch": 0.823314700995998, + "grad_norm": 1.3663312196731567, + "learning_rate": 1.702676021188487e-05, + "loss": 0.6858, + "step": 50300 + }, + { + "epoch": 0.8249515095466858, + "grad_norm": 1.4371784925460815, + "learning_rate": 1.701417641065713e-05, + "loss": 0.6827, + "step": 50400 + }, + { + "epoch": 0.8265883180973738, + "grad_norm": 1.465648889541626, + "learning_rate": 1.7001570707559274e-05, + "loss": 0.6813, + "step": 50500 + }, + { + "epoch": 0.8282251266480616, + "grad_norm": 1.1045328378677368, + "learning_rate": 1.69889431419528e-05, + "loss": 0.6858, + "step": 50600 + }, + { + "epoch": 0.8298619351987495, + "grad_norm": 1.1676952838897705, + "learning_rate": 1.6976293753267467e-05, + "loss": 0.662, + "step": 50700 + }, + { + "epoch": 0.8314987437494373, + "grad_norm": 1.2377560138702393, + "learning_rate": 1.6963622581001188e-05, + "loss": 0.6853, + "step": 50800 + }, + { + "epoch": 0.8331355523001253, + "grad_norm": 1.2052476406097412, + "learning_rate": 1.6950929664719883e-05, + "loss": 0.6898, + "step": 50900 + }, + { + "epoch": 0.8347723608508131, + "grad_norm": 1.400944709777832, + "learning_rate": 1.6938215044057363e-05, + "loss": 0.6905, + "step": 51000 + }, + { + "epoch": 0.836409169401501, + "grad_norm": 1.2622673511505127, + "learning_rate": 1.6925478758715226e-05, + "loss": 0.6651, + "step": 51100 + }, + { + "epoch": 0.8380459779521888, + "grad_norm": 1.1664501428604126, + "learning_rate": 1.691272084846272e-05, + "loss": 0.6851, + "step": 51200 + }, + { + "epoch": 0.8396827865028766, + "grad_norm": 1.2591482400894165, + "learning_rate": 1.68999413531366e-05, + "loss": 0.6936, + "step": 51300 + }, + { + "epoch": 0.8413195950535646, + "grad_norm": 1.163874864578247, + "learning_rate": 1.6887140312641036e-05, + "loss": 0.6886, + "step": 51400 + }, + { + "epoch": 0.8429564036042524, + "grad_norm": 1.2441082000732422, + "learning_rate": 1.6874317766947458e-05, + "loss": 0.6761, + "step": 51500 + }, + { + "epoch": 0.8445932121549403, + "grad_norm": 1.1966642141342163, + "learning_rate": 1.6861473756094464e-05, + "loss": 0.6758, + "step": 51600 + }, + { + "epoch": 0.8462300207056281, + "grad_norm": 1.1858773231506348, + "learning_rate": 1.6848608320187668e-05, + "loss": 0.6806, + "step": 51700 + }, + { + "epoch": 0.8478668292563161, + "grad_norm": 1.1656018495559692, + "learning_rate": 1.6835721499399583e-05, + "loss": 0.6768, + "step": 51800 + }, + { + "epoch": 0.8495036378070039, + "grad_norm": 1.2097491025924683, + "learning_rate": 1.6822813333969495e-05, + "loss": 0.6936, + "step": 51900 + }, + { + "epoch": 0.8511404463576918, + "grad_norm": 1.4976009130477905, + "learning_rate": 1.6809883864203352e-05, + "loss": 0.6721, + "step": 52000 + }, + { + "epoch": 0.8527772549083796, + "grad_norm": 1.3640004396438599, + "learning_rate": 1.6796933130473606e-05, + "loss": 0.6738, + "step": 52100 + }, + { + "epoch": 0.8544140634590676, + "grad_norm": 1.2159740924835205, + "learning_rate": 1.6783961173219116e-05, + "loss": 0.6755, + "step": 52200 + }, + { + "epoch": 0.8560508720097554, + "grad_norm": 1.23357355594635, + "learning_rate": 1.677096803294502e-05, + "loss": 0.6789, + "step": 52300 + }, + { + "epoch": 0.8576876805604432, + "grad_norm": 1.2574186325073242, + "learning_rate": 1.6757953750222586e-05, + "loss": 0.6892, + "step": 52400 + }, + { + "epoch": 0.8593244891111311, + "grad_norm": 1.2394073009490967, + "learning_rate": 1.6744918365689106e-05, + "loss": 0.6726, + "step": 52500 + }, + { + "epoch": 0.860961297661819, + "grad_norm": 1.2098554372787476, + "learning_rate": 1.6731861920047758e-05, + "loss": 0.6714, + "step": 52600 + }, + { + "epoch": 0.8625981062125069, + "grad_norm": 1.3548126220703125, + "learning_rate": 1.6718784454067495e-05, + "loss": 0.6849, + "step": 52700 + }, + { + "epoch": 0.8642349147631947, + "grad_norm": 1.5218019485473633, + "learning_rate": 1.670568600858289e-05, + "loss": 0.6744, + "step": 52800 + }, + { + "epoch": 0.8658717233138826, + "grad_norm": 1.3826264142990112, + "learning_rate": 1.669256662449404e-05, + "loss": 0.6762, + "step": 52900 + }, + { + "epoch": 0.8675085318645704, + "grad_norm": 1.2154985666275024, + "learning_rate": 1.667942634276642e-05, + "loss": 0.6711, + "step": 53000 + }, + { + "epoch": 0.8691453404152584, + "grad_norm": 1.3120452165603638, + "learning_rate": 1.666626520443075e-05, + "loss": 0.6788, + "step": 53100 + }, + { + "epoch": 0.8707821489659462, + "grad_norm": 1.2221883535385132, + "learning_rate": 1.665308325058288e-05, + "loss": 0.6661, + "step": 53200 + }, + { + "epoch": 0.8724189575166341, + "grad_norm": 1.385396957397461, + "learning_rate": 1.6639880522383655e-05, + "loss": 0.6714, + "step": 53300 + }, + { + "epoch": 0.8740557660673219, + "grad_norm": 1.2685418128967285, + "learning_rate": 1.6626657061058797e-05, + "loss": 0.668, + "step": 53400 + }, + { + "epoch": 0.8756925746180098, + "grad_norm": 1.513152837753296, + "learning_rate": 1.661341290789875e-05, + "loss": 0.6706, + "step": 53500 + }, + { + "epoch": 0.8773293831686977, + "grad_norm": 1.2810958623886108, + "learning_rate": 1.6600148104258594e-05, + "loss": 0.6904, + "step": 53600 + }, + { + "epoch": 0.8789661917193855, + "grad_norm": 1.2695286273956299, + "learning_rate": 1.6586862691557863e-05, + "loss": 0.6733, + "step": 53700 + }, + { + "epoch": 0.8806030002700734, + "grad_norm": 1.0760889053344727, + "learning_rate": 1.6573556711280457e-05, + "loss": 0.6743, + "step": 53800 + }, + { + "epoch": 0.8822398088207613, + "grad_norm": 1.3402081727981567, + "learning_rate": 1.6560230204974502e-05, + "loss": 0.6706, + "step": 53900 + }, + { + "epoch": 0.8838766173714492, + "grad_norm": 1.191873550415039, + "learning_rate": 1.654688321425221e-05, + "loss": 0.6764, + "step": 54000 + }, + { + "epoch": 0.885513425922137, + "grad_norm": 1.1215344667434692, + "learning_rate": 1.6533515780789758e-05, + "loss": 0.6857, + "step": 54100 + }, + { + "epoch": 0.8871502344728249, + "grad_norm": 1.1322293281555176, + "learning_rate": 1.6520127946327155e-05, + "loss": 0.6723, + "step": 54200 + }, + { + "epoch": 0.8887870430235127, + "grad_norm": 1.7162648439407349, + "learning_rate": 1.6506719752668115e-05, + "loss": 0.679, + "step": 54300 + }, + { + "epoch": 0.8904238515742007, + "grad_norm": 1.5632336139678955, + "learning_rate": 1.6493291241679922e-05, + "loss": 0.6807, + "step": 54400 + }, + { + "epoch": 0.8920606601248885, + "grad_norm": 1.0530614852905273, + "learning_rate": 1.6479842455293297e-05, + "loss": 0.6681, + "step": 54500 + }, + { + "epoch": 0.8936974686755763, + "grad_norm": 1.2179269790649414, + "learning_rate": 1.6466373435502276e-05, + "loss": 0.6614, + "step": 54600 + }, + { + "epoch": 0.8953342772262642, + "grad_norm": 1.3225027322769165, + "learning_rate": 1.6452884224364082e-05, + "loss": 0.671, + "step": 54700 + }, + { + "epoch": 0.8969710857769521, + "grad_norm": 1.3610303401947021, + "learning_rate": 1.6439374863998966e-05, + "loss": 0.6801, + "step": 54800 + }, + { + "epoch": 0.89860789432764, + "grad_norm": 1.3277727365493774, + "learning_rate": 1.6425845396590114e-05, + "loss": 0.6746, + "step": 54900 + }, + { + "epoch": 0.9002447028783278, + "grad_norm": 1.2963169813156128, + "learning_rate": 1.6412295864383487e-05, + "loss": 0.6817, + "step": 55000 + }, + { + "epoch": 0.9018815114290157, + "grad_norm": 1.475885033607483, + "learning_rate": 1.6398726309687704e-05, + "loss": 0.6891, + "step": 55100 + }, + { + "epoch": 0.9035183199797036, + "grad_norm": 1.2722758054733276, + "learning_rate": 1.638513677487389e-05, + "loss": 0.6709, + "step": 55200 + }, + { + "epoch": 0.9051551285303915, + "grad_norm": 1.3521857261657715, + "learning_rate": 1.637152730237558e-05, + "loss": 0.6812, + "step": 55300 + }, + { + "epoch": 0.9067919370810793, + "grad_norm": 1.2276744842529297, + "learning_rate": 1.6357897934688555e-05, + "loss": 0.6644, + "step": 55400 + }, + { + "epoch": 0.9084287456317672, + "grad_norm": 1.5432332754135132, + "learning_rate": 1.634424871437071e-05, + "loss": 0.6817, + "step": 55500 + }, + { + "epoch": 0.910065554182455, + "grad_norm": 1.2314627170562744, + "learning_rate": 1.6330579684041946e-05, + "loss": 0.6761, + "step": 55600 + }, + { + "epoch": 0.9117023627331429, + "grad_norm": 1.473347544670105, + "learning_rate": 1.631689088638401e-05, + "loss": 0.6587, + "step": 55700 + }, + { + "epoch": 0.9133391712838308, + "grad_norm": 1.4029542207717896, + "learning_rate": 1.6303182364140376e-05, + "loss": 0.6863, + "step": 55800 + }, + { + "epoch": 0.9149759798345186, + "grad_norm": 1.1235482692718506, + "learning_rate": 1.628945416011611e-05, + "loss": 0.6717, + "step": 55900 + }, + { + "epoch": 0.9166127883852065, + "grad_norm": 1.1514254808425903, + "learning_rate": 1.6275706317177732e-05, + "loss": 0.6815, + "step": 56000 + }, + { + "epoch": 0.9182495969358944, + "grad_norm": 1.388074517250061, + "learning_rate": 1.6261938878253086e-05, + "loss": 0.6849, + "step": 56100 + }, + { + "epoch": 0.9198864054865823, + "grad_norm": 1.1814851760864258, + "learning_rate": 1.6248151886331208e-05, + "loss": 0.6641, + "step": 56200 + }, + { + "epoch": 0.9215232140372701, + "grad_norm": 1.4052802324295044, + "learning_rate": 1.6234345384462174e-05, + "loss": 0.6787, + "step": 56300 + }, + { + "epoch": 0.923160022587958, + "grad_norm": 1.5508378744125366, + "learning_rate": 1.6220519415757005e-05, + "loss": 0.6808, + "step": 56400 + }, + { + "epoch": 0.9247968311386459, + "grad_norm": 1.3127562999725342, + "learning_rate": 1.620667402338749e-05, + "loss": 0.6663, + "step": 56500 + }, + { + "epoch": 0.9264336396893338, + "grad_norm": 1.2677356004714966, + "learning_rate": 1.619280925058607e-05, + "loss": 0.6723, + "step": 56600 + }, + { + "epoch": 0.9280704482400216, + "grad_norm": 1.2480475902557373, + "learning_rate": 1.61789251406457e-05, + "loss": 0.6583, + "step": 56700 + }, + { + "epoch": 0.9297072567907094, + "grad_norm": 1.1523864269256592, + "learning_rate": 1.616502173691973e-05, + "loss": 0.6858, + "step": 56800 + }, + { + "epoch": 0.9313440653413974, + "grad_norm": 1.2443100214004517, + "learning_rate": 1.615109908282174e-05, + "loss": 0.6842, + "step": 56900 + }, + { + "epoch": 0.9329808738920852, + "grad_norm": 1.172663927078247, + "learning_rate": 1.6137157221825418e-05, + "loss": 0.6708, + "step": 57000 + }, + { + "epoch": 0.9346176824427731, + "grad_norm": 1.2049202919006348, + "learning_rate": 1.6123196197464445e-05, + "loss": 0.6665, + "step": 57100 + }, + { + "epoch": 0.9362544909934609, + "grad_norm": 1.3395051956176758, + "learning_rate": 1.6109216053332313e-05, + "loss": 0.6593, + "step": 57200 + }, + { + "epoch": 0.9378912995441488, + "grad_norm": 1.4670510292053223, + "learning_rate": 1.6095216833082242e-05, + "loss": 0.6715, + "step": 57300 + }, + { + "epoch": 0.9395281080948367, + "grad_norm": 1.349523663520813, + "learning_rate": 1.6081198580427e-05, + "loss": 0.6724, + "step": 57400 + }, + { + "epoch": 0.9411649166455246, + "grad_norm": 1.5846613645553589, + "learning_rate": 1.606716133913879e-05, + "loss": 0.6716, + "step": 57500 + }, + { + "epoch": 0.9428017251962124, + "grad_norm": 1.1905144453048706, + "learning_rate": 1.6053105153049103e-05, + "loss": 0.6702, + "step": 57600 + }, + { + "epoch": 0.9444385337469003, + "grad_norm": 1.4006574153900146, + "learning_rate": 1.6039030066048592e-05, + "loss": 0.6665, + "step": 57700 + }, + { + "epoch": 0.9460753422975882, + "grad_norm": 1.3038159608840942, + "learning_rate": 1.602493612208693e-05, + "loss": 0.665, + "step": 57800 + }, + { + "epoch": 0.947712150848276, + "grad_norm": 1.336591124534607, + "learning_rate": 1.601082336517266e-05, + "loss": 0.6572, + "step": 57900 + }, + { + "epoch": 0.9493489593989639, + "grad_norm": 1.3096286058425903, + "learning_rate": 1.5996691839373077e-05, + "loss": 0.6651, + "step": 58000 + }, + { + "epoch": 0.9509857679496517, + "grad_norm": 1.3385711908340454, + "learning_rate": 1.5982541588814083e-05, + "loss": 0.6708, + "step": 58100 + }, + { + "epoch": 0.9526225765003397, + "grad_norm": 1.2425600290298462, + "learning_rate": 1.596837265768004e-05, + "loss": 0.6629, + "step": 58200 + }, + { + "epoch": 0.9542593850510275, + "grad_norm": 1.1755977869033813, + "learning_rate": 1.5954185090213653e-05, + "loss": 0.6618, + "step": 58300 + }, + { + "epoch": 0.9558961936017154, + "grad_norm": 1.5241588354110718, + "learning_rate": 1.5939978930715808e-05, + "loss": 0.6747, + "step": 58400 + }, + { + "epoch": 0.9575330021524032, + "grad_norm": 1.113451361656189, + "learning_rate": 1.5925754223545452e-05, + "loss": 0.6779, + "step": 58500 + }, + { + "epoch": 0.9591698107030912, + "grad_norm": 1.2721067667007446, + "learning_rate": 1.5911511013119438e-05, + "loss": 0.6586, + "step": 58600 + }, + { + "epoch": 0.960806619253779, + "grad_norm": 1.5037124156951904, + "learning_rate": 1.589724934391241e-05, + "loss": 0.6646, + "step": 58700 + }, + { + "epoch": 0.9624434278044669, + "grad_norm": 1.2813490629196167, + "learning_rate": 1.588296926045664e-05, + "loss": 0.6644, + "step": 58800 + }, + { + "epoch": 0.9640802363551547, + "grad_norm": 1.2610142230987549, + "learning_rate": 1.58686708073419e-05, + "loss": 0.6717, + "step": 58900 + }, + { + "epoch": 0.9657170449058425, + "grad_norm": 1.2408130168914795, + "learning_rate": 1.585435402921532e-05, + "loss": 0.6695, + "step": 59000 + }, + { + "epoch": 0.9673538534565305, + "grad_norm": 1.4657983779907227, + "learning_rate": 1.584001897078126e-05, + "loss": 0.6777, + "step": 59100 + }, + { + "epoch": 0.9689906620072183, + "grad_norm": 1.370548129081726, + "learning_rate": 1.5825665676801145e-05, + "loss": 0.6881, + "step": 59200 + }, + { + "epoch": 0.9706274705579062, + "grad_norm": 1.3695186376571655, + "learning_rate": 1.5811294192093353e-05, + "loss": 0.6594, + "step": 59300 + }, + { + "epoch": 0.972264279108594, + "grad_norm": 1.2767751216888428, + "learning_rate": 1.5796904561533054e-05, + "loss": 0.6661, + "step": 59400 + }, + { + "epoch": 0.973901087659282, + "grad_norm": 1.293419361114502, + "learning_rate": 1.578249683005209e-05, + "loss": 0.6781, + "step": 59500 + }, + { + "epoch": 0.9755378962099698, + "grad_norm": 1.5075045824050903, + "learning_rate": 1.576807104263881e-05, + "loss": 0.6706, + "step": 59600 + }, + { + "epoch": 0.9771747047606577, + "grad_norm": 1.1597870588302612, + "learning_rate": 1.5753627244337958e-05, + "loss": 0.6709, + "step": 59700 + }, + { + "epoch": 0.9788115133113455, + "grad_norm": 1.5488371849060059, + "learning_rate": 1.5739165480250504e-05, + "loss": 0.6611, + "step": 59800 + }, + { + "epoch": 0.9804483218620335, + "grad_norm": 1.3339688777923584, + "learning_rate": 1.5724685795533518e-05, + "loss": 0.679, + "step": 59900 + }, + { + "epoch": 0.9820851304127213, + "grad_norm": 1.3151462078094482, + "learning_rate": 1.571018823540004e-05, + "loss": 0.6636, + "step": 60000 + }, + { + "epoch": 0.9837219389634091, + "grad_norm": 1.3205444812774658, + "learning_rate": 1.5695672845118903e-05, + "loss": 0.6623, + "step": 60100 + }, + { + "epoch": 0.985358747514097, + "grad_norm": 1.294420599937439, + "learning_rate": 1.5681139670014643e-05, + "loss": 0.6666, + "step": 60200 + }, + { + "epoch": 0.9869955560647848, + "grad_norm": 1.3142366409301758, + "learning_rate": 1.566658875546731e-05, + "loss": 0.6629, + "step": 60300 + }, + { + "epoch": 0.9886323646154728, + "grad_norm": 1.3516416549682617, + "learning_rate": 1.565202014691235e-05, + "loss": 0.6664, + "step": 60400 + }, + { + "epoch": 0.9902691731661606, + "grad_norm": 1.2360502481460571, + "learning_rate": 1.5637433889840455e-05, + "loss": 0.6608, + "step": 60500 + }, + { + "epoch": 0.9919059817168485, + "grad_norm": 1.155104398727417, + "learning_rate": 1.562283002979744e-05, + "loss": 0.6676, + "step": 60600 + }, + { + "epoch": 0.9935427902675363, + "grad_norm": 1.2880823612213135, + "learning_rate": 1.560820861238407e-05, + "loss": 0.6632, + "step": 60700 + }, + { + "epoch": 0.9951795988182243, + "grad_norm": 1.2748744487762451, + "learning_rate": 1.5593569683255936e-05, + "loss": 0.6723, + "step": 60800 + }, + { + "epoch": 0.9968164073689121, + "grad_norm": 1.2065379619598389, + "learning_rate": 1.557891328812332e-05, + "loss": 0.6831, + "step": 60900 + }, + { + "epoch": 0.9984532159196, + "grad_norm": 1.143071174621582, + "learning_rate": 1.5564239472751022e-05, + "loss": 0.6656, + "step": 61000 + }, + { + "epoch": 1.0000818404275345, + "grad_norm": 1.1476441621780396, + "learning_rate": 1.5549548282958253e-05, + "loss": 0.6591, + "step": 61100 + }, + { + "epoch": 1.0017186489782222, + "grad_norm": 1.210295557975769, + "learning_rate": 1.5534839764618477e-05, + "loss": 0.6559, + "step": 61200 + }, + { + "epoch": 1.00335545752891, + "grad_norm": 1.5003302097320557, + "learning_rate": 1.5520113963659257e-05, + "loss": 0.6615, + "step": 61300 + }, + { + "epoch": 1.004992266079598, + "grad_norm": 1.235449194908142, + "learning_rate": 1.550537092606212e-05, + "loss": 0.6709, + "step": 61400 + }, + { + "epoch": 1.006629074630286, + "grad_norm": 1.1739157438278198, + "learning_rate": 1.549061069786243e-05, + "loss": 0.668, + "step": 61500 + }, + { + "epoch": 1.0082658831809737, + "grad_norm": 1.2646570205688477, + "learning_rate": 1.5475833325149215e-05, + "loss": 0.6553, + "step": 61600 + }, + { + "epoch": 1.0099026917316616, + "grad_norm": 1.2951397895812988, + "learning_rate": 1.546103885406504e-05, + "loss": 0.6584, + "step": 61700 + }, + { + "epoch": 1.0115395002823495, + "grad_norm": 1.2838189601898193, + "learning_rate": 1.544622733080586e-05, + "loss": 0.6518, + "step": 61800 + }, + { + "epoch": 1.0131763088330374, + "grad_norm": 1.3708552122116089, + "learning_rate": 1.543139880162088e-05, + "loss": 0.6628, + "step": 61900 + }, + { + "epoch": 1.0148131173837251, + "grad_norm": 1.301353931427002, + "learning_rate": 1.54165533128124e-05, + "loss": 0.6478, + "step": 62000 + }, + { + "epoch": 1.016449925934413, + "grad_norm": 1.3044975996017456, + "learning_rate": 1.5401690910735677e-05, + "loss": 0.6439, + "step": 62100 + }, + { + "epoch": 1.018086734485101, + "grad_norm": 1.4568370580673218, + "learning_rate": 1.5386811641798785e-05, + "loss": 0.6482, + "step": 62200 + }, + { + "epoch": 1.0197235430357887, + "grad_norm": 1.3758224248886108, + "learning_rate": 1.5371915552462466e-05, + "loss": 0.663, + "step": 62300 + }, + { + "epoch": 1.0213603515864766, + "grad_norm": 1.6428395509719849, + "learning_rate": 1.535700268923998e-05, + "loss": 0.6533, + "step": 62400 + }, + { + "epoch": 1.0229971601371646, + "grad_norm": 1.3830885887145996, + "learning_rate": 1.5342073098696956e-05, + "loss": 0.6632, + "step": 62500 + }, + { + "epoch": 1.0246339686878525, + "grad_norm": 1.426006555557251, + "learning_rate": 1.5327126827451272e-05, + "loss": 0.6491, + "step": 62600 + }, + { + "epoch": 1.0262707772385402, + "grad_norm": 1.4166696071624756, + "learning_rate": 1.531216392217288e-05, + "loss": 0.6465, + "step": 62700 + }, + { + "epoch": 1.0279075857892281, + "grad_norm": 1.224443793296814, + "learning_rate": 1.529718442958367e-05, + "loss": 0.6642, + "step": 62800 + }, + { + "epoch": 1.029544394339916, + "grad_norm": 1.250406265258789, + "learning_rate": 1.528218839645733e-05, + "loss": 0.6516, + "step": 62900 + }, + { + "epoch": 1.031181202890604, + "grad_norm": 1.2630037069320679, + "learning_rate": 1.52671758696192e-05, + "loss": 0.6649, + "step": 63000 + } + ], + "logging_steps": 100, + "max_steps": 183285, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2.34907099427588e+19, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}