diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,30941 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.999715245742924, + "eval_steps": 2000, + "global_step": 43895, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 9.391961097717285, + "eval_runtime": 9.2079, + "eval_samples_per_second": 1.629, + "eval_steps_per_second": 0.217, + "step": 0 + }, + { + "epoch": 0.001139017028304573, + "grad_norm": 33.790897369384766, + "learning_rate": 3.5000000000000004e-06, + "loss": 8.9613, + "step": 10 + }, + { + "epoch": 0.002278034056609146, + "grad_norm": 35.671512603759766, + "learning_rate": 8.000000000000001e-06, + "loss": 8.507, + "step": 20 + }, + { + "epoch": 0.0034170510849137197, + "grad_norm": 24.397680282592773, + "learning_rate": 1.3000000000000001e-05, + "loss": 7.9465, + "step": 30 + }, + { + "epoch": 0.004556068113218292, + "grad_norm": 15.802092552185059, + "learning_rate": 1.8e-05, + "loss": 7.3319, + "step": 40 + }, + { + "epoch": 0.005695085141522865, + "grad_norm": 14.520373344421387, + "learning_rate": 2.3000000000000003e-05, + "loss": 7.3491, + "step": 50 + }, + { + "epoch": 0.006834102169827439, + "grad_norm": 14.86996841430664, + "learning_rate": 2.8000000000000003e-05, + "loss": 6.8979, + "step": 60 + }, + { + "epoch": 0.007973119198132012, + "grad_norm": 14.0488920211792, + "learning_rate": 3.3e-05, + "loss": 6.9052, + "step": 70 + }, + { + "epoch": 0.009112136226436585, + "grad_norm": 13.32886028289795, + "learning_rate": 3.8e-05, + "loss": 7.3002, + "step": 80 + }, + { + "epoch": 0.010251153254741158, + "grad_norm": 11.834951400756836, + "learning_rate": 4.3e-05, + "loss": 6.9231, + "step": 90 + }, + { + "epoch": 0.01139017028304573, + "grad_norm": 17.501251220703125, + "learning_rate": 4.8e-05, + "loss": 6.6756, + "step": 100 + }, + { + "epoch": 0.012529187311350304, + "grad_norm": 10.640851020812988, + "learning_rate": 4.999314990295696e-05, + "loss": 6.861, + "step": 110 + }, + { + "epoch": 0.013668204339654879, + "grad_norm": 58.233848571777344, + "learning_rate": 4.998173307455189e-05, + "loss": 7.1125, + "step": 120 + }, + { + "epoch": 0.014807221367959452, + "grad_norm": 15.781530380249023, + "learning_rate": 4.997031624614683e-05, + "loss": 7.0402, + "step": 130 + }, + { + "epoch": 0.015946238396264023, + "grad_norm": 10.17588996887207, + "learning_rate": 4.995889941774175e-05, + "loss": 6.9334, + "step": 140 + }, + { + "epoch": 0.017085255424568596, + "grad_norm": 14.133599281311035, + "learning_rate": 4.9947482589336685e-05, + "loss": 6.7971, + "step": 150 + }, + { + "epoch": 0.01822427245287317, + "grad_norm": 10.439510345458984, + "learning_rate": 4.993606576093162e-05, + "loss": 7.2431, + "step": 160 + }, + { + "epoch": 0.019363289481177742, + "grad_norm": 10.945755004882812, + "learning_rate": 4.992464893252655e-05, + "loss": 6.794, + "step": 170 + }, + { + "epoch": 0.020502306509482315, + "grad_norm": 14.820952415466309, + "learning_rate": 4.9913232104121474e-05, + "loss": 6.6915, + "step": 180 + }, + { + "epoch": 0.02164132353778689, + "grad_norm": 13.331741333007812, + "learning_rate": 4.9901815275716405e-05, + "loss": 6.9742, + "step": 190 + }, + { + "epoch": 0.02278034056609146, + "grad_norm": 10.086663246154785, + "learning_rate": 4.989039844731134e-05, + "loss": 6.8214, + "step": 200 + }, + { + "epoch": 0.023919357594396035, + "grad_norm": 8.214897155761719, + "learning_rate": 4.987898161890627e-05, + "loss": 6.899, + "step": 210 + }, + { + "epoch": 0.025058374622700608, + "grad_norm": 9.853551864624023, + "learning_rate": 4.98675647905012e-05, + "loss": 6.893, + "step": 220 + }, + { + "epoch": 0.026197391651005184, + "grad_norm": 8.813733100891113, + "learning_rate": 4.985614796209613e-05, + "loss": 6.6315, + "step": 230 + }, + { + "epoch": 0.027336408679309757, + "grad_norm": 15.19206714630127, + "learning_rate": 4.9844731133691065e-05, + "loss": 6.2331, + "step": 240 + }, + { + "epoch": 0.02847542570761433, + "grad_norm": 11.389547348022461, + "learning_rate": 4.9833314305285996e-05, + "loss": 6.5654, + "step": 250 + }, + { + "epoch": 0.029614442735918903, + "grad_norm": 99.71858978271484, + "learning_rate": 4.982418084256194e-05, + "loss": 7.0021, + "step": 260 + }, + { + "epoch": 0.030753459764223477, + "grad_norm": 8.42121696472168, + "learning_rate": 4.981276401415687e-05, + "loss": 7.3346, + "step": 270 + }, + { + "epoch": 0.031892476792528046, + "grad_norm": 8.211444854736328, + "learning_rate": 4.98013471857518e-05, + "loss": 6.4807, + "step": 280 + }, + { + "epoch": 0.03303149382083262, + "grad_norm": 32.79572296142578, + "learning_rate": 4.978993035734673e-05, + "loss": 6.1654, + "step": 290 + }, + { + "epoch": 0.03417051084913719, + "grad_norm": 11.948236465454102, + "learning_rate": 4.9778513528941664e-05, + "loss": 6.9469, + "step": 300 + }, + { + "epoch": 0.035309527877441765, + "grad_norm": 8.02177619934082, + "learning_rate": 4.976709670053659e-05, + "loss": 6.8169, + "step": 310 + }, + { + "epoch": 0.03644854490574634, + "grad_norm": 7.713070392608643, + "learning_rate": 4.975567987213152e-05, + "loss": 6.528, + "step": 320 + }, + { + "epoch": 0.03758756193405091, + "grad_norm": 8.871659278869629, + "learning_rate": 4.974426304372646e-05, + "loss": 6.445, + "step": 330 + }, + { + "epoch": 0.038726578962355485, + "grad_norm": 9.975062370300293, + "learning_rate": 4.973284621532139e-05, + "loss": 6.7275, + "step": 340 + }, + { + "epoch": 0.03986559599066006, + "grad_norm": 9.243513107299805, + "learning_rate": 4.9721429386916316e-05, + "loss": 6.3645, + "step": 350 + }, + { + "epoch": 0.04100461301896463, + "grad_norm": 7.769169807434082, + "learning_rate": 4.971001255851125e-05, + "loss": 6.568, + "step": 360 + }, + { + "epoch": 0.042143630047269204, + "grad_norm": 11.605182647705078, + "learning_rate": 4.969859573010618e-05, + "loss": 6.6352, + "step": 370 + }, + { + "epoch": 0.04328264707557378, + "grad_norm": 8.109936714172363, + "learning_rate": 4.968717890170111e-05, + "loss": 6.7942, + "step": 380 + }, + { + "epoch": 0.04442166410387835, + "grad_norm": 11.485625267028809, + "learning_rate": 4.967576207329604e-05, + "loss": 6.6572, + "step": 390 + }, + { + "epoch": 0.04556068113218292, + "grad_norm": 10.854982376098633, + "learning_rate": 4.966434524489097e-05, + "loss": 6.4459, + "step": 400 + }, + { + "epoch": 0.046699698160487496, + "grad_norm": 13.574692726135254, + "learning_rate": 4.96529284164859e-05, + "loss": 6.4014, + "step": 410 + }, + { + "epoch": 0.04783871518879207, + "grad_norm": 10.07735824584961, + "learning_rate": 4.964151158808084e-05, + "loss": 6.5581, + "step": 420 + }, + { + "epoch": 0.04897773221709664, + "grad_norm": 11.299982070922852, + "learning_rate": 4.9630094759675764e-05, + "loss": 6.6429, + "step": 430 + }, + { + "epoch": 0.050116749245401215, + "grad_norm": 12.302263259887695, + "learning_rate": 4.9618677931270696e-05, + "loss": 6.9176, + "step": 440 + }, + { + "epoch": 0.051255766273705795, + "grad_norm": 280.8349304199219, + "learning_rate": 4.960726110286563e-05, + "loss": 6.115, + "step": 450 + }, + { + "epoch": 0.05239478330201037, + "grad_norm": 11.660781860351562, + "learning_rate": 4.959584427446056e-05, + "loss": 6.8566, + "step": 460 + }, + { + "epoch": 0.05353380033031494, + "grad_norm": 9.093023300170898, + "learning_rate": 4.9584427446055485e-05, + "loss": 6.8361, + "step": 470 + }, + { + "epoch": 0.054672817358619515, + "grad_norm": 11.05521297454834, + "learning_rate": 4.9573010617650417e-05, + "loss": 6.4291, + "step": 480 + }, + { + "epoch": 0.05581183438692409, + "grad_norm": 21.002239227294922, + "learning_rate": 4.956159378924535e-05, + "loss": 6.4612, + "step": 490 + }, + { + "epoch": 0.05695085141522866, + "grad_norm": 10.657567024230957, + "learning_rate": 4.955017696084028e-05, + "loss": 6.3579, + "step": 500 + }, + { + "epoch": 0.058089868443533234, + "grad_norm": 11.03357219696045, + "learning_rate": 4.953876013243521e-05, + "loss": 6.4549, + "step": 510 + }, + { + "epoch": 0.05922888547183781, + "grad_norm": 9.404083251953125, + "learning_rate": 4.9527343304030144e-05, + "loss": 6.0746, + "step": 520 + }, + { + "epoch": 0.06036790250014238, + "grad_norm": 7.945953845977783, + "learning_rate": 4.9515926475625076e-05, + "loss": 6.3596, + "step": 530 + }, + { + "epoch": 0.06150691952844695, + "grad_norm": 7.898467063903809, + "learning_rate": 4.950450964722001e-05, + "loss": 5.6804, + "step": 540 + }, + { + "epoch": 0.06264593655675152, + "grad_norm": 15.345819473266602, + "learning_rate": 4.949309281881493e-05, + "loss": 6.6344, + "step": 550 + }, + { + "epoch": 0.06378495358505609, + "grad_norm": 10.704774856567383, + "learning_rate": 4.9481675990409864e-05, + "loss": 6.2971, + "step": 560 + }, + { + "epoch": 0.06492397061336067, + "grad_norm": 18.340755462646484, + "learning_rate": 4.9470259162004796e-05, + "loss": 6.5508, + "step": 570 + }, + { + "epoch": 0.06606298764166524, + "grad_norm": 15.063714981079102, + "learning_rate": 4.945884233359973e-05, + "loss": 6.291, + "step": 580 + }, + { + "epoch": 0.06720200466996981, + "grad_norm": 11.307854652404785, + "learning_rate": 4.944742550519466e-05, + "loss": 6.2575, + "step": 590 + }, + { + "epoch": 0.06834102169827438, + "grad_norm": 9.267541885375977, + "learning_rate": 4.943600867678959e-05, + "loss": 6.1397, + "step": 600 + }, + { + "epoch": 0.06948003872657896, + "grad_norm": 8.097043991088867, + "learning_rate": 4.9424591848384524e-05, + "loss": 6.4305, + "step": 610 + }, + { + "epoch": 0.07061905575488353, + "grad_norm": 14.195074081420898, + "learning_rate": 4.941317501997945e-05, + "loss": 6.1038, + "step": 620 + }, + { + "epoch": 0.0717580727831881, + "grad_norm": 7.2899980545043945, + "learning_rate": 4.940175819157438e-05, + "loss": 6.5986, + "step": 630 + }, + { + "epoch": 0.07289708981149268, + "grad_norm": 7.332784175872803, + "learning_rate": 4.939034136316931e-05, + "loss": 6.5741, + "step": 640 + }, + { + "epoch": 0.07403610683979725, + "grad_norm": 17.372455596923828, + "learning_rate": 4.9378924534764244e-05, + "loss": 6.4233, + "step": 650 + }, + { + "epoch": 0.07517512386810182, + "grad_norm": 6.290081024169922, + "learning_rate": 4.9367507706359176e-05, + "loss": 6.1218, + "step": 660 + }, + { + "epoch": 0.0763141408964064, + "grad_norm": 11.476178169250488, + "learning_rate": 4.935609087795411e-05, + "loss": 6.3903, + "step": 670 + }, + { + "epoch": 0.07745315792471097, + "grad_norm": 9.72933578491211, + "learning_rate": 4.934467404954904e-05, + "loss": 6.2056, + "step": 680 + }, + { + "epoch": 0.07859217495301554, + "grad_norm": 13.388916015625, + "learning_rate": 4.933325722114397e-05, + "loss": 6.3394, + "step": 690 + }, + { + "epoch": 0.07973119198132012, + "grad_norm": 9.48354434967041, + "learning_rate": 4.93218403927389e-05, + "loss": 6.3088, + "step": 700 + }, + { + "epoch": 0.08087020900962469, + "grad_norm": 8.708009719848633, + "learning_rate": 4.931042356433383e-05, + "loss": 5.4808, + "step": 710 + }, + { + "epoch": 0.08200922603792926, + "grad_norm": 10.767496109008789, + "learning_rate": 4.929900673592876e-05, + "loss": 6.7952, + "step": 720 + }, + { + "epoch": 0.08314824306623383, + "grad_norm": 20.067209243774414, + "learning_rate": 4.928758990752369e-05, + "loss": 6.3908, + "step": 730 + }, + { + "epoch": 0.08428726009453841, + "grad_norm": 9.885820388793945, + "learning_rate": 4.927617307911862e-05, + "loss": 6.3423, + "step": 740 + }, + { + "epoch": 0.08542627712284298, + "grad_norm": 19.079978942871094, + "learning_rate": 4.9264756250713556e-05, + "loss": 6.742, + "step": 750 + }, + { + "epoch": 0.08656529415114755, + "grad_norm": 8.339012145996094, + "learning_rate": 4.925333942230849e-05, + "loss": 6.5489, + "step": 760 + }, + { + "epoch": 0.08770431117945213, + "grad_norm": 6.8558125495910645, + "learning_rate": 4.924192259390342e-05, + "loss": 6.2821, + "step": 770 + }, + { + "epoch": 0.0888433282077567, + "grad_norm": 9.564237594604492, + "learning_rate": 4.9230505765498345e-05, + "loss": 6.4705, + "step": 780 + }, + { + "epoch": 0.08998234523606127, + "grad_norm": 8.423482894897461, + "learning_rate": 4.9219088937093276e-05, + "loss": 6.277, + "step": 790 + }, + { + "epoch": 0.09112136226436585, + "grad_norm": 11.994951248168945, + "learning_rate": 4.920767210868821e-05, + "loss": 6.2951, + "step": 800 + }, + { + "epoch": 0.09226037929267042, + "grad_norm": 6.857611179351807, + "learning_rate": 4.919625528028314e-05, + "loss": 6.2825, + "step": 810 + }, + { + "epoch": 0.09339939632097499, + "grad_norm": 10.049439430236816, + "learning_rate": 4.9184838451878065e-05, + "loss": 6.6446, + "step": 820 + }, + { + "epoch": 0.09453841334927957, + "grad_norm": 10.375741958618164, + "learning_rate": 4.9173421623473004e-05, + "loss": 6.4659, + "step": 830 + }, + { + "epoch": 0.09567743037758414, + "grad_norm": 16.795923233032227, + "learning_rate": 4.9162004795067936e-05, + "loss": 6.0484, + "step": 840 + }, + { + "epoch": 0.09681644740588871, + "grad_norm": 9.408246994018555, + "learning_rate": 4.915058796666287e-05, + "loss": 6.2876, + "step": 850 + }, + { + "epoch": 0.09795546443419328, + "grad_norm": 9.622159004211426, + "learning_rate": 4.913917113825779e-05, + "loss": 6.2234, + "step": 860 + }, + { + "epoch": 0.09909448146249786, + "grad_norm": 6.719747543334961, + "learning_rate": 4.9127754309852724e-05, + "loss": 6.6025, + "step": 870 + }, + { + "epoch": 0.10023349849080243, + "grad_norm": 7.316214561462402, + "learning_rate": 4.9116337481447656e-05, + "loss": 6.0127, + "step": 880 + }, + { + "epoch": 0.101372515519107, + "grad_norm": 8.370978355407715, + "learning_rate": 4.910492065304259e-05, + "loss": 6.3505, + "step": 890 + }, + { + "epoch": 0.10251153254741159, + "grad_norm": 9.227510452270508, + "learning_rate": 4.909350382463751e-05, + "loss": 6.6696, + "step": 900 + }, + { + "epoch": 0.10365054957571616, + "grad_norm": 8.848276138305664, + "learning_rate": 4.908208699623245e-05, + "loss": 6.5989, + "step": 910 + }, + { + "epoch": 0.10478956660402074, + "grad_norm": 6.87146520614624, + "learning_rate": 4.9070670167827383e-05, + "loss": 6.1757, + "step": 920 + }, + { + "epoch": 0.10592858363232531, + "grad_norm": 12.845023155212402, + "learning_rate": 4.9059253339422315e-05, + "loss": 6.155, + "step": 930 + }, + { + "epoch": 0.10706760066062988, + "grad_norm": 8.775230407714844, + "learning_rate": 4.904783651101724e-05, + "loss": 6.6463, + "step": 940 + }, + { + "epoch": 0.10820661768893446, + "grad_norm": 6.301644802093506, + "learning_rate": 4.903641968261217e-05, + "loss": 6.3834, + "step": 950 + }, + { + "epoch": 0.10934563471723903, + "grad_norm": 6.351837635040283, + "learning_rate": 4.9025002854207104e-05, + "loss": 6.0652, + "step": 960 + }, + { + "epoch": 0.1104846517455436, + "grad_norm": 12.989243507385254, + "learning_rate": 4.9013586025802036e-05, + "loss": 6.133, + "step": 970 + }, + { + "epoch": 0.11162366877384818, + "grad_norm": 10.382040977478027, + "learning_rate": 4.900216919739696e-05, + "loss": 6.0956, + "step": 980 + }, + { + "epoch": 0.11276268580215275, + "grad_norm": 10.52625560760498, + "learning_rate": 4.89907523689919e-05, + "loss": 6.7882, + "step": 990 + }, + { + "epoch": 0.11390170283045732, + "grad_norm": 7.049230098724365, + "learning_rate": 4.897933554058683e-05, + "loss": 6.3438, + "step": 1000 + }, + { + "epoch": 0.1150407198587619, + "grad_norm": 6.671297073364258, + "learning_rate": 4.8967918712181756e-05, + "loss": 6.4511, + "step": 1010 + }, + { + "epoch": 0.11617973688706647, + "grad_norm": 6.536120891571045, + "learning_rate": 4.895650188377669e-05, + "loss": 6.1108, + "step": 1020 + }, + { + "epoch": 0.11731875391537104, + "grad_norm": 13.030671119689941, + "learning_rate": 4.894508505537162e-05, + "loss": 6.251, + "step": 1030 + }, + { + "epoch": 0.11845777094367561, + "grad_norm": 6.829502582550049, + "learning_rate": 4.893366822696655e-05, + "loss": 6.0828, + "step": 1040 + }, + { + "epoch": 0.11959678797198019, + "grad_norm": 7.369673252105713, + "learning_rate": 4.8922251398561484e-05, + "loss": 6.0556, + "step": 1050 + }, + { + "epoch": 0.12073580500028476, + "grad_norm": 8.547041893005371, + "learning_rate": 4.891083457015641e-05, + "loss": 6.1014, + "step": 1060 + }, + { + "epoch": 0.12187482202858933, + "grad_norm": 15.62175464630127, + "learning_rate": 4.889941774175134e-05, + "loss": 6.2186, + "step": 1070 + }, + { + "epoch": 0.1230138390568939, + "grad_norm": 12.673507690429688, + "learning_rate": 4.888800091334628e-05, + "loss": 6.6656, + "step": 1080 + }, + { + "epoch": 0.12415285608519848, + "grad_norm": 6.221761226654053, + "learning_rate": 4.8876584084941204e-05, + "loss": 6.1597, + "step": 1090 + }, + { + "epoch": 0.12529187311350304, + "grad_norm": 6.281069755554199, + "learning_rate": 4.8865167256536136e-05, + "loss": 6.8511, + "step": 1100 + }, + { + "epoch": 0.1264308901418076, + "grad_norm": 10.124175071716309, + "learning_rate": 4.885375042813107e-05, + "loss": 6.1205, + "step": 1110 + }, + { + "epoch": 0.12756990717011218, + "grad_norm": 12.35534381866455, + "learning_rate": 4.8842333599726e-05, + "loss": 6.2648, + "step": 1120 + }, + { + "epoch": 0.12870892419841676, + "grad_norm": 8.898721694946289, + "learning_rate": 4.8830916771320925e-05, + "loss": 6.4631, + "step": 1130 + }, + { + "epoch": 0.12984794122672133, + "grad_norm": 5.725305557250977, + "learning_rate": 4.881949994291586e-05, + "loss": 6.5257, + "step": 1140 + }, + { + "epoch": 0.1309869582550259, + "grad_norm": 6.875019550323486, + "learning_rate": 4.880808311451079e-05, + "loss": 6.4072, + "step": 1150 + }, + { + "epoch": 0.13212597528333048, + "grad_norm": 6.568986415863037, + "learning_rate": 4.879666628610573e-05, + "loss": 6.2429, + "step": 1160 + }, + { + "epoch": 0.13326499231163505, + "grad_norm": 6.279566287994385, + "learning_rate": 4.878524945770065e-05, + "loss": 6.1154, + "step": 1170 + }, + { + "epoch": 0.13440400933993962, + "grad_norm": 6.139650344848633, + "learning_rate": 4.8773832629295584e-05, + "loss": 6.4415, + "step": 1180 + }, + { + "epoch": 0.1355430263682442, + "grad_norm": 15.926244735717773, + "learning_rate": 4.8762415800890516e-05, + "loss": 5.7659, + "step": 1190 + }, + { + "epoch": 0.13668204339654877, + "grad_norm": 8.014056205749512, + "learning_rate": 4.875099897248545e-05, + "loss": 6.0903, + "step": 1200 + }, + { + "epoch": 0.13782106042485334, + "grad_norm": 8.850829124450684, + "learning_rate": 4.873958214408037e-05, + "loss": 6.1817, + "step": 1210 + }, + { + "epoch": 0.13896007745315792, + "grad_norm": 38.34330368041992, + "learning_rate": 4.8728165315675305e-05, + "loss": 6.6666, + "step": 1220 + }, + { + "epoch": 0.1400990944814625, + "grad_norm": 11.398346900939941, + "learning_rate": 4.8716748487270237e-05, + "loss": 6.2435, + "step": 1230 + }, + { + "epoch": 0.14123811150976706, + "grad_norm": 9.657853126525879, + "learning_rate": 4.8705331658865175e-05, + "loss": 5.9659, + "step": 1240 + }, + { + "epoch": 0.14237712853807163, + "grad_norm": 16.08867645263672, + "learning_rate": 4.86939148304601e-05, + "loss": 6.1505, + "step": 1250 + }, + { + "epoch": 0.1435161455663762, + "grad_norm": 6.016851425170898, + "learning_rate": 4.868249800205503e-05, + "loss": 6.1347, + "step": 1260 + }, + { + "epoch": 0.14465516259468078, + "grad_norm": 7.4131855964660645, + "learning_rate": 4.8671081173649964e-05, + "loss": 6.1454, + "step": 1270 + }, + { + "epoch": 0.14579417962298535, + "grad_norm": 27.70649528503418, + "learning_rate": 4.8659664345244896e-05, + "loss": 6.2655, + "step": 1280 + }, + { + "epoch": 0.14693319665128993, + "grad_norm": 23.359214782714844, + "learning_rate": 4.864824751683982e-05, + "loss": 5.9975, + "step": 1290 + }, + { + "epoch": 0.1480722136795945, + "grad_norm": 7.8880510330200195, + "learning_rate": 4.863683068843475e-05, + "loss": 6.373, + "step": 1300 + }, + { + "epoch": 0.14921123070789907, + "grad_norm": 8.129782676696777, + "learning_rate": 4.8625413860029684e-05, + "loss": 6.0534, + "step": 1310 + }, + { + "epoch": 0.15035024773620365, + "grad_norm": 5.611935615539551, + "learning_rate": 4.8613997031624616e-05, + "loss": 6.4988, + "step": 1320 + }, + { + "epoch": 0.15148926476450822, + "grad_norm": 6.0331597328186035, + "learning_rate": 4.860258020321955e-05, + "loss": 5.8687, + "step": 1330 + }, + { + "epoch": 0.1526282817928128, + "grad_norm": 6.858291149139404, + "learning_rate": 4.859116337481448e-05, + "loss": 6.0291, + "step": 1340 + }, + { + "epoch": 0.15376729882111737, + "grad_norm": 6.007782936096191, + "learning_rate": 4.857974654640941e-05, + "loss": 5.9691, + "step": 1350 + }, + { + "epoch": 0.15490631584942194, + "grad_norm": 5.932456970214844, + "learning_rate": 4.8568329718004344e-05, + "loss": 6.3551, + "step": 1360 + }, + { + "epoch": 0.1560453328777265, + "grad_norm": 6.164700984954834, + "learning_rate": 4.855691288959927e-05, + "loss": 6.0607, + "step": 1370 + }, + { + "epoch": 0.15718434990603108, + "grad_norm": 6.078372478485107, + "learning_rate": 4.85454960611942e-05, + "loss": 6.1052, + "step": 1380 + }, + { + "epoch": 0.15832336693433566, + "grad_norm": 9.424004554748535, + "learning_rate": 4.853407923278913e-05, + "loss": 6.2598, + "step": 1390 + }, + { + "epoch": 0.15946238396264023, + "grad_norm": 14.353684425354004, + "learning_rate": 4.8522662404384064e-05, + "loss": 6.1377, + "step": 1400 + }, + { + "epoch": 0.1606014009909448, + "grad_norm": 8.278711318969727, + "learning_rate": 4.8511245575978996e-05, + "loss": 6.2278, + "step": 1410 + }, + { + "epoch": 0.16174041801924938, + "grad_norm": 9.254868507385254, + "learning_rate": 4.849982874757393e-05, + "loss": 6.0705, + "step": 1420 + }, + { + "epoch": 0.16287943504755395, + "grad_norm": 9.009742736816406, + "learning_rate": 4.848841191916886e-05, + "loss": 6.3962, + "step": 1430 + }, + { + "epoch": 0.16401845207585852, + "grad_norm": 9.298644065856934, + "learning_rate": 4.8476995090763785e-05, + "loss": 6.2293, + "step": 1440 + }, + { + "epoch": 0.1651574691041631, + "grad_norm": 10.836934089660645, + "learning_rate": 4.846557826235872e-05, + "loss": 6.0921, + "step": 1450 + }, + { + "epoch": 0.16629648613246767, + "grad_norm": 19.63772201538086, + "learning_rate": 4.845416143395365e-05, + "loss": 6.4046, + "step": 1460 + }, + { + "epoch": 0.16743550316077224, + "grad_norm": 15.014579772949219, + "learning_rate": 4.844274460554858e-05, + "loss": 6.4525, + "step": 1470 + }, + { + "epoch": 0.16857452018907682, + "grad_norm": 7.850244522094727, + "learning_rate": 4.843132777714351e-05, + "loss": 6.0932, + "step": 1480 + }, + { + "epoch": 0.1697135372173814, + "grad_norm": 7.637505531311035, + "learning_rate": 4.8419910948738444e-05, + "loss": 6.0807, + "step": 1490 + }, + { + "epoch": 0.17085255424568596, + "grad_norm": 5.155618667602539, + "learning_rate": 4.8408494120333376e-05, + "loss": 6.2082, + "step": 1500 + }, + { + "epoch": 0.17199157127399053, + "grad_norm": 8.438528060913086, + "learning_rate": 4.839707729192831e-05, + "loss": 6.0389, + "step": 1510 + }, + { + "epoch": 0.1731305883022951, + "grad_norm": 31.926239013671875, + "learning_rate": 4.838566046352323e-05, + "loss": 5.9333, + "step": 1520 + }, + { + "epoch": 0.17426960533059968, + "grad_norm": 5.836406707763672, + "learning_rate": 4.8374243635118165e-05, + "loss": 6.0942, + "step": 1530 + }, + { + "epoch": 0.17540862235890425, + "grad_norm": 8.088964462280273, + "learning_rate": 4.8362826806713096e-05, + "loss": 6.2754, + "step": 1540 + }, + { + "epoch": 0.17654763938720883, + "grad_norm": 6.764881610870361, + "learning_rate": 4.835140997830803e-05, + "loss": 5.8569, + "step": 1550 + }, + { + "epoch": 0.1776866564155134, + "grad_norm": 4.88889217376709, + "learning_rate": 4.833999314990296e-05, + "loss": 6.378, + "step": 1560 + }, + { + "epoch": 0.17882567344381797, + "grad_norm": 8.175009727478027, + "learning_rate": 4.832857632149789e-05, + "loss": 6.2938, + "step": 1570 + }, + { + "epoch": 0.17996469047212255, + "grad_norm": 7.189762115478516, + "learning_rate": 4.8317159493092824e-05, + "loss": 6.1562, + "step": 1580 + }, + { + "epoch": 0.18110370750042712, + "grad_norm": 6.389930248260498, + "learning_rate": 4.8305742664687756e-05, + "loss": 6.2245, + "step": 1590 + }, + { + "epoch": 0.1822427245287317, + "grad_norm": 9.998733520507812, + "learning_rate": 4.829432583628268e-05, + "loss": 5.7088, + "step": 1600 + }, + { + "epoch": 0.18338174155703627, + "grad_norm": 9.267230033874512, + "learning_rate": 4.828290900787761e-05, + "loss": 5.9327, + "step": 1610 + }, + { + "epoch": 0.18452075858534084, + "grad_norm": 7.547935485839844, + "learning_rate": 4.8271492179472544e-05, + "loss": 5.8324, + "step": 1620 + }, + { + "epoch": 0.1856597756136454, + "grad_norm": 10.875968933105469, + "learning_rate": 4.8260075351067476e-05, + "loss": 6.0532, + "step": 1630 + }, + { + "epoch": 0.18679879264194998, + "grad_norm": 7.93349027633667, + "learning_rate": 4.82486585226624e-05, + "loss": 6.4827, + "step": 1640 + }, + { + "epoch": 0.18793780967025456, + "grad_norm": 7.426550388336182, + "learning_rate": 4.823724169425734e-05, + "loss": 6.1161, + "step": 1650 + }, + { + "epoch": 0.18907682669855913, + "grad_norm": 5.887362003326416, + "learning_rate": 4.822582486585227e-05, + "loss": 6.1616, + "step": 1660 + }, + { + "epoch": 0.1902158437268637, + "grad_norm": 9.981441497802734, + "learning_rate": 4.8214408037447204e-05, + "loss": 6.0732, + "step": 1670 + }, + { + "epoch": 0.19135486075516828, + "grad_norm": 7.465058326721191, + "learning_rate": 4.820299120904213e-05, + "loss": 6.1012, + "step": 1680 + }, + { + "epoch": 0.19249387778347285, + "grad_norm": 14.853341102600098, + "learning_rate": 4.819157438063706e-05, + "loss": 6.6688, + "step": 1690 + }, + { + "epoch": 0.19363289481177742, + "grad_norm": 9.4944486618042, + "learning_rate": 4.818015755223199e-05, + "loss": 6.0464, + "step": 1700 + }, + { + "epoch": 0.194771911840082, + "grad_norm": 6.113044261932373, + "learning_rate": 4.8168740723826924e-05, + "loss": 6.0747, + "step": 1710 + }, + { + "epoch": 0.19591092886838657, + "grad_norm": 11.900188446044922, + "learning_rate": 4.815732389542185e-05, + "loss": 6.1851, + "step": 1720 + }, + { + "epoch": 0.19704994589669114, + "grad_norm": 8.415531158447266, + "learning_rate": 4.814590706701679e-05, + "loss": 5.9874, + "step": 1730 + }, + { + "epoch": 0.19818896292499572, + "grad_norm": 5.007725238800049, + "learning_rate": 4.813449023861172e-05, + "loss": 6.3246, + "step": 1740 + }, + { + "epoch": 0.1993279799533003, + "grad_norm": 7.706382751464844, + "learning_rate": 4.812307341020665e-05, + "loss": 6.3739, + "step": 1750 + }, + { + "epoch": 0.20046699698160486, + "grad_norm": 6.697893142700195, + "learning_rate": 4.8111656581801577e-05, + "loss": 6.3933, + "step": 1760 + }, + { + "epoch": 0.20160601400990943, + "grad_norm": 8.579668998718262, + "learning_rate": 4.810023975339651e-05, + "loss": 5.9253, + "step": 1770 + }, + { + "epoch": 0.202745031038214, + "grad_norm": 7.312700271606445, + "learning_rate": 4.808882292499144e-05, + "loss": 5.995, + "step": 1780 + }, + { + "epoch": 0.20388404806651858, + "grad_norm": 12.99907112121582, + "learning_rate": 4.807740609658637e-05, + "loss": 6.023, + "step": 1790 + }, + { + "epoch": 0.20502306509482318, + "grad_norm": 7.628302097320557, + "learning_rate": 4.80659892681813e-05, + "loss": 6.1802, + "step": 1800 + }, + { + "epoch": 0.20616208212312775, + "grad_norm": 8.317530632019043, + "learning_rate": 4.805457243977623e-05, + "loss": 6.2685, + "step": 1810 + }, + { + "epoch": 0.20730109915143233, + "grad_norm": 14.762248039245605, + "learning_rate": 4.804315561137117e-05, + "loss": 6.1036, + "step": 1820 + }, + { + "epoch": 0.2084401161797369, + "grad_norm": 6.367677688598633, + "learning_rate": 4.803173878296609e-05, + "loss": 5.9443, + "step": 1830 + }, + { + "epoch": 0.20957913320804147, + "grad_norm": 8.055495262145996, + "learning_rate": 4.8020321954561024e-05, + "loss": 6.0813, + "step": 1840 + }, + { + "epoch": 0.21071815023634605, + "grad_norm": 8.737330436706543, + "learning_rate": 4.8008905126155956e-05, + "loss": 6.2417, + "step": 1850 + }, + { + "epoch": 0.21185716726465062, + "grad_norm": 7.428030490875244, + "learning_rate": 4.799748829775089e-05, + "loss": 5.9609, + "step": 1860 + }, + { + "epoch": 0.2129961842929552, + "grad_norm": 6.4950480461120605, + "learning_rate": 4.798607146934582e-05, + "loss": 6.1071, + "step": 1870 + }, + { + "epoch": 0.21413520132125977, + "grad_norm": 13.422469139099121, + "learning_rate": 4.7974654640940745e-05, + "loss": 6.174, + "step": 1880 + }, + { + "epoch": 0.21527421834956434, + "grad_norm": 5.788820743560791, + "learning_rate": 4.796323781253568e-05, + "loss": 6.2645, + "step": 1890 + }, + { + "epoch": 0.2164132353778689, + "grad_norm": 8.139408111572266, + "learning_rate": 4.7951820984130615e-05, + "loss": 6.165, + "step": 1900 + }, + { + "epoch": 0.21755225240617349, + "grad_norm": 5.652318954467773, + "learning_rate": 4.794040415572554e-05, + "loss": 6.0668, + "step": 1910 + }, + { + "epoch": 0.21869126943447806, + "grad_norm": 7.127692699432373, + "learning_rate": 4.792898732732047e-05, + "loss": 6.0703, + "step": 1920 + }, + { + "epoch": 0.21983028646278263, + "grad_norm": 8.72547435760498, + "learning_rate": 4.7917570498915404e-05, + "loss": 5.8692, + "step": 1930 + }, + { + "epoch": 0.2209693034910872, + "grad_norm": 11.749885559082031, + "learning_rate": 4.7906153670510336e-05, + "loss": 6.355, + "step": 1940 + }, + { + "epoch": 0.22210832051939178, + "grad_norm": 9.093997955322266, + "learning_rate": 4.789473684210526e-05, + "loss": 6.0883, + "step": 1950 + }, + { + "epoch": 0.22324733754769635, + "grad_norm": 7.588223457336426, + "learning_rate": 4.788332001370019e-05, + "loss": 6.3683, + "step": 1960 + }, + { + "epoch": 0.22438635457600092, + "grad_norm": 7.856176376342773, + "learning_rate": 4.7871903185295125e-05, + "loss": 5.9866, + "step": 1970 + }, + { + "epoch": 0.2255253716043055, + "grad_norm": 7.1797404289245605, + "learning_rate": 4.7860486356890063e-05, + "loss": 6.1741, + "step": 1980 + }, + { + "epoch": 0.22666438863261007, + "grad_norm": 7.769150733947754, + "learning_rate": 4.784906952848499e-05, + "loss": 6.0692, + "step": 1990 + }, + { + "epoch": 0.22780340566091464, + "grad_norm": 5.899435997009277, + "learning_rate": 4.783765270007992e-05, + "loss": 5.9925, + "step": 2000 + }, + { + "epoch": 0.22780340566091464, + "eval_loss": 6.29394006729126, + "eval_runtime": 11.0939, + "eval_samples_per_second": 1.352, + "eval_steps_per_second": 0.18, + "step": 2000 + }, + { + "epoch": 0.22894242268921922, + "grad_norm": 7.498287677764893, + "learning_rate": 4.782623587167485e-05, + "loss": 5.915, + "step": 2010 + }, + { + "epoch": 0.2300814397175238, + "grad_norm": 8.568222045898438, + "learning_rate": 4.7814819043269784e-05, + "loss": 6.0853, + "step": 2020 + }, + { + "epoch": 0.23122045674582836, + "grad_norm": 6.620724201202393, + "learning_rate": 4.780340221486471e-05, + "loss": 5.7249, + "step": 2030 + }, + { + "epoch": 0.23235947377413294, + "grad_norm": 10.718255996704102, + "learning_rate": 4.779198538645964e-05, + "loss": 6.023, + "step": 2040 + }, + { + "epoch": 0.2334984908024375, + "grad_norm": 5.444962024688721, + "learning_rate": 4.778056855805457e-05, + "loss": 6.0138, + "step": 2050 + }, + { + "epoch": 0.23463750783074208, + "grad_norm": 7.1495041847229, + "learning_rate": 4.776915172964951e-05, + "loss": 6.0932, + "step": 2060 + }, + { + "epoch": 0.23577652485904665, + "grad_norm": 8.905622482299805, + "learning_rate": 4.7757734901244436e-05, + "loss": 6.1093, + "step": 2070 + }, + { + "epoch": 0.23691554188735123, + "grad_norm": 6.666223526000977, + "learning_rate": 4.774631807283937e-05, + "loss": 5.9837, + "step": 2080 + }, + { + "epoch": 0.2380545589156558, + "grad_norm": 11.08031940460205, + "learning_rate": 4.77349012444343e-05, + "loss": 6.0752, + "step": 2090 + }, + { + "epoch": 0.23919357594396037, + "grad_norm": 7.527054786682129, + "learning_rate": 4.772348441602923e-05, + "loss": 5.9818, + "step": 2100 + }, + { + "epoch": 0.24033259297226495, + "grad_norm": 6.239260673522949, + "learning_rate": 4.771206758762416e-05, + "loss": 6.0345, + "step": 2110 + }, + { + "epoch": 0.24147161000056952, + "grad_norm": 13.21486759185791, + "learning_rate": 4.770065075921909e-05, + "loss": 5.9158, + "step": 2120 + }, + { + "epoch": 0.2426106270288741, + "grad_norm": 6.5013322830200195, + "learning_rate": 4.768923393081402e-05, + "loss": 6.1012, + "step": 2130 + }, + { + "epoch": 0.24374964405717867, + "grad_norm": 7.801065444946289, + "learning_rate": 4.767781710240895e-05, + "loss": 5.8506, + "step": 2140 + }, + { + "epoch": 0.24488866108548324, + "grad_norm": 11.960895538330078, + "learning_rate": 4.7666400274003884e-05, + "loss": 6.1912, + "step": 2150 + }, + { + "epoch": 0.2460276781137878, + "grad_norm": 5.73373556137085, + "learning_rate": 4.7654983445598816e-05, + "loss": 6.1602, + "step": 2160 + }, + { + "epoch": 0.24716669514209239, + "grad_norm": 6.799802303314209, + "learning_rate": 4.764356661719375e-05, + "loss": 6.1822, + "step": 2170 + }, + { + "epoch": 0.24830571217039696, + "grad_norm": 10.62204360961914, + "learning_rate": 4.763214978878868e-05, + "loss": 6.0969, + "step": 2180 + }, + { + "epoch": 0.24944472919870153, + "grad_norm": 16.851665496826172, + "learning_rate": 4.7620732960383605e-05, + "loss": 6.0222, + "step": 2190 + }, + { + "epoch": 0.2505837462270061, + "grad_norm": 12.263028144836426, + "learning_rate": 4.760931613197854e-05, + "loss": 6.2636, + "step": 2200 + }, + { + "epoch": 0.2517227632553107, + "grad_norm": 5.9286370277404785, + "learning_rate": 4.759789930357347e-05, + "loss": 6.2338, + "step": 2210 + }, + { + "epoch": 0.2528617802836152, + "grad_norm": 4.939316272735596, + "learning_rate": 4.75864824751684e-05, + "loss": 6.1357, + "step": 2220 + }, + { + "epoch": 0.2540007973119198, + "grad_norm": 6.26075553894043, + "learning_rate": 4.757506564676333e-05, + "loss": 6.0128, + "step": 2230 + }, + { + "epoch": 0.25513981434022437, + "grad_norm": 6.404390811920166, + "learning_rate": 4.7563648818358264e-05, + "loss": 6.0365, + "step": 2240 + }, + { + "epoch": 0.25627883136852897, + "grad_norm": 8.060712814331055, + "learning_rate": 4.7552231989953196e-05, + "loss": 5.8035, + "step": 2250 + }, + { + "epoch": 0.2574178483968335, + "grad_norm": 7.4588751792907715, + "learning_rate": 4.754081516154813e-05, + "loss": 6.0485, + "step": 2260 + }, + { + "epoch": 0.2585568654251381, + "grad_norm": 9.267732620239258, + "learning_rate": 4.752939833314305e-05, + "loss": 6.0969, + "step": 2270 + }, + { + "epoch": 0.25969588245344266, + "grad_norm": 6.651428699493408, + "learning_rate": 4.7517981504737985e-05, + "loss": 6.4401, + "step": 2280 + }, + { + "epoch": 0.26083489948174726, + "grad_norm": 13.987489700317383, + "learning_rate": 4.7506564676332916e-05, + "loss": 6.0228, + "step": 2290 + }, + { + "epoch": 0.2619739165100518, + "grad_norm": 8.124069213867188, + "learning_rate": 4.749514784792785e-05, + "loss": 6.1741, + "step": 2300 + }, + { + "epoch": 0.2631129335383564, + "grad_norm": 5.7567524909973145, + "learning_rate": 4.748373101952278e-05, + "loss": 6.6153, + "step": 2310 + }, + { + "epoch": 0.26425195056666095, + "grad_norm": 8.237354278564453, + "learning_rate": 4.747231419111771e-05, + "loss": 6.9511, + "step": 2320 + }, + { + "epoch": 0.26539096759496555, + "grad_norm": 9.20639705657959, + "learning_rate": 4.7460897362712644e-05, + "loss": 6.3056, + "step": 2330 + }, + { + "epoch": 0.2665299846232701, + "grad_norm": 5.040071964263916, + "learning_rate": 4.744948053430757e-05, + "loss": 6.1363, + "step": 2340 + }, + { + "epoch": 0.2676690016515747, + "grad_norm": 6.3060808181762695, + "learning_rate": 4.74380637059025e-05, + "loss": 6.3329, + "step": 2350 + }, + { + "epoch": 0.26880801867987925, + "grad_norm": 5.8059306144714355, + "learning_rate": 4.742664687749743e-05, + "loss": 6.1995, + "step": 2360 + }, + { + "epoch": 0.26994703570818385, + "grad_norm": 6.453045845031738, + "learning_rate": 4.7415230049092364e-05, + "loss": 5.8708, + "step": 2370 + }, + { + "epoch": 0.2710860527364884, + "grad_norm": 10.587589263916016, + "learning_rate": 4.7403813220687296e-05, + "loss": 5.8575, + "step": 2380 + }, + { + "epoch": 0.272225069764793, + "grad_norm": 9.980673789978027, + "learning_rate": 4.739239639228223e-05, + "loss": 6.0367, + "step": 2390 + }, + { + "epoch": 0.27336408679309754, + "grad_norm": 7.341554164886475, + "learning_rate": 4.738097956387716e-05, + "loss": 6.0386, + "step": 2400 + }, + { + "epoch": 0.27450310382140214, + "grad_norm": 6.8661627769470215, + "learning_rate": 4.736956273547209e-05, + "loss": 6.1036, + "step": 2410 + }, + { + "epoch": 0.2756421208497067, + "grad_norm": 5.913212299346924, + "learning_rate": 4.735814590706702e-05, + "loss": 5.9632, + "step": 2420 + }, + { + "epoch": 0.2767811378780113, + "grad_norm": 6.647164821624756, + "learning_rate": 4.734672907866195e-05, + "loss": 5.5648, + "step": 2430 + }, + { + "epoch": 0.27792015490631583, + "grad_norm": 10.95175552368164, + "learning_rate": 4.733531225025688e-05, + "loss": 5.9986, + "step": 2440 + }, + { + "epoch": 0.27905917193462043, + "grad_norm": 6.893737316131592, + "learning_rate": 4.732389542185181e-05, + "loss": 6.2376, + "step": 2450 + }, + { + "epoch": 0.280198188962925, + "grad_norm": 7.816811561584473, + "learning_rate": 4.731247859344674e-05, + "loss": 5.9885, + "step": 2460 + }, + { + "epoch": 0.2813372059912296, + "grad_norm": 6.8919782638549805, + "learning_rate": 4.730106176504167e-05, + "loss": 5.9962, + "step": 2470 + }, + { + "epoch": 0.2824762230195341, + "grad_norm": 5.464484214782715, + "learning_rate": 4.728964493663661e-05, + "loss": 6.105, + "step": 2480 + }, + { + "epoch": 0.2836152400478387, + "grad_norm": 7.527377605438232, + "learning_rate": 4.727822810823154e-05, + "loss": 6.2536, + "step": 2490 + }, + { + "epoch": 0.28475425707614327, + "grad_norm": 6.423839569091797, + "learning_rate": 4.7266811279826465e-05, + "loss": 5.8717, + "step": 2500 + }, + { + "epoch": 0.28589327410444787, + "grad_norm": 6.446660041809082, + "learning_rate": 4.7255394451421397e-05, + "loss": 6.3294, + "step": 2510 + }, + { + "epoch": 0.2870322911327524, + "grad_norm": 14.388496398925781, + "learning_rate": 4.724397762301633e-05, + "loss": 6.1426, + "step": 2520 + }, + { + "epoch": 0.288171308161057, + "grad_norm": 6.65110445022583, + "learning_rate": 4.723256079461126e-05, + "loss": 5.8097, + "step": 2530 + }, + { + "epoch": 0.28931032518936156, + "grad_norm": 10.555778503417969, + "learning_rate": 4.7221143966206185e-05, + "loss": 6.128, + "step": 2540 + }, + { + "epoch": 0.29044934221766616, + "grad_norm": 11.9190092086792, + "learning_rate": 4.720972713780112e-05, + "loss": 6.2016, + "step": 2550 + }, + { + "epoch": 0.2915883592459707, + "grad_norm": 7.404531955718994, + "learning_rate": 4.7198310309396056e-05, + "loss": 5.8386, + "step": 2560 + }, + { + "epoch": 0.2927273762742753, + "grad_norm": 14.28420639038086, + "learning_rate": 4.718689348099099e-05, + "loss": 5.8982, + "step": 2570 + }, + { + "epoch": 0.29386639330257985, + "grad_norm": 8.460867881774902, + "learning_rate": 4.717547665258591e-05, + "loss": 6.0927, + "step": 2580 + }, + { + "epoch": 0.29500541033088445, + "grad_norm": 6.630770683288574, + "learning_rate": 4.7164059824180844e-05, + "loss": 6.0046, + "step": 2590 + }, + { + "epoch": 0.296144427359189, + "grad_norm": 15.487954139709473, + "learning_rate": 4.7152642995775776e-05, + "loss": 6.0306, + "step": 2600 + }, + { + "epoch": 0.2972834443874936, + "grad_norm": 8.283817291259766, + "learning_rate": 4.714122616737071e-05, + "loss": 5.7432, + "step": 2610 + }, + { + "epoch": 0.29842246141579815, + "grad_norm": 6.828223705291748, + "learning_rate": 4.712980933896563e-05, + "loss": 6.3253, + "step": 2620 + }, + { + "epoch": 0.29956147844410275, + "grad_norm": 5.298694610595703, + "learning_rate": 4.7118392510560565e-05, + "loss": 6.0055, + "step": 2630 + }, + { + "epoch": 0.3007004954724073, + "grad_norm": 14.596810340881348, + "learning_rate": 4.7106975682155504e-05, + "loss": 6.2159, + "step": 2640 + }, + { + "epoch": 0.3018395125007119, + "grad_norm": 10.004966735839844, + "learning_rate": 4.7095558853750436e-05, + "loss": 5.9085, + "step": 2650 + }, + { + "epoch": 0.30297852952901644, + "grad_norm": 23.831050872802734, + "learning_rate": 4.708414202534536e-05, + "loss": 6.4296, + "step": 2660 + }, + { + "epoch": 0.30411754655732104, + "grad_norm": 13.838797569274902, + "learning_rate": 4.707272519694029e-05, + "loss": 6.0649, + "step": 2670 + }, + { + "epoch": 0.3052565635856256, + "grad_norm": 6.441380023956299, + "learning_rate": 4.7061308368535224e-05, + "loss": 5.6616, + "step": 2680 + }, + { + "epoch": 0.3063955806139302, + "grad_norm": 7.748492240905762, + "learning_rate": 4.7049891540130156e-05, + "loss": 5.9879, + "step": 2690 + }, + { + "epoch": 0.30753459764223473, + "grad_norm": 6.974376201629639, + "learning_rate": 4.703847471172508e-05, + "loss": 5.8898, + "step": 2700 + }, + { + "epoch": 0.30867361467053933, + "grad_norm": 9.72148323059082, + "learning_rate": 4.702705788332001e-05, + "loss": 5.9921, + "step": 2710 + }, + { + "epoch": 0.3098126316988439, + "grad_norm": 6.207435131072998, + "learning_rate": 4.701564105491495e-05, + "loss": 5.8311, + "step": 2720 + }, + { + "epoch": 0.3109516487271485, + "grad_norm": 8.65808391571045, + "learning_rate": 4.700422422650988e-05, + "loss": 5.8449, + "step": 2730 + }, + { + "epoch": 0.312090665755453, + "grad_norm": 5.72158145904541, + "learning_rate": 4.699280739810481e-05, + "loss": 5.8683, + "step": 2740 + }, + { + "epoch": 0.3132296827837576, + "grad_norm": 10.032368659973145, + "learning_rate": 4.698139056969974e-05, + "loss": 5.9892, + "step": 2750 + }, + { + "epoch": 0.31436869981206217, + "grad_norm": 4.539613723754883, + "learning_rate": 4.696997374129467e-05, + "loss": 6.9231, + "step": 2760 + }, + { + "epoch": 0.31550771684036677, + "grad_norm": 6.816250801086426, + "learning_rate": 4.6958556912889604e-05, + "loss": 5.9322, + "step": 2770 + }, + { + "epoch": 0.3166467338686713, + "grad_norm": 7.319843292236328, + "learning_rate": 4.694714008448453e-05, + "loss": 6.1575, + "step": 2780 + }, + { + "epoch": 0.3177857508969759, + "grad_norm": 8.64138126373291, + "learning_rate": 4.693572325607946e-05, + "loss": 6.1458, + "step": 2790 + }, + { + "epoch": 0.31892476792528046, + "grad_norm": 25.99393081665039, + "learning_rate": 4.692430642767439e-05, + "loss": 5.9521, + "step": 2800 + }, + { + "epoch": 0.32006378495358506, + "grad_norm": 5.670660018920898, + "learning_rate": 4.6912889599269325e-05, + "loss": 5.9455, + "step": 2810 + }, + { + "epoch": 0.3212028019818896, + "grad_norm": 5.817745208740234, + "learning_rate": 4.6901472770864256e-05, + "loss": 5.7616, + "step": 2820 + }, + { + "epoch": 0.3223418190101942, + "grad_norm": 6.353646755218506, + "learning_rate": 4.689005594245919e-05, + "loss": 6.0848, + "step": 2830 + }, + { + "epoch": 0.32348083603849875, + "grad_norm": 6.609086513519287, + "learning_rate": 4.687863911405412e-05, + "loss": 6.3497, + "step": 2840 + }, + { + "epoch": 0.32461985306680335, + "grad_norm": 6.302474498748779, + "learning_rate": 4.6867222285649045e-05, + "loss": 5.9555, + "step": 2850 + }, + { + "epoch": 0.3257588700951079, + "grad_norm": 4.505662441253662, + "learning_rate": 4.685580545724398e-05, + "loss": 6.1585, + "step": 2860 + }, + { + "epoch": 0.3268978871234125, + "grad_norm": 7.946392059326172, + "learning_rate": 4.684438862883891e-05, + "loss": 5.8707, + "step": 2870 + }, + { + "epoch": 0.32803690415171705, + "grad_norm": 5.582015037536621, + "learning_rate": 4.683297180043384e-05, + "loss": 6.0129, + "step": 2880 + }, + { + "epoch": 0.32917592118002165, + "grad_norm": 11.491278648376465, + "learning_rate": 4.682155497202877e-05, + "loss": 6.0328, + "step": 2890 + }, + { + "epoch": 0.3303149382083262, + "grad_norm": 5.488585948944092, + "learning_rate": 4.6810138143623704e-05, + "loss": 6.3148, + "step": 2900 + }, + { + "epoch": 0.3314539552366308, + "grad_norm": 7.975997447967529, + "learning_rate": 4.6798721315218636e-05, + "loss": 5.7636, + "step": 2910 + }, + { + "epoch": 0.33259297226493534, + "grad_norm": 7.250860214233398, + "learning_rate": 4.678730448681357e-05, + "loss": 5.8292, + "step": 2920 + }, + { + "epoch": 0.33373198929323994, + "grad_norm": 5.324447154998779, + "learning_rate": 4.677588765840849e-05, + "loss": 5.905, + "step": 2930 + }, + { + "epoch": 0.3348710063215445, + "grad_norm": 9.724185943603516, + "learning_rate": 4.6764470830003425e-05, + "loss": 6.2338, + "step": 2940 + }, + { + "epoch": 0.3360100233498491, + "grad_norm": 6.259258270263672, + "learning_rate": 4.675305400159836e-05, + "loss": 5.9365, + "step": 2950 + }, + { + "epoch": 0.33714904037815363, + "grad_norm": 13.341094017028809, + "learning_rate": 4.674163717319329e-05, + "loss": 5.9414, + "step": 2960 + }, + { + "epoch": 0.33828805740645823, + "grad_norm": 8.9069185256958, + "learning_rate": 4.673022034478822e-05, + "loss": 5.9498, + "step": 2970 + }, + { + "epoch": 0.3394270744347628, + "grad_norm": 8.157875061035156, + "learning_rate": 4.671880351638315e-05, + "loss": 5.8834, + "step": 2980 + }, + { + "epoch": 0.3405660914630674, + "grad_norm": 15.065528869628906, + "learning_rate": 4.6707386687978084e-05, + "loss": 5.9062, + "step": 2990 + }, + { + "epoch": 0.3417051084913719, + "grad_norm": 7.683790683746338, + "learning_rate": 4.6695969859573016e-05, + "loss": 5.7754, + "step": 3000 + }, + { + "epoch": 0.3428441255196765, + "grad_norm": 6.353583335876465, + "learning_rate": 4.668455303116794e-05, + "loss": 6.0532, + "step": 3010 + }, + { + "epoch": 0.34398314254798107, + "grad_norm": 5.4052629470825195, + "learning_rate": 4.667313620276287e-05, + "loss": 5.9156, + "step": 3020 + }, + { + "epoch": 0.34512215957628567, + "grad_norm": 9.244784355163574, + "learning_rate": 4.6661719374357805e-05, + "loss": 6.04, + "step": 3030 + }, + { + "epoch": 0.3462611766045902, + "grad_norm": 5.430369853973389, + "learning_rate": 4.6650302545952737e-05, + "loss": 5.9407, + "step": 3040 + }, + { + "epoch": 0.3474001936328948, + "grad_norm": 12.265975952148438, + "learning_rate": 4.663888571754767e-05, + "loss": 6.3226, + "step": 3050 + }, + { + "epoch": 0.34853921066119936, + "grad_norm": 6.474460601806641, + "learning_rate": 4.66274688891426e-05, + "loss": 6.2523, + "step": 3060 + }, + { + "epoch": 0.34967822768950396, + "grad_norm": 13.527132034301758, + "learning_rate": 4.661605206073753e-05, + "loss": 6.0565, + "step": 3070 + }, + { + "epoch": 0.3508172447178085, + "grad_norm": 6.607095241546631, + "learning_rate": 4.6604635232332464e-05, + "loss": 6.5046, + "step": 3080 + }, + { + "epoch": 0.3519562617461131, + "grad_norm": 6.4882354736328125, + "learning_rate": 4.659321840392739e-05, + "loss": 6.1059, + "step": 3090 + }, + { + "epoch": 0.35309527877441765, + "grad_norm": 10.648322105407715, + "learning_rate": 4.658180157552232e-05, + "loss": 5.9012, + "step": 3100 + }, + { + "epoch": 0.35423429580272225, + "grad_norm": 7.89484977722168, + "learning_rate": 4.657038474711725e-05, + "loss": 6.2262, + "step": 3110 + }, + { + "epoch": 0.3553733128310268, + "grad_norm": 6.660143852233887, + "learning_rate": 4.6558967918712184e-05, + "loss": 5.8987, + "step": 3120 + }, + { + "epoch": 0.3565123298593314, + "grad_norm": 6.835406303405762, + "learning_rate": 4.654755109030711e-05, + "loss": 5.7315, + "step": 3130 + }, + { + "epoch": 0.35765134688763595, + "grad_norm": 10.808911323547363, + "learning_rate": 4.653613426190205e-05, + "loss": 5.8631, + "step": 3140 + }, + { + "epoch": 0.35879036391594055, + "grad_norm": 6.554528713226318, + "learning_rate": 4.652471743349698e-05, + "loss": 6.1735, + "step": 3150 + }, + { + "epoch": 0.3599293809442451, + "grad_norm": 6.966010570526123, + "learning_rate": 4.651330060509191e-05, + "loss": 6.2185, + "step": 3160 + }, + { + "epoch": 0.3610683979725497, + "grad_norm": 5.658500671386719, + "learning_rate": 4.650188377668684e-05, + "loss": 6.3764, + "step": 3170 + }, + { + "epoch": 0.36220741500085424, + "grad_norm": 6.941939353942871, + "learning_rate": 4.649046694828177e-05, + "loss": 5.8302, + "step": 3180 + }, + { + "epoch": 0.36334643202915884, + "grad_norm": 10.871665954589844, + "learning_rate": 4.64790501198767e-05, + "loss": 5.7199, + "step": 3190 + }, + { + "epoch": 0.3644854490574634, + "grad_norm": 9.147170066833496, + "learning_rate": 4.646763329147163e-05, + "loss": 5.851, + "step": 3200 + }, + { + "epoch": 0.365624466085768, + "grad_norm": 5.908941268920898, + "learning_rate": 4.645621646306656e-05, + "loss": 6.2582, + "step": 3210 + }, + { + "epoch": 0.36676348311407253, + "grad_norm": 5.497340202331543, + "learning_rate": 4.6444799634661496e-05, + "loss": 6.0959, + "step": 3220 + }, + { + "epoch": 0.36790250014237713, + "grad_norm": 7.915394306182861, + "learning_rate": 4.643338280625643e-05, + "loss": 6.0134, + "step": 3230 + }, + { + "epoch": 0.3690415171706817, + "grad_norm": 6.310519218444824, + "learning_rate": 4.642196597785135e-05, + "loss": 5.519, + "step": 3240 + }, + { + "epoch": 0.3701805341989863, + "grad_norm": 6.008285045623779, + "learning_rate": 4.6410549149446285e-05, + "loss": 6.2746, + "step": 3250 + }, + { + "epoch": 0.3713195512272908, + "grad_norm": 6.441627502441406, + "learning_rate": 4.6399132321041217e-05, + "loss": 6.4277, + "step": 3260 + }, + { + "epoch": 0.3724585682555954, + "grad_norm": 5.518830299377441, + "learning_rate": 4.638771549263615e-05, + "loss": 5.963, + "step": 3270 + }, + { + "epoch": 0.37359758528389997, + "grad_norm": 5.965517997741699, + "learning_rate": 4.637629866423108e-05, + "loss": 5.7866, + "step": 3280 + }, + { + "epoch": 0.37473660231220457, + "grad_norm": 6.320878028869629, + "learning_rate": 4.6364881835826005e-05, + "loss": 6.1882, + "step": 3290 + }, + { + "epoch": 0.3758756193405091, + "grad_norm": 13.0669584274292, + "learning_rate": 4.6353465007420944e-05, + "loss": 6.1091, + "step": 3300 + }, + { + "epoch": 0.3770146363688137, + "grad_norm": 5.964664459228516, + "learning_rate": 4.6342048179015876e-05, + "loss": 6.0334, + "step": 3310 + }, + { + "epoch": 0.37815365339711826, + "grad_norm": 7.827390670776367, + "learning_rate": 4.63306313506108e-05, + "loss": 6.022, + "step": 3320 + }, + { + "epoch": 0.37929267042542286, + "grad_norm": 12.887112617492676, + "learning_rate": 4.631921452220573e-05, + "loss": 5.9171, + "step": 3330 + }, + { + "epoch": 0.3804316874537274, + "grad_norm": 8.092065811157227, + "learning_rate": 4.6307797693800665e-05, + "loss": 5.986, + "step": 3340 + }, + { + "epoch": 0.381570704482032, + "grad_norm": 6.128257751464844, + "learning_rate": 4.6296380865395596e-05, + "loss": 5.9844, + "step": 3350 + }, + { + "epoch": 0.38270972151033655, + "grad_norm": 8.12193775177002, + "learning_rate": 4.628496403699052e-05, + "loss": 5.9676, + "step": 3360 + }, + { + "epoch": 0.38384873853864115, + "grad_norm": 8.535385131835938, + "learning_rate": 4.627354720858545e-05, + "loss": 5.8463, + "step": 3370 + }, + { + "epoch": 0.3849877555669457, + "grad_norm": 8.127837181091309, + "learning_rate": 4.626213038018039e-05, + "loss": 6.1955, + "step": 3380 + }, + { + "epoch": 0.3861267725952503, + "grad_norm": 7.3196330070495605, + "learning_rate": 4.6250713551775324e-05, + "loss": 6.3398, + "step": 3390 + }, + { + "epoch": 0.38726578962355485, + "grad_norm": 9.154827117919922, + "learning_rate": 4.623929672337025e-05, + "loss": 5.9425, + "step": 3400 + }, + { + "epoch": 0.38840480665185945, + "grad_norm": 16.89331817626953, + "learning_rate": 4.622787989496518e-05, + "loss": 5.8076, + "step": 3410 + }, + { + "epoch": 0.389543823680164, + "grad_norm": 7.123322010040283, + "learning_rate": 4.621646306656011e-05, + "loss": 5.8576, + "step": 3420 + }, + { + "epoch": 0.3906828407084686, + "grad_norm": 10.912338256835938, + "learning_rate": 4.6205046238155044e-05, + "loss": 6.2962, + "step": 3430 + }, + { + "epoch": 0.39182185773677314, + "grad_norm": 37.93599319458008, + "learning_rate": 4.619362940974997e-05, + "loss": 5.8181, + "step": 3440 + }, + { + "epoch": 0.39296087476507774, + "grad_norm": 8.63291072845459, + "learning_rate": 4.61822125813449e-05, + "loss": 5.85, + "step": 3450 + }, + { + "epoch": 0.3940998917933823, + "grad_norm": 7.839759349822998, + "learning_rate": 4.617079575293984e-05, + "loss": 6.1772, + "step": 3460 + }, + { + "epoch": 0.3952389088216869, + "grad_norm": 11.179443359375, + "learning_rate": 4.615937892453477e-05, + "loss": 6.0357, + "step": 3470 + }, + { + "epoch": 0.39637792584999143, + "grad_norm": 5.794097423553467, + "learning_rate": 4.61479620961297e-05, + "loss": 6.2638, + "step": 3480 + }, + { + "epoch": 0.39751694287829603, + "grad_norm": 6.903919696807861, + "learning_rate": 4.613654526772463e-05, + "loss": 5.9867, + "step": 3490 + }, + { + "epoch": 0.3986559599066006, + "grad_norm": 7.100025177001953, + "learning_rate": 4.612512843931956e-05, + "loss": 5.8719, + "step": 3500 + }, + { + "epoch": 0.3997949769349052, + "grad_norm": 8.104240417480469, + "learning_rate": 4.611371161091449e-05, + "loss": 5.8909, + "step": 3510 + }, + { + "epoch": 0.4009339939632097, + "grad_norm": 6.986795902252197, + "learning_rate": 4.610229478250942e-05, + "loss": 6.195, + "step": 3520 + }, + { + "epoch": 0.4020730109915143, + "grad_norm": 7.941359043121338, + "learning_rate": 4.609087795410435e-05, + "loss": 6.2972, + "step": 3530 + }, + { + "epoch": 0.40321202801981887, + "grad_norm": 8.151006698608398, + "learning_rate": 4.607946112569928e-05, + "loss": 6.0135, + "step": 3540 + }, + { + "epoch": 0.40435104504812347, + "grad_norm": 4.726245403289795, + "learning_rate": 4.606804429729421e-05, + "loss": 6.3568, + "step": 3550 + }, + { + "epoch": 0.405490062076428, + "grad_norm": 5.867665767669678, + "learning_rate": 4.6056627468889145e-05, + "loss": 6.1011, + "step": 3560 + }, + { + "epoch": 0.4066290791047326, + "grad_norm": 8.400825500488281, + "learning_rate": 4.6045210640484076e-05, + "loss": 6.172, + "step": 3570 + }, + { + "epoch": 0.40776809613303716, + "grad_norm": 4.860127925872803, + "learning_rate": 4.603379381207901e-05, + "loss": 6.6524, + "step": 3580 + }, + { + "epoch": 0.40890711316134176, + "grad_norm": 8.082280158996582, + "learning_rate": 4.602237698367394e-05, + "loss": 5.8083, + "step": 3590 + }, + { + "epoch": 0.41004613018964636, + "grad_norm": 6.055807113647461, + "learning_rate": 4.6010960155268865e-05, + "loss": 6.0123, + "step": 3600 + }, + { + "epoch": 0.4111851472179509, + "grad_norm": 10.244037628173828, + "learning_rate": 4.59995433268638e-05, + "loss": 6.172, + "step": 3610 + }, + { + "epoch": 0.4123241642462555, + "grad_norm": 4.998210906982422, + "learning_rate": 4.598812649845873e-05, + "loss": 5.9164, + "step": 3620 + }, + { + "epoch": 0.41346318127456005, + "grad_norm": 5.7088823318481445, + "learning_rate": 4.597670967005366e-05, + "loss": 5.7632, + "step": 3630 + }, + { + "epoch": 0.41460219830286466, + "grad_norm": 3.9780211448669434, + "learning_rate": 4.596529284164859e-05, + "loss": 6.2047, + "step": 3640 + }, + { + "epoch": 0.4157412153311692, + "grad_norm": 7.240362644195557, + "learning_rate": 4.5953876013243524e-05, + "loss": 5.8884, + "step": 3650 + }, + { + "epoch": 0.4168802323594738, + "grad_norm": 8.113978385925293, + "learning_rate": 4.5942459184838456e-05, + "loss": 5.9598, + "step": 3660 + }, + { + "epoch": 0.41801924938777835, + "grad_norm": 4.712223052978516, + "learning_rate": 4.593104235643339e-05, + "loss": 6.242, + "step": 3670 + }, + { + "epoch": 0.41915826641608295, + "grad_norm": 4.892892360687256, + "learning_rate": 4.591962552802831e-05, + "loss": 5.8129, + "step": 3680 + }, + { + "epoch": 0.4202972834443875, + "grad_norm": 5.622137546539307, + "learning_rate": 4.5908208699623245e-05, + "loss": 5.8886, + "step": 3690 + }, + { + "epoch": 0.4214363004726921, + "grad_norm": 5.635571479797363, + "learning_rate": 4.589679187121818e-05, + "loss": 6.3696, + "step": 3700 + }, + { + "epoch": 0.42257531750099664, + "grad_norm": 10.80083179473877, + "learning_rate": 4.588537504281311e-05, + "loss": 5.5764, + "step": 3710 + }, + { + "epoch": 0.42371433452930124, + "grad_norm": 9.474900245666504, + "learning_rate": 4.587395821440804e-05, + "loss": 6.0212, + "step": 3720 + }, + { + "epoch": 0.4248533515576058, + "grad_norm": 5.499372959136963, + "learning_rate": 4.586254138600297e-05, + "loss": 5.8542, + "step": 3730 + }, + { + "epoch": 0.4259923685859104, + "grad_norm": 6.5461554527282715, + "learning_rate": 4.5851124557597904e-05, + "loss": 6.0441, + "step": 3740 + }, + { + "epoch": 0.42713138561421493, + "grad_norm": 15.075446128845215, + "learning_rate": 4.583970772919283e-05, + "loss": 5.9956, + "step": 3750 + }, + { + "epoch": 0.42827040264251953, + "grad_norm": 6.579662322998047, + "learning_rate": 4.582829090078776e-05, + "loss": 5.7716, + "step": 3760 + }, + { + "epoch": 0.4294094196708241, + "grad_norm": 4.892009258270264, + "learning_rate": 4.581687407238269e-05, + "loss": 5.5447, + "step": 3770 + }, + { + "epoch": 0.4305484366991287, + "grad_norm": 5.967264175415039, + "learning_rate": 4.5805457243977625e-05, + "loss": 5.9216, + "step": 3780 + }, + { + "epoch": 0.4316874537274332, + "grad_norm": 7.942027568817139, + "learning_rate": 4.5794040415572557e-05, + "loss": 5.809, + "step": 3790 + }, + { + "epoch": 0.4328264707557378, + "grad_norm": 5.483458042144775, + "learning_rate": 4.578262358716749e-05, + "loss": 5.7505, + "step": 3800 + }, + { + "epoch": 0.43396548778404237, + "grad_norm": 6.87880277633667, + "learning_rate": 4.577120675876242e-05, + "loss": 5.8504, + "step": 3810 + }, + { + "epoch": 0.43510450481234697, + "grad_norm": 6.670889854431152, + "learning_rate": 4.575978993035735e-05, + "loss": 6.173, + "step": 3820 + }, + { + "epoch": 0.4362435218406515, + "grad_norm": 6.609583377838135, + "learning_rate": 4.574837310195228e-05, + "loss": 5.7217, + "step": 3830 + }, + { + "epoch": 0.4373825388689561, + "grad_norm": 7.475062370300293, + "learning_rate": 4.573695627354721e-05, + "loss": 5.9074, + "step": 3840 + }, + { + "epoch": 0.43852155589726066, + "grad_norm": 8.09111499786377, + "learning_rate": 4.572553944514214e-05, + "loss": 6.1935, + "step": 3850 + }, + { + "epoch": 0.43966057292556526, + "grad_norm": 6.0317559242248535, + "learning_rate": 4.571412261673707e-05, + "loss": 5.9718, + "step": 3860 + }, + { + "epoch": 0.4407995899538698, + "grad_norm": 5.886181831359863, + "learning_rate": 4.5702705788332e-05, + "loss": 5.9667, + "step": 3870 + }, + { + "epoch": 0.4419386069821744, + "grad_norm": 7.264452934265137, + "learning_rate": 4.5691288959926936e-05, + "loss": 5.6558, + "step": 3880 + }, + { + "epoch": 0.44307762401047895, + "grad_norm": 7.599318504333496, + "learning_rate": 4.567987213152187e-05, + "loss": 6.0186, + "step": 3890 + }, + { + "epoch": 0.44421664103878356, + "grad_norm": 10.22423267364502, + "learning_rate": 4.56684553031168e-05, + "loss": 6.089, + "step": 3900 + }, + { + "epoch": 0.4453556580670881, + "grad_norm": 8.207062721252441, + "learning_rate": 4.5657038474711725e-05, + "loss": 5.8384, + "step": 3910 + }, + { + "epoch": 0.4464946750953927, + "grad_norm": 13.894556999206543, + "learning_rate": 4.564562164630666e-05, + "loss": 5.4603, + "step": 3920 + }, + { + "epoch": 0.44763369212369725, + "grad_norm": 5.356435775756836, + "learning_rate": 4.563420481790159e-05, + "loss": 6.0264, + "step": 3930 + }, + { + "epoch": 0.44877270915200185, + "grad_norm": 5.479897499084473, + "learning_rate": 4.562278798949652e-05, + "loss": 6.7273, + "step": 3940 + }, + { + "epoch": 0.4499117261803064, + "grad_norm": 5.720917701721191, + "learning_rate": 4.5611371161091446e-05, + "loss": 5.7317, + "step": 3950 + }, + { + "epoch": 0.451050743208611, + "grad_norm": 19.18890380859375, + "learning_rate": 4.5599954332686384e-05, + "loss": 5.9488, + "step": 3960 + }, + { + "epoch": 0.45218976023691554, + "grad_norm": 8.365082740783691, + "learning_rate": 4.5588537504281316e-05, + "loss": 5.7803, + "step": 3970 + }, + { + "epoch": 0.45332877726522014, + "grad_norm": 10.296485900878906, + "learning_rate": 4.557712067587625e-05, + "loss": 5.9118, + "step": 3980 + }, + { + "epoch": 0.4544677942935247, + "grad_norm": 9.0343656539917, + "learning_rate": 4.556570384747117e-05, + "loss": 5.6937, + "step": 3990 + }, + { + "epoch": 0.4556068113218293, + "grad_norm": 6.224181175231934, + "learning_rate": 4.5554287019066105e-05, + "loss": 6.0464, + "step": 4000 + }, + { + "epoch": 0.4556068113218293, + "eval_loss": 6.078274250030518, + "eval_runtime": 11.2491, + "eval_samples_per_second": 1.333, + "eval_steps_per_second": 0.178, + "step": 4000 + }, + { + "epoch": 0.45674582835013383, + "grad_norm": 5.356078624725342, + "learning_rate": 4.554287019066104e-05, + "loss": 5.8924, + "step": 4010 + }, + { + "epoch": 0.45788484537843843, + "grad_norm": 8.025609970092773, + "learning_rate": 4.553145336225597e-05, + "loss": 5.9475, + "step": 4020 + }, + { + "epoch": 0.459023862406743, + "grad_norm": 5.004885196685791, + "learning_rate": 4.5520036533850894e-05, + "loss": 5.6254, + "step": 4030 + }, + { + "epoch": 0.4601628794350476, + "grad_norm": 5.545079231262207, + "learning_rate": 4.550861970544583e-05, + "loss": 6.1783, + "step": 4040 + }, + { + "epoch": 0.4613018964633521, + "grad_norm": 6.245349884033203, + "learning_rate": 4.5497202877040764e-05, + "loss": 6.0623, + "step": 4050 + }, + { + "epoch": 0.4624409134916567, + "grad_norm": 6.350508213043213, + "learning_rate": 4.548578604863569e-05, + "loss": 6.0522, + "step": 4060 + }, + { + "epoch": 0.46357993051996127, + "grad_norm": 9.772076606750488, + "learning_rate": 4.547436922023062e-05, + "loss": 6.064, + "step": 4070 + }, + { + "epoch": 0.46471894754826587, + "grad_norm": 7.155641078948975, + "learning_rate": 4.546295239182555e-05, + "loss": 6.0838, + "step": 4080 + }, + { + "epoch": 0.4658579645765704, + "grad_norm": 5.971468925476074, + "learning_rate": 4.5451535563420485e-05, + "loss": 6.037, + "step": 4090 + }, + { + "epoch": 0.466996981604875, + "grad_norm": 8.513012886047363, + "learning_rate": 4.5440118735015416e-05, + "loss": 6.2869, + "step": 4100 + }, + { + "epoch": 0.46813599863317956, + "grad_norm": 7.186007022857666, + "learning_rate": 4.542870190661034e-05, + "loss": 6.1033, + "step": 4110 + }, + { + "epoch": 0.46927501566148416, + "grad_norm": 8.911818504333496, + "learning_rate": 4.541728507820528e-05, + "loss": 5.9273, + "step": 4120 + }, + { + "epoch": 0.4704140326897887, + "grad_norm": 6.835193634033203, + "learning_rate": 4.540586824980021e-05, + "loss": 5.9101, + "step": 4130 + }, + { + "epoch": 0.4715530497180933, + "grad_norm": 8.617008209228516, + "learning_rate": 4.539445142139514e-05, + "loss": 5.8752, + "step": 4140 + }, + { + "epoch": 0.47269206674639785, + "grad_norm": 6.705082893371582, + "learning_rate": 4.538303459299007e-05, + "loss": 5.8514, + "step": 4150 + }, + { + "epoch": 0.47383108377470246, + "grad_norm": 4.137232303619385, + "learning_rate": 4.5371617764585e-05, + "loss": 6.1345, + "step": 4160 + }, + { + "epoch": 0.474970100803007, + "grad_norm": 5.951109886169434, + "learning_rate": 4.536020093617993e-05, + "loss": 5.8639, + "step": 4170 + }, + { + "epoch": 0.4761091178313116, + "grad_norm": 9.148282051086426, + "learning_rate": 4.5348784107774864e-05, + "loss": 5.7085, + "step": 4180 + }, + { + "epoch": 0.47724813485961615, + "grad_norm": 6.89546537399292, + "learning_rate": 4.533736727936979e-05, + "loss": 5.8127, + "step": 4190 + }, + { + "epoch": 0.47838715188792075, + "grad_norm": 6.99493932723999, + "learning_rate": 4.532595045096472e-05, + "loss": 6.6861, + "step": 4200 + }, + { + "epoch": 0.4795261689162253, + "grad_norm": 5.6303019523620605, + "learning_rate": 4.531453362255966e-05, + "loss": 6.3338, + "step": 4210 + }, + { + "epoch": 0.4806651859445299, + "grad_norm": 6.7159953117370605, + "learning_rate": 4.5303116794154585e-05, + "loss": 5.8867, + "step": 4220 + }, + { + "epoch": 0.48180420297283444, + "grad_norm": 5.279120922088623, + "learning_rate": 4.529169996574952e-05, + "loss": 5.9261, + "step": 4230 + }, + { + "epoch": 0.48294322000113904, + "grad_norm": 6.267137050628662, + "learning_rate": 4.528028313734445e-05, + "loss": 5.7475, + "step": 4240 + }, + { + "epoch": 0.4840822370294436, + "grad_norm": 11.94001293182373, + "learning_rate": 4.526886630893938e-05, + "loss": 5.7601, + "step": 4250 + }, + { + "epoch": 0.4852212540577482, + "grad_norm": 7.740771293640137, + "learning_rate": 4.5257449480534305e-05, + "loss": 5.8693, + "step": 4260 + }, + { + "epoch": 0.48636027108605273, + "grad_norm": 5.90120792388916, + "learning_rate": 4.524603265212924e-05, + "loss": 5.894, + "step": 4270 + }, + { + "epoch": 0.48749928811435733, + "grad_norm": 5.867739677429199, + "learning_rate": 4.523461582372417e-05, + "loss": 5.8287, + "step": 4280 + }, + { + "epoch": 0.4886383051426619, + "grad_norm": 5.900161266326904, + "learning_rate": 4.522319899531911e-05, + "loss": 6.0611, + "step": 4290 + }, + { + "epoch": 0.4897773221709665, + "grad_norm": 5.506997585296631, + "learning_rate": 4.521178216691403e-05, + "loss": 5.8383, + "step": 4300 + }, + { + "epoch": 0.490916339199271, + "grad_norm": 5.639462947845459, + "learning_rate": 4.5200365338508965e-05, + "loss": 5.7852, + "step": 4310 + }, + { + "epoch": 0.4920553562275756, + "grad_norm": 15.822317123413086, + "learning_rate": 4.5188948510103897e-05, + "loss": 5.77, + "step": 4320 + }, + { + "epoch": 0.49319437325588017, + "grad_norm": 6.525705337524414, + "learning_rate": 4.517753168169883e-05, + "loss": 6.3061, + "step": 4330 + }, + { + "epoch": 0.49433339028418477, + "grad_norm": 5.3117876052856445, + "learning_rate": 4.5166114853293753e-05, + "loss": 5.7736, + "step": 4340 + }, + { + "epoch": 0.4954724073124893, + "grad_norm": 5.8776397705078125, + "learning_rate": 4.5154698024888685e-05, + "loss": 5.8458, + "step": 4350 + }, + { + "epoch": 0.4966114243407939, + "grad_norm": 11.251127243041992, + "learning_rate": 4.514328119648362e-05, + "loss": 5.9588, + "step": 4360 + }, + { + "epoch": 0.49775044136909846, + "grad_norm": 22.83697509765625, + "learning_rate": 4.5131864368078556e-05, + "loss": 5.963, + "step": 4370 + }, + { + "epoch": 0.49888945839740306, + "grad_norm": 6.44933557510376, + "learning_rate": 4.512044753967348e-05, + "loss": 5.9385, + "step": 4380 + }, + { + "epoch": 0.5000284754257076, + "grad_norm": 7.210160732269287, + "learning_rate": 4.510903071126841e-05, + "loss": 5.9247, + "step": 4390 + }, + { + "epoch": 0.5011674924540122, + "grad_norm": 6.42350959777832, + "learning_rate": 4.5097613882863344e-05, + "loss": 6.1627, + "step": 4400 + }, + { + "epoch": 0.5023065094823168, + "grad_norm": 8.11135196685791, + "learning_rate": 4.5086197054458276e-05, + "loss": 6.4086, + "step": 4410 + }, + { + "epoch": 0.5034455265106214, + "grad_norm": 7.448008060455322, + "learning_rate": 4.50747802260532e-05, + "loss": 6.0661, + "step": 4420 + }, + { + "epoch": 0.5045845435389259, + "grad_norm": 7.212705135345459, + "learning_rate": 4.506336339764813e-05, + "loss": 5.766, + "step": 4430 + }, + { + "epoch": 0.5057235605672304, + "grad_norm": 6.085208892822266, + "learning_rate": 4.5051946569243065e-05, + "loss": 5.7313, + "step": 4440 + }, + { + "epoch": 0.5068625775955351, + "grad_norm": 8.44282054901123, + "learning_rate": 4.5040529740838e-05, + "loss": 5.7597, + "step": 4450 + }, + { + "epoch": 0.5080015946238396, + "grad_norm": 5.492551326751709, + "learning_rate": 4.502911291243293e-05, + "loss": 5.7822, + "step": 4460 + }, + { + "epoch": 0.5091406116521442, + "grad_norm": 8.480428695678711, + "learning_rate": 4.501769608402786e-05, + "loss": 5.6644, + "step": 4470 + }, + { + "epoch": 0.5102796286804487, + "grad_norm": 9.871870040893555, + "learning_rate": 4.500627925562279e-05, + "loss": 5.8488, + "step": 4480 + }, + { + "epoch": 0.5114186457087534, + "grad_norm": 22.278566360473633, + "learning_rate": 4.4994862427217724e-05, + "loss": 5.7774, + "step": 4490 + }, + { + "epoch": 0.5125576627370579, + "grad_norm": 6.218201160430908, + "learning_rate": 4.498344559881265e-05, + "loss": 5.6653, + "step": 4500 + }, + { + "epoch": 0.5136966797653625, + "grad_norm": 7.463952541351318, + "learning_rate": 4.497202877040758e-05, + "loss": 5.8319, + "step": 4510 + }, + { + "epoch": 0.514835696793667, + "grad_norm": 7.079387664794922, + "learning_rate": 4.496061194200251e-05, + "loss": 5.7116, + "step": 4520 + }, + { + "epoch": 0.5159747138219717, + "grad_norm": 6.294861793518066, + "learning_rate": 4.4949195113597445e-05, + "loss": 5.8526, + "step": 4530 + }, + { + "epoch": 0.5171137308502762, + "grad_norm": 7.084829807281494, + "learning_rate": 4.4937778285192377e-05, + "loss": 5.6832, + "step": 4540 + }, + { + "epoch": 0.5182527478785808, + "grad_norm": 4.318167686462402, + "learning_rate": 4.492636145678731e-05, + "loss": 6.1293, + "step": 4550 + }, + { + "epoch": 0.5193917649068853, + "grad_norm": 9.80423641204834, + "learning_rate": 4.491494462838224e-05, + "loss": 5.8818, + "step": 4560 + }, + { + "epoch": 0.52053078193519, + "grad_norm": 11.214982032775879, + "learning_rate": 4.4903527799977165e-05, + "loss": 5.8895, + "step": 4570 + }, + { + "epoch": 0.5216697989634945, + "grad_norm": 8.01582145690918, + "learning_rate": 4.48921109715721e-05, + "loss": 6.233, + "step": 4580 + }, + { + "epoch": 0.5228088159917991, + "grad_norm": 6.428025722503662, + "learning_rate": 4.488069414316703e-05, + "loss": 6.1618, + "step": 4590 + }, + { + "epoch": 0.5239478330201036, + "grad_norm": 4.905664443969727, + "learning_rate": 4.486927731476196e-05, + "loss": 5.9563, + "step": 4600 + }, + { + "epoch": 0.5250868500484083, + "grad_norm": 5.810403823852539, + "learning_rate": 4.485786048635689e-05, + "loss": 5.7832, + "step": 4610 + }, + { + "epoch": 0.5262258670767128, + "grad_norm": 5.713348388671875, + "learning_rate": 4.4846443657951825e-05, + "loss": 6.0745, + "step": 4620 + }, + { + "epoch": 0.5273648841050174, + "grad_norm": 5.19990348815918, + "learning_rate": 4.4835026829546756e-05, + "loss": 5.8194, + "step": 4630 + }, + { + "epoch": 0.5285039011333219, + "grad_norm": 5.478442668914795, + "learning_rate": 4.482361000114169e-05, + "loss": 5.7365, + "step": 4640 + }, + { + "epoch": 0.5296429181616266, + "grad_norm": 6.2245378494262695, + "learning_rate": 4.481219317273661e-05, + "loss": 6.3179, + "step": 4650 + }, + { + "epoch": 0.5307819351899311, + "grad_norm": 6.621606826782227, + "learning_rate": 4.4800776344331545e-05, + "loss": 5.7806, + "step": 4660 + }, + { + "epoch": 0.5319209522182357, + "grad_norm": 5.53688907623291, + "learning_rate": 4.478935951592648e-05, + "loss": 6.0046, + "step": 4670 + }, + { + "epoch": 0.5330599692465402, + "grad_norm": 5.000534534454346, + "learning_rate": 4.477794268752141e-05, + "loss": 5.8763, + "step": 4680 + }, + { + "epoch": 0.5341989862748449, + "grad_norm": 4.869490623474121, + "learning_rate": 4.476652585911634e-05, + "loss": 6.3533, + "step": 4690 + }, + { + "epoch": 0.5353380033031494, + "grad_norm": 6.470453262329102, + "learning_rate": 4.475510903071127e-05, + "loss": 6.0611, + "step": 4700 + }, + { + "epoch": 0.536477020331454, + "grad_norm": 5.445845127105713, + "learning_rate": 4.4743692202306204e-05, + "loss": 5.8835, + "step": 4710 + }, + { + "epoch": 0.5376160373597585, + "grad_norm": 6.2387237548828125, + "learning_rate": 4.4732275373901136e-05, + "loss": 5.8729, + "step": 4720 + }, + { + "epoch": 0.5387550543880631, + "grad_norm": 6.605109691619873, + "learning_rate": 4.472085854549606e-05, + "loss": 6.14, + "step": 4730 + }, + { + "epoch": 0.5398940714163677, + "grad_norm": 5.047166347503662, + "learning_rate": 4.470944171709099e-05, + "loss": 5.8988, + "step": 4740 + }, + { + "epoch": 0.5410330884446722, + "grad_norm": 5.175670623779297, + "learning_rate": 4.4698024888685925e-05, + "loss": 5.9424, + "step": 4750 + }, + { + "epoch": 0.5421721054729768, + "grad_norm": 5.9300079345703125, + "learning_rate": 4.468660806028086e-05, + "loss": 6.2061, + "step": 4760 + }, + { + "epoch": 0.5433111225012814, + "grad_norm": 6.683525085449219, + "learning_rate": 4.467519123187578e-05, + "loss": 5.7376, + "step": 4770 + }, + { + "epoch": 0.544450139529586, + "grad_norm": 14.139119148254395, + "learning_rate": 4.466377440347072e-05, + "loss": 6.0151, + "step": 4780 + }, + { + "epoch": 0.5455891565578905, + "grad_norm": 4.422637939453125, + "learning_rate": 4.465235757506565e-05, + "loss": 5.7256, + "step": 4790 + }, + { + "epoch": 0.5467281735861951, + "grad_norm": 4.97141170501709, + "learning_rate": 4.4640940746660584e-05, + "loss": 5.7168, + "step": 4800 + }, + { + "epoch": 0.5478671906144997, + "grad_norm": 4.695651531219482, + "learning_rate": 4.462952391825551e-05, + "loss": 5.7933, + "step": 4810 + }, + { + "epoch": 0.5490062076428043, + "grad_norm": 5.186215877532959, + "learning_rate": 4.461810708985044e-05, + "loss": 6.041, + "step": 4820 + }, + { + "epoch": 0.5501452246711088, + "grad_norm": 6.2940545082092285, + "learning_rate": 4.460669026144537e-05, + "loss": 6.0442, + "step": 4830 + }, + { + "epoch": 0.5512842416994134, + "grad_norm": 9.648452758789062, + "learning_rate": 4.4595273433040305e-05, + "loss": 6.0486, + "step": 4840 + }, + { + "epoch": 0.552423258727718, + "grad_norm": 6.279506206512451, + "learning_rate": 4.458385660463523e-05, + "loss": 5.9373, + "step": 4850 + }, + { + "epoch": 0.5535622757560226, + "grad_norm": 11.7589750289917, + "learning_rate": 4.457243977623016e-05, + "loss": 5.8724, + "step": 4860 + }, + { + "epoch": 0.5547012927843271, + "grad_norm": 5.745841979980469, + "learning_rate": 4.45610229478251e-05, + "loss": 6.1764, + "step": 4870 + }, + { + "epoch": 0.5558403098126317, + "grad_norm": 10.151200294494629, + "learning_rate": 4.454960611942003e-05, + "loss": 5.812, + "step": 4880 + }, + { + "epoch": 0.5569793268409363, + "grad_norm": 6.251079082489014, + "learning_rate": 4.453818929101496e-05, + "loss": 5.699, + "step": 4890 + }, + { + "epoch": 0.5581183438692409, + "grad_norm": 7.862876892089844, + "learning_rate": 4.452677246260989e-05, + "loss": 6.012, + "step": 4900 + }, + { + "epoch": 0.5592573608975454, + "grad_norm": 6.622585296630859, + "learning_rate": 4.451535563420482e-05, + "loss": 5.964, + "step": 4910 + }, + { + "epoch": 0.56039637792585, + "grad_norm": 7.121626377105713, + "learning_rate": 4.450393880579975e-05, + "loss": 5.8795, + "step": 4920 + }, + { + "epoch": 0.5615353949541546, + "grad_norm": 7.391700267791748, + "learning_rate": 4.449252197739468e-05, + "loss": 6.24, + "step": 4930 + }, + { + "epoch": 0.5626744119824592, + "grad_norm": 5.3456549644470215, + "learning_rate": 4.448110514898961e-05, + "loss": 6.1187, + "step": 4940 + }, + { + "epoch": 0.5638134290107637, + "grad_norm": 5.86979866027832, + "learning_rate": 4.446968832058455e-05, + "loss": 6.0705, + "step": 4950 + }, + { + "epoch": 0.5649524460390682, + "grad_norm": 17.474056243896484, + "learning_rate": 4.445827149217947e-05, + "loss": 5.9015, + "step": 4960 + }, + { + "epoch": 0.5660914630673729, + "grad_norm": 6.048776149749756, + "learning_rate": 4.4446854663774405e-05, + "loss": 5.78, + "step": 4970 + }, + { + "epoch": 0.5672304800956774, + "grad_norm": 6.961145877838135, + "learning_rate": 4.443543783536934e-05, + "loss": 6.045, + "step": 4980 + }, + { + "epoch": 0.568369497123982, + "grad_norm": 8.503111839294434, + "learning_rate": 4.442402100696427e-05, + "loss": 5.932, + "step": 4990 + }, + { + "epoch": 0.5695085141522865, + "grad_norm": 5.485167503356934, + "learning_rate": 4.44126041785592e-05, + "loss": 5.7518, + "step": 5000 + }, + { + "epoch": 0.5706475311805912, + "grad_norm": 6.443273544311523, + "learning_rate": 4.4401187350154126e-05, + "loss": 5.9175, + "step": 5010 + }, + { + "epoch": 0.5717865482088957, + "grad_norm": 6.290263652801514, + "learning_rate": 4.438977052174906e-05, + "loss": 5.6413, + "step": 5020 + }, + { + "epoch": 0.5729255652372003, + "grad_norm": 6.004288196563721, + "learning_rate": 4.4378353693343996e-05, + "loss": 5.9629, + "step": 5030 + }, + { + "epoch": 0.5740645822655048, + "grad_norm": 5.7330145835876465, + "learning_rate": 4.436693686493892e-05, + "loss": 5.9903, + "step": 5040 + }, + { + "epoch": 0.5752035992938095, + "grad_norm": 6.661035060882568, + "learning_rate": 4.435552003653385e-05, + "loss": 5.8785, + "step": 5050 + }, + { + "epoch": 0.576342616322114, + "grad_norm": 8.097799301147461, + "learning_rate": 4.4344103208128785e-05, + "loss": 5.7706, + "step": 5060 + }, + { + "epoch": 0.5774816333504186, + "grad_norm": 6.515287399291992, + "learning_rate": 4.4332686379723717e-05, + "loss": 5.9882, + "step": 5070 + }, + { + "epoch": 0.5786206503787231, + "grad_norm": 16.282241821289062, + "learning_rate": 4.432126955131864e-05, + "loss": 5.8521, + "step": 5080 + }, + { + "epoch": 0.5797596674070278, + "grad_norm": 4.604179382324219, + "learning_rate": 4.4309852722913573e-05, + "loss": 6.2113, + "step": 5090 + }, + { + "epoch": 0.5808986844353323, + "grad_norm": 5.025040626525879, + "learning_rate": 4.4298435894508505e-05, + "loss": 6.7721, + "step": 5100 + }, + { + "epoch": 0.5820377014636369, + "grad_norm": 7.755475997924805, + "learning_rate": 4.4287019066103444e-05, + "loss": 5.8835, + "step": 5110 + }, + { + "epoch": 0.5831767184919414, + "grad_norm": 6.028952598571777, + "learning_rate": 4.427560223769837e-05, + "loss": 5.909, + "step": 5120 + }, + { + "epoch": 0.5843157355202461, + "grad_norm": 6.185368537902832, + "learning_rate": 4.42641854092933e-05, + "loss": 5.6625, + "step": 5130 + }, + { + "epoch": 0.5854547525485506, + "grad_norm": 8.831610679626465, + "learning_rate": 4.425276858088823e-05, + "loss": 6.078, + "step": 5140 + }, + { + "epoch": 0.5865937695768552, + "grad_norm": 7.380201816558838, + "learning_rate": 4.4241351752483164e-05, + "loss": 5.996, + "step": 5150 + }, + { + "epoch": 0.5877327866051597, + "grad_norm": 14.59043025970459, + "learning_rate": 4.422993492407809e-05, + "loss": 6.0636, + "step": 5160 + }, + { + "epoch": 0.5888718036334644, + "grad_norm": 8.660425186157227, + "learning_rate": 4.421851809567302e-05, + "loss": 6.1363, + "step": 5170 + }, + { + "epoch": 0.5900108206617689, + "grad_norm": 6.343860626220703, + "learning_rate": 4.420710126726795e-05, + "loss": 6.0228, + "step": 5180 + }, + { + "epoch": 0.5911498376900735, + "grad_norm": 8.988828659057617, + "learning_rate": 4.4195684438862885e-05, + "loss": 6.1271, + "step": 5190 + }, + { + "epoch": 0.592288854718378, + "grad_norm": 4.080019474029541, + "learning_rate": 4.418426761045782e-05, + "loss": 5.717, + "step": 5200 + }, + { + "epoch": 0.5934278717466827, + "grad_norm": 8.057381629943848, + "learning_rate": 4.417285078205275e-05, + "loss": 5.9801, + "step": 5210 + }, + { + "epoch": 0.5945668887749872, + "grad_norm": 6.668858528137207, + "learning_rate": 4.416143395364768e-05, + "loss": 5.9643, + "step": 5220 + }, + { + "epoch": 0.5957059058032917, + "grad_norm": 15.661836624145508, + "learning_rate": 4.415001712524261e-05, + "loss": 5.9624, + "step": 5230 + }, + { + "epoch": 0.5968449228315963, + "grad_norm": 7.527164936065674, + "learning_rate": 4.413860029683754e-05, + "loss": 6.7508, + "step": 5240 + }, + { + "epoch": 0.597983939859901, + "grad_norm": 7.868592262268066, + "learning_rate": 4.412718346843247e-05, + "loss": 5.7929, + "step": 5250 + }, + { + "epoch": 0.5991229568882055, + "grad_norm": 9.12926197052002, + "learning_rate": 4.41157666400274e-05, + "loss": 5.7517, + "step": 5260 + }, + { + "epoch": 0.60026197391651, + "grad_norm": 7.015052318572998, + "learning_rate": 4.410434981162233e-05, + "loss": 5.7492, + "step": 5270 + }, + { + "epoch": 0.6014009909448146, + "grad_norm": 6.547445774078369, + "learning_rate": 4.4092932983217265e-05, + "loss": 5.6785, + "step": 5280 + }, + { + "epoch": 0.6025400079731192, + "grad_norm": 6.565978527069092, + "learning_rate": 4.40815161548122e-05, + "loss": 5.8961, + "step": 5290 + }, + { + "epoch": 0.6036790250014238, + "grad_norm": 6.1269097328186035, + "learning_rate": 4.407009932640713e-05, + "loss": 6.0278, + "step": 5300 + }, + { + "epoch": 0.6048180420297283, + "grad_norm": 4.784152507781982, + "learning_rate": 4.405868249800206e-05, + "loss": 5.7792, + "step": 5310 + }, + { + "epoch": 0.6059570590580329, + "grad_norm": 4.875153541564941, + "learning_rate": 4.4047265669596985e-05, + "loss": 6.0277, + "step": 5320 + }, + { + "epoch": 0.6070960760863375, + "grad_norm": 4.904726982116699, + "learning_rate": 4.403584884119192e-05, + "loss": 5.5104, + "step": 5330 + }, + { + "epoch": 0.6082350931146421, + "grad_norm": 16.596614837646484, + "learning_rate": 4.402443201278685e-05, + "loss": 5.7808, + "step": 5340 + }, + { + "epoch": 0.6093741101429466, + "grad_norm": 11.208609580993652, + "learning_rate": 4.401301518438178e-05, + "loss": 5.7724, + "step": 5350 + }, + { + "epoch": 0.6105131271712512, + "grad_norm": 7.170653820037842, + "learning_rate": 4.400159835597671e-05, + "loss": 5.8673, + "step": 5360 + }, + { + "epoch": 0.6116521441995558, + "grad_norm": 9.264379501342773, + "learning_rate": 4.3990181527571645e-05, + "loss": 6.0324, + "step": 5370 + }, + { + "epoch": 0.6127911612278604, + "grad_norm": 5.418370246887207, + "learning_rate": 4.3978764699166576e-05, + "loss": 5.8417, + "step": 5380 + }, + { + "epoch": 0.6139301782561649, + "grad_norm": 10.881325721740723, + "learning_rate": 4.396734787076151e-05, + "loss": 5.6967, + "step": 5390 + }, + { + "epoch": 0.6150691952844695, + "grad_norm": 21.59162712097168, + "learning_rate": 4.395593104235643e-05, + "loss": 5.9382, + "step": 5400 + }, + { + "epoch": 0.6162082123127741, + "grad_norm": 26.581167221069336, + "learning_rate": 4.3944514213951365e-05, + "loss": 6.0368, + "step": 5410 + }, + { + "epoch": 0.6173472293410787, + "grad_norm": 9.81440544128418, + "learning_rate": 4.39330973855463e-05, + "loss": 5.8343, + "step": 5420 + }, + { + "epoch": 0.6184862463693832, + "grad_norm": 7.034490585327148, + "learning_rate": 4.392168055714123e-05, + "loss": 5.8832, + "step": 5430 + }, + { + "epoch": 0.6196252633976878, + "grad_norm": 12.727472305297852, + "learning_rate": 4.391026372873616e-05, + "loss": 5.4909, + "step": 5440 + }, + { + "epoch": 0.6207642804259924, + "grad_norm": 5.399456024169922, + "learning_rate": 4.389884690033109e-05, + "loss": 6.0521, + "step": 5450 + }, + { + "epoch": 0.621903297454297, + "grad_norm": 15.499351501464844, + "learning_rate": 4.3887430071926024e-05, + "loss": 6.1447, + "step": 5460 + }, + { + "epoch": 0.6230423144826015, + "grad_norm": 6.282449722290039, + "learning_rate": 4.387601324352095e-05, + "loss": 6.1408, + "step": 5470 + }, + { + "epoch": 0.624181331510906, + "grad_norm": 5.330649375915527, + "learning_rate": 4.386459641511588e-05, + "loss": 5.7458, + "step": 5480 + }, + { + "epoch": 0.6253203485392107, + "grad_norm": 5.244607448577881, + "learning_rate": 4.385317958671081e-05, + "loss": 5.878, + "step": 5490 + }, + { + "epoch": 0.6264593655675152, + "grad_norm": 6.8278489112854, + "learning_rate": 4.3841762758305745e-05, + "loss": 5.7342, + "step": 5500 + }, + { + "epoch": 0.6275983825958198, + "grad_norm": 5.953369617462158, + "learning_rate": 4.383034592990068e-05, + "loss": 6.0004, + "step": 5510 + }, + { + "epoch": 0.6287373996241243, + "grad_norm": 4.974438667297363, + "learning_rate": 4.381892910149561e-05, + "loss": 5.9253, + "step": 5520 + }, + { + "epoch": 0.629876416652429, + "grad_norm": 4.46335506439209, + "learning_rate": 4.380751227309054e-05, + "loss": 5.7667, + "step": 5530 + }, + { + "epoch": 0.6310154336807335, + "grad_norm": 7.2519211769104, + "learning_rate": 4.379609544468547e-05, + "loss": 5.8392, + "step": 5540 + }, + { + "epoch": 0.6321544507090381, + "grad_norm": 5.6725053787231445, + "learning_rate": 4.37846786162804e-05, + "loss": 6.4619, + "step": 5550 + }, + { + "epoch": 0.6332934677373426, + "grad_norm": 6.412236213684082, + "learning_rate": 4.377326178787533e-05, + "loss": 6.3128, + "step": 5560 + }, + { + "epoch": 0.6344324847656473, + "grad_norm": 7.52258825302124, + "learning_rate": 4.376184495947026e-05, + "loss": 5.7141, + "step": 5570 + }, + { + "epoch": 0.6355715017939518, + "grad_norm": 6.420504570007324, + "learning_rate": 4.375042813106519e-05, + "loss": 5.9488, + "step": 5580 + }, + { + "epoch": 0.6367105188222564, + "grad_norm": 7.359799861907959, + "learning_rate": 4.373901130266012e-05, + "loss": 5.8208, + "step": 5590 + }, + { + "epoch": 0.6378495358505609, + "grad_norm": 6.431293487548828, + "learning_rate": 4.372759447425505e-05, + "loss": 6.0522, + "step": 5600 + }, + { + "epoch": 0.6389885528788656, + "grad_norm": 7.792017936706543, + "learning_rate": 4.371617764584999e-05, + "loss": 5.8313, + "step": 5610 + }, + { + "epoch": 0.6401275699071701, + "grad_norm": 5.352195739746094, + "learning_rate": 4.370476081744492e-05, + "loss": 5.8553, + "step": 5620 + }, + { + "epoch": 0.6412665869354747, + "grad_norm": 5.225295066833496, + "learning_rate": 4.3693343989039845e-05, + "loss": 5.6983, + "step": 5630 + }, + { + "epoch": 0.6424056039637792, + "grad_norm": 7.8425493240356445, + "learning_rate": 4.368192716063478e-05, + "loss": 6.0586, + "step": 5640 + }, + { + "epoch": 0.6435446209920839, + "grad_norm": 5.940848350524902, + "learning_rate": 4.367051033222971e-05, + "loss": 5.8525, + "step": 5650 + }, + { + "epoch": 0.6446836380203884, + "grad_norm": 5.690766334533691, + "learning_rate": 4.365909350382464e-05, + "loss": 5.6485, + "step": 5660 + }, + { + "epoch": 0.645822655048693, + "grad_norm": 6.9173102378845215, + "learning_rate": 4.3647676675419566e-05, + "loss": 5.6241, + "step": 5670 + }, + { + "epoch": 0.6469616720769975, + "grad_norm": 10.341431617736816, + "learning_rate": 4.36362598470145e-05, + "loss": 5.6291, + "step": 5680 + }, + { + "epoch": 0.6481006891053022, + "grad_norm": 6.144728660583496, + "learning_rate": 4.3624843018609436e-05, + "loss": 5.5985, + "step": 5690 + }, + { + "epoch": 0.6492397061336067, + "grad_norm": 10.256707191467285, + "learning_rate": 4.361342619020437e-05, + "loss": 5.7659, + "step": 5700 + }, + { + "epoch": 0.6503787231619113, + "grad_norm": 8.072504997253418, + "learning_rate": 4.360200936179929e-05, + "loss": 5.7809, + "step": 5710 + }, + { + "epoch": 0.6515177401902158, + "grad_norm": 6.1178178787231445, + "learning_rate": 4.3590592533394225e-05, + "loss": 5.7802, + "step": 5720 + }, + { + "epoch": 0.6526567572185205, + "grad_norm": 10.861801147460938, + "learning_rate": 4.357917570498916e-05, + "loss": 6.0257, + "step": 5730 + }, + { + "epoch": 0.653795774246825, + "grad_norm": 7.259403705596924, + "learning_rate": 4.356775887658409e-05, + "loss": 5.8374, + "step": 5740 + }, + { + "epoch": 0.6549347912751295, + "grad_norm": 5.754521369934082, + "learning_rate": 4.3556342048179014e-05, + "loss": 6.006, + "step": 5750 + }, + { + "epoch": 0.6560738083034341, + "grad_norm": 10.971464157104492, + "learning_rate": 4.3544925219773946e-05, + "loss": 6.0546, + "step": 5760 + }, + { + "epoch": 0.6572128253317387, + "grad_norm": 4.653876781463623, + "learning_rate": 4.3533508391368884e-05, + "loss": 5.6189, + "step": 5770 + }, + { + "epoch": 0.6583518423600433, + "grad_norm": 14.550445556640625, + "learning_rate": 4.352209156296381e-05, + "loss": 6.1425, + "step": 5780 + }, + { + "epoch": 0.6594908593883478, + "grad_norm": 6.240013599395752, + "learning_rate": 4.351067473455874e-05, + "loss": 5.9321, + "step": 5790 + }, + { + "epoch": 0.6606298764166524, + "grad_norm": 10.119818687438965, + "learning_rate": 4.349925790615367e-05, + "loss": 5.9247, + "step": 5800 + }, + { + "epoch": 0.661768893444957, + "grad_norm": 8.448205947875977, + "learning_rate": 4.3487841077748605e-05, + "loss": 6.3629, + "step": 5810 + }, + { + "epoch": 0.6629079104732616, + "grad_norm": 8.628769874572754, + "learning_rate": 4.3476424249343537e-05, + "loss": 6.2197, + "step": 5820 + }, + { + "epoch": 0.6640469275015661, + "grad_norm": 5.994671821594238, + "learning_rate": 4.346500742093846e-05, + "loss": 6.1132, + "step": 5830 + }, + { + "epoch": 0.6651859445298707, + "grad_norm": 26.20197868347168, + "learning_rate": 4.3453590592533393e-05, + "loss": 5.7933, + "step": 5840 + }, + { + "epoch": 0.6663249615581753, + "grad_norm": 10.80951976776123, + "learning_rate": 4.344217376412833e-05, + "loss": 5.7691, + "step": 5850 + }, + { + "epoch": 0.6674639785864799, + "grad_norm": 20.93010139465332, + "learning_rate": 4.343075693572326e-05, + "loss": 5.7721, + "step": 5860 + }, + { + "epoch": 0.6686029956147844, + "grad_norm": 7.294173240661621, + "learning_rate": 4.341934010731819e-05, + "loss": 5.8856, + "step": 5870 + }, + { + "epoch": 0.669742012643089, + "grad_norm": 7.660623073577881, + "learning_rate": 4.340792327891312e-05, + "loss": 5.8282, + "step": 5880 + }, + { + "epoch": 0.6708810296713936, + "grad_norm": 7.517411708831787, + "learning_rate": 4.339650645050805e-05, + "loss": 5.9608, + "step": 5890 + }, + { + "epoch": 0.6720200466996982, + "grad_norm": 8.141441345214844, + "learning_rate": 4.3385089622102985e-05, + "loss": 6.053, + "step": 5900 + }, + { + "epoch": 0.6731590637280027, + "grad_norm": 5.962698936462402, + "learning_rate": 4.337367279369791e-05, + "loss": 5.9031, + "step": 5910 + }, + { + "epoch": 0.6742980807563073, + "grad_norm": 5.588809013366699, + "learning_rate": 4.336225596529284e-05, + "loss": 5.7685, + "step": 5920 + }, + { + "epoch": 0.6754370977846119, + "grad_norm": 9.669516563415527, + "learning_rate": 4.335083913688777e-05, + "loss": 5.9723, + "step": 5930 + }, + { + "epoch": 0.6765761148129165, + "grad_norm": 7.2432756423950195, + "learning_rate": 4.3339422308482705e-05, + "loss": 5.8062, + "step": 5940 + }, + { + "epoch": 0.677715131841221, + "grad_norm": 8.40849494934082, + "learning_rate": 4.332800548007764e-05, + "loss": 5.6209, + "step": 5950 + }, + { + "epoch": 0.6788541488695256, + "grad_norm": 17.449857711791992, + "learning_rate": 4.331658865167257e-05, + "loss": 5.8675, + "step": 5960 + }, + { + "epoch": 0.6799931658978302, + "grad_norm": 4.8293328285217285, + "learning_rate": 4.33051718232675e-05, + "loss": 5.8157, + "step": 5970 + }, + { + "epoch": 0.6811321829261348, + "grad_norm": 11.033587455749512, + "learning_rate": 4.3293754994862426e-05, + "loss": 5.847, + "step": 5980 + }, + { + "epoch": 0.6822711999544393, + "grad_norm": 6.28988790512085, + "learning_rate": 4.328233816645736e-05, + "loss": 5.7486, + "step": 5990 + }, + { + "epoch": 0.6834102169827438, + "grad_norm": 6.424103736877441, + "learning_rate": 4.327092133805229e-05, + "loss": 5.9153, + "step": 6000 + }, + { + "epoch": 0.6834102169827438, + "eval_loss": 6.000249862670898, + "eval_runtime": 11.8864, + "eval_samples_per_second": 1.262, + "eval_steps_per_second": 0.168, + "step": 6000 + }, + { + "epoch": 0.6845492340110485, + "grad_norm": 6.374187469482422, + "learning_rate": 4.325950450964722e-05, + "loss": 5.7432, + "step": 6010 + }, + { + "epoch": 0.685688251039353, + "grad_norm": 8.024246215820312, + "learning_rate": 4.324808768124215e-05, + "loss": 6.0022, + "step": 6020 + }, + { + "epoch": 0.6868272680676576, + "grad_norm": 9.164938926696777, + "learning_rate": 4.3236670852837085e-05, + "loss": 6.0697, + "step": 6030 + }, + { + "epoch": 0.6879662850959621, + "grad_norm": 11.665236473083496, + "learning_rate": 4.322525402443202e-05, + "loss": 5.997, + "step": 6040 + }, + { + "epoch": 0.6891053021242668, + "grad_norm": 6.842959403991699, + "learning_rate": 4.321383719602695e-05, + "loss": 5.3779, + "step": 6050 + }, + { + "epoch": 0.6902443191525713, + "grad_norm": 6.97825288772583, + "learning_rate": 4.3202420367621874e-05, + "loss": 5.7943, + "step": 6060 + }, + { + "epoch": 0.6913833361808759, + "grad_norm": 7.083444118499756, + "learning_rate": 4.3191003539216805e-05, + "loss": 5.707, + "step": 6070 + }, + { + "epoch": 0.6925223532091804, + "grad_norm": 6.394343852996826, + "learning_rate": 4.317958671081174e-05, + "loss": 6.012, + "step": 6080 + }, + { + "epoch": 0.6936613702374851, + "grad_norm": 6.833849906921387, + "learning_rate": 4.316816988240667e-05, + "loss": 6.1103, + "step": 6090 + }, + { + "epoch": 0.6948003872657896, + "grad_norm": 5.809621810913086, + "learning_rate": 4.31567530540016e-05, + "loss": 5.968, + "step": 6100 + }, + { + "epoch": 0.6959394042940942, + "grad_norm": 6.108304023742676, + "learning_rate": 4.314533622559653e-05, + "loss": 5.9533, + "step": 6110 + }, + { + "epoch": 0.6970784213223987, + "grad_norm": 8.843610763549805, + "learning_rate": 4.3133919397191465e-05, + "loss": 5.7717, + "step": 6120 + }, + { + "epoch": 0.6982174383507034, + "grad_norm": 6.577777862548828, + "learning_rate": 4.3122502568786396e-05, + "loss": 5.9296, + "step": 6130 + }, + { + "epoch": 0.6993564553790079, + "grad_norm": 6.981563091278076, + "learning_rate": 4.311108574038132e-05, + "loss": 6.0679, + "step": 6140 + }, + { + "epoch": 0.7004954724073125, + "grad_norm": 7.8999433517456055, + "learning_rate": 4.309966891197625e-05, + "loss": 5.8404, + "step": 6150 + }, + { + "epoch": 0.701634489435617, + "grad_norm": 5.662416934967041, + "learning_rate": 4.3088252083571185e-05, + "loss": 6.1544, + "step": 6160 + }, + { + "epoch": 0.7027735064639217, + "grad_norm": 12.454471588134766, + "learning_rate": 4.307683525516612e-05, + "loss": 5.7012, + "step": 6170 + }, + { + "epoch": 0.7039125234922262, + "grad_norm": 11.734405517578125, + "learning_rate": 4.306541842676105e-05, + "loss": 5.7161, + "step": 6180 + }, + { + "epoch": 0.7050515405205308, + "grad_norm": 7.174385070800781, + "learning_rate": 4.305400159835598e-05, + "loss": 5.6746, + "step": 6190 + }, + { + "epoch": 0.7061905575488353, + "grad_norm": 5.351472854614258, + "learning_rate": 4.304258476995091e-05, + "loss": 5.8431, + "step": 6200 + }, + { + "epoch": 0.70732957457714, + "grad_norm": 5.916141986846924, + "learning_rate": 4.3031167941545844e-05, + "loss": 5.9033, + "step": 6210 + }, + { + "epoch": 0.7084685916054445, + "grad_norm": 7.05497932434082, + "learning_rate": 4.301975111314077e-05, + "loss": 5.6192, + "step": 6220 + }, + { + "epoch": 0.709607608633749, + "grad_norm": 14.515453338623047, + "learning_rate": 4.30083342847357e-05, + "loss": 6.2751, + "step": 6230 + }, + { + "epoch": 0.7107466256620536, + "grad_norm": 9.84923267364502, + "learning_rate": 4.299691745633063e-05, + "loss": 5.9913, + "step": 6240 + }, + { + "epoch": 0.7118856426903583, + "grad_norm": 8.738007545471191, + "learning_rate": 4.2985500627925565e-05, + "loss": 5.8785, + "step": 6250 + }, + { + "epoch": 0.7130246597186628, + "grad_norm": 5.9236860275268555, + "learning_rate": 4.297408379952049e-05, + "loss": 5.8098, + "step": 6260 + }, + { + "epoch": 0.7141636767469673, + "grad_norm": 6.191288471221924, + "learning_rate": 4.296266697111543e-05, + "loss": 6.0675, + "step": 6270 + }, + { + "epoch": 0.7153026937752719, + "grad_norm": 5.791018486022949, + "learning_rate": 4.295125014271036e-05, + "loss": 6.1648, + "step": 6280 + }, + { + "epoch": 0.7164417108035765, + "grad_norm": 10.756135940551758, + "learning_rate": 4.2939833314305286e-05, + "loss": 6.0626, + "step": 6290 + }, + { + "epoch": 0.7175807278318811, + "grad_norm": 7.2194976806640625, + "learning_rate": 4.292841648590022e-05, + "loss": 5.7357, + "step": 6300 + }, + { + "epoch": 0.7187197448601856, + "grad_norm": 6.4318318367004395, + "learning_rate": 4.291699965749515e-05, + "loss": 5.7417, + "step": 6310 + }, + { + "epoch": 0.7198587618884902, + "grad_norm": 12.609630584716797, + "learning_rate": 4.290558282909008e-05, + "loss": 5.7873, + "step": 6320 + }, + { + "epoch": 0.7209977789167948, + "grad_norm": 6.773333549499512, + "learning_rate": 4.289416600068501e-05, + "loss": 6.0641, + "step": 6330 + }, + { + "epoch": 0.7221367959450994, + "grad_norm": 7.28901481628418, + "learning_rate": 4.288274917227994e-05, + "loss": 5.7901, + "step": 6340 + }, + { + "epoch": 0.7232758129734039, + "grad_norm": 7.068140029907227, + "learning_rate": 4.2871332343874877e-05, + "loss": 5.9178, + "step": 6350 + }, + { + "epoch": 0.7244148300017085, + "grad_norm": 6.572689533233643, + "learning_rate": 4.285991551546981e-05, + "loss": 5.7508, + "step": 6360 + }, + { + "epoch": 0.7255538470300131, + "grad_norm": 7.898672580718994, + "learning_rate": 4.2848498687064733e-05, + "loss": 6.1836, + "step": 6370 + }, + { + "epoch": 0.7266928640583177, + "grad_norm": 5.110747337341309, + "learning_rate": 4.2837081858659665e-05, + "loss": 5.6782, + "step": 6380 + }, + { + "epoch": 0.7278318810866222, + "grad_norm": 10.442312240600586, + "learning_rate": 4.28256650302546e-05, + "loss": 6.0124, + "step": 6390 + }, + { + "epoch": 0.7289708981149268, + "grad_norm": 9.792623519897461, + "learning_rate": 4.281424820184953e-05, + "loss": 5.7531, + "step": 6400 + }, + { + "epoch": 0.7301099151432314, + "grad_norm": 12.85150146484375, + "learning_rate": 4.280283137344446e-05, + "loss": 5.7829, + "step": 6410 + }, + { + "epoch": 0.731248932171536, + "grad_norm": 9.494611740112305, + "learning_rate": 4.2791414545039386e-05, + "loss": 6.0707, + "step": 6420 + }, + { + "epoch": 0.7323879491998405, + "grad_norm": 8.019498825073242, + "learning_rate": 4.2779997716634324e-05, + "loss": 5.9736, + "step": 6430 + }, + { + "epoch": 0.7335269662281451, + "grad_norm": 6.833261013031006, + "learning_rate": 4.2768580888229256e-05, + "loss": 5.6632, + "step": 6440 + }, + { + "epoch": 0.7346659832564497, + "grad_norm": 6.260386943817139, + "learning_rate": 4.275716405982418e-05, + "loss": 5.7989, + "step": 6450 + }, + { + "epoch": 0.7358050002847543, + "grad_norm": 11.511929512023926, + "learning_rate": 4.274574723141911e-05, + "loss": 6.0158, + "step": 6460 + }, + { + "epoch": 0.7369440173130588, + "grad_norm": 6.045806407928467, + "learning_rate": 4.2734330403014045e-05, + "loss": 5.6529, + "step": 6470 + }, + { + "epoch": 0.7380830343413634, + "grad_norm": 5.465932369232178, + "learning_rate": 4.272291357460898e-05, + "loss": 6.1358, + "step": 6480 + }, + { + "epoch": 0.739222051369668, + "grad_norm": 4.64259147644043, + "learning_rate": 4.27114967462039e-05, + "loss": 5.8446, + "step": 6490 + }, + { + "epoch": 0.7403610683979726, + "grad_norm": 11.950940132141113, + "learning_rate": 4.2700079917798834e-05, + "loss": 5.5124, + "step": 6500 + }, + { + "epoch": 0.7415000854262771, + "grad_norm": 11.80821418762207, + "learning_rate": 4.268866308939377e-05, + "loss": 5.3712, + "step": 6510 + }, + { + "epoch": 0.7426391024545816, + "grad_norm": 7.256740093231201, + "learning_rate": 4.2677246260988704e-05, + "loss": 5.6585, + "step": 6520 + }, + { + "epoch": 0.7437781194828863, + "grad_norm": 9.645147323608398, + "learning_rate": 4.266582943258363e-05, + "loss": 6.0998, + "step": 6530 + }, + { + "epoch": 0.7449171365111908, + "grad_norm": 6.811032295227051, + "learning_rate": 4.265441260417856e-05, + "loss": 5.983, + "step": 6540 + }, + { + "epoch": 0.7460561535394954, + "grad_norm": 7.274852752685547, + "learning_rate": 4.264299577577349e-05, + "loss": 5.8492, + "step": 6550 + }, + { + "epoch": 0.7471951705677999, + "grad_norm": 14.114151954650879, + "learning_rate": 4.2631578947368425e-05, + "loss": 5.8024, + "step": 6560 + }, + { + "epoch": 0.7483341875961046, + "grad_norm": 5.180044651031494, + "learning_rate": 4.262016211896335e-05, + "loss": 5.8085, + "step": 6570 + }, + { + "epoch": 0.7494732046244091, + "grad_norm": 7.047656536102295, + "learning_rate": 4.260874529055828e-05, + "loss": 5.9068, + "step": 6580 + }, + { + "epoch": 0.7506122216527137, + "grad_norm": 5.658132553100586, + "learning_rate": 4.2597328462153214e-05, + "loss": 5.7752, + "step": 6590 + }, + { + "epoch": 0.7517512386810182, + "grad_norm": 7.832466125488281, + "learning_rate": 4.258591163374815e-05, + "loss": 5.9246, + "step": 6600 + }, + { + "epoch": 0.7528902557093229, + "grad_norm": 7.179642200469971, + "learning_rate": 4.257449480534308e-05, + "loss": 5.8667, + "step": 6610 + }, + { + "epoch": 0.7540292727376274, + "grad_norm": 7.623144149780273, + "learning_rate": 4.256307797693801e-05, + "loss": 6.0546, + "step": 6620 + }, + { + "epoch": 0.755168289765932, + "grad_norm": 8.365649223327637, + "learning_rate": 4.255166114853294e-05, + "loss": 5.8384, + "step": 6630 + }, + { + "epoch": 0.7563073067942365, + "grad_norm": 5.382843494415283, + "learning_rate": 4.254024432012787e-05, + "loss": 5.6101, + "step": 6640 + }, + { + "epoch": 0.7574463238225412, + "grad_norm": 6.739308834075928, + "learning_rate": 4.25288274917228e-05, + "loss": 5.5094, + "step": 6650 + }, + { + "epoch": 0.7585853408508457, + "grad_norm": 14.110841751098633, + "learning_rate": 4.251741066331773e-05, + "loss": 5.6044, + "step": 6660 + }, + { + "epoch": 0.7597243578791503, + "grad_norm": 11.060261726379395, + "learning_rate": 4.250599383491266e-05, + "loss": 5.8924, + "step": 6670 + }, + { + "epoch": 0.7608633749074548, + "grad_norm": 10.84189510345459, + "learning_rate": 4.249457700650759e-05, + "loss": 5.867, + "step": 6680 + }, + { + "epoch": 0.7620023919357595, + "grad_norm": 8.165534019470215, + "learning_rate": 4.2483160178102525e-05, + "loss": 5.894, + "step": 6690 + }, + { + "epoch": 0.763141408964064, + "grad_norm": 13.278603553771973, + "learning_rate": 4.247174334969746e-05, + "loss": 5.7121, + "step": 6700 + }, + { + "epoch": 0.7642804259923686, + "grad_norm": 4.628503799438477, + "learning_rate": 4.246032652129239e-05, + "loss": 6.1128, + "step": 6710 + }, + { + "epoch": 0.7654194430206731, + "grad_norm": 12.247485160827637, + "learning_rate": 4.244890969288732e-05, + "loss": 5.8867, + "step": 6720 + }, + { + "epoch": 0.7665584600489778, + "grad_norm": 7.217540740966797, + "learning_rate": 4.2437492864482246e-05, + "loss": 5.7733, + "step": 6730 + }, + { + "epoch": 0.7676974770772823, + "grad_norm": 7.513230800628662, + "learning_rate": 4.242607603607718e-05, + "loss": 6.0099, + "step": 6740 + }, + { + "epoch": 0.7688364941055869, + "grad_norm": 7.3117499351501465, + "learning_rate": 4.241465920767211e-05, + "loss": 5.6005, + "step": 6750 + }, + { + "epoch": 0.7699755111338914, + "grad_norm": 6.87970495223999, + "learning_rate": 4.240324237926704e-05, + "loss": 6.2145, + "step": 6760 + }, + { + "epoch": 0.7711145281621961, + "grad_norm": 22.215946197509766, + "learning_rate": 4.239182555086197e-05, + "loss": 5.6321, + "step": 6770 + }, + { + "epoch": 0.7722535451905006, + "grad_norm": 8.393705368041992, + "learning_rate": 4.2380408722456905e-05, + "loss": 5.8547, + "step": 6780 + }, + { + "epoch": 0.7733925622188051, + "grad_norm": 6.210604190826416, + "learning_rate": 4.236899189405184e-05, + "loss": 5.6622, + "step": 6790 + }, + { + "epoch": 0.7745315792471097, + "grad_norm": 8.281871795654297, + "learning_rate": 4.235757506564676e-05, + "loss": 5.5941, + "step": 6800 + }, + { + "epoch": 0.7756705962754143, + "grad_norm": 8.00438404083252, + "learning_rate": 4.2346158237241694e-05, + "loss": 6.1111, + "step": 6810 + }, + { + "epoch": 0.7768096133037189, + "grad_norm": 16.651893615722656, + "learning_rate": 4.2334741408836625e-05, + "loss": 6.0269, + "step": 6820 + }, + { + "epoch": 0.7779486303320234, + "grad_norm": 8.054128646850586, + "learning_rate": 4.232332458043156e-05, + "loss": 5.5274, + "step": 6830 + }, + { + "epoch": 0.779087647360328, + "grad_norm": 6.785789489746094, + "learning_rate": 4.231190775202649e-05, + "loss": 6.1035, + "step": 6840 + }, + { + "epoch": 0.7802266643886326, + "grad_norm": 8.805694580078125, + "learning_rate": 4.230049092362142e-05, + "loss": 6.0284, + "step": 6850 + }, + { + "epoch": 0.7813656814169372, + "grad_norm": 8.542845726013184, + "learning_rate": 4.228907409521635e-05, + "loss": 5.5808, + "step": 6860 + }, + { + "epoch": 0.7825046984452417, + "grad_norm": 10.117310523986816, + "learning_rate": 4.2277657266811285e-05, + "loss": 5.6974, + "step": 6870 + }, + { + "epoch": 0.7836437154735463, + "grad_norm": 7.026234149932861, + "learning_rate": 4.226624043840621e-05, + "loss": 5.6559, + "step": 6880 + }, + { + "epoch": 0.7847827325018509, + "grad_norm": 6.190097332000732, + "learning_rate": 4.225482361000114e-05, + "loss": 5.6631, + "step": 6890 + }, + { + "epoch": 0.7859217495301555, + "grad_norm": 5.212761878967285, + "learning_rate": 4.2243406781596073e-05, + "loss": 6.198, + "step": 6900 + }, + { + "epoch": 0.78706076655846, + "grad_norm": 4.255821228027344, + "learning_rate": 4.2231989953191005e-05, + "loss": 5.9244, + "step": 6910 + }, + { + "epoch": 0.7881997835867646, + "grad_norm": 3.5964879989624023, + "learning_rate": 4.222057312478594e-05, + "loss": 6.0321, + "step": 6920 + }, + { + "epoch": 0.7893388006150692, + "grad_norm": 5.330949783325195, + "learning_rate": 4.220915629638087e-05, + "loss": 5.7435, + "step": 6930 + }, + { + "epoch": 0.7904778176433738, + "grad_norm": 17.729000091552734, + "learning_rate": 4.21977394679758e-05, + "loss": 5.7756, + "step": 6940 + }, + { + "epoch": 0.7916168346716783, + "grad_norm": 4.763799667358398, + "learning_rate": 4.218632263957073e-05, + "loss": 5.8928, + "step": 6950 + }, + { + "epoch": 0.7927558516999829, + "grad_norm": 6.26491641998291, + "learning_rate": 4.217490581116566e-05, + "loss": 5.7309, + "step": 6960 + }, + { + "epoch": 0.7938948687282875, + "grad_norm": 6.810174465179443, + "learning_rate": 4.216348898276059e-05, + "loss": 5.6357, + "step": 6970 + }, + { + "epoch": 0.7950338857565921, + "grad_norm": 7.7829718589782715, + "learning_rate": 4.215207215435552e-05, + "loss": 5.7724, + "step": 6980 + }, + { + "epoch": 0.7961729027848966, + "grad_norm": 15.339445114135742, + "learning_rate": 4.214065532595045e-05, + "loss": 5.5577, + "step": 6990 + }, + { + "epoch": 0.7973119198132012, + "grad_norm": 9.3477201461792, + "learning_rate": 4.212923849754538e-05, + "loss": 6.085, + "step": 7000 + }, + { + "epoch": 0.7984509368415058, + "grad_norm": 5.963769435882568, + "learning_rate": 4.211782166914032e-05, + "loss": 5.9804, + "step": 7010 + }, + { + "epoch": 0.7995899538698104, + "grad_norm": 7.498968124389648, + "learning_rate": 4.210640484073525e-05, + "loss": 5.8361, + "step": 7020 + }, + { + "epoch": 0.8007289708981149, + "grad_norm": 15.94110107421875, + "learning_rate": 4.209498801233018e-05, + "loss": 5.8782, + "step": 7030 + }, + { + "epoch": 0.8018679879264194, + "grad_norm": 6.359627723693848, + "learning_rate": 4.2083571183925106e-05, + "loss": 5.9358, + "step": 7040 + }, + { + "epoch": 0.8030070049547241, + "grad_norm": 4.368448257446289, + "learning_rate": 4.207215435552004e-05, + "loss": 5.7184, + "step": 7050 + }, + { + "epoch": 0.8041460219830286, + "grad_norm": 8.287890434265137, + "learning_rate": 4.206073752711497e-05, + "loss": 5.7268, + "step": 7060 + }, + { + "epoch": 0.8052850390113332, + "grad_norm": 3.759228467941284, + "learning_rate": 4.20493206987099e-05, + "loss": 6.0169, + "step": 7070 + }, + { + "epoch": 0.8064240560396377, + "grad_norm": 7.766053199768066, + "learning_rate": 4.2037903870304826e-05, + "loss": 5.9249, + "step": 7080 + }, + { + "epoch": 0.8075630730679424, + "grad_norm": 5.910696029663086, + "learning_rate": 4.2026487041899765e-05, + "loss": 5.6922, + "step": 7090 + }, + { + "epoch": 0.8087020900962469, + "grad_norm": 11.534826278686523, + "learning_rate": 4.2015070213494697e-05, + "loss": 5.8633, + "step": 7100 + }, + { + "epoch": 0.8098411071245515, + "grad_norm": 11.53891372680664, + "learning_rate": 4.200365338508963e-05, + "loss": 6.0148, + "step": 7110 + }, + { + "epoch": 0.810980124152856, + "grad_norm": 9.300012588500977, + "learning_rate": 4.1992236556684553e-05, + "loss": 5.6454, + "step": 7120 + }, + { + "epoch": 0.8121191411811607, + "grad_norm": 10.440338134765625, + "learning_rate": 4.1980819728279485e-05, + "loss": 5.6689, + "step": 7130 + }, + { + "epoch": 0.8132581582094652, + "grad_norm": 5.3286542892456055, + "learning_rate": 4.196940289987442e-05, + "loss": 5.7747, + "step": 7140 + }, + { + "epoch": 0.8143971752377698, + "grad_norm": 6.505975723266602, + "learning_rate": 4.195798607146935e-05, + "loss": 5.6918, + "step": 7150 + }, + { + "epoch": 0.8155361922660743, + "grad_norm": 4.70256233215332, + "learning_rate": 4.1946569243064274e-05, + "loss": 5.6079, + "step": 7160 + }, + { + "epoch": 0.816675209294379, + "grad_norm": 4.606108665466309, + "learning_rate": 4.193515241465921e-05, + "loss": 5.7555, + "step": 7170 + }, + { + "epoch": 0.8178142263226835, + "grad_norm": 5.840761661529541, + "learning_rate": 4.1923735586254145e-05, + "loss": 5.9702, + "step": 7180 + }, + { + "epoch": 0.8189532433509881, + "grad_norm": 7.519806385040283, + "learning_rate": 4.191231875784907e-05, + "loss": 5.8691, + "step": 7190 + }, + { + "epoch": 0.8200922603792927, + "grad_norm": 9.685820579528809, + "learning_rate": 4.1900901929444e-05, + "loss": 5.7984, + "step": 7200 + }, + { + "epoch": 0.8212312774075973, + "grad_norm": 5.0011515617370605, + "learning_rate": 4.188948510103893e-05, + "loss": 5.6861, + "step": 7210 + }, + { + "epoch": 0.8223702944359018, + "grad_norm": 7.479933738708496, + "learning_rate": 4.1878068272633865e-05, + "loss": 5.8211, + "step": 7220 + }, + { + "epoch": 0.8235093114642064, + "grad_norm": 24.248559951782227, + "learning_rate": 4.18666514442288e-05, + "loss": 6.106, + "step": 7230 + }, + { + "epoch": 0.824648328492511, + "grad_norm": 5.33579158782959, + "learning_rate": 4.185523461582372e-05, + "loss": 5.8561, + "step": 7240 + }, + { + "epoch": 0.8257873455208156, + "grad_norm": 5.8180341720581055, + "learning_rate": 4.1844959470259165e-05, + "loss": 5.671, + "step": 7250 + }, + { + "epoch": 0.8269263625491201, + "grad_norm": 7.822412967681885, + "learning_rate": 4.1833542641854097e-05, + "loss": 5.7731, + "step": 7260 + }, + { + "epoch": 0.8280653795774247, + "grad_norm": 4.881489276885986, + "learning_rate": 4.182212581344903e-05, + "loss": 6.17, + "step": 7270 + }, + { + "epoch": 0.8292043966057293, + "grad_norm": 9.92574691772461, + "learning_rate": 4.1810708985043953e-05, + "loss": 5.6865, + "step": 7280 + }, + { + "epoch": 0.8303434136340339, + "grad_norm": 6.111751556396484, + "learning_rate": 4.1799292156638885e-05, + "loss": 5.9666, + "step": 7290 + }, + { + "epoch": 0.8314824306623384, + "grad_norm": 16.988025665283203, + "learning_rate": 4.178787532823382e-05, + "loss": 5.5669, + "step": 7300 + }, + { + "epoch": 0.832621447690643, + "grad_norm": 7.006514549255371, + "learning_rate": 4.177645849982875e-05, + "loss": 5.8763, + "step": 7310 + }, + { + "epoch": 0.8337604647189476, + "grad_norm": 6.63994836807251, + "learning_rate": 4.176504167142368e-05, + "loss": 5.6928, + "step": 7320 + }, + { + "epoch": 0.8348994817472521, + "grad_norm": 7.609856128692627, + "learning_rate": 4.175362484301861e-05, + "loss": 6.0222, + "step": 7330 + }, + { + "epoch": 0.8360384987755567, + "grad_norm": 6.351384162902832, + "learning_rate": 4.1742208014613544e-05, + "loss": 6.1061, + "step": 7340 + }, + { + "epoch": 0.8371775158038612, + "grad_norm": 9.336108207702637, + "learning_rate": 4.1730791186208476e-05, + "loss": 5.681, + "step": 7350 + }, + { + "epoch": 0.8383165328321659, + "grad_norm": 6.0924272537231445, + "learning_rate": 4.17193743578034e-05, + "loss": 6.0066, + "step": 7360 + }, + { + "epoch": 0.8394555498604704, + "grad_norm": 9.09196662902832, + "learning_rate": 4.170795752939833e-05, + "loss": 6.15, + "step": 7370 + }, + { + "epoch": 0.840594566888775, + "grad_norm": 5.710347652435303, + "learning_rate": 4.1696540700993265e-05, + "loss": 5.6897, + "step": 7380 + }, + { + "epoch": 0.8417335839170795, + "grad_norm": 5.6023688316345215, + "learning_rate": 4.16851238725882e-05, + "loss": 5.6246, + "step": 7390 + }, + { + "epoch": 0.8428726009453842, + "grad_norm": 10.401467323303223, + "learning_rate": 4.167370704418313e-05, + "loss": 5.9081, + "step": 7400 + }, + { + "epoch": 0.8440116179736887, + "grad_norm": 11.755331039428711, + "learning_rate": 4.166229021577806e-05, + "loss": 5.9648, + "step": 7410 + }, + { + "epoch": 0.8451506350019933, + "grad_norm": 5.7525248527526855, + "learning_rate": 4.165087338737299e-05, + "loss": 5.7465, + "step": 7420 + }, + { + "epoch": 0.8462896520302978, + "grad_norm": 8.131318092346191, + "learning_rate": 4.1639456558967924e-05, + "loss": 5.8959, + "step": 7430 + }, + { + "epoch": 0.8474286690586025, + "grad_norm": 6.65851354598999, + "learning_rate": 4.162803973056285e-05, + "loss": 5.5865, + "step": 7440 + }, + { + "epoch": 0.848567686086907, + "grad_norm": 5.441812992095947, + "learning_rate": 4.161662290215778e-05, + "loss": 5.6765, + "step": 7450 + }, + { + "epoch": 0.8497067031152116, + "grad_norm": 11.929362297058105, + "learning_rate": 4.160520607375271e-05, + "loss": 5.6566, + "step": 7460 + }, + { + "epoch": 0.8508457201435161, + "grad_norm": 10.595314025878906, + "learning_rate": 4.1593789245347645e-05, + "loss": 5.9156, + "step": 7470 + }, + { + "epoch": 0.8519847371718208, + "grad_norm": 25.11834144592285, + "learning_rate": 4.158237241694258e-05, + "loss": 5.8055, + "step": 7480 + }, + { + "epoch": 0.8531237542001253, + "grad_norm": 5.37290620803833, + "learning_rate": 4.157095558853751e-05, + "loss": 5.8243, + "step": 7490 + }, + { + "epoch": 0.8542627712284299, + "grad_norm": 10.526466369628906, + "learning_rate": 4.155953876013244e-05, + "loss": 5.9527, + "step": 7500 + }, + { + "epoch": 0.8554017882567344, + "grad_norm": 7.707073211669922, + "learning_rate": 4.1548121931727365e-05, + "loss": 5.6988, + "step": 7510 + }, + { + "epoch": 0.8565408052850391, + "grad_norm": 7.538547992706299, + "learning_rate": 4.15367051033223e-05, + "loss": 5.7646, + "step": 7520 + }, + { + "epoch": 0.8576798223133436, + "grad_norm": 9.679366111755371, + "learning_rate": 4.152528827491723e-05, + "loss": 5.7077, + "step": 7530 + }, + { + "epoch": 0.8588188393416482, + "grad_norm": 10.528566360473633, + "learning_rate": 4.151387144651216e-05, + "loss": 5.9138, + "step": 7540 + }, + { + "epoch": 0.8599578563699527, + "grad_norm": 7.699685573577881, + "learning_rate": 4.150245461810709e-05, + "loss": 5.7155, + "step": 7550 + }, + { + "epoch": 0.8610968733982574, + "grad_norm": 5.603579998016357, + "learning_rate": 4.1491037789702025e-05, + "loss": 5.5373, + "step": 7560 + }, + { + "epoch": 0.8622358904265619, + "grad_norm": 6.505054950714111, + "learning_rate": 4.1479620961296956e-05, + "loss": 5.6922, + "step": 7570 + }, + { + "epoch": 0.8633749074548664, + "grad_norm": 6.861274242401123, + "learning_rate": 4.146820413289189e-05, + "loss": 6.0833, + "step": 7580 + }, + { + "epoch": 0.864513924483171, + "grad_norm": 6.467966079711914, + "learning_rate": 4.145678730448681e-05, + "loss": 5.8859, + "step": 7590 + }, + { + "epoch": 0.8656529415114756, + "grad_norm": 6.942263603210449, + "learning_rate": 4.1445370476081745e-05, + "loss": 5.8636, + "step": 7600 + }, + { + "epoch": 0.8667919585397802, + "grad_norm": 6.266470432281494, + "learning_rate": 4.143395364767668e-05, + "loss": 5.7906, + "step": 7610 + }, + { + "epoch": 0.8679309755680847, + "grad_norm": 10.374220848083496, + "learning_rate": 4.142253681927161e-05, + "loss": 5.7794, + "step": 7620 + }, + { + "epoch": 0.8690699925963893, + "grad_norm": 8.868586540222168, + "learning_rate": 4.1411119990866534e-05, + "loss": 6.1445, + "step": 7630 + }, + { + "epoch": 0.8702090096246939, + "grad_norm": 7.694916725158691, + "learning_rate": 4.139970316246147e-05, + "loss": 5.7188, + "step": 7640 + }, + { + "epoch": 0.8713480266529985, + "grad_norm": 9.376993179321289, + "learning_rate": 4.1388286334056404e-05, + "loss": 5.5129, + "step": 7650 + }, + { + "epoch": 0.872487043681303, + "grad_norm": 7.108951568603516, + "learning_rate": 4.1376869505651336e-05, + "loss": 6.0777, + "step": 7660 + }, + { + "epoch": 0.8736260607096076, + "grad_norm": 5.349836349487305, + "learning_rate": 4.136545267724626e-05, + "loss": 5.6746, + "step": 7670 + }, + { + "epoch": 0.8747650777379122, + "grad_norm": 23.05253028869629, + "learning_rate": 4.135403584884119e-05, + "loss": 5.6119, + "step": 7680 + }, + { + "epoch": 0.8759040947662168, + "grad_norm": 8.042037963867188, + "learning_rate": 4.1342619020436125e-05, + "loss": 6.0481, + "step": 7690 + }, + { + "epoch": 0.8770431117945213, + "grad_norm": 7.190995216369629, + "learning_rate": 4.133120219203106e-05, + "loss": 5.6291, + "step": 7700 + }, + { + "epoch": 0.8781821288228259, + "grad_norm": 6.2798871994018555, + "learning_rate": 4.131978536362598e-05, + "loss": 5.8272, + "step": 7710 + }, + { + "epoch": 0.8793211458511305, + "grad_norm": 6.415441036224365, + "learning_rate": 4.1308368535220914e-05, + "loss": 5.9067, + "step": 7720 + }, + { + "epoch": 0.8804601628794351, + "grad_norm": 9.83820915222168, + "learning_rate": 4.129695170681585e-05, + "loss": 6.2126, + "step": 7730 + }, + { + "epoch": 0.8815991799077396, + "grad_norm": 6.543364524841309, + "learning_rate": 4.1285534878410784e-05, + "loss": 5.7905, + "step": 7740 + }, + { + "epoch": 0.8827381969360442, + "grad_norm": 5.862452507019043, + "learning_rate": 4.127411805000571e-05, + "loss": 5.6371, + "step": 7750 + }, + { + "epoch": 0.8838772139643488, + "grad_norm": 6.425840377807617, + "learning_rate": 4.126270122160064e-05, + "loss": 5.6843, + "step": 7760 + }, + { + "epoch": 0.8850162309926534, + "grad_norm": 7.781904697418213, + "learning_rate": 4.125128439319557e-05, + "loss": 5.6806, + "step": 7770 + }, + { + "epoch": 0.8861552480209579, + "grad_norm": 6.408961772918701, + "learning_rate": 4.1239867564790505e-05, + "loss": 6.0425, + "step": 7780 + }, + { + "epoch": 0.8872942650492625, + "grad_norm": 6.187387943267822, + "learning_rate": 4.122845073638543e-05, + "loss": 6.662, + "step": 7790 + }, + { + "epoch": 0.8884332820775671, + "grad_norm": 5.426062107086182, + "learning_rate": 4.121703390798036e-05, + "loss": 5.5295, + "step": 7800 + }, + { + "epoch": 0.8895722991058717, + "grad_norm": 6.168425559997559, + "learning_rate": 4.12056170795753e-05, + "loss": 5.8367, + "step": 7810 + }, + { + "epoch": 0.8907113161341762, + "grad_norm": 6.9254679679870605, + "learning_rate": 4.1194200251170225e-05, + "loss": 5.6099, + "step": 7820 + }, + { + "epoch": 0.8918503331624807, + "grad_norm": 4.471135139465332, + "learning_rate": 4.118278342276516e-05, + "loss": 5.8812, + "step": 7830 + }, + { + "epoch": 0.8929893501907854, + "grad_norm": 11.914435386657715, + "learning_rate": 4.117136659436009e-05, + "loss": 5.7133, + "step": 7840 + }, + { + "epoch": 0.89412836721909, + "grad_norm": 6.855250835418701, + "learning_rate": 4.115994976595502e-05, + "loss": 5.5045, + "step": 7850 + }, + { + "epoch": 0.8952673842473945, + "grad_norm": 4.5126190185546875, + "learning_rate": 4.114853293754995e-05, + "loss": 6.1189, + "step": 7860 + }, + { + "epoch": 0.896406401275699, + "grad_norm": 5.44443941116333, + "learning_rate": 4.113711610914488e-05, + "loss": 5.6838, + "step": 7870 + }, + { + "epoch": 0.8975454183040037, + "grad_norm": 6.447963237762451, + "learning_rate": 4.112569928073981e-05, + "loss": 5.7742, + "step": 7880 + }, + { + "epoch": 0.8986844353323082, + "grad_norm": 9.784255981445312, + "learning_rate": 4.111428245233475e-05, + "loss": 5.6421, + "step": 7890 + }, + { + "epoch": 0.8998234523606128, + "grad_norm": 4.79591178894043, + "learning_rate": 4.110286562392967e-05, + "loss": 5.8867, + "step": 7900 + }, + { + "epoch": 0.9009624693889173, + "grad_norm": 10.118879318237305, + "learning_rate": 4.1091448795524605e-05, + "loss": 5.7606, + "step": 7910 + }, + { + "epoch": 0.902101486417222, + "grad_norm": 4.7326483726501465, + "learning_rate": 4.108003196711954e-05, + "loss": 5.8425, + "step": 7920 + }, + { + "epoch": 0.9032405034455265, + "grad_norm": 9.731658935546875, + "learning_rate": 4.106861513871447e-05, + "loss": 5.6166, + "step": 7930 + }, + { + "epoch": 0.9043795204738311, + "grad_norm": 9.106302261352539, + "learning_rate": 4.1057198310309394e-05, + "loss": 5.9631, + "step": 7940 + }, + { + "epoch": 0.9055185375021356, + "grad_norm": 8.407746315002441, + "learning_rate": 4.1045781481904326e-05, + "loss": 5.9826, + "step": 7950 + }, + { + "epoch": 0.9066575545304403, + "grad_norm": 7.38493537902832, + "learning_rate": 4.103436465349926e-05, + "loss": 5.3733, + "step": 7960 + }, + { + "epoch": 0.9077965715587448, + "grad_norm": 4.594963550567627, + "learning_rate": 4.1022947825094196e-05, + "loss": 5.5044, + "step": 7970 + }, + { + "epoch": 0.9089355885870494, + "grad_norm": 6.537250995635986, + "learning_rate": 4.101153099668912e-05, + "loss": 5.8437, + "step": 7980 + }, + { + "epoch": 0.9100746056153539, + "grad_norm": 8.928478240966797, + "learning_rate": 4.100011416828405e-05, + "loss": 5.8224, + "step": 7990 + }, + { + "epoch": 0.9112136226436586, + "grad_norm": 6.1974968910217285, + "learning_rate": 4.0988697339878985e-05, + "loss": 5.7735, + "step": 8000 + }, + { + "epoch": 0.9112136226436586, + "eval_loss": 5.963014125823975, + "eval_runtime": 12.0216, + "eval_samples_per_second": 1.248, + "eval_steps_per_second": 0.166, + "step": 8000 + }, + { + "epoch": 0.9123526396719631, + "grad_norm": 16.432716369628906, + "learning_rate": 4.0977280511473917e-05, + "loss": 5.3713, + "step": 8010 + }, + { + "epoch": 0.9134916567002677, + "grad_norm": 8.134661674499512, + "learning_rate": 4.096586368306884e-05, + "loss": 5.7447, + "step": 8020 + }, + { + "epoch": 0.9146306737285722, + "grad_norm": 6.499509334564209, + "learning_rate": 4.0954446854663773e-05, + "loss": 5.5174, + "step": 8030 + }, + { + "epoch": 0.9157696907568769, + "grad_norm": 6.545501708984375, + "learning_rate": 4.0943030026258705e-05, + "loss": 6.1282, + "step": 8040 + }, + { + "epoch": 0.9169087077851814, + "grad_norm": 5.5157976150512695, + "learning_rate": 4.093161319785364e-05, + "loss": 6.0457, + "step": 8050 + }, + { + "epoch": 0.918047724813486, + "grad_norm": 6.671703338623047, + "learning_rate": 4.092019636944857e-05, + "loss": 5.6853, + "step": 8060 + }, + { + "epoch": 0.9191867418417905, + "grad_norm": 8.327787399291992, + "learning_rate": 4.09087795410435e-05, + "loss": 5.6512, + "step": 8070 + }, + { + "epoch": 0.9203257588700952, + "grad_norm": 5.751533031463623, + "learning_rate": 4.089736271263843e-05, + "loss": 5.5488, + "step": 8080 + }, + { + "epoch": 0.9214647758983997, + "grad_norm": 5.360276222229004, + "learning_rate": 4.0885945884233365e-05, + "loss": 5.8269, + "step": 8090 + }, + { + "epoch": 0.9226037929267042, + "grad_norm": 9.118461608886719, + "learning_rate": 4.087452905582829e-05, + "loss": 5.783, + "step": 8100 + }, + { + "epoch": 0.9237428099550088, + "grad_norm": 6.865748882293701, + "learning_rate": 4.086311222742322e-05, + "loss": 5.4973, + "step": 8110 + }, + { + "epoch": 0.9248818269833134, + "grad_norm": 5.93018102645874, + "learning_rate": 4.085169539901815e-05, + "loss": 5.8153, + "step": 8120 + }, + { + "epoch": 0.926020844011618, + "grad_norm": 4.858203887939453, + "learning_rate": 4.0840278570613085e-05, + "loss": 5.6685, + "step": 8130 + }, + { + "epoch": 0.9271598610399225, + "grad_norm": 17.164108276367188, + "learning_rate": 4.082886174220802e-05, + "loss": 6.1546, + "step": 8140 + }, + { + "epoch": 0.9282988780682271, + "grad_norm": 7.045877456665039, + "learning_rate": 4.081744491380295e-05, + "loss": 5.5681, + "step": 8150 + }, + { + "epoch": 0.9294378950965317, + "grad_norm": 7.193668365478516, + "learning_rate": 4.080602808539788e-05, + "loss": 5.7963, + "step": 8160 + }, + { + "epoch": 0.9305769121248363, + "grad_norm": 6.006307601928711, + "learning_rate": 4.079461125699281e-05, + "loss": 5.7842, + "step": 8170 + }, + { + "epoch": 0.9317159291531408, + "grad_norm": 5.786032199859619, + "learning_rate": 4.078319442858774e-05, + "loss": 5.7106, + "step": 8180 + }, + { + "epoch": 0.9328549461814454, + "grad_norm": 8.014665603637695, + "learning_rate": 4.077177760018267e-05, + "loss": 5.6216, + "step": 8190 + }, + { + "epoch": 0.93399396320975, + "grad_norm": 7.127926826477051, + "learning_rate": 4.07603607717776e-05, + "loss": 5.6205, + "step": 8200 + }, + { + "epoch": 0.9351329802380546, + "grad_norm": 7.680981159210205, + "learning_rate": 4.074894394337253e-05, + "loss": 5.5081, + "step": 8210 + }, + { + "epoch": 0.9362719972663591, + "grad_norm": 7.980518817901611, + "learning_rate": 4.0737527114967465e-05, + "loss": 5.7191, + "step": 8220 + }, + { + "epoch": 0.9374110142946637, + "grad_norm": 6.019864559173584, + "learning_rate": 4.07261102865624e-05, + "loss": 6.177, + "step": 8230 + }, + { + "epoch": 0.9385500313229683, + "grad_norm": 5.620800018310547, + "learning_rate": 4.071469345815733e-05, + "loss": 5.6332, + "step": 8240 + }, + { + "epoch": 0.9396890483512729, + "grad_norm": 6.723660945892334, + "learning_rate": 4.070327662975226e-05, + "loss": 5.5879, + "step": 8250 + }, + { + "epoch": 0.9408280653795774, + "grad_norm": 9.479137420654297, + "learning_rate": 4.0691859801347185e-05, + "loss": 5.8988, + "step": 8260 + }, + { + "epoch": 0.941967082407882, + "grad_norm": 5.0746989250183105, + "learning_rate": 4.068044297294212e-05, + "loss": 5.6945, + "step": 8270 + }, + { + "epoch": 0.9431060994361866, + "grad_norm": 8.483436584472656, + "learning_rate": 4.066902614453705e-05, + "loss": 5.6908, + "step": 8280 + }, + { + "epoch": 0.9442451164644912, + "grad_norm": 5.951892852783203, + "learning_rate": 4.065760931613198e-05, + "loss": 5.6396, + "step": 8290 + }, + { + "epoch": 0.9453841334927957, + "grad_norm": 6.2564496994018555, + "learning_rate": 4.064619248772691e-05, + "loss": 5.7533, + "step": 8300 + }, + { + "epoch": 0.9465231505211003, + "grad_norm": 6.395124435424805, + "learning_rate": 4.0634775659321845e-05, + "loss": 5.7885, + "step": 8310 + }, + { + "epoch": 0.9476621675494049, + "grad_norm": 4.897542953491211, + "learning_rate": 4.0623358830916776e-05, + "loss": 5.8378, + "step": 8320 + }, + { + "epoch": 0.9488011845777095, + "grad_norm": 7.198699951171875, + "learning_rate": 4.06119420025117e-05, + "loss": 5.9325, + "step": 8330 + }, + { + "epoch": 0.949940201606014, + "grad_norm": 5.802608966827393, + "learning_rate": 4.060052517410663e-05, + "loss": 5.9556, + "step": 8340 + }, + { + "epoch": 0.9510792186343185, + "grad_norm": 6.417184352874756, + "learning_rate": 4.0589108345701565e-05, + "loss": 5.5368, + "step": 8350 + }, + { + "epoch": 0.9522182356626232, + "grad_norm": 6.399797439575195, + "learning_rate": 4.05776915172965e-05, + "loss": 5.6566, + "step": 8360 + }, + { + "epoch": 0.9533572526909277, + "grad_norm": 6.135841369628906, + "learning_rate": 4.056627468889143e-05, + "loss": 5.5519, + "step": 8370 + }, + { + "epoch": 0.9544962697192323, + "grad_norm": 5.759917736053467, + "learning_rate": 4.055485786048636e-05, + "loss": 5.9158, + "step": 8380 + }, + { + "epoch": 0.9556352867475368, + "grad_norm": 4.6408796310424805, + "learning_rate": 4.054344103208129e-05, + "loss": 5.7527, + "step": 8390 + }, + { + "epoch": 0.9567743037758415, + "grad_norm": 7.432102203369141, + "learning_rate": 4.0532024203676224e-05, + "loss": 5.9422, + "step": 8400 + }, + { + "epoch": 0.957913320804146, + "grad_norm": 8.108816146850586, + "learning_rate": 4.052060737527115e-05, + "loss": 5.5413, + "step": 8410 + }, + { + "epoch": 0.9590523378324506, + "grad_norm": 5.005612850189209, + "learning_rate": 4.050919054686608e-05, + "loss": 5.7046, + "step": 8420 + }, + { + "epoch": 0.9601913548607551, + "grad_norm": 15.327434539794922, + "learning_rate": 4.049777371846101e-05, + "loss": 5.7096, + "step": 8430 + }, + { + "epoch": 0.9613303718890598, + "grad_norm": 5.380997180938721, + "learning_rate": 4.0486356890055945e-05, + "loss": 5.6903, + "step": 8440 + }, + { + "epoch": 0.9624693889173643, + "grad_norm": 8.846467971801758, + "learning_rate": 4.047494006165087e-05, + "loss": 5.732, + "step": 8450 + }, + { + "epoch": 0.9636084059456689, + "grad_norm": 4.473547458648682, + "learning_rate": 4.04635232332458e-05, + "loss": 5.9855, + "step": 8460 + }, + { + "epoch": 0.9647474229739734, + "grad_norm": 4.686957359313965, + "learning_rate": 4.045210640484074e-05, + "loss": 5.7405, + "step": 8470 + }, + { + "epoch": 0.9658864400022781, + "grad_norm": 8.524946212768555, + "learning_rate": 4.044068957643567e-05, + "loss": 5.7864, + "step": 8480 + }, + { + "epoch": 0.9670254570305826, + "grad_norm": 8.493307113647461, + "learning_rate": 4.04292727480306e-05, + "loss": 6.0512, + "step": 8490 + }, + { + "epoch": 0.9681644740588872, + "grad_norm": 4.178739547729492, + "learning_rate": 4.041785591962553e-05, + "loss": 5.9716, + "step": 8500 + }, + { + "epoch": 0.9693034910871917, + "grad_norm": 10.36215591430664, + "learning_rate": 4.040643909122046e-05, + "loss": 5.7114, + "step": 8510 + }, + { + "epoch": 0.9704425081154964, + "grad_norm": 20.450754165649414, + "learning_rate": 4.039502226281539e-05, + "loss": 5.5858, + "step": 8520 + }, + { + "epoch": 0.9715815251438009, + "grad_norm": 10.467815399169922, + "learning_rate": 4.038360543441032e-05, + "loss": 5.8745, + "step": 8530 + }, + { + "epoch": 0.9727205421721055, + "grad_norm": 4.771965026855469, + "learning_rate": 4.037218860600525e-05, + "loss": 5.937, + "step": 8540 + }, + { + "epoch": 0.97385955920041, + "grad_norm": 5.959897994995117, + "learning_rate": 4.036077177760019e-05, + "loss": 6.067, + "step": 8550 + }, + { + "epoch": 0.9749985762287147, + "grad_norm": 5.705534934997559, + "learning_rate": 4.034935494919512e-05, + "loss": 5.6886, + "step": 8560 + }, + { + "epoch": 0.9761375932570192, + "grad_norm": 6.315032005310059, + "learning_rate": 4.0337938120790045e-05, + "loss": 5.8835, + "step": 8570 + }, + { + "epoch": 0.9772766102853238, + "grad_norm": 14.058731079101562, + "learning_rate": 4.032652129238498e-05, + "loss": 5.9582, + "step": 8580 + }, + { + "epoch": 0.9784156273136283, + "grad_norm": 8.842247009277344, + "learning_rate": 4.031510446397991e-05, + "loss": 5.7506, + "step": 8590 + }, + { + "epoch": 0.979554644341933, + "grad_norm": 6.420351982116699, + "learning_rate": 4.030368763557484e-05, + "loss": 5.7064, + "step": 8600 + }, + { + "epoch": 0.9806936613702375, + "grad_norm": 6.479267120361328, + "learning_rate": 4.0292270807169766e-05, + "loss": 5.5813, + "step": 8610 + }, + { + "epoch": 0.981832678398542, + "grad_norm": 13.055333137512207, + "learning_rate": 4.02808539787647e-05, + "loss": 5.4908, + "step": 8620 + }, + { + "epoch": 0.9829716954268466, + "grad_norm": 4.641709327697754, + "learning_rate": 4.0269437150359636e-05, + "loss": 6.0172, + "step": 8630 + }, + { + "epoch": 0.9841107124551512, + "grad_norm": 10.869007110595703, + "learning_rate": 4.025802032195457e-05, + "loss": 6.0174, + "step": 8640 + }, + { + "epoch": 0.9852497294834558, + "grad_norm": 6.283684253692627, + "learning_rate": 4.024660349354949e-05, + "loss": 6.0299, + "step": 8650 + }, + { + "epoch": 0.9863887465117603, + "grad_norm": 6.0538859367370605, + "learning_rate": 4.0235186665144425e-05, + "loss": 5.7183, + "step": 8660 + }, + { + "epoch": 0.9875277635400649, + "grad_norm": 6.773647785186768, + "learning_rate": 4.022376983673936e-05, + "loss": 5.9058, + "step": 8670 + }, + { + "epoch": 0.9886667805683695, + "grad_norm": 7.833893775939941, + "learning_rate": 4.021235300833429e-05, + "loss": 5.6367, + "step": 8680 + }, + { + "epoch": 0.9898057975966741, + "grad_norm": 7.423435688018799, + "learning_rate": 4.0200936179929214e-05, + "loss": 6.4342, + "step": 8690 + }, + { + "epoch": 0.9909448146249786, + "grad_norm": 6.709625244140625, + "learning_rate": 4.0189519351524146e-05, + "loss": 5.9904, + "step": 8700 + }, + { + "epoch": 0.9920838316532832, + "grad_norm": 4.907162666320801, + "learning_rate": 4.0178102523119084e-05, + "loss": 5.7625, + "step": 8710 + }, + { + "epoch": 0.9932228486815878, + "grad_norm": 7.05642557144165, + "learning_rate": 4.016668569471401e-05, + "loss": 5.6486, + "step": 8720 + }, + { + "epoch": 0.9943618657098924, + "grad_norm": 7.265207290649414, + "learning_rate": 4.015526886630894e-05, + "loss": 5.658, + "step": 8730 + }, + { + "epoch": 0.9955008827381969, + "grad_norm": 5.687889099121094, + "learning_rate": 4.014385203790387e-05, + "loss": 5.6476, + "step": 8740 + }, + { + "epoch": 0.9966398997665015, + "grad_norm": 11.792612075805664, + "learning_rate": 4.0132435209498805e-05, + "loss": 5.5635, + "step": 8750 + }, + { + "epoch": 0.9977789167948061, + "grad_norm": 6.816971778869629, + "learning_rate": 4.012101838109374e-05, + "loss": 5.8865, + "step": 8760 + }, + { + "epoch": 0.9989179338231107, + "grad_norm": 7.3285932540893555, + "learning_rate": 4.010960155268866e-05, + "loss": 5.7944, + "step": 8770 + }, + { + "epoch": 1.0000569508514152, + "grad_norm": 6.628791809082031, + "learning_rate": 4.0098184724283594e-05, + "loss": 5.5504, + "step": 8780 + }, + { + "epoch": 1.0011959678797198, + "grad_norm": 5.887302875518799, + "learning_rate": 4.0086767895878525e-05, + "loss": 5.254, + "step": 8790 + }, + { + "epoch": 1.0023349849080243, + "grad_norm": 4.866805076599121, + "learning_rate": 4.007535106747346e-05, + "loss": 5.3222, + "step": 8800 + }, + { + "epoch": 1.0034740019363289, + "grad_norm": 7.485427379608154, + "learning_rate": 4.006393423906839e-05, + "loss": 5.4493, + "step": 8810 + }, + { + "epoch": 1.0046130189646336, + "grad_norm": 7.071866035461426, + "learning_rate": 4.005251741066332e-05, + "loss": 5.3074, + "step": 8820 + }, + { + "epoch": 1.0057520359929382, + "grad_norm": 17.301889419555664, + "learning_rate": 4.004110058225825e-05, + "loss": 5.4764, + "step": 8830 + }, + { + "epoch": 1.0068910530212427, + "grad_norm": 13.042938232421875, + "learning_rate": 4.002968375385318e-05, + "loss": 5.1558, + "step": 8840 + }, + { + "epoch": 1.0080300700495473, + "grad_norm": 22.195083618164062, + "learning_rate": 4.001826692544811e-05, + "loss": 5.2606, + "step": 8850 + }, + { + "epoch": 1.0091690870778518, + "grad_norm": 7.411557674407959, + "learning_rate": 4.000685009704304e-05, + "loss": 5.4076, + "step": 8860 + }, + { + "epoch": 1.0103081041061563, + "grad_norm": 8.647936820983887, + "learning_rate": 3.999543326863797e-05, + "loss": 5.4122, + "step": 8870 + }, + { + "epoch": 1.011447121134461, + "grad_norm": 6.743743419647217, + "learning_rate": 3.9984016440232905e-05, + "loss": 5.7691, + "step": 8880 + }, + { + "epoch": 1.0125861381627654, + "grad_norm": 7.781901836395264, + "learning_rate": 3.997259961182784e-05, + "loss": 5.4218, + "step": 8890 + }, + { + "epoch": 1.0137251551910702, + "grad_norm": 15.083586692810059, + "learning_rate": 3.996118278342277e-05, + "loss": 5.2873, + "step": 8900 + }, + { + "epoch": 1.0148641722193747, + "grad_norm": 12.412664413452148, + "learning_rate": 3.99497659550177e-05, + "loss": 5.3881, + "step": 8910 + }, + { + "epoch": 1.0160031892476793, + "grad_norm": 7.3285722732543945, + "learning_rate": 3.9938349126612626e-05, + "loss": 5.2282, + "step": 8920 + }, + { + "epoch": 1.0171422062759838, + "grad_norm": 6.983195781707764, + "learning_rate": 3.992693229820756e-05, + "loss": 5.2699, + "step": 8930 + }, + { + "epoch": 1.0182812233042884, + "grad_norm": 5.575123310089111, + "learning_rate": 3.991551546980249e-05, + "loss": 5.2499, + "step": 8940 + }, + { + "epoch": 1.019420240332593, + "grad_norm": 6.207281589508057, + "learning_rate": 3.990409864139742e-05, + "loss": 5.3387, + "step": 8950 + }, + { + "epoch": 1.0205592573608975, + "grad_norm": 9.925410270690918, + "learning_rate": 3.989268181299235e-05, + "loss": 5.0628, + "step": 8960 + }, + { + "epoch": 1.021698274389202, + "grad_norm": 8.906091690063477, + "learning_rate": 3.9881264984587285e-05, + "loss": 5.3465, + "step": 8970 + }, + { + "epoch": 1.0228372914175068, + "grad_norm": 7.547214031219482, + "learning_rate": 3.986984815618222e-05, + "loss": 5.2823, + "step": 8980 + }, + { + "epoch": 1.0239763084458113, + "grad_norm": 7.977101802825928, + "learning_rate": 3.985843132777715e-05, + "loss": 5.5895, + "step": 8990 + }, + { + "epoch": 1.0251153254741159, + "grad_norm": 9.279818534851074, + "learning_rate": 3.9847014499372074e-05, + "loss": 6.1008, + "step": 9000 + }, + { + "epoch": 1.0262543425024204, + "grad_norm": 6.907566547393799, + "learning_rate": 3.9835597670967005e-05, + "loss": 5.1312, + "step": 9010 + }, + { + "epoch": 1.027393359530725, + "grad_norm": 15.964417457580566, + "learning_rate": 3.982418084256194e-05, + "loss": 5.2919, + "step": 9020 + }, + { + "epoch": 1.0285323765590295, + "grad_norm": 7.658446311950684, + "learning_rate": 3.981276401415687e-05, + "loss": 5.3453, + "step": 9030 + }, + { + "epoch": 1.029671393587334, + "grad_norm": 5.22704553604126, + "learning_rate": 3.98013471857518e-05, + "loss": 5.5017, + "step": 9040 + }, + { + "epoch": 1.0308104106156386, + "grad_norm": 5.284582614898682, + "learning_rate": 3.978993035734673e-05, + "loss": 5.3824, + "step": 9050 + }, + { + "epoch": 1.0319494276439434, + "grad_norm": 6.4581170082092285, + "learning_rate": 3.9778513528941665e-05, + "loss": 6.1145, + "step": 9060 + }, + { + "epoch": 1.033088444672248, + "grad_norm": 8.916783332824707, + "learning_rate": 3.9767096700536596e-05, + "loss": 5.3733, + "step": 9070 + }, + { + "epoch": 1.0342274617005525, + "grad_norm": 6.466397762298584, + "learning_rate": 3.975567987213152e-05, + "loss": 5.15, + "step": 9080 + }, + { + "epoch": 1.035366478728857, + "grad_norm": 5.718016147613525, + "learning_rate": 3.9744263043726453e-05, + "loss": 5.0449, + "step": 9090 + }, + { + "epoch": 1.0365054957571616, + "grad_norm": 8.794554710388184, + "learning_rate": 3.9732846215321385e-05, + "loss": 5.2021, + "step": 9100 + }, + { + "epoch": 1.037644512785466, + "grad_norm": 8.841976165771484, + "learning_rate": 3.972142938691632e-05, + "loss": 5.1749, + "step": 9110 + }, + { + "epoch": 1.0387835298137706, + "grad_norm": 6.140927791595459, + "learning_rate": 3.971001255851124e-05, + "loss": 5.2475, + "step": 9120 + }, + { + "epoch": 1.0399225468420752, + "grad_norm": 5.292937755584717, + "learning_rate": 3.969859573010618e-05, + "loss": 5.1817, + "step": 9130 + }, + { + "epoch": 1.04106156387038, + "grad_norm": 6.455714702606201, + "learning_rate": 3.968717890170111e-05, + "loss": 5.3106, + "step": 9140 + }, + { + "epoch": 1.0422005808986845, + "grad_norm": 23.862918853759766, + "learning_rate": 3.9675762073296044e-05, + "loss": 5.5527, + "step": 9150 + }, + { + "epoch": 1.043339597926989, + "grad_norm": 6.65964412689209, + "learning_rate": 3.966434524489097e-05, + "loss": 5.2139, + "step": 9160 + }, + { + "epoch": 1.0444786149552936, + "grad_norm": 23.59825325012207, + "learning_rate": 3.96529284164859e-05, + "loss": 5.4358, + "step": 9170 + }, + { + "epoch": 1.0456176319835981, + "grad_norm": 6.902249813079834, + "learning_rate": 3.964151158808083e-05, + "loss": 5.2541, + "step": 9180 + }, + { + "epoch": 1.0467566490119027, + "grad_norm": 7.1851043701171875, + "learning_rate": 3.9630094759675765e-05, + "loss": 5.3055, + "step": 9190 + }, + { + "epoch": 1.0478956660402072, + "grad_norm": 8.189417839050293, + "learning_rate": 3.961867793127069e-05, + "loss": 6.0163, + "step": 9200 + }, + { + "epoch": 1.0490346830685118, + "grad_norm": 6.41946268081665, + "learning_rate": 3.960726110286563e-05, + "loss": 5.2336, + "step": 9210 + }, + { + "epoch": 1.0501737000968165, + "grad_norm": 9.957685470581055, + "learning_rate": 3.959584427446056e-05, + "loss": 5.4844, + "step": 9220 + }, + { + "epoch": 1.051312717125121, + "grad_norm": 14.342864036560059, + "learning_rate": 3.9584427446055486e-05, + "loss": 5.3257, + "step": 9230 + }, + { + "epoch": 1.0524517341534256, + "grad_norm": 6.256680488586426, + "learning_rate": 3.957301061765042e-05, + "loss": 5.7186, + "step": 9240 + }, + { + "epoch": 1.0535907511817302, + "grad_norm": 6.000284671783447, + "learning_rate": 3.956159378924535e-05, + "loss": 5.0134, + "step": 9250 + }, + { + "epoch": 1.0547297682100347, + "grad_norm": 7.8761396408081055, + "learning_rate": 3.955017696084028e-05, + "loss": 5.1664, + "step": 9260 + }, + { + "epoch": 1.0558687852383393, + "grad_norm": 9.180042266845703, + "learning_rate": 3.953876013243521e-05, + "loss": 5.2035, + "step": 9270 + }, + { + "epoch": 1.0570078022666438, + "grad_norm": 9.206707000732422, + "learning_rate": 3.952734330403014e-05, + "loss": 5.3212, + "step": 9280 + }, + { + "epoch": 1.0581468192949484, + "grad_norm": 7.057497501373291, + "learning_rate": 3.9515926475625077e-05, + "loss": 5.0746, + "step": 9290 + }, + { + "epoch": 1.0592858363232531, + "grad_norm": 8.683945655822754, + "learning_rate": 3.950450964722001e-05, + "loss": 5.3396, + "step": 9300 + }, + { + "epoch": 1.0604248533515577, + "grad_norm": 7.847707748413086, + "learning_rate": 3.9493092818814933e-05, + "loss": 5.1268, + "step": 9310 + }, + { + "epoch": 1.0615638703798622, + "grad_norm": 10.160079956054688, + "learning_rate": 3.9481675990409865e-05, + "loss": 5.3413, + "step": 9320 + }, + { + "epoch": 1.0627028874081668, + "grad_norm": 8.487902641296387, + "learning_rate": 3.94702591620048e-05, + "loss": 5.1777, + "step": 9330 + }, + { + "epoch": 1.0638419044364713, + "grad_norm": 11.462549209594727, + "learning_rate": 3.945884233359973e-05, + "loss": 5.3486, + "step": 9340 + }, + { + "epoch": 1.0649809214647759, + "grad_norm": 10.42354965209961, + "learning_rate": 3.9447425505194654e-05, + "loss": 5.3129, + "step": 9350 + }, + { + "epoch": 1.0661199384930804, + "grad_norm": 7.207667827606201, + "learning_rate": 3.9436008676789586e-05, + "loss": 5.2524, + "step": 9360 + }, + { + "epoch": 1.067258955521385, + "grad_norm": 5.92431116104126, + "learning_rate": 3.9424591848384525e-05, + "loss": 5.0543, + "step": 9370 + }, + { + "epoch": 1.0683979725496897, + "grad_norm": 7.101565837860107, + "learning_rate": 3.9413175019979456e-05, + "loss": 5.7859, + "step": 9380 + }, + { + "epoch": 1.0695369895779943, + "grad_norm": 6.356846809387207, + "learning_rate": 3.940175819157438e-05, + "loss": 5.1667, + "step": 9390 + }, + { + "epoch": 1.0706760066062988, + "grad_norm": 7.87912654876709, + "learning_rate": 3.939034136316931e-05, + "loss": 5.1314, + "step": 9400 + }, + { + "epoch": 1.0718150236346033, + "grad_norm": 11.108393669128418, + "learning_rate": 3.9378924534764245e-05, + "loss": 5.2929, + "step": 9410 + }, + { + "epoch": 1.072954040662908, + "grad_norm": 9.422391891479492, + "learning_rate": 3.936750770635918e-05, + "loss": 5.6755, + "step": 9420 + }, + { + "epoch": 1.0740930576912124, + "grad_norm": 8.584696769714355, + "learning_rate": 3.93560908779541e-05, + "loss": 5.4, + "step": 9430 + }, + { + "epoch": 1.075232074719517, + "grad_norm": 8.486468315124512, + "learning_rate": 3.9344674049549034e-05, + "loss": 5.8283, + "step": 9440 + }, + { + "epoch": 1.0763710917478215, + "grad_norm": 5.808884620666504, + "learning_rate": 3.9333257221143966e-05, + "loss": 5.3142, + "step": 9450 + }, + { + "epoch": 1.0775101087761263, + "grad_norm": 7.499875068664551, + "learning_rate": 3.9321840392738904e-05, + "loss": 5.6757, + "step": 9460 + }, + { + "epoch": 1.0786491258044308, + "grad_norm": 6.805802345275879, + "learning_rate": 3.931042356433383e-05, + "loss": 5.4205, + "step": 9470 + }, + { + "epoch": 1.0797881428327354, + "grad_norm": 7.800095558166504, + "learning_rate": 3.929900673592876e-05, + "loss": 5.3918, + "step": 9480 + }, + { + "epoch": 1.08092715986104, + "grad_norm": 7.894876956939697, + "learning_rate": 3.928758990752369e-05, + "loss": 5.4332, + "step": 9490 + }, + { + "epoch": 1.0820661768893445, + "grad_norm": 10.799897193908691, + "learning_rate": 3.9276173079118625e-05, + "loss": 5.1566, + "step": 9500 + }, + { + "epoch": 1.083205193917649, + "grad_norm": 5.901568412780762, + "learning_rate": 3.926475625071355e-05, + "loss": 5.2164, + "step": 9510 + }, + { + "epoch": 1.0843442109459536, + "grad_norm": 8.62924575805664, + "learning_rate": 3.925333942230848e-05, + "loss": 5.3436, + "step": 9520 + }, + { + "epoch": 1.0854832279742581, + "grad_norm": 7.14743709564209, + "learning_rate": 3.9241922593903414e-05, + "loss": 5.6523, + "step": 9530 + }, + { + "epoch": 1.0866222450025629, + "grad_norm": 6.542008876800537, + "learning_rate": 3.9230505765498345e-05, + "loss": 5.4717, + "step": 9540 + }, + { + "epoch": 1.0877612620308674, + "grad_norm": 6.9539713859558105, + "learning_rate": 3.921908893709328e-05, + "loss": 5.4214, + "step": 9550 + }, + { + "epoch": 1.088900279059172, + "grad_norm": 7.2530927658081055, + "learning_rate": 3.920767210868821e-05, + "loss": 5.1499, + "step": 9560 + }, + { + "epoch": 1.0900392960874765, + "grad_norm": 8.217451095581055, + "learning_rate": 3.919625528028314e-05, + "loss": 5.3176, + "step": 9570 + }, + { + "epoch": 1.091178313115781, + "grad_norm": 7.244843482971191, + "learning_rate": 3.918483845187807e-05, + "loss": 5.1277, + "step": 9580 + }, + { + "epoch": 1.0923173301440856, + "grad_norm": 10.915613174438477, + "learning_rate": 3.9173421623473e-05, + "loss": 5.5916, + "step": 9590 + }, + { + "epoch": 1.0934563471723902, + "grad_norm": 9.508447647094727, + "learning_rate": 3.916200479506793e-05, + "loss": 5.055, + "step": 9600 + }, + { + "epoch": 1.0945953642006947, + "grad_norm": 8.85869312286377, + "learning_rate": 3.915058796666286e-05, + "loss": 5.7979, + "step": 9610 + }, + { + "epoch": 1.0957343812289995, + "grad_norm": 7.686898231506348, + "learning_rate": 3.913917113825779e-05, + "loss": 5.5152, + "step": 9620 + }, + { + "epoch": 1.096873398257304, + "grad_norm": 6.5317277908325195, + "learning_rate": 3.9127754309852725e-05, + "loss": 5.1522, + "step": 9630 + }, + { + "epoch": 1.0980124152856086, + "grad_norm": 16.195051193237305, + "learning_rate": 3.911633748144766e-05, + "loss": 5.2707, + "step": 9640 + }, + { + "epoch": 1.099151432313913, + "grad_norm": 11.292452812194824, + "learning_rate": 3.910492065304259e-05, + "loss": 5.1579, + "step": 9650 + }, + { + "epoch": 1.1002904493422176, + "grad_norm": 9.894390106201172, + "learning_rate": 3.909350382463752e-05, + "loss": 5.3674, + "step": 9660 + }, + { + "epoch": 1.1014294663705222, + "grad_norm": 8.536913871765137, + "learning_rate": 3.9082086996232446e-05, + "loss": 5.2427, + "step": 9670 + }, + { + "epoch": 1.1025684833988267, + "grad_norm": 6.5137786865234375, + "learning_rate": 3.907067016782738e-05, + "loss": 5.3871, + "step": 9680 + }, + { + "epoch": 1.1037075004271313, + "grad_norm": 11.244919776916504, + "learning_rate": 3.905925333942231e-05, + "loss": 5.1847, + "step": 9690 + }, + { + "epoch": 1.104846517455436, + "grad_norm": 12.385736465454102, + "learning_rate": 3.904783651101724e-05, + "loss": 5.5966, + "step": 9700 + }, + { + "epoch": 1.1059855344837406, + "grad_norm": 5.9896016120910645, + "learning_rate": 3.903641968261217e-05, + "loss": 5.2129, + "step": 9710 + }, + { + "epoch": 1.1071245515120451, + "grad_norm": 7.364066123962402, + "learning_rate": 3.9025002854207105e-05, + "loss": 5.1776, + "step": 9720 + }, + { + "epoch": 1.1082635685403497, + "grad_norm": 8.434669494628906, + "learning_rate": 3.901358602580204e-05, + "loss": 5.2653, + "step": 9730 + }, + { + "epoch": 1.1094025855686542, + "grad_norm": 8.104005813598633, + "learning_rate": 3.900216919739696e-05, + "loss": 5.2718, + "step": 9740 + }, + { + "epoch": 1.1105416025969588, + "grad_norm": 8.960792541503906, + "learning_rate": 3.8990752368991894e-05, + "loss": 5.1523, + "step": 9750 + }, + { + "epoch": 1.1116806196252633, + "grad_norm": 10.277594566345215, + "learning_rate": 3.8979335540586826e-05, + "loss": 5.0689, + "step": 9760 + }, + { + "epoch": 1.1128196366535679, + "grad_norm": 15.712716102600098, + "learning_rate": 3.896791871218176e-05, + "loss": 5.2961, + "step": 9770 + }, + { + "epoch": 1.1139586536818726, + "grad_norm": 7.332018852233887, + "learning_rate": 3.895650188377669e-05, + "loss": 5.4115, + "step": 9780 + }, + { + "epoch": 1.1150976707101772, + "grad_norm": 10.549822807312012, + "learning_rate": 3.894508505537162e-05, + "loss": 5.397, + "step": 9790 + }, + { + "epoch": 1.1162366877384817, + "grad_norm": 6.6530375480651855, + "learning_rate": 3.893366822696655e-05, + "loss": 5.7133, + "step": 9800 + }, + { + "epoch": 1.1173757047667863, + "grad_norm": 8.9078369140625, + "learning_rate": 3.8922251398561485e-05, + "loss": 4.867, + "step": 9810 + }, + { + "epoch": 1.1185147217950908, + "grad_norm": 8.685425758361816, + "learning_rate": 3.891083457015641e-05, + "loss": 5.1863, + "step": 9820 + }, + { + "epoch": 1.1196537388233954, + "grad_norm": 12.476484298706055, + "learning_rate": 3.889941774175134e-05, + "loss": 5.121, + "step": 9830 + }, + { + "epoch": 1.1207927558517, + "grad_norm": 8.18492317199707, + "learning_rate": 3.8888000913346273e-05, + "loss": 5.3926, + "step": 9840 + }, + { + "epoch": 1.1219317728800045, + "grad_norm": 12.082232475280762, + "learning_rate": 3.8876584084941205e-05, + "loss": 5.4308, + "step": 9850 + }, + { + "epoch": 1.1230707899083092, + "grad_norm": 6.331710338592529, + "learning_rate": 3.886516725653613e-05, + "loss": 5.1105, + "step": 9860 + }, + { + "epoch": 1.1242098069366138, + "grad_norm": 29.657867431640625, + "learning_rate": 3.885375042813107e-05, + "loss": 5.1145, + "step": 9870 + }, + { + "epoch": 1.1253488239649183, + "grad_norm": 6.528309345245361, + "learning_rate": 3.8842333599726e-05, + "loss": 5.356, + "step": 9880 + }, + { + "epoch": 1.1264878409932229, + "grad_norm": 8.70510196685791, + "learning_rate": 3.883091677132093e-05, + "loss": 5.1216, + "step": 9890 + }, + { + "epoch": 1.1276268580215274, + "grad_norm": 11.015145301818848, + "learning_rate": 3.881949994291586e-05, + "loss": 5.1805, + "step": 9900 + }, + { + "epoch": 1.128765875049832, + "grad_norm": 8.160832405090332, + "learning_rate": 3.880808311451079e-05, + "loss": 5.4908, + "step": 9910 + }, + { + "epoch": 1.1299048920781365, + "grad_norm": 6.731886386871338, + "learning_rate": 3.879666628610572e-05, + "loss": 5.1552, + "step": 9920 + }, + { + "epoch": 1.1310439091064413, + "grad_norm": 9.394970893859863, + "learning_rate": 3.878524945770065e-05, + "loss": 5.6384, + "step": 9930 + }, + { + "epoch": 1.1321829261347458, + "grad_norm": 6.584465980529785, + "learning_rate": 3.877383262929558e-05, + "loss": 5.4019, + "step": 9940 + }, + { + "epoch": 1.1333219431630503, + "grad_norm": 9.031838417053223, + "learning_rate": 3.876241580089052e-05, + "loss": 5.308, + "step": 9950 + }, + { + "epoch": 1.134460960191355, + "grad_norm": 17.675561904907227, + "learning_rate": 3.875099897248545e-05, + "loss": 5.4112, + "step": 9960 + }, + { + "epoch": 1.1355999772196594, + "grad_norm": 8.194103240966797, + "learning_rate": 3.873958214408038e-05, + "loss": 5.294, + "step": 9970 + }, + { + "epoch": 1.136738994247964, + "grad_norm": 8.301799774169922, + "learning_rate": 3.8728165315675306e-05, + "loss": 5.3232, + "step": 9980 + }, + { + "epoch": 1.1378780112762685, + "grad_norm": 9.098228454589844, + "learning_rate": 3.871674848727024e-05, + "loss": 5.6319, + "step": 9990 + }, + { + "epoch": 1.139017028304573, + "grad_norm": 4.738903999328613, + "learning_rate": 3.870533165886517e-05, + "loss": 5.2591, + "step": 10000 + }, + { + "epoch": 1.139017028304573, + "eval_loss": 5.98829460144043, + "eval_runtime": 11.6367, + "eval_samples_per_second": 1.289, + "eval_steps_per_second": 0.172, + "step": 10000 + }, + { + "epoch": 1.1401560453328776, + "grad_norm": 8.139161109924316, + "learning_rate": 3.86939148304601e-05, + "loss": 5.36, + "step": 10010 + }, + { + "epoch": 1.1412950623611824, + "grad_norm": 5.483173847198486, + "learning_rate": 3.8682498002055026e-05, + "loss": 5.2019, + "step": 10020 + }, + { + "epoch": 1.142434079389487, + "grad_norm": 9.01739501953125, + "learning_rate": 3.8671081173649965e-05, + "loss": 5.138, + "step": 10030 + }, + { + "epoch": 1.1435730964177915, + "grad_norm": 8.21419906616211, + "learning_rate": 3.86596643452449e-05, + "loss": 5.5423, + "step": 10040 + }, + { + "epoch": 1.144712113446096, + "grad_norm": 6.223106384277344, + "learning_rate": 3.864824751683982e-05, + "loss": 5.555, + "step": 10050 + }, + { + "epoch": 1.1458511304744006, + "grad_norm": 8.907463073730469, + "learning_rate": 3.8636830688434754e-05, + "loss": 5.257, + "step": 10060 + }, + { + "epoch": 1.1469901475027051, + "grad_norm": 13.769535064697266, + "learning_rate": 3.8625413860029685e-05, + "loss": 5.3491, + "step": 10070 + }, + { + "epoch": 1.1481291645310097, + "grad_norm": 6.513309478759766, + "learning_rate": 3.861399703162462e-05, + "loss": 5.2318, + "step": 10080 + }, + { + "epoch": 1.1492681815593144, + "grad_norm": 7.231842041015625, + "learning_rate": 3.860258020321955e-05, + "loss": 5.3576, + "step": 10090 + }, + { + "epoch": 1.150407198587619, + "grad_norm": 7.973701000213623, + "learning_rate": 3.8591163374814474e-05, + "loss": 5.3865, + "step": 10100 + }, + { + "epoch": 1.1515462156159235, + "grad_norm": 10.383562088012695, + "learning_rate": 3.8579746546409406e-05, + "loss": 5.3199, + "step": 10110 + }, + { + "epoch": 1.152685232644228, + "grad_norm": 10.173810958862305, + "learning_rate": 3.8568329718004345e-05, + "loss": 5.4949, + "step": 10120 + }, + { + "epoch": 1.1538242496725326, + "grad_norm": 10.03913688659668, + "learning_rate": 3.855691288959927e-05, + "loss": 5.437, + "step": 10130 + }, + { + "epoch": 1.1549632667008372, + "grad_norm": 15.847329139709473, + "learning_rate": 3.85454960611942e-05, + "loss": 5.1987, + "step": 10140 + }, + { + "epoch": 1.1561022837291417, + "grad_norm": 8.826560020446777, + "learning_rate": 3.853407923278913e-05, + "loss": 5.2763, + "step": 10150 + }, + { + "epoch": 1.1572413007574462, + "grad_norm": 11.843634605407715, + "learning_rate": 3.8522662404384065e-05, + "loss": 5.0479, + "step": 10160 + }, + { + "epoch": 1.1583803177857508, + "grad_norm": 11.732421875, + "learning_rate": 3.8511245575979e-05, + "loss": 5.0428, + "step": 10170 + }, + { + "epoch": 1.1595193348140556, + "grad_norm": 8.850831985473633, + "learning_rate": 3.849982874757392e-05, + "loss": 4.8044, + "step": 10180 + }, + { + "epoch": 1.16065835184236, + "grad_norm": 7.789608955383301, + "learning_rate": 3.8488411919168854e-05, + "loss": 5.1098, + "step": 10190 + }, + { + "epoch": 1.1617973688706646, + "grad_norm": 6.359009742736816, + "learning_rate": 3.847699509076379e-05, + "loss": 5.3908, + "step": 10200 + }, + { + "epoch": 1.1629363858989692, + "grad_norm": 16.158702850341797, + "learning_rate": 3.846557826235872e-05, + "loss": 5.3882, + "step": 10210 + }, + { + "epoch": 1.1640754029272737, + "grad_norm": 6.294594764709473, + "learning_rate": 3.845416143395365e-05, + "loss": 5.2433, + "step": 10220 + }, + { + "epoch": 1.1652144199555783, + "grad_norm": 6.593204021453857, + "learning_rate": 3.844274460554858e-05, + "loss": 5.4319, + "step": 10230 + }, + { + "epoch": 1.1663534369838828, + "grad_norm": 7.6413702964782715, + "learning_rate": 3.843132777714351e-05, + "loss": 5.2148, + "step": 10240 + }, + { + "epoch": 1.1674924540121876, + "grad_norm": 8.203592300415039, + "learning_rate": 3.841991094873844e-05, + "loss": 5.307, + "step": 10250 + }, + { + "epoch": 1.1686314710404921, + "grad_norm": 7.085062026977539, + "learning_rate": 3.840849412033337e-05, + "loss": 5.1634, + "step": 10260 + }, + { + "epoch": 1.1697704880687967, + "grad_norm": 5.813399791717529, + "learning_rate": 3.83970772919283e-05, + "loss": 5.4435, + "step": 10270 + }, + { + "epoch": 1.1709095050971012, + "grad_norm": 8.348408699035645, + "learning_rate": 3.838566046352324e-05, + "loss": 5.3024, + "step": 10280 + }, + { + "epoch": 1.1720485221254058, + "grad_norm": 6.418082237243652, + "learning_rate": 3.8374243635118165e-05, + "loss": 5.2993, + "step": 10290 + }, + { + "epoch": 1.1731875391537103, + "grad_norm": 9.121561050415039, + "learning_rate": 3.83628268067131e-05, + "loss": 5.2365, + "step": 10300 + }, + { + "epoch": 1.1743265561820149, + "grad_norm": 18.363595962524414, + "learning_rate": 3.835140997830803e-05, + "loss": 5.4046, + "step": 10310 + }, + { + "epoch": 1.1754655732103194, + "grad_norm": 32.12479782104492, + "learning_rate": 3.833999314990296e-05, + "loss": 5.2378, + "step": 10320 + }, + { + "epoch": 1.176604590238624, + "grad_norm": 7.56660795211792, + "learning_rate": 3.8328576321497886e-05, + "loss": 5.2554, + "step": 10330 + }, + { + "epoch": 1.1777436072669287, + "grad_norm": 8.347277641296387, + "learning_rate": 3.831715949309282e-05, + "loss": 5.281, + "step": 10340 + }, + { + "epoch": 1.1788826242952333, + "grad_norm": 12.190389633178711, + "learning_rate": 3.830574266468775e-05, + "loss": 5.2473, + "step": 10350 + }, + { + "epoch": 1.1800216413235378, + "grad_norm": 9.21943187713623, + "learning_rate": 3.829432583628269e-05, + "loss": 5.602, + "step": 10360 + }, + { + "epoch": 1.1811606583518424, + "grad_norm": 9.936127662658691, + "learning_rate": 3.828290900787761e-05, + "loss": 5.3133, + "step": 10370 + }, + { + "epoch": 1.182299675380147, + "grad_norm": 5.087235450744629, + "learning_rate": 3.8271492179472545e-05, + "loss": 5.425, + "step": 10380 + }, + { + "epoch": 1.1834386924084515, + "grad_norm": 6.407649040222168, + "learning_rate": 3.826007535106748e-05, + "loss": 5.2866, + "step": 10390 + }, + { + "epoch": 1.184577709436756, + "grad_norm": 11.357165336608887, + "learning_rate": 3.824865852266241e-05, + "loss": 5.4531, + "step": 10400 + }, + { + "epoch": 1.1857167264650608, + "grad_norm": 6.227645397186279, + "learning_rate": 3.8237241694257334e-05, + "loss": 5.0796, + "step": 10410 + }, + { + "epoch": 1.1868557434933653, + "grad_norm": 7.409129619598389, + "learning_rate": 3.8225824865852266e-05, + "loss": 5.5382, + "step": 10420 + }, + { + "epoch": 1.1879947605216699, + "grad_norm": 7.063460350036621, + "learning_rate": 3.82144080374472e-05, + "loss": 5.2155, + "step": 10430 + }, + { + "epoch": 1.1891337775499744, + "grad_norm": 8.425795555114746, + "learning_rate": 3.820299120904213e-05, + "loss": 5.4379, + "step": 10440 + }, + { + "epoch": 1.190272794578279, + "grad_norm": 7.571264266967773, + "learning_rate": 3.819157438063706e-05, + "loss": 5.4893, + "step": 10450 + }, + { + "epoch": 1.1914118116065835, + "grad_norm": 9.26445484161377, + "learning_rate": 3.818015755223199e-05, + "loss": 5.5383, + "step": 10460 + }, + { + "epoch": 1.192550828634888, + "grad_norm": 8.3720064163208, + "learning_rate": 3.8168740723826925e-05, + "loss": 5.0225, + "step": 10470 + }, + { + "epoch": 1.1936898456631926, + "grad_norm": 7.018798351287842, + "learning_rate": 3.815732389542186e-05, + "loss": 5.0177, + "step": 10480 + }, + { + "epoch": 1.1948288626914971, + "grad_norm": 9.46872329711914, + "learning_rate": 3.814590706701678e-05, + "loss": 5.2551, + "step": 10490 + }, + { + "epoch": 1.195967879719802, + "grad_norm": 9.977618217468262, + "learning_rate": 3.8134490238611714e-05, + "loss": 5.4843, + "step": 10500 + }, + { + "epoch": 1.1971068967481064, + "grad_norm": 7.310171127319336, + "learning_rate": 3.8123073410206646e-05, + "loss": 5.2402, + "step": 10510 + }, + { + "epoch": 1.198245913776411, + "grad_norm": 10.744500160217285, + "learning_rate": 3.811165658180158e-05, + "loss": 5.548, + "step": 10520 + }, + { + "epoch": 1.1993849308047155, + "grad_norm": 6.208596229553223, + "learning_rate": 3.810023975339651e-05, + "loss": 5.5792, + "step": 10530 + }, + { + "epoch": 1.20052394783302, + "grad_norm": 10.301777839660645, + "learning_rate": 3.808882292499144e-05, + "loss": 5.2611, + "step": 10540 + }, + { + "epoch": 1.2016629648613246, + "grad_norm": 11.960914611816406, + "learning_rate": 3.807740609658637e-05, + "loss": 5.0305, + "step": 10550 + }, + { + "epoch": 1.2028019818896292, + "grad_norm": 6.9739885330200195, + "learning_rate": 3.80659892681813e-05, + "loss": 5.1622, + "step": 10560 + }, + { + "epoch": 1.203940998917934, + "grad_norm": 12.12211799621582, + "learning_rate": 3.805457243977623e-05, + "loss": 5.6689, + "step": 10570 + }, + { + "epoch": 1.2050800159462385, + "grad_norm": 10.115104675292969, + "learning_rate": 3.804315561137116e-05, + "loss": 5.4093, + "step": 10580 + }, + { + "epoch": 1.206219032974543, + "grad_norm": 9.459589004516602, + "learning_rate": 3.8031738782966093e-05, + "loss": 5.4789, + "step": 10590 + }, + { + "epoch": 1.2073580500028476, + "grad_norm": 8.0070161819458, + "learning_rate": 3.8020321954561025e-05, + "loss": 5.0064, + "step": 10600 + }, + { + "epoch": 1.2084970670311521, + "grad_norm": 6.625135898590088, + "learning_rate": 3.800890512615596e-05, + "loss": 5.3855, + "step": 10610 + }, + { + "epoch": 1.2096360840594567, + "grad_norm": 6.648497104644775, + "learning_rate": 3.799748829775089e-05, + "loss": 5.2896, + "step": 10620 + }, + { + "epoch": 1.2107751010877612, + "grad_norm": 16.452611923217773, + "learning_rate": 3.798607146934582e-05, + "loss": 5.346, + "step": 10630 + }, + { + "epoch": 1.2119141181160658, + "grad_norm": 7.3033447265625, + "learning_rate": 3.7974654640940746e-05, + "loss": 5.3142, + "step": 10640 + }, + { + "epoch": 1.2130531351443703, + "grad_norm": 6.744337558746338, + "learning_rate": 3.796323781253568e-05, + "loss": 5.138, + "step": 10650 + }, + { + "epoch": 1.214192152172675, + "grad_norm": 7.216135501861572, + "learning_rate": 3.795182098413061e-05, + "loss": 5.2378, + "step": 10660 + }, + { + "epoch": 1.2153311692009796, + "grad_norm": 5.454289436340332, + "learning_rate": 3.794040415572554e-05, + "loss": 5.5945, + "step": 10670 + }, + { + "epoch": 1.2164701862292842, + "grad_norm": 7.187722206115723, + "learning_rate": 3.792898732732047e-05, + "loss": 4.9304, + "step": 10680 + }, + { + "epoch": 1.2176092032575887, + "grad_norm": 9.496403694152832, + "learning_rate": 3.7917570498915405e-05, + "loss": 5.19, + "step": 10690 + }, + { + "epoch": 1.2187482202858932, + "grad_norm": 6.7584357261657715, + "learning_rate": 3.790615367051034e-05, + "loss": 5.471, + "step": 10700 + }, + { + "epoch": 1.2198872373141978, + "grad_norm": 10.78551959991455, + "learning_rate": 3.789473684210527e-05, + "loss": 5.2186, + "step": 10710 + }, + { + "epoch": 1.2210262543425023, + "grad_norm": 5.996976852416992, + "learning_rate": 3.7883320013700194e-05, + "loss": 5.3735, + "step": 10720 + }, + { + "epoch": 1.222165271370807, + "grad_norm": 5.803508281707764, + "learning_rate": 3.7871903185295126e-05, + "loss": 5.2404, + "step": 10730 + }, + { + "epoch": 1.2233042883991117, + "grad_norm": 8.11447525024414, + "learning_rate": 3.786048635689006e-05, + "loss": 5.1215, + "step": 10740 + }, + { + "epoch": 1.2244433054274162, + "grad_norm": 10.285445213317871, + "learning_rate": 3.784906952848499e-05, + "loss": 5.3977, + "step": 10750 + }, + { + "epoch": 1.2255823224557207, + "grad_norm": 6.042105674743652, + "learning_rate": 3.7837652700079914e-05, + "loss": 5.1748, + "step": 10760 + }, + { + "epoch": 1.2267213394840253, + "grad_norm": 8.941543579101562, + "learning_rate": 3.782623587167485e-05, + "loss": 5.5393, + "step": 10770 + }, + { + "epoch": 1.2278603565123298, + "grad_norm": 6.521518230438232, + "learning_rate": 3.7814819043269785e-05, + "loss": 5.3153, + "step": 10780 + }, + { + "epoch": 1.2289993735406344, + "grad_norm": 14.907044410705566, + "learning_rate": 3.780340221486472e-05, + "loss": 5.3014, + "step": 10790 + }, + { + "epoch": 1.230138390568939, + "grad_norm": 19.862760543823242, + "learning_rate": 3.779198538645964e-05, + "loss": 5.332, + "step": 10800 + }, + { + "epoch": 1.2312774075972435, + "grad_norm": 5.705162525177002, + "learning_rate": 3.7780568558054574e-05, + "loss": 5.2788, + "step": 10810 + }, + { + "epoch": 1.2324164246255482, + "grad_norm": 6.770884037017822, + "learning_rate": 3.7769151729649505e-05, + "loss": 5.2975, + "step": 10820 + }, + { + "epoch": 1.2335554416538528, + "grad_norm": 9.910143852233887, + "learning_rate": 3.775773490124444e-05, + "loss": 4.8984, + "step": 10830 + }, + { + "epoch": 1.2346944586821573, + "grad_norm": 6.479272365570068, + "learning_rate": 3.774631807283936e-05, + "loss": 5.2465, + "step": 10840 + }, + { + "epoch": 1.2358334757104619, + "grad_norm": 16.471927642822266, + "learning_rate": 3.7734901244434294e-05, + "loss": 5.461, + "step": 10850 + }, + { + "epoch": 1.2369724927387664, + "grad_norm": 8.394795417785645, + "learning_rate": 3.772348441602923e-05, + "loss": 5.1475, + "step": 10860 + }, + { + "epoch": 1.238111509767071, + "grad_norm": 10.2684326171875, + "learning_rate": 3.7712067587624165e-05, + "loss": 5.7925, + "step": 10870 + }, + { + "epoch": 1.2392505267953755, + "grad_norm": 11.658308029174805, + "learning_rate": 3.770065075921909e-05, + "loss": 5.0855, + "step": 10880 + }, + { + "epoch": 1.2403895438236803, + "grad_norm": 7.594724178314209, + "learning_rate": 3.768923393081402e-05, + "loss": 5.0243, + "step": 10890 + }, + { + "epoch": 1.2415285608519848, + "grad_norm": 9.657943725585938, + "learning_rate": 3.767781710240895e-05, + "loss": 5.18, + "step": 10900 + }, + { + "epoch": 1.2426675778802894, + "grad_norm": 10.352492332458496, + "learning_rate": 3.7666400274003885e-05, + "loss": 5.3739, + "step": 10910 + }, + { + "epoch": 1.243806594908594, + "grad_norm": 5.806910991668701, + "learning_rate": 3.765498344559881e-05, + "loss": 5.3483, + "step": 10920 + }, + { + "epoch": 1.2449456119368985, + "grad_norm": 7.94573450088501, + "learning_rate": 3.764356661719374e-05, + "loss": 5.545, + "step": 10930 + }, + { + "epoch": 1.246084628965203, + "grad_norm": 6.481786727905273, + "learning_rate": 3.763214978878868e-05, + "loss": 5.0522, + "step": 10940 + }, + { + "epoch": 1.2472236459935075, + "grad_norm": 7.557528972625732, + "learning_rate": 3.7620732960383606e-05, + "loss": 5.3473, + "step": 10950 + }, + { + "epoch": 1.248362663021812, + "grad_norm": 8.600546836853027, + "learning_rate": 3.760931613197854e-05, + "loss": 5.1727, + "step": 10960 + }, + { + "epoch": 1.2495016800501166, + "grad_norm": 13.744807243347168, + "learning_rate": 3.759789930357347e-05, + "loss": 5.2168, + "step": 10970 + }, + { + "epoch": 1.2506406970784214, + "grad_norm": 10.714949607849121, + "learning_rate": 3.75864824751684e-05, + "loss": 5.4683, + "step": 10980 + }, + { + "epoch": 1.251779714106726, + "grad_norm": 6.328366756439209, + "learning_rate": 3.757506564676333e-05, + "loss": 5.3212, + "step": 10990 + }, + { + "epoch": 1.2529187311350305, + "grad_norm": 6.948147296905518, + "learning_rate": 3.756364881835826e-05, + "loss": 5.2868, + "step": 11000 + }, + { + "epoch": 1.254057748163335, + "grad_norm": 8.713366508483887, + "learning_rate": 3.755223198995319e-05, + "loss": 5.1815, + "step": 11010 + }, + { + "epoch": 1.2551967651916396, + "grad_norm": 9.481947898864746, + "learning_rate": 3.754081516154813e-05, + "loss": 5.2775, + "step": 11020 + }, + { + "epoch": 1.2563357822199441, + "grad_norm": 11.682991981506348, + "learning_rate": 3.7529398333143054e-05, + "loss": 5.2723, + "step": 11030 + }, + { + "epoch": 1.2574747992482487, + "grad_norm": 6.145923614501953, + "learning_rate": 3.7517981504737986e-05, + "loss": 5.4262, + "step": 11040 + }, + { + "epoch": 1.2586138162765534, + "grad_norm": 7.860983848571777, + "learning_rate": 3.750656467633292e-05, + "loss": 5.267, + "step": 11050 + }, + { + "epoch": 1.259752833304858, + "grad_norm": 16.078229904174805, + "learning_rate": 3.749514784792785e-05, + "loss": 5.5123, + "step": 11060 + }, + { + "epoch": 1.2608918503331625, + "grad_norm": 12.896660804748535, + "learning_rate": 3.7483731019522774e-05, + "loss": 5.6769, + "step": 11070 + }, + { + "epoch": 1.262030867361467, + "grad_norm": 6.555258274078369, + "learning_rate": 3.7472314191117706e-05, + "loss": 5.1755, + "step": 11080 + }, + { + "epoch": 1.2631698843897716, + "grad_norm": 8.567139625549316, + "learning_rate": 3.746089736271264e-05, + "loss": 5.2561, + "step": 11090 + }, + { + "epoch": 1.2643089014180762, + "grad_norm": 8.549598693847656, + "learning_rate": 3.7449480534307577e-05, + "loss": 5.5128, + "step": 11100 + }, + { + "epoch": 1.2654479184463807, + "grad_norm": 13.493013381958008, + "learning_rate": 3.74380637059025e-05, + "loss": 5.2305, + "step": 11110 + }, + { + "epoch": 1.2665869354746855, + "grad_norm": 5.492915630340576, + "learning_rate": 3.7426646877497433e-05, + "loss": 5.3072, + "step": 11120 + }, + { + "epoch": 1.2677259525029898, + "grad_norm": 7.769402027130127, + "learning_rate": 3.7415230049092365e-05, + "loss": 5.3097, + "step": 11130 + }, + { + "epoch": 1.2688649695312946, + "grad_norm": 6.967016696929932, + "learning_rate": 3.74038132206873e-05, + "loss": 5.518, + "step": 11140 + }, + { + "epoch": 1.2700039865595991, + "grad_norm": 8.325007438659668, + "learning_rate": 3.739239639228222e-05, + "loss": 5.1475, + "step": 11150 + }, + { + "epoch": 1.2711430035879037, + "grad_norm": 7.11507511138916, + "learning_rate": 3.7380979563877154e-05, + "loss": 5.44, + "step": 11160 + }, + { + "epoch": 1.2722820206162082, + "grad_norm": 16.661008834838867, + "learning_rate": 3.7369562735472086e-05, + "loss": 4.9281, + "step": 11170 + }, + { + "epoch": 1.2734210376445128, + "grad_norm": 6.7068400382995605, + "learning_rate": 3.735814590706702e-05, + "loss": 5.2555, + "step": 11180 + }, + { + "epoch": 1.2745600546728173, + "grad_norm": 7.5050554275512695, + "learning_rate": 3.734672907866195e-05, + "loss": 5.2036, + "step": 11190 + }, + { + "epoch": 1.2756990717011218, + "grad_norm": 8.88784122467041, + "learning_rate": 3.733531225025688e-05, + "loss": 5.419, + "step": 11200 + }, + { + "epoch": 1.2768380887294266, + "grad_norm": 6.129419326782227, + "learning_rate": 3.732389542185181e-05, + "loss": 5.1893, + "step": 11210 + }, + { + "epoch": 1.2779771057577312, + "grad_norm": 13.970202445983887, + "learning_rate": 3.7312478593446745e-05, + "loss": 4.972, + "step": 11220 + }, + { + "epoch": 1.2791161227860357, + "grad_norm": 6.938068389892578, + "learning_rate": 3.730220344788218e-05, + "loss": 5.4386, + "step": 11230 + }, + { + "epoch": 1.2802551398143402, + "grad_norm": 13.684118270874023, + "learning_rate": 3.7290786619477106e-05, + "loss": 5.0856, + "step": 11240 + }, + { + "epoch": 1.2813941568426448, + "grad_norm": 5.631530284881592, + "learning_rate": 3.7279369791072045e-05, + "loss": 5.1861, + "step": 11250 + }, + { + "epoch": 1.2825331738709493, + "grad_norm": 8.32963752746582, + "learning_rate": 3.7267952962666976e-05, + "loss": 5.2015, + "step": 11260 + }, + { + "epoch": 1.2836721908992539, + "grad_norm": 7.235726833343506, + "learning_rate": 3.72565361342619e-05, + "loss": 5.2528, + "step": 11270 + }, + { + "epoch": 1.2848112079275587, + "grad_norm": 6.3417887687683105, + "learning_rate": 3.724511930585683e-05, + "loss": 5.5352, + "step": 11280 + }, + { + "epoch": 1.285950224955863, + "grad_norm": 35.59114074707031, + "learning_rate": 3.7233702477451765e-05, + "loss": 5.1489, + "step": 11290 + }, + { + "epoch": 1.2870892419841677, + "grad_norm": 9.192126274108887, + "learning_rate": 3.72222856490467e-05, + "loss": 5.0911, + "step": 11300 + }, + { + "epoch": 1.2882282590124723, + "grad_norm": 5.963927268981934, + "learning_rate": 3.721086882064163e-05, + "loss": 5.5495, + "step": 11310 + }, + { + "epoch": 1.2893672760407768, + "grad_norm": 5.8875885009765625, + "learning_rate": 3.7199451992236554e-05, + "loss": 5.6548, + "step": 11320 + }, + { + "epoch": 1.2905062930690814, + "grad_norm": 7.632376670837402, + "learning_rate": 3.718803516383149e-05, + "loss": 5.3078, + "step": 11330 + }, + { + "epoch": 1.291645310097386, + "grad_norm": 5.833863258361816, + "learning_rate": 3.7176618335426424e-05, + "loss": 5.4708, + "step": 11340 + }, + { + "epoch": 1.2927843271256905, + "grad_norm": 7.593703746795654, + "learning_rate": 3.716520150702135e-05, + "loss": 5.1828, + "step": 11350 + }, + { + "epoch": 1.293923344153995, + "grad_norm": 24.085445404052734, + "learning_rate": 3.715378467861628e-05, + "loss": 5.4162, + "step": 11360 + }, + { + "epoch": 1.2950623611822998, + "grad_norm": 10.530863761901855, + "learning_rate": 3.714236785021121e-05, + "loss": 5.252, + "step": 11370 + }, + { + "epoch": 1.2962013782106043, + "grad_norm": 13.61907958984375, + "learning_rate": 3.7130951021806145e-05, + "loss": 4.842, + "step": 11380 + }, + { + "epoch": 1.2973403952389089, + "grad_norm": 6.181117534637451, + "learning_rate": 3.711953419340107e-05, + "loss": 5.3513, + "step": 11390 + }, + { + "epoch": 1.2984794122672134, + "grad_norm": 7.91089391708374, + "learning_rate": 3.7108117364996e-05, + "loss": 5.239, + "step": 11400 + }, + { + "epoch": 1.299618429295518, + "grad_norm": 5.9612202644348145, + "learning_rate": 3.709670053659094e-05, + "loss": 5.3079, + "step": 11410 + }, + { + "epoch": 1.3007574463238225, + "grad_norm": 9.246620178222656, + "learning_rate": 3.708528370818587e-05, + "loss": 5.4449, + "step": 11420 + }, + { + "epoch": 1.301896463352127, + "grad_norm": 9.88797378540039, + "learning_rate": 3.70738668797808e-05, + "loss": 5.5098, + "step": 11430 + }, + { + "epoch": 1.3030354803804318, + "grad_norm": 7.659064769744873, + "learning_rate": 3.706245005137573e-05, + "loss": 5.4119, + "step": 11440 + }, + { + "epoch": 1.3041744974087361, + "grad_norm": 5.793743133544922, + "learning_rate": 3.705103322297066e-05, + "loss": 5.6571, + "step": 11450 + }, + { + "epoch": 1.305313514437041, + "grad_norm": 11.31017780303955, + "learning_rate": 3.703961639456559e-05, + "loss": 5.6601, + "step": 11460 + }, + { + "epoch": 1.3064525314653455, + "grad_norm": 6.8008012771606445, + "learning_rate": 3.702819956616052e-05, + "loss": 5.4059, + "step": 11470 + }, + { + "epoch": 1.30759154849365, + "grad_norm": 11.843914031982422, + "learning_rate": 3.701678273775545e-05, + "loss": 5.3103, + "step": 11480 + }, + { + "epoch": 1.3087305655219545, + "grad_norm": 8.745990753173828, + "learning_rate": 3.700536590935039e-05, + "loss": 5.6514, + "step": 11490 + }, + { + "epoch": 1.309869582550259, + "grad_norm": 23.588220596313477, + "learning_rate": 3.699394908094532e-05, + "loss": 5.1553, + "step": 11500 + }, + { + "epoch": 1.3110085995785636, + "grad_norm": 6.995006561279297, + "learning_rate": 3.6982532252540245e-05, + "loss": 5.4568, + "step": 11510 + }, + { + "epoch": 1.3121476166068682, + "grad_norm": 12.877575874328613, + "learning_rate": 3.697111542413518e-05, + "loss": 5.3316, + "step": 11520 + }, + { + "epoch": 1.313286633635173, + "grad_norm": 14.89468765258789, + "learning_rate": 3.695969859573011e-05, + "loss": 5.4251, + "step": 11530 + }, + { + "epoch": 1.3144256506634775, + "grad_norm": 6.0038838386535645, + "learning_rate": 3.694828176732504e-05, + "loss": 5.6329, + "step": 11540 + }, + { + "epoch": 1.315564667691782, + "grad_norm": 10.036600112915039, + "learning_rate": 3.6936864938919966e-05, + "loss": 5.4431, + "step": 11550 + }, + { + "epoch": 1.3167036847200866, + "grad_norm": 7.397878646850586, + "learning_rate": 3.69254481105149e-05, + "loss": 5.7798, + "step": 11560 + }, + { + "epoch": 1.3178427017483911, + "grad_norm": 7.196527481079102, + "learning_rate": 3.6914031282109836e-05, + "loss": 5.3741, + "step": 11570 + }, + { + "epoch": 1.3189817187766957, + "grad_norm": 9.94596004486084, + "learning_rate": 3.690261445370476e-05, + "loss": 5.2022, + "step": 11580 + }, + { + "epoch": 1.3201207358050002, + "grad_norm": 11.582137107849121, + "learning_rate": 3.689119762529969e-05, + "loss": 5.3073, + "step": 11590 + }, + { + "epoch": 1.321259752833305, + "grad_norm": 7.553103446960449, + "learning_rate": 3.6879780796894625e-05, + "loss": 5.2065, + "step": 11600 + }, + { + "epoch": 1.3223987698616093, + "grad_norm": 8.516357421875, + "learning_rate": 3.686836396848956e-05, + "loss": 5.4567, + "step": 11610 + }, + { + "epoch": 1.323537786889914, + "grad_norm": 5.805466651916504, + "learning_rate": 3.685694714008449e-05, + "loss": 5.4244, + "step": 11620 + }, + { + "epoch": 1.3246768039182186, + "grad_norm": 14.516117095947266, + "learning_rate": 3.6845530311679414e-05, + "loss": 5.3269, + "step": 11630 + }, + { + "epoch": 1.3258158209465232, + "grad_norm": 8.495697021484375, + "learning_rate": 3.6834113483274346e-05, + "loss": 5.311, + "step": 11640 + }, + { + "epoch": 1.3269548379748277, + "grad_norm": 6.78104305267334, + "learning_rate": 3.682269665486928e-05, + "loss": 5.355, + "step": 11650 + }, + { + "epoch": 1.3280938550031323, + "grad_norm": 7.765377998352051, + "learning_rate": 3.681127982646421e-05, + "loss": 5.1988, + "step": 11660 + }, + { + "epoch": 1.3292328720314368, + "grad_norm": 9.460587501525879, + "learning_rate": 3.679986299805914e-05, + "loss": 5.4055, + "step": 11670 + }, + { + "epoch": 1.3303718890597414, + "grad_norm": 13.057355880737305, + "learning_rate": 3.678844616965407e-05, + "loss": 5.0494, + "step": 11680 + }, + { + "epoch": 1.3315109060880461, + "grad_norm": 12.284239768981934, + "learning_rate": 3.6777029341249005e-05, + "loss": 5.5441, + "step": 11690 + }, + { + "epoch": 1.3326499231163507, + "grad_norm": 12.175599098205566, + "learning_rate": 3.676561251284393e-05, + "loss": 5.1674, + "step": 11700 + }, + { + "epoch": 1.3337889401446552, + "grad_norm": 6.106376647949219, + "learning_rate": 3.675419568443886e-05, + "loss": 5.5162, + "step": 11710 + }, + { + "epoch": 1.3349279571729598, + "grad_norm": 6.725399971008301, + "learning_rate": 3.6742778856033794e-05, + "loss": 5.5066, + "step": 11720 + }, + { + "epoch": 1.3360669742012643, + "grad_norm": 10.35840129852295, + "learning_rate": 3.6731362027628725e-05, + "loss": 5.4834, + "step": 11730 + }, + { + "epoch": 1.3372059912295688, + "grad_norm": 6.601029872894287, + "learning_rate": 3.671994519922366e-05, + "loss": 5.2114, + "step": 11740 + }, + { + "epoch": 1.3383450082578734, + "grad_norm": 6.504428863525391, + "learning_rate": 3.670852837081859e-05, + "loss": 5.3814, + "step": 11750 + }, + { + "epoch": 1.3394840252861782, + "grad_norm": 8.49081802368164, + "learning_rate": 3.669711154241352e-05, + "loss": 5.1191, + "step": 11760 + }, + { + "epoch": 1.3406230423144825, + "grad_norm": 30.99142074584961, + "learning_rate": 3.668569471400845e-05, + "loss": 5.2552, + "step": 11770 + }, + { + "epoch": 1.3417620593427872, + "grad_norm": 16.623620986938477, + "learning_rate": 3.667427788560338e-05, + "loss": 5.5582, + "step": 11780 + }, + { + "epoch": 1.3429010763710918, + "grad_norm": 8.451813697814941, + "learning_rate": 3.666286105719831e-05, + "loss": 5.3429, + "step": 11790 + }, + { + "epoch": 1.3440400933993963, + "grad_norm": 7.930083274841309, + "learning_rate": 3.665144422879324e-05, + "loss": 5.5788, + "step": 11800 + }, + { + "epoch": 1.3451791104277009, + "grad_norm": 8.974008560180664, + "learning_rate": 3.664002740038817e-05, + "loss": 5.0104, + "step": 11810 + }, + { + "epoch": 1.3463181274560054, + "grad_norm": 11.199780464172363, + "learning_rate": 3.6628610571983105e-05, + "loss": 5.3469, + "step": 11820 + }, + { + "epoch": 1.34745714448431, + "grad_norm": 16.39617919921875, + "learning_rate": 3.661719374357804e-05, + "loss": 5.0454, + "step": 11830 + }, + { + "epoch": 1.3485961615126145, + "grad_norm": 15.259292602539062, + "learning_rate": 3.660577691517297e-05, + "loss": 5.606, + "step": 11840 + }, + { + "epoch": 1.3497351785409193, + "grad_norm": 5.952733039855957, + "learning_rate": 3.65943600867679e-05, + "loss": 5.2637, + "step": 11850 + }, + { + "epoch": 1.3508741955692238, + "grad_norm": 26.232786178588867, + "learning_rate": 3.6582943258362826e-05, + "loss": 5.3298, + "step": 11860 + }, + { + "epoch": 1.3520132125975284, + "grad_norm": 8.518367767333984, + "learning_rate": 3.657152642995776e-05, + "loss": 5.1626, + "step": 11870 + }, + { + "epoch": 1.353152229625833, + "grad_norm": 21.344398498535156, + "learning_rate": 3.656010960155269e-05, + "loss": 5.2306, + "step": 11880 + }, + { + "epoch": 1.3542912466541375, + "grad_norm": 5.3159589767456055, + "learning_rate": 3.654869277314762e-05, + "loss": 5.3372, + "step": 11890 + }, + { + "epoch": 1.355430263682442, + "grad_norm": 6.752881050109863, + "learning_rate": 3.653727594474255e-05, + "loss": 5.1163, + "step": 11900 + }, + { + "epoch": 1.3565692807107466, + "grad_norm": 6.732866287231445, + "learning_rate": 3.6525859116337485e-05, + "loss": 5.3473, + "step": 11910 + }, + { + "epoch": 1.3577082977390513, + "grad_norm": 7.8479132652282715, + "learning_rate": 3.651444228793242e-05, + "loss": 5.3538, + "step": 11920 + }, + { + "epoch": 1.3588473147673557, + "grad_norm": 9.0760498046875, + "learning_rate": 3.650302545952735e-05, + "loss": 5.3047, + "step": 11930 + }, + { + "epoch": 1.3599863317956604, + "grad_norm": 5.797522068023682, + "learning_rate": 3.6491608631122274e-05, + "loss": 5.45, + "step": 11940 + }, + { + "epoch": 1.361125348823965, + "grad_norm": 10.67641544342041, + "learning_rate": 3.6480191802717206e-05, + "loss": 5.6555, + "step": 11950 + }, + { + "epoch": 1.3622643658522695, + "grad_norm": 10.890436172485352, + "learning_rate": 3.646877497431214e-05, + "loss": 5.6038, + "step": 11960 + }, + { + "epoch": 1.363403382880574, + "grad_norm": 10.217756271362305, + "learning_rate": 3.645735814590707e-05, + "loss": 5.2046, + "step": 11970 + }, + { + "epoch": 1.3645423999088786, + "grad_norm": 7.881415367126465, + "learning_rate": 3.6445941317501994e-05, + "loss": 5.2958, + "step": 11980 + }, + { + "epoch": 1.3656814169371831, + "grad_norm": 10.278264045715332, + "learning_rate": 3.643452448909693e-05, + "loss": 5.2545, + "step": 11990 + }, + { + "epoch": 1.3668204339654877, + "grad_norm": 6.571601867675781, + "learning_rate": 3.6423107660691865e-05, + "loss": 5.3605, + "step": 12000 + }, + { + "epoch": 1.3668204339654877, + "eval_loss": 5.954630374908447, + "eval_runtime": 10.9746, + "eval_samples_per_second": 1.367, + "eval_steps_per_second": 0.182, + "step": 12000 + }, + { + "epoch": 1.3679594509937925, + "grad_norm": 7.712740421295166, + "learning_rate": 3.6411690832286797e-05, + "loss": 5.1089, + "step": 12010 + }, + { + "epoch": 1.369098468022097, + "grad_norm": 15.281449317932129, + "learning_rate": 3.640027400388172e-05, + "loss": 4.9024, + "step": 12020 + }, + { + "epoch": 1.3702374850504015, + "grad_norm": 11.042343139648438, + "learning_rate": 3.6388857175476653e-05, + "loss": 5.3989, + "step": 12030 + }, + { + "epoch": 1.371376502078706, + "grad_norm": 7.947473526000977, + "learning_rate": 3.6377440347071585e-05, + "loss": 5.5346, + "step": 12040 + }, + { + "epoch": 1.3725155191070106, + "grad_norm": 13.120514869689941, + "learning_rate": 3.636602351866652e-05, + "loss": 5.1866, + "step": 12050 + }, + { + "epoch": 1.3736545361353152, + "grad_norm": 6.44757080078125, + "learning_rate": 3.635460669026144e-05, + "loss": 5.406, + "step": 12060 + }, + { + "epoch": 1.3747935531636197, + "grad_norm": 49.004764556884766, + "learning_rate": 3.634318986185638e-05, + "loss": 5.2507, + "step": 12070 + }, + { + "epoch": 1.3759325701919245, + "grad_norm": 6.357001781463623, + "learning_rate": 3.633177303345131e-05, + "loss": 5.0962, + "step": 12080 + }, + { + "epoch": 1.3770715872202288, + "grad_norm": 6.734482765197754, + "learning_rate": 3.632035620504624e-05, + "loss": 5.2718, + "step": 12090 + }, + { + "epoch": 1.3782106042485336, + "grad_norm": 23.010496139526367, + "learning_rate": 3.630893937664117e-05, + "loss": 5.2241, + "step": 12100 + }, + { + "epoch": 1.3793496212768381, + "grad_norm": 5.788354396820068, + "learning_rate": 3.62975225482361e-05, + "loss": 5.5905, + "step": 12110 + }, + { + "epoch": 1.3804886383051427, + "grad_norm": 7.649424076080322, + "learning_rate": 3.628610571983103e-05, + "loss": 5.1133, + "step": 12120 + }, + { + "epoch": 1.3816276553334472, + "grad_norm": 13.524384498596191, + "learning_rate": 3.6274688891425965e-05, + "loss": 5.4897, + "step": 12130 + }, + { + "epoch": 1.3827666723617518, + "grad_norm": 16.350011825561523, + "learning_rate": 3.626327206302089e-05, + "loss": 5.3957, + "step": 12140 + }, + { + "epoch": 1.3839056893900563, + "grad_norm": 5.700869560241699, + "learning_rate": 3.625185523461583e-05, + "loss": 5.3942, + "step": 12150 + }, + { + "epoch": 1.3850447064183609, + "grad_norm": 13.569497108459473, + "learning_rate": 3.624043840621076e-05, + "loss": 5.009, + "step": 12160 + }, + { + "epoch": 1.3861837234466656, + "grad_norm": 8.509134292602539, + "learning_rate": 3.6229021577805686e-05, + "loss": 5.1335, + "step": 12170 + }, + { + "epoch": 1.3873227404749702, + "grad_norm": 34.72011184692383, + "learning_rate": 3.621760474940062e-05, + "loss": 5.4954, + "step": 12180 + }, + { + "epoch": 1.3884617575032747, + "grad_norm": 9.994758605957031, + "learning_rate": 3.620618792099555e-05, + "loss": 5.8254, + "step": 12190 + }, + { + "epoch": 1.3896007745315793, + "grad_norm": 7.645510196685791, + "learning_rate": 3.619477109259048e-05, + "loss": 5.7755, + "step": 12200 + }, + { + "epoch": 1.3907397915598838, + "grad_norm": 16.956539154052734, + "learning_rate": 3.6183354264185406e-05, + "loss": 5.2381, + "step": 12210 + }, + { + "epoch": 1.3918788085881884, + "grad_norm": 10.36237621307373, + "learning_rate": 3.617193743578034e-05, + "loss": 5.2316, + "step": 12220 + }, + { + "epoch": 1.393017825616493, + "grad_norm": 9.028413772583008, + "learning_rate": 3.616052060737528e-05, + "loss": 5.8723, + "step": 12230 + }, + { + "epoch": 1.3941568426447977, + "grad_norm": 10.029345512390137, + "learning_rate": 3.614910377897021e-05, + "loss": 5.2414, + "step": 12240 + }, + { + "epoch": 1.395295859673102, + "grad_norm": 7.671345233917236, + "learning_rate": 3.6137686950565134e-05, + "loss": 5.2427, + "step": 12250 + }, + { + "epoch": 1.3964348767014068, + "grad_norm": 10.010140419006348, + "learning_rate": 3.6126270122160065e-05, + "loss": 5.1803, + "step": 12260 + }, + { + "epoch": 1.3975738937297113, + "grad_norm": 4.959784507751465, + "learning_rate": 3.6114853293755e-05, + "loss": 5.41, + "step": 12270 + }, + { + "epoch": 1.3987129107580158, + "grad_norm": 7.631646156311035, + "learning_rate": 3.610343646534993e-05, + "loss": 5.36, + "step": 12280 + }, + { + "epoch": 1.3998519277863204, + "grad_norm": 8.803350448608398, + "learning_rate": 3.6092019636944854e-05, + "loss": 5.161, + "step": 12290 + }, + { + "epoch": 1.400990944814625, + "grad_norm": 6.224495887756348, + "learning_rate": 3.6080602808539786e-05, + "loss": 5.4243, + "step": 12300 + }, + { + "epoch": 1.4021299618429295, + "grad_norm": 11.932744979858398, + "learning_rate": 3.606918598013472e-05, + "loss": 5.3582, + "step": 12310 + }, + { + "epoch": 1.403268978871234, + "grad_norm": 7.398464679718018, + "learning_rate": 3.6057769151729656e-05, + "loss": 5.2339, + "step": 12320 + }, + { + "epoch": 1.4044079958995388, + "grad_norm": 13.733871459960938, + "learning_rate": 3.604635232332458e-05, + "loss": 5.4102, + "step": 12330 + }, + { + "epoch": 1.4055470129278433, + "grad_norm": 6.52939510345459, + "learning_rate": 3.603493549491951e-05, + "loss": 5.4492, + "step": 12340 + }, + { + "epoch": 1.4066860299561479, + "grad_norm": 12.400009155273438, + "learning_rate": 3.6023518666514445e-05, + "loss": 5.4458, + "step": 12350 + }, + { + "epoch": 1.4078250469844524, + "grad_norm": 7.083974361419678, + "learning_rate": 3.601210183810938e-05, + "loss": 5.4185, + "step": 12360 + }, + { + "epoch": 1.408964064012757, + "grad_norm": 7.966810703277588, + "learning_rate": 3.60006850097043e-05, + "loss": 5.4619, + "step": 12370 + }, + { + "epoch": 1.4101030810410615, + "grad_norm": 10.752161979675293, + "learning_rate": 3.5989268181299234e-05, + "loss": 5.3817, + "step": 12380 + }, + { + "epoch": 1.411242098069366, + "grad_norm": 6.735963821411133, + "learning_rate": 3.5977851352894166e-05, + "loss": 5.4496, + "step": 12390 + }, + { + "epoch": 1.4123811150976708, + "grad_norm": 8.230522155761719, + "learning_rate": 3.5966434524489104e-05, + "loss": 5.3922, + "step": 12400 + }, + { + "epoch": 1.4135201321259752, + "grad_norm": 22.14419937133789, + "learning_rate": 3.595501769608403e-05, + "loss": 5.4713, + "step": 12410 + }, + { + "epoch": 1.41465914915428, + "grad_norm": 6.678779602050781, + "learning_rate": 3.594360086767896e-05, + "loss": 5.3202, + "step": 12420 + }, + { + "epoch": 1.4157981661825845, + "grad_norm": 6.689286231994629, + "learning_rate": 3.593218403927389e-05, + "loss": 5.577, + "step": 12430 + }, + { + "epoch": 1.416937183210889, + "grad_norm": 9.552467346191406, + "learning_rate": 3.5920767210868825e-05, + "loss": 5.3421, + "step": 12440 + }, + { + "epoch": 1.4180762002391936, + "grad_norm": 6.153965950012207, + "learning_rate": 3.590935038246375e-05, + "loss": 5.0896, + "step": 12450 + }, + { + "epoch": 1.419215217267498, + "grad_norm": 7.473817825317383, + "learning_rate": 3.589793355405868e-05, + "loss": 5.2073, + "step": 12460 + }, + { + "epoch": 1.4203542342958027, + "grad_norm": 7.036187171936035, + "learning_rate": 3.5886516725653614e-05, + "loss": 5.4712, + "step": 12470 + }, + { + "epoch": 1.4214932513241072, + "grad_norm": 20.243093490600586, + "learning_rate": 3.5875099897248545e-05, + "loss": 5.4689, + "step": 12480 + }, + { + "epoch": 1.422632268352412, + "grad_norm": 8.695764541625977, + "learning_rate": 3.586368306884348e-05, + "loss": 5.308, + "step": 12490 + }, + { + "epoch": 1.4237712853807165, + "grad_norm": 6.1280837059021, + "learning_rate": 3.585226624043841e-05, + "loss": 5.4206, + "step": 12500 + }, + { + "epoch": 1.424910302409021, + "grad_norm": 10.041711807250977, + "learning_rate": 3.584084941203334e-05, + "loss": 5.4938, + "step": 12510 + }, + { + "epoch": 1.4260493194373256, + "grad_norm": 7.014266490936279, + "learning_rate": 3.582943258362827e-05, + "loss": 5.447, + "step": 12520 + }, + { + "epoch": 1.4271883364656301, + "grad_norm": 7.004973411560059, + "learning_rate": 3.58180157552232e-05, + "loss": 5.1397, + "step": 12530 + }, + { + "epoch": 1.4283273534939347, + "grad_norm": 5.695499420166016, + "learning_rate": 3.580659892681813e-05, + "loss": 5.3655, + "step": 12540 + }, + { + "epoch": 1.4294663705222392, + "grad_norm": 6.106477737426758, + "learning_rate": 3.579518209841306e-05, + "loss": 5.2462, + "step": 12550 + }, + { + "epoch": 1.430605387550544, + "grad_norm": 10.030777931213379, + "learning_rate": 3.578376527000799e-05, + "loss": 5.52, + "step": 12560 + }, + { + "epoch": 1.4317444045788483, + "grad_norm": 5.281642436981201, + "learning_rate": 3.5772348441602925e-05, + "loss": 5.1858, + "step": 12570 + }, + { + "epoch": 1.432883421607153, + "grad_norm": 5.482350826263428, + "learning_rate": 3.576093161319786e-05, + "loss": 5.1776, + "step": 12580 + }, + { + "epoch": 1.4340224386354576, + "grad_norm": 10.585066795349121, + "learning_rate": 3.574951478479279e-05, + "loss": 5.1566, + "step": 12590 + }, + { + "epoch": 1.4351614556637622, + "grad_norm": 5.751795768737793, + "learning_rate": 3.5738097956387714e-05, + "loss": 5.6618, + "step": 12600 + }, + { + "epoch": 1.4363004726920667, + "grad_norm": 9.302414894104004, + "learning_rate": 3.5726681127982646e-05, + "loss": 5.4944, + "step": 12610 + }, + { + "epoch": 1.4374394897203713, + "grad_norm": 8.983521461486816, + "learning_rate": 3.571526429957758e-05, + "loss": 5.3993, + "step": 12620 + }, + { + "epoch": 1.4385785067486758, + "grad_norm": 8.087730407714844, + "learning_rate": 3.570384747117251e-05, + "loss": 5.8598, + "step": 12630 + }, + { + "epoch": 1.4397175237769804, + "grad_norm": 96.82878112792969, + "learning_rate": 3.569243064276744e-05, + "loss": 5.3293, + "step": 12640 + }, + { + "epoch": 1.4408565408052851, + "grad_norm": 30.524112701416016, + "learning_rate": 3.568101381436237e-05, + "loss": 5.3401, + "step": 12650 + }, + { + "epoch": 1.4419955578335897, + "grad_norm": 11.92377758026123, + "learning_rate": 3.5669596985957305e-05, + "loss": 5.8283, + "step": 12660 + }, + { + "epoch": 1.4431345748618942, + "grad_norm": 7.544785022735596, + "learning_rate": 3.565818015755224e-05, + "loss": 5.0848, + "step": 12670 + }, + { + "epoch": 1.4442735918901988, + "grad_norm": 8.341391563415527, + "learning_rate": 3.564676332914716e-05, + "loss": 5.5114, + "step": 12680 + }, + { + "epoch": 1.4454126089185033, + "grad_norm": 13.230775833129883, + "learning_rate": 3.5635346500742094e-05, + "loss": 5.5077, + "step": 12690 + }, + { + "epoch": 1.4465516259468079, + "grad_norm": 10.898941993713379, + "learning_rate": 3.5623929672337026e-05, + "loss": 5.7832, + "step": 12700 + }, + { + "epoch": 1.4476906429751124, + "grad_norm": 20.98676109313965, + "learning_rate": 3.561251284393196e-05, + "loss": 5.2055, + "step": 12710 + }, + { + "epoch": 1.4488296600034172, + "grad_norm": 9.478466033935547, + "learning_rate": 3.560109601552688e-05, + "loss": 5.0588, + "step": 12720 + }, + { + "epoch": 1.4499686770317215, + "grad_norm": 14.445795059204102, + "learning_rate": 3.558967918712182e-05, + "loss": 5.4359, + "step": 12730 + }, + { + "epoch": 1.4511076940600263, + "grad_norm": 7.075606822967529, + "learning_rate": 3.557826235871675e-05, + "loss": 5.2279, + "step": 12740 + }, + { + "epoch": 1.4522467110883308, + "grad_norm": 7.631337642669678, + "learning_rate": 3.5566845530311685e-05, + "loss": 5.3887, + "step": 12750 + }, + { + "epoch": 1.4533857281166354, + "grad_norm": 5.999801158905029, + "learning_rate": 3.555542870190661e-05, + "loss": 5.6362, + "step": 12760 + }, + { + "epoch": 1.45452474514494, + "grad_norm": 9.004730224609375, + "learning_rate": 3.554401187350154e-05, + "loss": 5.3229, + "step": 12770 + }, + { + "epoch": 1.4556637621732444, + "grad_norm": 7.371973991394043, + "learning_rate": 3.5532595045096473e-05, + "loss": 5.3307, + "step": 12780 + }, + { + "epoch": 1.456802779201549, + "grad_norm": 8.362858772277832, + "learning_rate": 3.5521178216691405e-05, + "loss": 5.2371, + "step": 12790 + }, + { + "epoch": 1.4579417962298535, + "grad_norm": 6.346038341522217, + "learning_rate": 3.550976138828633e-05, + "loss": 5.3268, + "step": 12800 + }, + { + "epoch": 1.4590808132581583, + "grad_norm": 7.912856578826904, + "learning_rate": 3.549834455988127e-05, + "loss": 5.0665, + "step": 12810 + }, + { + "epoch": 1.4602198302864628, + "grad_norm": 26.517242431640625, + "learning_rate": 3.54869277314762e-05, + "loss": 5.4491, + "step": 12820 + }, + { + "epoch": 1.4613588473147674, + "grad_norm": 41.19887924194336, + "learning_rate": 3.547551090307113e-05, + "loss": 5.6174, + "step": 12830 + }, + { + "epoch": 1.462497864343072, + "grad_norm": 8.585516929626465, + "learning_rate": 3.546409407466606e-05, + "loss": 5.3851, + "step": 12840 + }, + { + "epoch": 1.4636368813713765, + "grad_norm": 8.645813941955566, + "learning_rate": 3.545267724626099e-05, + "loss": 5.136, + "step": 12850 + }, + { + "epoch": 1.464775898399681, + "grad_norm": 14.942729949951172, + "learning_rate": 3.544126041785592e-05, + "loss": 5.3279, + "step": 12860 + }, + { + "epoch": 1.4659149154279856, + "grad_norm": 10.440014839172363, + "learning_rate": 3.542984358945085e-05, + "loss": 5.4267, + "step": 12870 + }, + { + "epoch": 1.4670539324562903, + "grad_norm": 8.172478675842285, + "learning_rate": 3.541842676104578e-05, + "loss": 5.43, + "step": 12880 + }, + { + "epoch": 1.4681929494845947, + "grad_norm": 24.180456161499023, + "learning_rate": 3.540700993264072e-05, + "loss": 5.4406, + "step": 12890 + }, + { + "epoch": 1.4693319665128994, + "grad_norm": 7.078608989715576, + "learning_rate": 3.539559310423565e-05, + "loss": 5.1815, + "step": 12900 + }, + { + "epoch": 1.470470983541204, + "grad_norm": 6.023872375488281, + "learning_rate": 3.538417627583058e-05, + "loss": 5.3495, + "step": 12910 + }, + { + "epoch": 1.4716100005695085, + "grad_norm": 7.708781719207764, + "learning_rate": 3.5372759447425506e-05, + "loss": 5.1646, + "step": 12920 + }, + { + "epoch": 1.472749017597813, + "grad_norm": 8.54637622833252, + "learning_rate": 3.536134261902044e-05, + "loss": 5.3512, + "step": 12930 + }, + { + "epoch": 1.4738880346261176, + "grad_norm": 7.830763816833496, + "learning_rate": 3.534992579061537e-05, + "loss": 5.1291, + "step": 12940 + }, + { + "epoch": 1.4750270516544222, + "grad_norm": 9.371572494506836, + "learning_rate": 3.53385089622103e-05, + "loss": 5.4404, + "step": 12950 + }, + { + "epoch": 1.4761660686827267, + "grad_norm": 6.507216453552246, + "learning_rate": 3.5327092133805226e-05, + "loss": 5.5138, + "step": 12960 + }, + { + "epoch": 1.4773050857110315, + "grad_norm": 8.929028511047363, + "learning_rate": 3.531567530540016e-05, + "loss": 5.7131, + "step": 12970 + }, + { + "epoch": 1.478444102739336, + "grad_norm": 6.6559906005859375, + "learning_rate": 3.53042584769951e-05, + "loss": 5.3745, + "step": 12980 + }, + { + "epoch": 1.4795831197676406, + "grad_norm": 5.723512649536133, + "learning_rate": 3.529284164859002e-05, + "loss": 5.2418, + "step": 12990 + }, + { + "epoch": 1.480722136795945, + "grad_norm": 14.410907745361328, + "learning_rate": 3.5281424820184954e-05, + "loss": 5.1752, + "step": 13000 + }, + { + "epoch": 1.4818611538242497, + "grad_norm": 6.995190620422363, + "learning_rate": 3.5270007991779885e-05, + "loss": 5.3642, + "step": 13010 + }, + { + "epoch": 1.4830001708525542, + "grad_norm": 7.952390193939209, + "learning_rate": 3.525859116337482e-05, + "loss": 5.3698, + "step": 13020 + }, + { + "epoch": 1.4841391878808587, + "grad_norm": 9.407118797302246, + "learning_rate": 3.524717433496975e-05, + "loss": 5.6339, + "step": 13030 + }, + { + "epoch": 1.4852782049091635, + "grad_norm": 7.846121788024902, + "learning_rate": 3.5235757506564674e-05, + "loss": 5.4137, + "step": 13040 + }, + { + "epoch": 1.4864172219374678, + "grad_norm": 8.195741653442383, + "learning_rate": 3.5224340678159606e-05, + "loss": 5.6291, + "step": 13050 + }, + { + "epoch": 1.4875562389657726, + "grad_norm": 5.081000328063965, + "learning_rate": 3.5212923849754545e-05, + "loss": 5.3807, + "step": 13060 + }, + { + "epoch": 1.4886952559940771, + "grad_norm": 8.810487747192383, + "learning_rate": 3.520150702134947e-05, + "loss": 5.1507, + "step": 13070 + }, + { + "epoch": 1.4898342730223817, + "grad_norm": 13.622875213623047, + "learning_rate": 3.51900901929444e-05, + "loss": 5.2288, + "step": 13080 + }, + { + "epoch": 1.4909732900506862, + "grad_norm": 6.059431552886963, + "learning_rate": 3.517867336453933e-05, + "loss": 5.2828, + "step": 13090 + }, + { + "epoch": 1.4921123070789908, + "grad_norm": 7.449197292327881, + "learning_rate": 3.5167256536134265e-05, + "loss": 5.2901, + "step": 13100 + }, + { + "epoch": 1.4932513241072953, + "grad_norm": 9.480165481567383, + "learning_rate": 3.515583970772919e-05, + "loss": 5.3054, + "step": 13110 + }, + { + "epoch": 1.4943903411355999, + "grad_norm": 13.476594924926758, + "learning_rate": 3.514442287932412e-05, + "loss": 5.3112, + "step": 13120 + }, + { + "epoch": 1.4955293581639046, + "grad_norm": 9.43685531616211, + "learning_rate": 3.5133006050919054e-05, + "loss": 5.501, + "step": 13130 + }, + { + "epoch": 1.4966683751922092, + "grad_norm": 31.36126708984375, + "learning_rate": 3.512158922251399e-05, + "loss": 5.358, + "step": 13140 + }, + { + "epoch": 1.4978073922205137, + "grad_norm": 6.15058708190918, + "learning_rate": 3.511017239410892e-05, + "loss": 5.2292, + "step": 13150 + }, + { + "epoch": 1.4989464092488183, + "grad_norm": 12.908248901367188, + "learning_rate": 3.509875556570385e-05, + "loss": 5.2005, + "step": 13160 + }, + { + "epoch": 1.5000854262771228, + "grad_norm": 7.87893533706665, + "learning_rate": 3.508733873729878e-05, + "loss": 5.2167, + "step": 13170 + }, + { + "epoch": 1.5012244433054274, + "grad_norm": 7.283593654632568, + "learning_rate": 3.507592190889371e-05, + "loss": 5.0384, + "step": 13180 + }, + { + "epoch": 1.502363460333732, + "grad_norm": 7.5403289794921875, + "learning_rate": 3.506450508048864e-05, + "loss": 5.5789, + "step": 13190 + }, + { + "epoch": 1.5035024773620367, + "grad_norm": 27.158863067626953, + "learning_rate": 3.505308825208357e-05, + "loss": 5.1138, + "step": 13200 + }, + { + "epoch": 1.504641494390341, + "grad_norm": 8.950972557067871, + "learning_rate": 3.50416714236785e-05, + "loss": 5.1794, + "step": 13210 + }, + { + "epoch": 1.5057805114186458, + "grad_norm": 6.999053955078125, + "learning_rate": 3.503025459527344e-05, + "loss": 5.2138, + "step": 13220 + }, + { + "epoch": 1.5069195284469503, + "grad_norm": 24.662425994873047, + "learning_rate": 3.5018837766868366e-05, + "loss": 5.5911, + "step": 13230 + }, + { + "epoch": 1.5080585454752549, + "grad_norm": 10.630668640136719, + "learning_rate": 3.50074209384633e-05, + "loss": 5.2046, + "step": 13240 + }, + { + "epoch": 1.5091975625035594, + "grad_norm": 7.645869255065918, + "learning_rate": 3.499600411005823e-05, + "loss": 5.1066, + "step": 13250 + }, + { + "epoch": 1.510336579531864, + "grad_norm": 7.014522552490234, + "learning_rate": 3.498458728165316e-05, + "loss": 5.5894, + "step": 13260 + }, + { + "epoch": 1.5114755965601687, + "grad_norm": 8.533949851989746, + "learning_rate": 3.4973170453248086e-05, + "loss": 4.9576, + "step": 13270 + }, + { + "epoch": 1.512614613588473, + "grad_norm": 6.9750237464904785, + "learning_rate": 3.496175362484302e-05, + "loss": 5.1692, + "step": 13280 + }, + { + "epoch": 1.5137536306167778, + "grad_norm": 5.311960220336914, + "learning_rate": 3.495033679643795e-05, + "loss": 5.0933, + "step": 13290 + }, + { + "epoch": 1.5148926476450821, + "grad_norm": 18.698442459106445, + "learning_rate": 3.493891996803288e-05, + "loss": 5.3646, + "step": 13300 + }, + { + "epoch": 1.516031664673387, + "grad_norm": 17.7315616607666, + "learning_rate": 3.4927503139627813e-05, + "loss": 5.2188, + "step": 13310 + }, + { + "epoch": 1.5171706817016914, + "grad_norm": 11.710564613342285, + "learning_rate": 3.4916086311222745e-05, + "loss": 5.2846, + "step": 13320 + }, + { + "epoch": 1.518309698729996, + "grad_norm": 5.661081314086914, + "learning_rate": 3.490466948281768e-05, + "loss": 5.2878, + "step": 13330 + }, + { + "epoch": 1.5194487157583005, + "grad_norm": 12.068194389343262, + "learning_rate": 3.489325265441261e-05, + "loss": 5.2608, + "step": 13340 + }, + { + "epoch": 1.520587732786605, + "grad_norm": 8.578582763671875, + "learning_rate": 3.4881835826007534e-05, + "loss": 4.9035, + "step": 13350 + }, + { + "epoch": 1.5217267498149099, + "grad_norm": 9.248255729675293, + "learning_rate": 3.4870418997602466e-05, + "loss": 5.0897, + "step": 13360 + }, + { + "epoch": 1.5228657668432142, + "grad_norm": 5.910317420959473, + "learning_rate": 3.48590021691974e-05, + "loss": 5.1177, + "step": 13370 + }, + { + "epoch": 1.524004783871519, + "grad_norm": 9.493122100830078, + "learning_rate": 3.484758534079233e-05, + "loss": 5.5372, + "step": 13380 + }, + { + "epoch": 1.5251438008998235, + "grad_norm": 14.408531188964844, + "learning_rate": 3.483616851238726e-05, + "loss": 5.1862, + "step": 13390 + }, + { + "epoch": 1.526282817928128, + "grad_norm": 9.839924812316895, + "learning_rate": 3.482475168398219e-05, + "loss": 5.4061, + "step": 13400 + }, + { + "epoch": 1.5274218349564326, + "grad_norm": 8.718000411987305, + "learning_rate": 3.4813334855577125e-05, + "loss": 5.4663, + "step": 13410 + }, + { + "epoch": 1.5285608519847371, + "grad_norm": 6.590074062347412, + "learning_rate": 3.480191802717206e-05, + "loss": 5.3317, + "step": 13420 + }, + { + "epoch": 1.529699869013042, + "grad_norm": 6.715322017669678, + "learning_rate": 3.479050119876698e-05, + "loss": 5.1962, + "step": 13430 + }, + { + "epoch": 1.5308388860413462, + "grad_norm": 8.20048713684082, + "learning_rate": 3.4779084370361914e-05, + "loss": 5.5784, + "step": 13440 + }, + { + "epoch": 1.531977903069651, + "grad_norm": 17.340343475341797, + "learning_rate": 3.4767667541956846e-05, + "loss": 5.4492, + "step": 13450 + }, + { + "epoch": 1.5331169200979553, + "grad_norm": 6.961179256439209, + "learning_rate": 3.475625071355178e-05, + "loss": 5.4859, + "step": 13460 + }, + { + "epoch": 1.53425593712626, + "grad_norm": 8.843058586120605, + "learning_rate": 3.474483388514671e-05, + "loss": 5.1902, + "step": 13470 + }, + { + "epoch": 1.5353949541545646, + "grad_norm": 7.162143707275391, + "learning_rate": 3.473341705674164e-05, + "loss": 5.3569, + "step": 13480 + }, + { + "epoch": 1.5365339711828692, + "grad_norm": 7.354994297027588, + "learning_rate": 3.472200022833657e-05, + "loss": 5.347, + "step": 13490 + }, + { + "epoch": 1.5376729882111737, + "grad_norm": 14.971761703491211, + "learning_rate": 3.47105833999315e-05, + "loss": 5.3153, + "step": 13500 + }, + { + "epoch": 1.5388120052394783, + "grad_norm": 9.347837448120117, + "learning_rate": 3.469916657152643e-05, + "loss": 5.3571, + "step": 13510 + }, + { + "epoch": 1.539951022267783, + "grad_norm": 7.197291374206543, + "learning_rate": 3.468774974312136e-05, + "loss": 5.5024, + "step": 13520 + }, + { + "epoch": 1.5410900392960873, + "grad_norm": 11.419710159301758, + "learning_rate": 3.4676332914716294e-05, + "loss": 5.2976, + "step": 13530 + }, + { + "epoch": 1.5422290563243921, + "grad_norm": 7.408755302429199, + "learning_rate": 3.4664916086311225e-05, + "loss": 4.9251, + "step": 13540 + }, + { + "epoch": 1.5433680733526967, + "grad_norm": 7.224884033203125, + "learning_rate": 3.465349925790616e-05, + "loss": 5.1885, + "step": 13550 + }, + { + "epoch": 1.5445070903810012, + "grad_norm": 9.694978713989258, + "learning_rate": 3.464208242950109e-05, + "loss": 5.1941, + "step": 13560 + }, + { + "epoch": 1.5456461074093057, + "grad_norm": 14.073616027832031, + "learning_rate": 3.463066560109602e-05, + "loss": 5.104, + "step": 13570 + }, + { + "epoch": 1.5467851244376103, + "grad_norm": 5.462674617767334, + "learning_rate": 3.4619248772690946e-05, + "loss": 5.4142, + "step": 13580 + }, + { + "epoch": 1.547924141465915, + "grad_norm": 8.231867790222168, + "learning_rate": 3.460783194428588e-05, + "loss": 5.1634, + "step": 13590 + }, + { + "epoch": 1.5490631584942194, + "grad_norm": 6.0484619140625, + "learning_rate": 3.459641511588081e-05, + "loss": 5.351, + "step": 13600 + }, + { + "epoch": 1.5502021755225242, + "grad_norm": 6.931220054626465, + "learning_rate": 3.458499828747574e-05, + "loss": 5.1483, + "step": 13610 + }, + { + "epoch": 1.5513411925508285, + "grad_norm": 23.10702896118164, + "learning_rate": 3.4573581459070667e-05, + "loss": 5.3331, + "step": 13620 + }, + { + "epoch": 1.5524802095791332, + "grad_norm": 15.228353500366211, + "learning_rate": 3.4562164630665605e-05, + "loss": 5.3901, + "step": 13630 + }, + { + "epoch": 1.5536192266074378, + "grad_norm": 7.844686508178711, + "learning_rate": 3.455074780226054e-05, + "loss": 5.4937, + "step": 13640 + }, + { + "epoch": 1.5547582436357423, + "grad_norm": 6.5618743896484375, + "learning_rate": 3.453933097385547e-05, + "loss": 5.0538, + "step": 13650 + }, + { + "epoch": 1.5558972606640469, + "grad_norm": 6.531167507171631, + "learning_rate": 3.4527914145450394e-05, + "loss": 5.435, + "step": 13660 + }, + { + "epoch": 1.5570362776923514, + "grad_norm": 13.602387428283691, + "learning_rate": 3.4516497317045326e-05, + "loss": 5.3281, + "step": 13670 + }, + { + "epoch": 1.5581752947206562, + "grad_norm": 7.583394527435303, + "learning_rate": 3.450508048864026e-05, + "loss": 5.315, + "step": 13680 + }, + { + "epoch": 1.5593143117489605, + "grad_norm": 8.912744522094727, + "learning_rate": 3.449366366023519e-05, + "loss": 5.3501, + "step": 13690 + }, + { + "epoch": 1.5604533287772653, + "grad_norm": 6.5284423828125, + "learning_rate": 3.4482246831830114e-05, + "loss": 5.2478, + "step": 13700 + }, + { + "epoch": 1.5615923458055698, + "grad_norm": 18.133039474487305, + "learning_rate": 3.4470830003425046e-05, + "loss": 5.1715, + "step": 13710 + }, + { + "epoch": 1.5627313628338744, + "grad_norm": 7.198716640472412, + "learning_rate": 3.4459413175019985e-05, + "loss": 5.2315, + "step": 13720 + }, + { + "epoch": 1.563870379862179, + "grad_norm": 8.62277603149414, + "learning_rate": 3.444799634661492e-05, + "loss": 5.1835, + "step": 13730 + }, + { + "epoch": 1.5650093968904835, + "grad_norm": 8.421860694885254, + "learning_rate": 3.443657951820984e-05, + "loss": 5.1822, + "step": 13740 + }, + { + "epoch": 1.5661484139187882, + "grad_norm": 7.427688121795654, + "learning_rate": 3.4425162689804774e-05, + "loss": 5.5495, + "step": 13750 + }, + { + "epoch": 1.5672874309470926, + "grad_norm": 7.007988929748535, + "learning_rate": 3.4413745861399705e-05, + "loss": 5.1895, + "step": 13760 + }, + { + "epoch": 1.5684264479753973, + "grad_norm": 9.902037620544434, + "learning_rate": 3.440232903299464e-05, + "loss": 5.2564, + "step": 13770 + }, + { + "epoch": 1.5695654650037016, + "grad_norm": 8.029926300048828, + "learning_rate": 3.439091220458956e-05, + "loss": 5.3869, + "step": 13780 + }, + { + "epoch": 1.5707044820320064, + "grad_norm": 11.344751358032227, + "learning_rate": 3.4379495376184494e-05, + "loss": 5.3192, + "step": 13790 + }, + { + "epoch": 1.571843499060311, + "grad_norm": 19.97797393798828, + "learning_rate": 3.436807854777943e-05, + "loss": 4.9832, + "step": 13800 + }, + { + "epoch": 1.5729825160886155, + "grad_norm": 9.381373405456543, + "learning_rate": 3.435666171937436e-05, + "loss": 5.2027, + "step": 13810 + }, + { + "epoch": 1.57412153311692, + "grad_norm": 7.4374613761901855, + "learning_rate": 3.434524489096929e-05, + "loss": 5.2427, + "step": 13820 + }, + { + "epoch": 1.5752605501452246, + "grad_norm": 8.768608093261719, + "learning_rate": 3.433382806256422e-05, + "loss": 5.182, + "step": 13830 + }, + { + "epoch": 1.5763995671735294, + "grad_norm": 10.891498565673828, + "learning_rate": 3.432241123415915e-05, + "loss": 5.765, + "step": 13840 + }, + { + "epoch": 1.5775385842018337, + "grad_norm": 20.340749740600586, + "learning_rate": 3.4310994405754085e-05, + "loss": 5.4202, + "step": 13850 + }, + { + "epoch": 1.5786776012301384, + "grad_norm": 5.067477226257324, + "learning_rate": 3.429957757734901e-05, + "loss": 5.5461, + "step": 13860 + }, + { + "epoch": 1.579816618258443, + "grad_norm": 9.956294059753418, + "learning_rate": 3.428816074894394e-05, + "loss": 5.2925, + "step": 13870 + }, + { + "epoch": 1.5809556352867475, + "grad_norm": 8.55966854095459, + "learning_rate": 3.427674392053888e-05, + "loss": 5.3078, + "step": 13880 + }, + { + "epoch": 1.582094652315052, + "grad_norm": 10.623746871948242, + "learning_rate": 3.4265327092133806e-05, + "loss": 5.0856, + "step": 13890 + }, + { + "epoch": 1.5832336693433566, + "grad_norm": 7.9208526611328125, + "learning_rate": 3.425391026372874e-05, + "loss": 5.313, + "step": 13900 + }, + { + "epoch": 1.5843726863716614, + "grad_norm": 9.109285354614258, + "learning_rate": 3.424249343532367e-05, + "loss": 5.4487, + "step": 13910 + }, + { + "epoch": 1.5855117033999657, + "grad_norm": 15.563692092895508, + "learning_rate": 3.42310766069186e-05, + "loss": 5.2342, + "step": 13920 + }, + { + "epoch": 1.5866507204282705, + "grad_norm": 15.028312683105469, + "learning_rate": 3.421965977851353e-05, + "loss": 5.0988, + "step": 13930 + }, + { + "epoch": 1.5877897374565748, + "grad_norm": 10.812498092651367, + "learning_rate": 3.420824295010846e-05, + "loss": 4.8925, + "step": 13940 + }, + { + "epoch": 1.5889287544848796, + "grad_norm": 12.279333114624023, + "learning_rate": 3.419682612170339e-05, + "loss": 5.1012, + "step": 13950 + }, + { + "epoch": 1.5900677715131841, + "grad_norm": 18.0694637298584, + "learning_rate": 3.418540929329833e-05, + "loss": 5.4058, + "step": 13960 + }, + { + "epoch": 1.5912067885414887, + "grad_norm": 12.514638900756836, + "learning_rate": 3.4173992464893254e-05, + "loss": 5.4547, + "step": 13970 + }, + { + "epoch": 1.5923458055697932, + "grad_norm": 7.83701229095459, + "learning_rate": 3.4162575636488186e-05, + "loss": 5.3477, + "step": 13980 + }, + { + "epoch": 1.5934848225980978, + "grad_norm": 7.048572063446045, + "learning_rate": 3.415115880808312e-05, + "loss": 5.1777, + "step": 13990 + }, + { + "epoch": 1.5946238396264025, + "grad_norm": 8.231867790222168, + "learning_rate": 3.413974197967805e-05, + "loss": 5.3191, + "step": 14000 + }, + { + "epoch": 1.5946238396264025, + "eval_loss": 5.865710258483887, + "eval_runtime": 11.9559, + "eval_samples_per_second": 1.255, + "eval_steps_per_second": 0.167, + "step": 14000 + }, + { + "epoch": 1.5957628566547069, + "grad_norm": 7.9335150718688965, + "learning_rate": 3.4128325151272974e-05, + "loss": 5.1679, + "step": 14010 + }, + { + "epoch": 1.5969018736830116, + "grad_norm": 11.78247356414795, + "learning_rate": 3.4116908322867906e-05, + "loss": 5.1418, + "step": 14020 + }, + { + "epoch": 1.5980408907113162, + "grad_norm": 8.302976608276367, + "learning_rate": 3.410549149446284e-05, + "loss": 5.2641, + "step": 14030 + }, + { + "epoch": 1.5991799077396207, + "grad_norm": 6.566915988922119, + "learning_rate": 3.409407466605777e-05, + "loss": 5.4475, + "step": 14040 + }, + { + "epoch": 1.6003189247679253, + "grad_norm": 9.077897071838379, + "learning_rate": 3.40826578376527e-05, + "loss": 5.3925, + "step": 14050 + }, + { + "epoch": 1.6014579417962298, + "grad_norm": 7.3145880699157715, + "learning_rate": 3.4071241009247633e-05, + "loss": 5.4688, + "step": 14060 + }, + { + "epoch": 1.6025969588245346, + "grad_norm": 9.912899017333984, + "learning_rate": 3.4059824180842565e-05, + "loss": 5.1155, + "step": 14070 + }, + { + "epoch": 1.603735975852839, + "grad_norm": 8.687728881835938, + "learning_rate": 3.40484073524375e-05, + "loss": 5.4758, + "step": 14080 + }, + { + "epoch": 1.6048749928811437, + "grad_norm": 16.98023796081543, + "learning_rate": 3.403699052403242e-05, + "loss": 5.2506, + "step": 14090 + }, + { + "epoch": 1.606014009909448, + "grad_norm": 5.677768230438232, + "learning_rate": 3.4025573695627354e-05, + "loss": 5.4057, + "step": 14100 + }, + { + "epoch": 1.6071530269377527, + "grad_norm": 7.748041152954102, + "learning_rate": 3.4014156867222286e-05, + "loss": 5.3705, + "step": 14110 + }, + { + "epoch": 1.6082920439660573, + "grad_norm": 6.371578216552734, + "learning_rate": 3.400274003881722e-05, + "loss": 5.1812, + "step": 14120 + }, + { + "epoch": 1.6094310609943618, + "grad_norm": 7.417301654815674, + "learning_rate": 3.399132321041215e-05, + "loss": 5.9261, + "step": 14130 + }, + { + "epoch": 1.6105700780226664, + "grad_norm": 10.484310150146484, + "learning_rate": 3.397990638200708e-05, + "loss": 5.3389, + "step": 14140 + }, + { + "epoch": 1.611709095050971, + "grad_norm": 22.2833251953125, + "learning_rate": 3.396848955360201e-05, + "loss": 5.2009, + "step": 14150 + }, + { + "epoch": 1.6128481120792757, + "grad_norm": 9.973012924194336, + "learning_rate": 3.3957072725196945e-05, + "loss": 5.4385, + "step": 14160 + }, + { + "epoch": 1.61398712910758, + "grad_norm": 7.872779369354248, + "learning_rate": 3.394565589679187e-05, + "loss": 5.4088, + "step": 14170 + }, + { + "epoch": 1.6151261461358848, + "grad_norm": 9.12864875793457, + "learning_rate": 3.39342390683868e-05, + "loss": 5.3384, + "step": 14180 + }, + { + "epoch": 1.6162651631641893, + "grad_norm": 6.356509685516357, + "learning_rate": 3.3922822239981734e-05, + "loss": 5.1973, + "step": 14190 + }, + { + "epoch": 1.6174041801924939, + "grad_norm": 9.473197937011719, + "learning_rate": 3.3912547094417177e-05, + "loss": 5.555, + "step": 14200 + }, + { + "epoch": 1.6185431972207984, + "grad_norm": 8.52722454071045, + "learning_rate": 3.39011302660121e-05, + "loss": 5.0841, + "step": 14210 + }, + { + "epoch": 1.619682214249103, + "grad_norm": 7.906297206878662, + "learning_rate": 3.3889713437607033e-05, + "loss": 5.3532, + "step": 14220 + }, + { + "epoch": 1.6208212312774077, + "grad_norm": 13.823141098022461, + "learning_rate": 3.3878296609201965e-05, + "loss": 5.3732, + "step": 14230 + }, + { + "epoch": 1.621960248305712, + "grad_norm": 8.63819408416748, + "learning_rate": 3.38668797807969e-05, + "loss": 5.7671, + "step": 14240 + }, + { + "epoch": 1.6230992653340168, + "grad_norm": 11.065068244934082, + "learning_rate": 3.385546295239182e-05, + "loss": 5.1148, + "step": 14250 + }, + { + "epoch": 1.6242382823623212, + "grad_norm": 6.661491870880127, + "learning_rate": 3.3844046123986754e-05, + "loss": 5.1601, + "step": 14260 + }, + { + "epoch": 1.625377299390626, + "grad_norm": 7.48806619644165, + "learning_rate": 3.383262929558169e-05, + "loss": 5.4801, + "step": 14270 + }, + { + "epoch": 1.6265163164189305, + "grad_norm": 6.75936222076416, + "learning_rate": 3.3821212467176624e-05, + "loss": 4.6146, + "step": 14280 + }, + { + "epoch": 1.627655333447235, + "grad_norm": 6.72158670425415, + "learning_rate": 3.380979563877155e-05, + "loss": 5.2395, + "step": 14290 + }, + { + "epoch": 1.6287943504755396, + "grad_norm": 9.066991806030273, + "learning_rate": 3.379837881036648e-05, + "loss": 5.4662, + "step": 14300 + }, + { + "epoch": 1.629933367503844, + "grad_norm": 9.151389122009277, + "learning_rate": 3.378696198196141e-05, + "loss": 5.5807, + "step": 14310 + }, + { + "epoch": 1.6310723845321489, + "grad_norm": 5.07937479019165, + "learning_rate": 3.3775545153556345e-05, + "loss": 5.0922, + "step": 14320 + }, + { + "epoch": 1.6322114015604532, + "grad_norm": 4.731130599975586, + "learning_rate": 3.376412832515127e-05, + "loss": 5.3785, + "step": 14330 + }, + { + "epoch": 1.633350418588758, + "grad_norm": 7.364837646484375, + "learning_rate": 3.37527114967462e-05, + "loss": 5.0001, + "step": 14340 + }, + { + "epoch": 1.6344894356170625, + "grad_norm": 9.038762092590332, + "learning_rate": 3.374129466834114e-05, + "loss": 5.3599, + "step": 14350 + }, + { + "epoch": 1.635628452645367, + "grad_norm": 8.42864990234375, + "learning_rate": 3.372987783993607e-05, + "loss": 5.2821, + "step": 14360 + }, + { + "epoch": 1.6367674696736716, + "grad_norm": 5.388772487640381, + "learning_rate": 3.3718461011531e-05, + "loss": 5.3046, + "step": 14370 + }, + { + "epoch": 1.6379064867019761, + "grad_norm": 6.146626949310303, + "learning_rate": 3.370704418312593e-05, + "loss": 5.1354, + "step": 14380 + }, + { + "epoch": 1.639045503730281, + "grad_norm": 28.261077880859375, + "learning_rate": 3.369562735472086e-05, + "loss": 5.1599, + "step": 14390 + }, + { + "epoch": 1.6401845207585852, + "grad_norm": 16.488996505737305, + "learning_rate": 3.368421052631579e-05, + "loss": 5.5803, + "step": 14400 + }, + { + "epoch": 1.64132353778689, + "grad_norm": 5.829269886016846, + "learning_rate": 3.367279369791072e-05, + "loss": 4.9764, + "step": 14410 + }, + { + "epoch": 1.6424625548151945, + "grad_norm": 5.7588419914245605, + "learning_rate": 3.366137686950565e-05, + "loss": 5.5567, + "step": 14420 + }, + { + "epoch": 1.643601571843499, + "grad_norm": 7.560709476470947, + "learning_rate": 3.364996004110059e-05, + "loss": 5.4832, + "step": 14430 + }, + { + "epoch": 1.6447405888718036, + "grad_norm": 13.212182998657227, + "learning_rate": 3.3638543212695514e-05, + "loss": 5.0167, + "step": 14440 + }, + { + "epoch": 1.6458796059001082, + "grad_norm": 6.811690807342529, + "learning_rate": 3.3627126384290445e-05, + "loss": 5.7124, + "step": 14450 + }, + { + "epoch": 1.647018622928413, + "grad_norm": 8.75715446472168, + "learning_rate": 3.361570955588538e-05, + "loss": 5.3427, + "step": 14460 + }, + { + "epoch": 1.6481576399567173, + "grad_norm": 9.054322242736816, + "learning_rate": 3.360429272748031e-05, + "loss": 5.1287, + "step": 14470 + }, + { + "epoch": 1.649296656985022, + "grad_norm": 8.605569839477539, + "learning_rate": 3.359287589907524e-05, + "loss": 5.3172, + "step": 14480 + }, + { + "epoch": 1.6504356740133264, + "grad_norm": 4.9488348960876465, + "learning_rate": 3.3581459070670166e-05, + "loss": 5.2897, + "step": 14490 + }, + { + "epoch": 1.6515746910416311, + "grad_norm": 7.640530109405518, + "learning_rate": 3.35700422422651e-05, + "loss": 5.1434, + "step": 14500 + }, + { + "epoch": 1.6527137080699357, + "grad_norm": 18.76363182067871, + "learning_rate": 3.355862541386003e-05, + "loss": 5.0798, + "step": 14510 + }, + { + "epoch": 1.6538527250982402, + "grad_norm": 10.534653663635254, + "learning_rate": 3.354720858545496e-05, + "loss": 5.4461, + "step": 14520 + }, + { + "epoch": 1.6549917421265448, + "grad_norm": 11.95950698852539, + "learning_rate": 3.353579175704989e-05, + "loss": 5.403, + "step": 14530 + }, + { + "epoch": 1.6561307591548493, + "grad_norm": 19.345565795898438, + "learning_rate": 3.3524374928644825e-05, + "loss": 5.4681, + "step": 14540 + }, + { + "epoch": 1.657269776183154, + "grad_norm": 21.54135513305664, + "learning_rate": 3.351295810023976e-05, + "loss": 5.1326, + "step": 14550 + }, + { + "epoch": 1.6584087932114584, + "grad_norm": 10.59844970703125, + "learning_rate": 3.350154127183469e-05, + "loss": 5.2563, + "step": 14560 + }, + { + "epoch": 1.6595478102397632, + "grad_norm": 5.983888149261475, + "learning_rate": 3.3490124443429614e-05, + "loss": 5.4246, + "step": 14570 + }, + { + "epoch": 1.6606868272680677, + "grad_norm": 10.829444885253906, + "learning_rate": 3.3478707615024546e-05, + "loss": 5.3801, + "step": 14580 + }, + { + "epoch": 1.6618258442963723, + "grad_norm": 7.411653995513916, + "learning_rate": 3.346729078661948e-05, + "loss": 5.4264, + "step": 14590 + }, + { + "epoch": 1.6629648613246768, + "grad_norm": 4.255529880523682, + "learning_rate": 3.345587395821441e-05, + "loss": 5.114, + "step": 14600 + }, + { + "epoch": 1.6641038783529813, + "grad_norm": 7.266125202178955, + "learning_rate": 3.344445712980934e-05, + "loss": 5.3673, + "step": 14610 + }, + { + "epoch": 1.6652428953812861, + "grad_norm": 11.747828483581543, + "learning_rate": 3.343304030140427e-05, + "loss": 5.0465, + "step": 14620 + }, + { + "epoch": 1.6663819124095904, + "grad_norm": 7.091350555419922, + "learning_rate": 3.3421623472999205e-05, + "loss": 5.2813, + "step": 14630 + }, + { + "epoch": 1.6675209294378952, + "grad_norm": 7.287014484405518, + "learning_rate": 3.341020664459413e-05, + "loss": 5.3202, + "step": 14640 + }, + { + "epoch": 1.6686599464661995, + "grad_norm": 8.400530815124512, + "learning_rate": 3.339878981618906e-05, + "loss": 4.9966, + "step": 14650 + }, + { + "epoch": 1.6697989634945043, + "grad_norm": 8.203120231628418, + "learning_rate": 3.3387372987783994e-05, + "loss": 5.0657, + "step": 14660 + }, + { + "epoch": 1.6709379805228088, + "grad_norm": 8.2782564163208, + "learning_rate": 3.3375956159378925e-05, + "loss": 5.4649, + "step": 14670 + }, + { + "epoch": 1.6720769975511134, + "grad_norm": 8.495993614196777, + "learning_rate": 3.336453933097386e-05, + "loss": 4.9707, + "step": 14680 + }, + { + "epoch": 1.673216014579418, + "grad_norm": 13.784844398498535, + "learning_rate": 3.335312250256879e-05, + "loss": 5.396, + "step": 14690 + }, + { + "epoch": 1.6743550316077225, + "grad_norm": 6.243724822998047, + "learning_rate": 3.334170567416372e-05, + "loss": 5.1599, + "step": 14700 + }, + { + "epoch": 1.6754940486360272, + "grad_norm": 6.918847560882568, + "learning_rate": 3.333028884575865e-05, + "loss": 5.0621, + "step": 14710 + }, + { + "epoch": 1.6766330656643316, + "grad_norm": 12.930042266845703, + "learning_rate": 3.331887201735358e-05, + "loss": 5.3289, + "step": 14720 + }, + { + "epoch": 1.6777720826926363, + "grad_norm": 8.358970642089844, + "learning_rate": 3.330745518894851e-05, + "loss": 5.1417, + "step": 14730 + }, + { + "epoch": 1.6789110997209409, + "grad_norm": 5.784093379974365, + "learning_rate": 3.329603836054344e-05, + "loss": 5.0608, + "step": 14740 + }, + { + "epoch": 1.6800501167492454, + "grad_norm": 9.293664932250977, + "learning_rate": 3.328462153213837e-05, + "loss": 5.0801, + "step": 14750 + }, + { + "epoch": 1.68118913377755, + "grad_norm": 7.548481464385986, + "learning_rate": 3.3273204703733305e-05, + "loss": 5.3661, + "step": 14760 + }, + { + "epoch": 1.6823281508058545, + "grad_norm": 6.076251983642578, + "learning_rate": 3.326178787532824e-05, + "loss": 5.3517, + "step": 14770 + }, + { + "epoch": 1.6834671678341593, + "grad_norm": 11.553271293640137, + "learning_rate": 3.325037104692317e-05, + "loss": 5.1621, + "step": 14780 + }, + { + "epoch": 1.6846061848624636, + "grad_norm": 5.663163185119629, + "learning_rate": 3.32389542185181e-05, + "loss": 5.2807, + "step": 14790 + }, + { + "epoch": 1.6857452018907684, + "grad_norm": 8.15644359588623, + "learning_rate": 3.3227537390113026e-05, + "loss": 5.6571, + "step": 14800 + }, + { + "epoch": 1.6868842189190727, + "grad_norm": 13.200905799865723, + "learning_rate": 3.321612056170796e-05, + "loss": 5.1325, + "step": 14810 + }, + { + "epoch": 1.6880232359473775, + "grad_norm": 10.035882949829102, + "learning_rate": 3.320470373330289e-05, + "loss": 5.2823, + "step": 14820 + }, + { + "epoch": 1.689162252975682, + "grad_norm": 6.265589237213135, + "learning_rate": 3.319328690489782e-05, + "loss": 5.2748, + "step": 14830 + }, + { + "epoch": 1.6903012700039866, + "grad_norm": 8.532353401184082, + "learning_rate": 3.3181870076492746e-05, + "loss": 5.4935, + "step": 14840 + }, + { + "epoch": 1.691440287032291, + "grad_norm": 7.839117050170898, + "learning_rate": 3.3170453248087685e-05, + "loss": 5.4935, + "step": 14850 + }, + { + "epoch": 1.6925793040605956, + "grad_norm": 6.248348712921143, + "learning_rate": 3.315903641968262e-05, + "loss": 5.4433, + "step": 14860 + }, + { + "epoch": 1.6937183210889004, + "grad_norm": 8.618048667907715, + "learning_rate": 3.314761959127755e-05, + "loss": 5.2831, + "step": 14870 + }, + { + "epoch": 1.6948573381172047, + "grad_norm": 7.2228312492370605, + "learning_rate": 3.3136202762872474e-05, + "loss": 5.3584, + "step": 14880 + }, + { + "epoch": 1.6959963551455095, + "grad_norm": 10.249879837036133, + "learning_rate": 3.3124785934467406e-05, + "loss": 5.4896, + "step": 14890 + }, + { + "epoch": 1.697135372173814, + "grad_norm": 13.266846656799316, + "learning_rate": 3.311336910606234e-05, + "loss": 5.8706, + "step": 14900 + }, + { + "epoch": 1.6982743892021186, + "grad_norm": 16.368671417236328, + "learning_rate": 3.310195227765727e-05, + "loss": 5.1789, + "step": 14910 + }, + { + "epoch": 1.6994134062304231, + "grad_norm": 5.892331123352051, + "learning_rate": 3.3090535449252194e-05, + "loss": 5.8149, + "step": 14920 + }, + { + "epoch": 1.7005524232587277, + "grad_norm": 7.839752674102783, + "learning_rate": 3.307911862084713e-05, + "loss": 5.1733, + "step": 14930 + }, + { + "epoch": 1.7016914402870325, + "grad_norm": 6.328537940979004, + "learning_rate": 3.3067701792442065e-05, + "loss": 5.3515, + "step": 14940 + }, + { + "epoch": 1.7028304573153368, + "grad_norm": 11.081585884094238, + "learning_rate": 3.305628496403699e-05, + "loss": 5.7846, + "step": 14950 + }, + { + "epoch": 1.7039694743436415, + "grad_norm": 9.543967247009277, + "learning_rate": 3.304486813563192e-05, + "loss": 5.3206, + "step": 14960 + }, + { + "epoch": 1.7051084913719459, + "grad_norm": 13.206875801086426, + "learning_rate": 3.3033451307226853e-05, + "loss": 5.2824, + "step": 14970 + }, + { + "epoch": 1.7062475084002506, + "grad_norm": 9.329044342041016, + "learning_rate": 3.3022034478821785e-05, + "loss": 5.7017, + "step": 14980 + }, + { + "epoch": 1.7073865254285552, + "grad_norm": 7.400033950805664, + "learning_rate": 3.301061765041672e-05, + "loss": 5.2255, + "step": 14990 + }, + { + "epoch": 1.7085255424568597, + "grad_norm": 10.00680923461914, + "learning_rate": 3.299920082201164e-05, + "loss": 5.3857, + "step": 15000 + }, + { + "epoch": 1.7096645594851643, + "grad_norm": 6.304871559143066, + "learning_rate": 3.298778399360658e-05, + "loss": 5.2932, + "step": 15010 + }, + { + "epoch": 1.7108035765134688, + "grad_norm": 6.080683708190918, + "learning_rate": 3.297636716520151e-05, + "loss": 5.2007, + "step": 15020 + }, + { + "epoch": 1.7119425935417736, + "grad_norm": 11.78959846496582, + "learning_rate": 3.296495033679644e-05, + "loss": 5.2121, + "step": 15030 + }, + { + "epoch": 1.713081610570078, + "grad_norm": 12.254242897033691, + "learning_rate": 3.295353350839137e-05, + "loss": 5.2893, + "step": 15040 + }, + { + "epoch": 1.7142206275983827, + "grad_norm": 11.91922378540039, + "learning_rate": 3.29421166799863e-05, + "loss": 5.2825, + "step": 15050 + }, + { + "epoch": 1.7153596446266872, + "grad_norm": 6.525363922119141, + "learning_rate": 3.293069985158123e-05, + "loss": 5.2046, + "step": 15060 + }, + { + "epoch": 1.7164986616549918, + "grad_norm": 9.117419242858887, + "learning_rate": 3.2919283023176165e-05, + "loss": 5.5099, + "step": 15070 + }, + { + "epoch": 1.7176376786832963, + "grad_norm": 7.740299224853516, + "learning_rate": 3.290786619477109e-05, + "loss": 5.4564, + "step": 15080 + }, + { + "epoch": 1.7187766957116009, + "grad_norm": 32.48822021484375, + "learning_rate": 3.289644936636603e-05, + "loss": 5.2745, + "step": 15090 + }, + { + "epoch": 1.7199157127399056, + "grad_norm": 8.313048362731934, + "learning_rate": 3.288503253796096e-05, + "loss": 5.4062, + "step": 15100 + }, + { + "epoch": 1.72105472976821, + "grad_norm": 12.474053382873535, + "learning_rate": 3.2873615709555886e-05, + "loss": 5.4008, + "step": 15110 + }, + { + "epoch": 1.7221937467965147, + "grad_norm": 7.4052958488464355, + "learning_rate": 3.286219888115082e-05, + "loss": 5.1607, + "step": 15120 + }, + { + "epoch": 1.723332763824819, + "grad_norm": 8.364946365356445, + "learning_rate": 3.285078205274575e-05, + "loss": 5.3591, + "step": 15130 + }, + { + "epoch": 1.7244717808531238, + "grad_norm": 11.5457763671875, + "learning_rate": 3.283936522434068e-05, + "loss": 5.294, + "step": 15140 + }, + { + "epoch": 1.7256107978814283, + "grad_norm": 5.80129337310791, + "learning_rate": 3.2827948395935606e-05, + "loss": 5.1861, + "step": 15150 + }, + { + "epoch": 1.726749814909733, + "grad_norm": 12.946269989013672, + "learning_rate": 3.281653156753054e-05, + "loss": 5.5425, + "step": 15160 + }, + { + "epoch": 1.7278888319380374, + "grad_norm": 11.868324279785156, + "learning_rate": 3.280511473912547e-05, + "loss": 5.1072, + "step": 15170 + }, + { + "epoch": 1.729027848966342, + "grad_norm": 20.0992374420166, + "learning_rate": 3.279369791072041e-05, + "loss": 5.1057, + "step": 15180 + }, + { + "epoch": 1.7301668659946468, + "grad_norm": 7.570152759552002, + "learning_rate": 3.2782281082315334e-05, + "loss": 5.4605, + "step": 15190 + }, + { + "epoch": 1.731305883022951, + "grad_norm": 8.00123119354248, + "learning_rate": 3.2770864253910265e-05, + "loss": 5.9011, + "step": 15200 + }, + { + "epoch": 1.7324449000512558, + "grad_norm": 6.339069366455078, + "learning_rate": 3.27594474255052e-05, + "loss": 5.5464, + "step": 15210 + }, + { + "epoch": 1.7335839170795604, + "grad_norm": 7.332450866699219, + "learning_rate": 3.274803059710013e-05, + "loss": 5.2134, + "step": 15220 + }, + { + "epoch": 1.734722934107865, + "grad_norm": 7.234862327575684, + "learning_rate": 3.2736613768695054e-05, + "loss": 5.4308, + "step": 15230 + }, + { + "epoch": 1.7358619511361695, + "grad_norm": 8.013717651367188, + "learning_rate": 3.2725196940289986e-05, + "loss": 5.2408, + "step": 15240 + }, + { + "epoch": 1.737000968164474, + "grad_norm": 5.108926296234131, + "learning_rate": 3.271378011188492e-05, + "loss": 5.0276, + "step": 15250 + }, + { + "epoch": 1.7381399851927788, + "grad_norm": 9.906007766723633, + "learning_rate": 3.2702363283479856e-05, + "loss": 5.1256, + "step": 15260 + }, + { + "epoch": 1.7392790022210831, + "grad_norm": 5.640520095825195, + "learning_rate": 3.269094645507478e-05, + "loss": 5.5672, + "step": 15270 + }, + { + "epoch": 1.7404180192493879, + "grad_norm": 6.319045066833496, + "learning_rate": 3.267952962666971e-05, + "loss": 4.8686, + "step": 15280 + }, + { + "epoch": 1.7415570362776922, + "grad_norm": 29.221023559570312, + "learning_rate": 3.2668112798264645e-05, + "loss": 5.4805, + "step": 15290 + }, + { + "epoch": 1.742696053305997, + "grad_norm": 5.332036972045898, + "learning_rate": 3.265669596985958e-05, + "loss": 5.1916, + "step": 15300 + }, + { + "epoch": 1.7438350703343015, + "grad_norm": 11.026018142700195, + "learning_rate": 3.26452791414545e-05, + "loss": 5.64, + "step": 15310 + }, + { + "epoch": 1.744974087362606, + "grad_norm": 12.852426528930664, + "learning_rate": 3.2633862313049434e-05, + "loss": 5.72, + "step": 15320 + }, + { + "epoch": 1.7461131043909106, + "grad_norm": 15.072467803955078, + "learning_rate": 3.2622445484644366e-05, + "loss": 5.2566, + "step": 15330 + }, + { + "epoch": 1.7472521214192152, + "grad_norm": 12.933263778686523, + "learning_rate": 3.26110286562393e-05, + "loss": 5.4592, + "step": 15340 + }, + { + "epoch": 1.74839113844752, + "grad_norm": 8.705323219299316, + "learning_rate": 3.259961182783423e-05, + "loss": 5.1086, + "step": 15350 + }, + { + "epoch": 1.7495301554758242, + "grad_norm": 21.354663848876953, + "learning_rate": 3.258819499942916e-05, + "loss": 5.2691, + "step": 15360 + }, + { + "epoch": 1.750669172504129, + "grad_norm": 18.818647384643555, + "learning_rate": 3.257677817102409e-05, + "loss": 5.1616, + "step": 15370 + }, + { + "epoch": 1.7518081895324336, + "grad_norm": 8.70785140991211, + "learning_rate": 3.2565361342619025e-05, + "loss": 5.4155, + "step": 15380 + }, + { + "epoch": 1.752947206560738, + "grad_norm": 10.73972225189209, + "learning_rate": 3.255394451421395e-05, + "loss": 5.5494, + "step": 15390 + }, + { + "epoch": 1.7540862235890426, + "grad_norm": 14.080718994140625, + "learning_rate": 3.254252768580888e-05, + "loss": 5.2188, + "step": 15400 + }, + { + "epoch": 1.7552252406173472, + "grad_norm": 15.136094093322754, + "learning_rate": 3.2531110857403814e-05, + "loss": 5.5364, + "step": 15410 + }, + { + "epoch": 1.756364257645652, + "grad_norm": 8.25161075592041, + "learning_rate": 3.2519694028998745e-05, + "loss": 5.0991, + "step": 15420 + }, + { + "epoch": 1.7575032746739563, + "grad_norm": 23.618043899536133, + "learning_rate": 3.250827720059368e-05, + "loss": 5.0023, + "step": 15430 + }, + { + "epoch": 1.758642291702261, + "grad_norm": 12.272988319396973, + "learning_rate": 3.249686037218861e-05, + "loss": 5.2807, + "step": 15440 + }, + { + "epoch": 1.7597813087305654, + "grad_norm": 10.583504676818848, + "learning_rate": 3.248544354378354e-05, + "loss": 5.1016, + "step": 15450 + }, + { + "epoch": 1.7609203257588701, + "grad_norm": 10.107946395874023, + "learning_rate": 3.2474026715378466e-05, + "loss": 5.402, + "step": 15460 + }, + { + "epoch": 1.7620593427871747, + "grad_norm": 7.89888334274292, + "learning_rate": 3.24626098869734e-05, + "loss": 5.1111, + "step": 15470 + }, + { + "epoch": 1.7631983598154792, + "grad_norm": 8.583559036254883, + "learning_rate": 3.245119305856833e-05, + "loss": 4.8444, + "step": 15480 + }, + { + "epoch": 1.7643373768437838, + "grad_norm": 6.2021918296813965, + "learning_rate": 3.243977623016326e-05, + "loss": 5.0835, + "step": 15490 + }, + { + "epoch": 1.7654763938720883, + "grad_norm": 7.720624923706055, + "learning_rate": 3.2428359401758193e-05, + "loss": 5.19, + "step": 15500 + }, + { + "epoch": 1.766615410900393, + "grad_norm": 10.650630950927734, + "learning_rate": 3.2416942573353125e-05, + "loss": 5.3394, + "step": 15510 + }, + { + "epoch": 1.7677544279286974, + "grad_norm": 8.247625350952148, + "learning_rate": 3.240552574494806e-05, + "loss": 4.9535, + "step": 15520 + }, + { + "epoch": 1.7688934449570022, + "grad_norm": 35.305152893066406, + "learning_rate": 3.239410891654299e-05, + "loss": 5.6442, + "step": 15530 + }, + { + "epoch": 1.7700324619853067, + "grad_norm": 9.185782432556152, + "learning_rate": 3.2382692088137914e-05, + "loss": 5.2315, + "step": 15540 + }, + { + "epoch": 1.7711714790136113, + "grad_norm": 8.288222312927246, + "learning_rate": 3.2371275259732846e-05, + "loss": 5.0178, + "step": 15550 + }, + { + "epoch": 1.7723104960419158, + "grad_norm": 13.48383903503418, + "learning_rate": 3.235985843132778e-05, + "loss": 5.2955, + "step": 15560 + }, + { + "epoch": 1.7734495130702204, + "grad_norm": 7.542227745056152, + "learning_rate": 3.234844160292271e-05, + "loss": 5.2314, + "step": 15570 + }, + { + "epoch": 1.7745885300985251, + "grad_norm": 8.83359146118164, + "learning_rate": 3.233702477451764e-05, + "loss": 5.1114, + "step": 15580 + }, + { + "epoch": 1.7757275471268295, + "grad_norm": 8.529522895812988, + "learning_rate": 3.232560794611257e-05, + "loss": 5.0844, + "step": 15590 + }, + { + "epoch": 1.7768665641551342, + "grad_norm": 8.735173225402832, + "learning_rate": 3.2314191117707505e-05, + "loss": 4.9814, + "step": 15600 + }, + { + "epoch": 1.7780055811834385, + "grad_norm": 10.227621078491211, + "learning_rate": 3.230277428930244e-05, + "loss": 5.6574, + "step": 15610 + }, + { + "epoch": 1.7791445982117433, + "grad_norm": 6.311840534210205, + "learning_rate": 3.229135746089736e-05, + "loss": 5.2249, + "step": 15620 + }, + { + "epoch": 1.7802836152400479, + "grad_norm": 10.498848915100098, + "learning_rate": 3.2279940632492294e-05, + "loss": 5.1075, + "step": 15630 + }, + { + "epoch": 1.7814226322683524, + "grad_norm": 12.254350662231445, + "learning_rate": 3.2268523804087226e-05, + "loss": 5.3643, + "step": 15640 + }, + { + "epoch": 1.782561649296657, + "grad_norm": 11.579363822937012, + "learning_rate": 3.225710697568216e-05, + "loss": 5.1973, + "step": 15650 + }, + { + "epoch": 1.7837006663249615, + "grad_norm": 21.702810287475586, + "learning_rate": 3.224569014727708e-05, + "loss": 5.1932, + "step": 15660 + }, + { + "epoch": 1.7848396833532663, + "grad_norm": 6.838681697845459, + "learning_rate": 3.223427331887202e-05, + "loss": 5.1427, + "step": 15670 + }, + { + "epoch": 1.7859787003815706, + "grad_norm": 6.818762302398682, + "learning_rate": 3.222285649046695e-05, + "loss": 5.1011, + "step": 15680 + }, + { + "epoch": 1.7871177174098754, + "grad_norm": 24.59422492980957, + "learning_rate": 3.2211439662061885e-05, + "loss": 5.0368, + "step": 15690 + }, + { + "epoch": 1.78825673443818, + "grad_norm": 7.4565510749816895, + "learning_rate": 3.220002283365681e-05, + "loss": 5.2689, + "step": 15700 + }, + { + "epoch": 1.7893957514664844, + "grad_norm": 6.894626140594482, + "learning_rate": 3.218860600525174e-05, + "loss": 5.3947, + "step": 15710 + }, + { + "epoch": 1.790534768494789, + "grad_norm": 7.664463520050049, + "learning_rate": 3.2177189176846674e-05, + "loss": 5.6121, + "step": 15720 + }, + { + "epoch": 1.7916737855230935, + "grad_norm": 8.247864723205566, + "learning_rate": 3.2165772348441605e-05, + "loss": 5.5169, + "step": 15730 + }, + { + "epoch": 1.7928128025513983, + "grad_norm": 7.653250694274902, + "learning_rate": 3.215435552003653e-05, + "loss": 5.2645, + "step": 15740 + }, + { + "epoch": 1.7939518195797026, + "grad_norm": 6.541507720947266, + "learning_rate": 3.214293869163147e-05, + "loss": 5.7815, + "step": 15750 + }, + { + "epoch": 1.7950908366080074, + "grad_norm": 5.535106658935547, + "learning_rate": 3.21315218632264e-05, + "loss": 5.5604, + "step": 15760 + }, + { + "epoch": 1.7962298536363117, + "grad_norm": 5.451852321624756, + "learning_rate": 3.212010503482133e-05, + "loss": 5.2591, + "step": 15770 + }, + { + "epoch": 1.7973688706646165, + "grad_norm": 5.946657180786133, + "learning_rate": 3.210868820641626e-05, + "loss": 5.259, + "step": 15780 + }, + { + "epoch": 1.798507887692921, + "grad_norm": 7.167417526245117, + "learning_rate": 3.209727137801119e-05, + "loss": 5.2543, + "step": 15790 + }, + { + "epoch": 1.7996469047212256, + "grad_norm": 9.464091300964355, + "learning_rate": 3.208585454960612e-05, + "loss": 5.4242, + "step": 15800 + }, + { + "epoch": 1.8007859217495301, + "grad_norm": 7.651808261871338, + "learning_rate": 3.207443772120105e-05, + "loss": 5.557, + "step": 15810 + }, + { + "epoch": 1.8019249387778347, + "grad_norm": 7.241791725158691, + "learning_rate": 3.206302089279598e-05, + "loss": 5.3039, + "step": 15820 + }, + { + "epoch": 1.8030639558061394, + "grad_norm": 35.77559280395508, + "learning_rate": 3.205160406439091e-05, + "loss": 5.1588, + "step": 15830 + }, + { + "epoch": 1.8042029728344438, + "grad_norm": 6.354983329772949, + "learning_rate": 3.204018723598585e-05, + "loss": 5.0738, + "step": 15840 + }, + { + "epoch": 1.8053419898627485, + "grad_norm": 11.231024742126465, + "learning_rate": 3.2028770407580774e-05, + "loss": 5.1951, + "step": 15850 + }, + { + "epoch": 1.806481006891053, + "grad_norm": 4.689850807189941, + "learning_rate": 3.2017353579175706e-05, + "loss": 5.5869, + "step": 15860 + }, + { + "epoch": 1.8076200239193576, + "grad_norm": 10.245040893554688, + "learning_rate": 3.200593675077064e-05, + "loss": 5.2374, + "step": 15870 + }, + { + "epoch": 1.8087590409476622, + "grad_norm": 14.079906463623047, + "learning_rate": 3.199451992236557e-05, + "loss": 5.1183, + "step": 15880 + }, + { + "epoch": 1.8098980579759667, + "grad_norm": 9.724322319030762, + "learning_rate": 3.19831030939605e-05, + "loss": 5.1199, + "step": 15890 + }, + { + "epoch": 1.8110370750042715, + "grad_norm": 7.243402004241943, + "learning_rate": 3.1971686265555426e-05, + "loss": 5.3126, + "step": 15900 + }, + { + "epoch": 1.8121760920325758, + "grad_norm": 10.498720169067383, + "learning_rate": 3.196026943715036e-05, + "loss": 5.0482, + "step": 15910 + }, + { + "epoch": 1.8133151090608806, + "grad_norm": 8.319934844970703, + "learning_rate": 3.19488526087453e-05, + "loss": 5.0945, + "step": 15920 + }, + { + "epoch": 1.8144541260891849, + "grad_norm": 8.889106750488281, + "learning_rate": 3.193743578034022e-05, + "loss": 5.1267, + "step": 15930 + }, + { + "epoch": 1.8155931431174896, + "grad_norm": 10.439918518066406, + "learning_rate": 3.1926018951935154e-05, + "loss": 4.9774, + "step": 15940 + }, + { + "epoch": 1.8167321601457942, + "grad_norm": 9.230948448181152, + "learning_rate": 3.1914602123530085e-05, + "loss": 5.2242, + "step": 15950 + }, + { + "epoch": 1.8178711771740987, + "grad_norm": 5.9906535148620605, + "learning_rate": 3.190318529512502e-05, + "loss": 5.7379, + "step": 15960 + }, + { + "epoch": 1.8190101942024033, + "grad_norm": 12.100125312805176, + "learning_rate": 3.189176846671994e-05, + "loss": 5.1342, + "step": 15970 + }, + { + "epoch": 1.8201492112307078, + "grad_norm": 8.128509521484375, + "learning_rate": 3.1880351638314874e-05, + "loss": 5.0393, + "step": 15980 + }, + { + "epoch": 1.8212882282590126, + "grad_norm": 11.983037948608398, + "learning_rate": 3.1868934809909806e-05, + "loss": 5.4135, + "step": 15990 + }, + { + "epoch": 1.822427245287317, + "grad_norm": 12.307679176330566, + "learning_rate": 3.1857517981504745e-05, + "loss": 5.2544, + "step": 16000 + }, + { + "epoch": 1.822427245287317, + "eval_loss": 5.832084655761719, + "eval_runtime": 11.4932, + "eval_samples_per_second": 1.305, + "eval_steps_per_second": 0.174, + "step": 16000 + }, + { + "epoch": 1.8235662623156217, + "grad_norm": 9.702421188354492, + "learning_rate": 3.184610115309967e-05, + "loss": 5.2784, + "step": 16010 + }, + { + "epoch": 1.8247052793439262, + "grad_norm": 8.732382774353027, + "learning_rate": 3.18346843246946e-05, + "loss": 5.1524, + "step": 16020 + }, + { + "epoch": 1.8258442963722308, + "grad_norm": 10.099857330322266, + "learning_rate": 3.182326749628953e-05, + "loss": 5.0381, + "step": 16030 + }, + { + "epoch": 1.8269833134005353, + "grad_norm": 7.625748157501221, + "learning_rate": 3.1811850667884465e-05, + "loss": 5.3359, + "step": 16040 + }, + { + "epoch": 1.8281223304288399, + "grad_norm": 12.038113594055176, + "learning_rate": 3.180043383947939e-05, + "loss": 5.247, + "step": 16050 + }, + { + "epoch": 1.8292613474571446, + "grad_norm": 6.477821350097656, + "learning_rate": 3.178901701107432e-05, + "loss": 5.4063, + "step": 16060 + }, + { + "epoch": 1.830400364485449, + "grad_norm": 10.657258033752441, + "learning_rate": 3.1777600182669254e-05, + "loss": 5.4068, + "step": 16070 + }, + { + "epoch": 1.8315393815137537, + "grad_norm": 10.139802932739258, + "learning_rate": 3.176618335426419e-05, + "loss": 5.3456, + "step": 16080 + }, + { + "epoch": 1.832678398542058, + "grad_norm": 8.613080978393555, + "learning_rate": 3.175476652585912e-05, + "loss": 5.5659, + "step": 16090 + }, + { + "epoch": 1.8338174155703628, + "grad_norm": 7.020716190338135, + "learning_rate": 3.174334969745405e-05, + "loss": 5.31, + "step": 16100 + }, + { + "epoch": 1.8349564325986674, + "grad_norm": 11.441906929016113, + "learning_rate": 3.173193286904898e-05, + "loss": 5.436, + "step": 16110 + }, + { + "epoch": 1.836095449626972, + "grad_norm": 7.583320140838623, + "learning_rate": 3.172051604064391e-05, + "loss": 5.405, + "step": 16120 + }, + { + "epoch": 1.8372344666552765, + "grad_norm": 6.088997840881348, + "learning_rate": 3.170909921223884e-05, + "loss": 5.1049, + "step": 16130 + }, + { + "epoch": 1.838373483683581, + "grad_norm": 7.740018367767334, + "learning_rate": 3.169768238383377e-05, + "loss": 5.158, + "step": 16140 + }, + { + "epoch": 1.8395125007118858, + "grad_norm": 7.6566972732543945, + "learning_rate": 3.16862655554287e-05, + "loss": 5.357, + "step": 16150 + }, + { + "epoch": 1.84065151774019, + "grad_norm": 6.360335826873779, + "learning_rate": 3.1674848727023634e-05, + "loss": 5.1247, + "step": 16160 + }, + { + "epoch": 1.8417905347684949, + "grad_norm": 9.910289764404297, + "learning_rate": 3.1663431898618566e-05, + "loss": 5.3082, + "step": 16170 + }, + { + "epoch": 1.8429295517967994, + "grad_norm": 8.577327728271484, + "learning_rate": 3.16520150702135e-05, + "loss": 5.2903, + "step": 16180 + }, + { + "epoch": 1.844068568825104, + "grad_norm": 13.031941413879395, + "learning_rate": 3.164059824180843e-05, + "loss": 5.0962, + "step": 16190 + }, + { + "epoch": 1.8452075858534085, + "grad_norm": 10.391229629516602, + "learning_rate": 3.162918141340336e-05, + "loss": 5.0891, + "step": 16200 + }, + { + "epoch": 1.846346602881713, + "grad_norm": 7.560512065887451, + "learning_rate": 3.1617764584998286e-05, + "loss": 5.1244, + "step": 16210 + }, + { + "epoch": 1.8474856199100178, + "grad_norm": 11.551033020019531, + "learning_rate": 3.160634775659322e-05, + "loss": 5.262, + "step": 16220 + }, + { + "epoch": 1.8486246369383221, + "grad_norm": 7.193599224090576, + "learning_rate": 3.159493092818815e-05, + "loss": 5.0939, + "step": 16230 + }, + { + "epoch": 1.849763653966627, + "grad_norm": 18.756372451782227, + "learning_rate": 3.158351409978308e-05, + "loss": 5.2957, + "step": 16240 + }, + { + "epoch": 1.8509026709949312, + "grad_norm": 13.374124526977539, + "learning_rate": 3.1572097271378013e-05, + "loss": 5.2796, + "step": 16250 + }, + { + "epoch": 1.852041688023236, + "grad_norm": 10.634527206420898, + "learning_rate": 3.1560680442972945e-05, + "loss": 5.3388, + "step": 16260 + }, + { + "epoch": 1.8531807050515405, + "grad_norm": 7.483227729797363, + "learning_rate": 3.154926361456788e-05, + "loss": 5.4911, + "step": 16270 + }, + { + "epoch": 1.854319722079845, + "grad_norm": 12.502581596374512, + "learning_rate": 3.153784678616281e-05, + "loss": 5.0431, + "step": 16280 + }, + { + "epoch": 1.8554587391081496, + "grad_norm": 12.948872566223145, + "learning_rate": 3.1526429957757734e-05, + "loss": 5.5631, + "step": 16290 + }, + { + "epoch": 1.8565977561364542, + "grad_norm": 7.303791046142578, + "learning_rate": 3.1515013129352666e-05, + "loss": 5.5589, + "step": 16300 + }, + { + "epoch": 1.857736773164759, + "grad_norm": 15.113411903381348, + "learning_rate": 3.15035963009476e-05, + "loss": 5.2462, + "step": 16310 + }, + { + "epoch": 1.8588757901930633, + "grad_norm": 11.068880081176758, + "learning_rate": 3.149217947254253e-05, + "loss": 5.4357, + "step": 16320 + }, + { + "epoch": 1.860014807221368, + "grad_norm": 6.116614818572998, + "learning_rate": 3.148076264413746e-05, + "loss": 5.4016, + "step": 16330 + }, + { + "epoch": 1.8611538242496726, + "grad_norm": 7.408304214477539, + "learning_rate": 3.146934581573239e-05, + "loss": 5.3268, + "step": 16340 + }, + { + "epoch": 1.8622928412779771, + "grad_norm": 7.863326072692871, + "learning_rate": 3.1457928987327325e-05, + "loss": 5.178, + "step": 16350 + }, + { + "epoch": 1.8634318583062817, + "grad_norm": 10.515961647033691, + "learning_rate": 3.144651215892225e-05, + "loss": 5.3988, + "step": 16360 + }, + { + "epoch": 1.8645708753345862, + "grad_norm": 6.62656831741333, + "learning_rate": 3.143509533051718e-05, + "loss": 5.3556, + "step": 16370 + }, + { + "epoch": 1.865709892362891, + "grad_norm": 10.1450834274292, + "learning_rate": 3.1423678502112114e-05, + "loss": 5.249, + "step": 16380 + }, + { + "epoch": 1.8668489093911953, + "grad_norm": 7.126070022583008, + "learning_rate": 3.1412261673707046e-05, + "loss": 5.3152, + "step": 16390 + }, + { + "epoch": 1.8679879264195, + "grad_norm": 10.022063255310059, + "learning_rate": 3.140084484530198e-05, + "loss": 5.1321, + "step": 16400 + }, + { + "epoch": 1.8691269434478044, + "grad_norm": 7.533326625823975, + "learning_rate": 3.138942801689691e-05, + "loss": 5.2327, + "step": 16410 + }, + { + "epoch": 1.8702659604761092, + "grad_norm": 8.48379898071289, + "learning_rate": 3.137801118849184e-05, + "loss": 4.9058, + "step": 16420 + }, + { + "epoch": 1.8714049775044137, + "grad_norm": 9.660683631896973, + "learning_rate": 3.136659436008677e-05, + "loss": 5.2705, + "step": 16430 + }, + { + "epoch": 1.8725439945327182, + "grad_norm": 5.680925369262695, + "learning_rate": 3.13551775316817e-05, + "loss": 5.5768, + "step": 16440 + }, + { + "epoch": 1.8736830115610228, + "grad_norm": 6.803483963012695, + "learning_rate": 3.134376070327663e-05, + "loss": 5.3157, + "step": 16450 + }, + { + "epoch": 1.8748220285893273, + "grad_norm": 11.37948989868164, + "learning_rate": 3.133234387487156e-05, + "loss": 5.2156, + "step": 16460 + }, + { + "epoch": 1.875961045617632, + "grad_norm": 9.787893295288086, + "learning_rate": 3.1320927046466494e-05, + "loss": 5.084, + "step": 16470 + }, + { + "epoch": 1.8771000626459364, + "grad_norm": 22.197452545166016, + "learning_rate": 3.130951021806142e-05, + "loss": 5.2033, + "step": 16480 + }, + { + "epoch": 1.8782390796742412, + "grad_norm": 7.01490592956543, + "learning_rate": 3.129809338965636e-05, + "loss": 5.0844, + "step": 16490 + }, + { + "epoch": 1.8793780967025457, + "grad_norm": 6.314261436462402, + "learning_rate": 3.128667656125129e-05, + "loss": 5.2189, + "step": 16500 + }, + { + "epoch": 1.8805171137308503, + "grad_norm": 6.676644802093506, + "learning_rate": 3.127525973284622e-05, + "loss": 5.1678, + "step": 16510 + }, + { + "epoch": 1.8816561307591548, + "grad_norm": 7.425068378448486, + "learning_rate": 3.1263842904441146e-05, + "loss": 5.2719, + "step": 16520 + }, + { + "epoch": 1.8827951477874594, + "grad_norm": 10.819995880126953, + "learning_rate": 3.125242607603608e-05, + "loss": 5.1188, + "step": 16530 + }, + { + "epoch": 1.8839341648157641, + "grad_norm": 8.873900413513184, + "learning_rate": 3.124100924763101e-05, + "loss": 5.2426, + "step": 16540 + }, + { + "epoch": 1.8850731818440685, + "grad_norm": 16.555330276489258, + "learning_rate": 3.122959241922594e-05, + "loss": 5.3134, + "step": 16550 + }, + { + "epoch": 1.8862121988723732, + "grad_norm": 8.569807052612305, + "learning_rate": 3.1218175590820867e-05, + "loss": 5.363, + "step": 16560 + }, + { + "epoch": 1.8873512159006776, + "grad_norm": 9.172271728515625, + "learning_rate": 3.12067587624158e-05, + "loss": 4.8459, + "step": 16570 + }, + { + "epoch": 1.8884902329289823, + "grad_norm": 14.09700870513916, + "learning_rate": 3.119534193401074e-05, + "loss": 5.2349, + "step": 16580 + }, + { + "epoch": 1.8896292499572869, + "grad_norm": 12.434288024902344, + "learning_rate": 3.118392510560567e-05, + "loss": 5.2453, + "step": 16590 + }, + { + "epoch": 1.8907682669855914, + "grad_norm": 6.912929058074951, + "learning_rate": 3.1172508277200594e-05, + "loss": 5.0789, + "step": 16600 + }, + { + "epoch": 1.891907284013896, + "grad_norm": 7.448489665985107, + "learning_rate": 3.1161091448795526e-05, + "loss": 5.465, + "step": 16610 + }, + { + "epoch": 1.8930463010422005, + "grad_norm": 6.203219890594482, + "learning_rate": 3.114967462039046e-05, + "loss": 5.2743, + "step": 16620 + }, + { + "epoch": 1.8941853180705053, + "grad_norm": 32.255104064941406, + "learning_rate": 3.113825779198539e-05, + "loss": 5.4738, + "step": 16630 + }, + { + "epoch": 1.8953243350988096, + "grad_norm": 11.738665580749512, + "learning_rate": 3.1126840963580314e-05, + "loss": 5.0485, + "step": 16640 + }, + { + "epoch": 1.8964633521271144, + "grad_norm": 12.012779235839844, + "learning_rate": 3.1115424135175246e-05, + "loss": 5.4544, + "step": 16650 + }, + { + "epoch": 1.897602369155419, + "grad_norm": 9.032222747802734, + "learning_rate": 3.1104007306770185e-05, + "loss": 5.3856, + "step": 16660 + }, + { + "epoch": 1.8987413861837235, + "grad_norm": 10.080476760864258, + "learning_rate": 3.109259047836512e-05, + "loss": 5.7127, + "step": 16670 + }, + { + "epoch": 1.899880403212028, + "grad_norm": 8.859971046447754, + "learning_rate": 3.108117364996004e-05, + "loss": 5.102, + "step": 16680 + }, + { + "epoch": 1.9010194202403325, + "grad_norm": 6.79989767074585, + "learning_rate": 3.1069756821554974e-05, + "loss": 5.2618, + "step": 16690 + }, + { + "epoch": 1.9021584372686373, + "grad_norm": 14.299168586730957, + "learning_rate": 3.1058339993149905e-05, + "loss": 5.1497, + "step": 16700 + }, + { + "epoch": 1.9032974542969416, + "grad_norm": 7.730276107788086, + "learning_rate": 3.104692316474484e-05, + "loss": 5.4198, + "step": 16710 + }, + { + "epoch": 1.9044364713252464, + "grad_norm": 6.5476226806640625, + "learning_rate": 3.103550633633976e-05, + "loss": 5.2444, + "step": 16720 + }, + { + "epoch": 1.9055754883535507, + "grad_norm": 7.926487445831299, + "learning_rate": 3.1024089507934694e-05, + "loss": 5.1846, + "step": 16730 + }, + { + "epoch": 1.9067145053818555, + "grad_norm": 9.054329872131348, + "learning_rate": 3.101267267952963e-05, + "loss": 5.5013, + "step": 16740 + }, + { + "epoch": 1.90785352241016, + "grad_norm": 8.103936195373535, + "learning_rate": 3.100125585112456e-05, + "loss": 5.6069, + "step": 16750 + }, + { + "epoch": 1.9089925394384646, + "grad_norm": 11.002752304077148, + "learning_rate": 3.098983902271949e-05, + "loss": 5.0676, + "step": 16760 + }, + { + "epoch": 1.9101315564667691, + "grad_norm": 7.584782600402832, + "learning_rate": 3.097842219431442e-05, + "loss": 5.6041, + "step": 16770 + }, + { + "epoch": 1.9112705734950737, + "grad_norm": 6.534191608428955, + "learning_rate": 3.0967005365909353e-05, + "loss": 5.2058, + "step": 16780 + }, + { + "epoch": 1.9124095905233784, + "grad_norm": 7.534450531005859, + "learning_rate": 3.0955588537504285e-05, + "loss": 5.1247, + "step": 16790 + }, + { + "epoch": 1.9135486075516828, + "grad_norm": 11.882919311523438, + "learning_rate": 3.094417170909921e-05, + "loss": 5.3567, + "step": 16800 + }, + { + "epoch": 1.9146876245799875, + "grad_norm": 7.071094036102295, + "learning_rate": 3.093275488069414e-05, + "loss": 5.8045, + "step": 16810 + }, + { + "epoch": 1.915826641608292, + "grad_norm": 39.18484878540039, + "learning_rate": 3.092133805228908e-05, + "loss": 5.2192, + "step": 16820 + }, + { + "epoch": 1.9169656586365966, + "grad_norm": 8.408158302307129, + "learning_rate": 3.0909921223884006e-05, + "loss": 5.7599, + "step": 16830 + }, + { + "epoch": 1.9181046756649012, + "grad_norm": 30.49919891357422, + "learning_rate": 3.089850439547894e-05, + "loss": 5.4905, + "step": 16840 + }, + { + "epoch": 1.9192436926932057, + "grad_norm": 9.431031227111816, + "learning_rate": 3.088708756707387e-05, + "loss": 5.402, + "step": 16850 + }, + { + "epoch": 1.9203827097215105, + "grad_norm": 16.699037551879883, + "learning_rate": 3.08756707386688e-05, + "loss": 5.1874, + "step": 16860 + }, + { + "epoch": 1.9215217267498148, + "grad_norm": 7.762238025665283, + "learning_rate": 3.0864253910263726e-05, + "loss": 5.2737, + "step": 16870 + }, + { + "epoch": 1.9226607437781196, + "grad_norm": 12.07967758178711, + "learning_rate": 3.085283708185866e-05, + "loss": 5.4094, + "step": 16880 + }, + { + "epoch": 1.923799760806424, + "grad_norm": 9.500292778015137, + "learning_rate": 3.084142025345359e-05, + "loss": 5.3492, + "step": 16890 + }, + { + "epoch": 1.9249387778347287, + "grad_norm": 18.876487731933594, + "learning_rate": 3.083000342504852e-05, + "loss": 5.1222, + "step": 16900 + }, + { + "epoch": 1.9260777948630332, + "grad_norm": 11.098732948303223, + "learning_rate": 3.0818586596643454e-05, + "loss": 4.4881, + "step": 16910 + }, + { + "epoch": 1.9272168118913378, + "grad_norm": 10.376971244812012, + "learning_rate": 3.0807169768238386e-05, + "loss": 4.6965, + "step": 16920 + }, + { + "epoch": 1.9283558289196423, + "grad_norm": 12.247511863708496, + "learning_rate": 3.079575293983332e-05, + "loss": 5.2123, + "step": 16930 + }, + { + "epoch": 1.9294948459479468, + "grad_norm": 8.139228820800781, + "learning_rate": 3.078433611142825e-05, + "loss": 5.3492, + "step": 16940 + }, + { + "epoch": 1.9306338629762516, + "grad_norm": 9.468058586120605, + "learning_rate": 3.0772919283023174e-05, + "loss": 5.3156, + "step": 16950 + }, + { + "epoch": 1.931772880004556, + "grad_norm": 6.062070369720459, + "learning_rate": 3.0761502454618106e-05, + "loss": 5.3921, + "step": 16960 + }, + { + "epoch": 1.9329118970328607, + "grad_norm": 6.7293314933776855, + "learning_rate": 3.075008562621304e-05, + "loss": 5.1339, + "step": 16970 + }, + { + "epoch": 1.9340509140611652, + "grad_norm": 6.088140487670898, + "learning_rate": 3.073866879780797e-05, + "loss": 4.9764, + "step": 16980 + }, + { + "epoch": 1.9351899310894698, + "grad_norm": 6.771167278289795, + "learning_rate": 3.07272519694029e-05, + "loss": 5.2947, + "step": 16990 + }, + { + "epoch": 1.9363289481177743, + "grad_norm": 17.922042846679688, + "learning_rate": 3.0715835140997834e-05, + "loss": 4.9927, + "step": 17000 + }, + { + "epoch": 1.9374679651460789, + "grad_norm": 7.334212303161621, + "learning_rate": 3.0704418312592765e-05, + "loss": 5.1482, + "step": 17010 + }, + { + "epoch": 1.9386069821743837, + "grad_norm": 6.488142490386963, + "learning_rate": 3.06930014841877e-05, + "loss": 5.1893, + "step": 17020 + }, + { + "epoch": 1.939745999202688, + "grad_norm": 9.199004173278809, + "learning_rate": 3.068158465578262e-05, + "loss": 5.6558, + "step": 17030 + }, + { + "epoch": 1.9408850162309927, + "grad_norm": 7.08030366897583, + "learning_rate": 3.0670167827377554e-05, + "loss": 5.2228, + "step": 17040 + }, + { + "epoch": 1.942024033259297, + "grad_norm": 16.6004695892334, + "learning_rate": 3.0658750998972486e-05, + "loss": 5.1263, + "step": 17050 + }, + { + "epoch": 1.9431630502876018, + "grad_norm": 5.704008102416992, + "learning_rate": 3.064733417056742e-05, + "loss": 5.6937, + "step": 17060 + }, + { + "epoch": 1.9443020673159064, + "grad_norm": 8.700068473815918, + "learning_rate": 3.063591734216235e-05, + "loss": 5.8973, + "step": 17070 + }, + { + "epoch": 1.945441084344211, + "grad_norm": 20.755605697631836, + "learning_rate": 3.062450051375728e-05, + "loss": 5.4337, + "step": 17080 + }, + { + "epoch": 1.9465801013725155, + "grad_norm": 29.836252212524414, + "learning_rate": 3.061308368535221e-05, + "loss": 5.333, + "step": 17090 + }, + { + "epoch": 1.94771911840082, + "grad_norm": 8.315689086914062, + "learning_rate": 3.0601666856947145e-05, + "loss": 5.1461, + "step": 17100 + }, + { + "epoch": 1.9488581354291248, + "grad_norm": 22.537151336669922, + "learning_rate": 3.059025002854207e-05, + "loss": 4.9269, + "step": 17110 + }, + { + "epoch": 1.949997152457429, + "grad_norm": 8.498539924621582, + "learning_rate": 3.0578833200137e-05, + "loss": 4.9942, + "step": 17120 + }, + { + "epoch": 1.9511361694857339, + "grad_norm": 7.357529163360596, + "learning_rate": 3.0567416371731934e-05, + "loss": 5.1522, + "step": 17130 + }, + { + "epoch": 1.9522751865140384, + "grad_norm": 7.00098180770874, + "learning_rate": 3.0555999543326866e-05, + "loss": 5.3363, + "step": 17140 + }, + { + "epoch": 1.953414203542343, + "grad_norm": 7.778203010559082, + "learning_rate": 3.05445827149218e-05, + "loss": 5.0716, + "step": 17150 + }, + { + "epoch": 1.9545532205706475, + "grad_norm": 13.908317565917969, + "learning_rate": 3.053316588651673e-05, + "loss": 5.5496, + "step": 17160 + }, + { + "epoch": 1.955692237598952, + "grad_norm": 12.319000244140625, + "learning_rate": 3.052174905811166e-05, + "loss": 5.1008, + "step": 17170 + }, + { + "epoch": 1.9568312546272568, + "grad_norm": 6.238765716552734, + "learning_rate": 3.051033222970659e-05, + "loss": 5.3298, + "step": 17180 + }, + { + "epoch": 1.9579702716555611, + "grad_norm": 34.22773742675781, + "learning_rate": 3.049891540130152e-05, + "loss": 5.2814, + "step": 17190 + }, + { + "epoch": 1.959109288683866, + "grad_norm": 8.195430755615234, + "learning_rate": 3.048749857289645e-05, + "loss": 5.232, + "step": 17200 + }, + { + "epoch": 1.9602483057121702, + "grad_norm": 7.08268928527832, + "learning_rate": 3.0476081744491382e-05, + "loss": 4.9855, + "step": 17210 + }, + { + "epoch": 1.961387322740475, + "grad_norm": 11.708693504333496, + "learning_rate": 3.046466491608631e-05, + "loss": 5.2377, + "step": 17220 + }, + { + "epoch": 1.9625263397687795, + "grad_norm": 12.489038467407227, + "learning_rate": 3.0453248087681242e-05, + "loss": 5.1972, + "step": 17230 + }, + { + "epoch": 1.963665356797084, + "grad_norm": 8.34151554107666, + "learning_rate": 3.0441831259276177e-05, + "loss": 5.1917, + "step": 17240 + }, + { + "epoch": 1.9648043738253886, + "grad_norm": 6.540013790130615, + "learning_rate": 3.0430414430871106e-05, + "loss": 5.2306, + "step": 17250 + }, + { + "epoch": 1.9659433908536932, + "grad_norm": 10.032349586486816, + "learning_rate": 3.0418997602466038e-05, + "loss": 5.107, + "step": 17260 + }, + { + "epoch": 1.967082407881998, + "grad_norm": 6.8237128257751465, + "learning_rate": 3.0407580774060966e-05, + "loss": 5.3935, + "step": 17270 + }, + { + "epoch": 1.9682214249103023, + "grad_norm": 16.39888572692871, + "learning_rate": 3.0396163945655898e-05, + "loss": 5.3169, + "step": 17280 + }, + { + "epoch": 1.969360441938607, + "grad_norm": 7.8522748947143555, + "learning_rate": 3.038474711725083e-05, + "loss": 5.4692, + "step": 17290 + }, + { + "epoch": 1.9704994589669116, + "grad_norm": 6.113000869750977, + "learning_rate": 3.0373330288845758e-05, + "loss": 5.3042, + "step": 17300 + }, + { + "epoch": 1.9716384759952161, + "grad_norm": 8.276409149169922, + "learning_rate": 3.036191346044069e-05, + "loss": 5.3894, + "step": 17310 + }, + { + "epoch": 1.9727774930235207, + "grad_norm": 8.825841903686523, + "learning_rate": 3.0350496632035625e-05, + "loss": 5.4161, + "step": 17320 + }, + { + "epoch": 1.9739165100518252, + "grad_norm": 12.91003704071045, + "learning_rate": 3.0339079803630554e-05, + "loss": 4.9437, + "step": 17330 + }, + { + "epoch": 1.97505552708013, + "grad_norm": 7.1281418800354, + "learning_rate": 3.0327662975225485e-05, + "loss": 5.4292, + "step": 17340 + }, + { + "epoch": 1.9761945441084343, + "grad_norm": 8.980794906616211, + "learning_rate": 3.0316246146820414e-05, + "loss": 5.6428, + "step": 17350 + }, + { + "epoch": 1.977333561136739, + "grad_norm": 6.971176624298096, + "learning_rate": 3.0304829318415346e-05, + "loss": 5.0774, + "step": 17360 + }, + { + "epoch": 1.9784725781650434, + "grad_norm": 10.74820613861084, + "learning_rate": 3.0293412490010274e-05, + "loss": 5.1101, + "step": 17370 + }, + { + "epoch": 1.9796115951933482, + "grad_norm": 4.974326133728027, + "learning_rate": 3.0281995661605206e-05, + "loss": 5.4859, + "step": 17380 + }, + { + "epoch": 1.9807506122216527, + "grad_norm": 10.470224380493164, + "learning_rate": 3.0270578833200135e-05, + "loss": 5.2855, + "step": 17390 + }, + { + "epoch": 1.9818896292499573, + "grad_norm": 12.764081954956055, + "learning_rate": 3.0259162004795073e-05, + "loss": 5.5172, + "step": 17400 + }, + { + "epoch": 1.9830286462782618, + "grad_norm": 6.461774826049805, + "learning_rate": 3.024774517639e-05, + "loss": 5.252, + "step": 17410 + }, + { + "epoch": 1.9841676633065664, + "grad_norm": 9.746689796447754, + "learning_rate": 3.0236328347984933e-05, + "loss": 5.2766, + "step": 17420 + }, + { + "epoch": 1.9853066803348711, + "grad_norm": 9.114880561828613, + "learning_rate": 3.0224911519579862e-05, + "loss": 5.182, + "step": 17430 + }, + { + "epoch": 1.9864456973631754, + "grad_norm": 6.150300979614258, + "learning_rate": 3.0213494691174794e-05, + "loss": 5.1747, + "step": 17440 + }, + { + "epoch": 1.9875847143914802, + "grad_norm": 8.727187156677246, + "learning_rate": 3.0202077862769722e-05, + "loss": 5.3149, + "step": 17450 + }, + { + "epoch": 1.9887237314197848, + "grad_norm": 8.779123306274414, + "learning_rate": 3.0190661034364654e-05, + "loss": 5.4248, + "step": 17460 + }, + { + "epoch": 1.9898627484480893, + "grad_norm": 20.234495162963867, + "learning_rate": 3.0179244205959582e-05, + "loss": 5.2006, + "step": 17470 + }, + { + "epoch": 1.9910017654763938, + "grad_norm": 6.3343424797058105, + "learning_rate": 3.016782737755452e-05, + "loss": 5.7433, + "step": 17480 + }, + { + "epoch": 1.9921407825046984, + "grad_norm": 6.843255519866943, + "learning_rate": 3.015641054914945e-05, + "loss": 6.1519, + "step": 17490 + }, + { + "epoch": 1.9932797995330032, + "grad_norm": 6.034940242767334, + "learning_rate": 3.014499372074438e-05, + "loss": 5.5021, + "step": 17500 + }, + { + "epoch": 1.9944188165613075, + "grad_norm": 13.856889724731445, + "learning_rate": 3.013357689233931e-05, + "loss": 5.3194, + "step": 17510 + }, + { + "epoch": 1.9955578335896123, + "grad_norm": 8.151268005371094, + "learning_rate": 3.012216006393424e-05, + "loss": 5.645, + "step": 17520 + }, + { + "epoch": 1.9966968506179166, + "grad_norm": 7.212371826171875, + "learning_rate": 3.011074323552917e-05, + "loss": 5.2082, + "step": 17530 + }, + { + "epoch": 1.9978358676462213, + "grad_norm": 5.859493732452393, + "learning_rate": 3.0099326407124102e-05, + "loss": 5.1702, + "step": 17540 + }, + { + "epoch": 1.9989748846745259, + "grad_norm": 12.061100006103516, + "learning_rate": 3.008790957871903e-05, + "loss": 5.4642, + "step": 17550 + }, + { + "epoch": 2.0001139017028304, + "grad_norm": 10.034649848937988, + "learning_rate": 3.0076492750313962e-05, + "loss": 5.104, + "step": 17560 + }, + { + "epoch": 2.001252918731135, + "grad_norm": 9.353052139282227, + "learning_rate": 3.0065075921908897e-05, + "loss": 4.5714, + "step": 17570 + }, + { + "epoch": 2.0023919357594395, + "grad_norm": 10.620824813842773, + "learning_rate": 3.005365909350383e-05, + "loss": 4.7921, + "step": 17580 + }, + { + "epoch": 2.0035309527877443, + "grad_norm": 10.396838188171387, + "learning_rate": 3.0042242265098758e-05, + "loss": 4.3342, + "step": 17590 + }, + { + "epoch": 2.0046699698160486, + "grad_norm": 27.1042423248291, + "learning_rate": 3.003082543669369e-05, + "loss": 4.4587, + "step": 17600 + }, + { + "epoch": 2.0058089868443534, + "grad_norm": 10.320976257324219, + "learning_rate": 3.0019408608288618e-05, + "loss": 4.2594, + "step": 17610 + }, + { + "epoch": 2.0069480038726577, + "grad_norm": 8.026143074035645, + "learning_rate": 3.000799177988355e-05, + "loss": 4.7785, + "step": 17620 + }, + { + "epoch": 2.0080870209009625, + "grad_norm": 10.650627136230469, + "learning_rate": 2.9996574951478478e-05, + "loss": 4.5333, + "step": 17630 + }, + { + "epoch": 2.0092260379292672, + "grad_norm": 18.135908126831055, + "learning_rate": 2.998515812307341e-05, + "loss": 4.6306, + "step": 17640 + }, + { + "epoch": 2.0103650549575716, + "grad_norm": 9.425822257995605, + "learning_rate": 2.9973741294668345e-05, + "loss": 4.4276, + "step": 17650 + }, + { + "epoch": 2.0115040719858763, + "grad_norm": 7.995151519775391, + "learning_rate": 2.9962324466263274e-05, + "loss": 4.1761, + "step": 17660 + }, + { + "epoch": 2.0126430890141807, + "grad_norm": 6.981862545013428, + "learning_rate": 2.9950907637858206e-05, + "loss": 4.3842, + "step": 17670 + }, + { + "epoch": 2.0137821060424854, + "grad_norm": 8.841754913330078, + "learning_rate": 2.9939490809453134e-05, + "loss": 4.599, + "step": 17680 + }, + { + "epoch": 2.0149211230707897, + "grad_norm": 23.87337875366211, + "learning_rate": 2.9928073981048066e-05, + "loss": 4.4764, + "step": 17690 + }, + { + "epoch": 2.0160601400990945, + "grad_norm": 10.238081932067871, + "learning_rate": 2.9916657152642998e-05, + "loss": 4.6655, + "step": 17700 + }, + { + "epoch": 2.017199157127399, + "grad_norm": 8.939787864685059, + "learning_rate": 2.9905240324237926e-05, + "loss": 4.5585, + "step": 17710 + }, + { + "epoch": 2.0183381741557036, + "grad_norm": 13.194611549377441, + "learning_rate": 2.9893823495832858e-05, + "loss": 4.4062, + "step": 17720 + }, + { + "epoch": 2.0194771911840084, + "grad_norm": 7.173688888549805, + "learning_rate": 2.9882406667427793e-05, + "loss": 4.4404, + "step": 17730 + }, + { + "epoch": 2.0206162082123127, + "grad_norm": 26.673742294311523, + "learning_rate": 2.9870989839022722e-05, + "loss": 4.2668, + "step": 17740 + }, + { + "epoch": 2.0217552252406175, + "grad_norm": 10.922196388244629, + "learning_rate": 2.9859573010617654e-05, + "loss": 4.1519, + "step": 17750 + }, + { + "epoch": 2.022894242268922, + "grad_norm": 34.457366943359375, + "learning_rate": 2.9848156182212582e-05, + "loss": 4.0883, + "step": 17760 + }, + { + "epoch": 2.0240332592972266, + "grad_norm": 24.143638610839844, + "learning_rate": 2.9836739353807514e-05, + "loss": 4.4006, + "step": 17770 + }, + { + "epoch": 2.025172276325531, + "grad_norm": 8.693916320800781, + "learning_rate": 2.9825322525402442e-05, + "loss": 4.5388, + "step": 17780 + }, + { + "epoch": 2.0263112933538356, + "grad_norm": 11.42645263671875, + "learning_rate": 2.9813905696997374e-05, + "loss": 4.6226, + "step": 17790 + }, + { + "epoch": 2.0274503103821404, + "grad_norm": 10.090991020202637, + "learning_rate": 2.9802488868592303e-05, + "loss": 4.3568, + "step": 17800 + }, + { + "epoch": 2.0285893274104447, + "grad_norm": 8.924347877502441, + "learning_rate": 2.979107204018724e-05, + "loss": 4.4656, + "step": 17810 + }, + { + "epoch": 2.0297283444387495, + "grad_norm": 8.989141464233398, + "learning_rate": 2.977965521178217e-05, + "loss": 4.6345, + "step": 17820 + }, + { + "epoch": 2.030867361467054, + "grad_norm": 12.552188873291016, + "learning_rate": 2.97682383833771e-05, + "loss": 4.3445, + "step": 17830 + }, + { + "epoch": 2.0320063784953586, + "grad_norm": 28.57890510559082, + "learning_rate": 2.975682155497203e-05, + "loss": 4.5935, + "step": 17840 + }, + { + "epoch": 2.033145395523663, + "grad_norm": 16.600643157958984, + "learning_rate": 2.9745404726566962e-05, + "loss": 4.5515, + "step": 17850 + }, + { + "epoch": 2.0342844125519677, + "grad_norm": 10.375631332397461, + "learning_rate": 2.973398789816189e-05, + "loss": 4.4477, + "step": 17860 + }, + { + "epoch": 2.0354234295802724, + "grad_norm": 11.461358070373535, + "learning_rate": 2.9722571069756822e-05, + "loss": 4.3483, + "step": 17870 + }, + { + "epoch": 2.0365624466085768, + "grad_norm": 9.288942337036133, + "learning_rate": 2.971115424135175e-05, + "loss": 4.5621, + "step": 17880 + }, + { + "epoch": 2.0377014636368815, + "grad_norm": 9.822104454040527, + "learning_rate": 2.9699737412946682e-05, + "loss": 4.3366, + "step": 17890 + }, + { + "epoch": 2.038840480665186, + "grad_norm": 8.849560737609863, + "learning_rate": 2.9688320584541618e-05, + "loss": 4.2618, + "step": 17900 + }, + { + "epoch": 2.0399794976934906, + "grad_norm": 11.033799171447754, + "learning_rate": 2.967690375613655e-05, + "loss": 4.519, + "step": 17910 + }, + { + "epoch": 2.041118514721795, + "grad_norm": 10.335617065429688, + "learning_rate": 2.9665486927731478e-05, + "loss": 4.77, + "step": 17920 + }, + { + "epoch": 2.0422575317500997, + "grad_norm": 10.127538681030273, + "learning_rate": 2.965407009932641e-05, + "loss": 4.6134, + "step": 17930 + }, + { + "epoch": 2.043396548778404, + "grad_norm": 13.892292976379395, + "learning_rate": 2.9642653270921338e-05, + "loss": 4.7826, + "step": 17940 + }, + { + "epoch": 2.044535565806709, + "grad_norm": 22.508115768432617, + "learning_rate": 2.963123644251627e-05, + "loss": 4.9485, + "step": 17950 + }, + { + "epoch": 2.0456745828350136, + "grad_norm": 9.200506210327148, + "learning_rate": 2.96198196141112e-05, + "loss": 4.2296, + "step": 17960 + }, + { + "epoch": 2.046813599863318, + "grad_norm": 20.13701057434082, + "learning_rate": 2.960840278570613e-05, + "loss": 4.496, + "step": 17970 + }, + { + "epoch": 2.0479526168916227, + "grad_norm": 17.2512149810791, + "learning_rate": 2.9596985957301065e-05, + "loss": 4.316, + "step": 17980 + }, + { + "epoch": 2.049091633919927, + "grad_norm": 11.529026985168457, + "learning_rate": 2.9585569128895997e-05, + "loss": 4.4849, + "step": 17990 + }, + { + "epoch": 2.0502306509482318, + "grad_norm": 13.813348770141602, + "learning_rate": 2.9574152300490926e-05, + "loss": 4.5282, + "step": 18000 + }, + { + "epoch": 2.0502306509482318, + "eval_loss": 6.060417175292969, + "eval_runtime": 10.2589, + "eval_samples_per_second": 1.462, + "eval_steps_per_second": 0.195, + "step": 18000 + }, + { + "epoch": 2.051369667976536, + "grad_norm": 10.74505615234375, + "learning_rate": 2.9562735472085858e-05, + "loss": 4.1056, + "step": 18010 + }, + { + "epoch": 2.052508685004841, + "grad_norm": 10.346863746643066, + "learning_rate": 2.9551318643680786e-05, + "loss": 4.3538, + "step": 18020 + }, + { + "epoch": 2.053647702033145, + "grad_norm": 8.979138374328613, + "learning_rate": 2.9539901815275718e-05, + "loss": 4.642, + "step": 18030 + }, + { + "epoch": 2.05478671906145, + "grad_norm": 20.48455810546875, + "learning_rate": 2.9528484986870646e-05, + "loss": 4.1862, + "step": 18040 + }, + { + "epoch": 2.0559257360897547, + "grad_norm": 24.90452003479004, + "learning_rate": 2.9517068158465578e-05, + "loss": 4.1786, + "step": 18050 + }, + { + "epoch": 2.057064753118059, + "grad_norm": 8.899307250976562, + "learning_rate": 2.9505651330060513e-05, + "loss": 4.3966, + "step": 18060 + }, + { + "epoch": 2.058203770146364, + "grad_norm": 40.624732971191406, + "learning_rate": 2.9494234501655442e-05, + "loss": 4.4136, + "step": 18070 + }, + { + "epoch": 2.059342787174668, + "grad_norm": 12.927209854125977, + "learning_rate": 2.9482817673250374e-05, + "loss": 4.4222, + "step": 18080 + }, + { + "epoch": 2.060481804202973, + "grad_norm": 12.320836067199707, + "learning_rate": 2.9471400844845306e-05, + "loss": 4.4919, + "step": 18090 + }, + { + "epoch": 2.061620821231277, + "grad_norm": 8.779129981994629, + "learning_rate": 2.9459984016440234e-05, + "loss": 4.658, + "step": 18100 + }, + { + "epoch": 2.062759838259582, + "grad_norm": 10.036825180053711, + "learning_rate": 2.9448567188035166e-05, + "loss": 4.8078, + "step": 18110 + }, + { + "epoch": 2.0638988552878867, + "grad_norm": 11.447505950927734, + "learning_rate": 2.9437150359630094e-05, + "loss": 4.3399, + "step": 18120 + }, + { + "epoch": 2.065037872316191, + "grad_norm": 8.548605918884277, + "learning_rate": 2.9425733531225026e-05, + "loss": 4.4198, + "step": 18130 + }, + { + "epoch": 2.066176889344496, + "grad_norm": 10.151397705078125, + "learning_rate": 2.941431670281996e-05, + "loss": 4.1909, + "step": 18140 + }, + { + "epoch": 2.0673159063728, + "grad_norm": 7.891269683837891, + "learning_rate": 2.940289987441489e-05, + "loss": 4.5161, + "step": 18150 + }, + { + "epoch": 2.068454923401105, + "grad_norm": 21.275150299072266, + "learning_rate": 2.939148304600982e-05, + "loss": 4.4974, + "step": 18160 + }, + { + "epoch": 2.0695939404294093, + "grad_norm": 10.612809181213379, + "learning_rate": 2.938006621760475e-05, + "loss": 4.5919, + "step": 18170 + }, + { + "epoch": 2.070732957457714, + "grad_norm": 10.864121437072754, + "learning_rate": 2.9368649389199682e-05, + "loss": 4.5019, + "step": 18180 + }, + { + "epoch": 2.071871974486019, + "grad_norm": 15.571329116821289, + "learning_rate": 2.935723256079461e-05, + "loss": 4.2866, + "step": 18190 + }, + { + "epoch": 2.073010991514323, + "grad_norm": 11.20418930053711, + "learning_rate": 2.9345815732389542e-05, + "loss": 4.4531, + "step": 18200 + }, + { + "epoch": 2.074150008542628, + "grad_norm": 13.021940231323242, + "learning_rate": 2.9334398903984474e-05, + "loss": 4.5566, + "step": 18210 + }, + { + "epoch": 2.075289025570932, + "grad_norm": 35.43565368652344, + "learning_rate": 2.9322982075579402e-05, + "loss": 4.2142, + "step": 18220 + }, + { + "epoch": 2.076428042599237, + "grad_norm": 16.484302520751953, + "learning_rate": 2.9311565247174338e-05, + "loss": 4.2342, + "step": 18230 + }, + { + "epoch": 2.0775670596275413, + "grad_norm": 11.920555114746094, + "learning_rate": 2.930014841876927e-05, + "loss": 4.574, + "step": 18240 + }, + { + "epoch": 2.078706076655846, + "grad_norm": 10.326672554016113, + "learning_rate": 2.9288731590364198e-05, + "loss": 4.4499, + "step": 18250 + }, + { + "epoch": 2.0798450936841504, + "grad_norm": 10.69715690612793, + "learning_rate": 2.927731476195913e-05, + "loss": 4.2972, + "step": 18260 + }, + { + "epoch": 2.080984110712455, + "grad_norm": 13.71670150756836, + "learning_rate": 2.9265897933554058e-05, + "loss": 4.7965, + "step": 18270 + }, + { + "epoch": 2.08212312774076, + "grad_norm": 12.21806526184082, + "learning_rate": 2.925448110514899e-05, + "loss": 4.4754, + "step": 18280 + }, + { + "epoch": 2.0832621447690642, + "grad_norm": 8.209394454956055, + "learning_rate": 2.924306427674392e-05, + "loss": 4.3476, + "step": 18290 + }, + { + "epoch": 2.084401161797369, + "grad_norm": 14.19764518737793, + "learning_rate": 2.923164744833885e-05, + "loss": 4.2957, + "step": 18300 + }, + { + "epoch": 2.0855401788256733, + "grad_norm": 15.750473022460938, + "learning_rate": 2.9220230619933786e-05, + "loss": 4.3946, + "step": 18310 + }, + { + "epoch": 2.086679195853978, + "grad_norm": 12.75074577331543, + "learning_rate": 2.9208813791528717e-05, + "loss": 4.7376, + "step": 18320 + }, + { + "epoch": 2.0878182128822824, + "grad_norm": 10.48817253112793, + "learning_rate": 2.9197396963123646e-05, + "loss": 4.0535, + "step": 18330 + }, + { + "epoch": 2.088957229910587, + "grad_norm": 9.252484321594238, + "learning_rate": 2.9185980134718578e-05, + "loss": 4.4967, + "step": 18340 + }, + { + "epoch": 2.0900962469388915, + "grad_norm": 12.832139015197754, + "learning_rate": 2.9174563306313506e-05, + "loss": 4.5884, + "step": 18350 + }, + { + "epoch": 2.0912352639671963, + "grad_norm": 9.247235298156738, + "learning_rate": 2.9163146477908438e-05, + "loss": 4.2827, + "step": 18360 + }, + { + "epoch": 2.092374280995501, + "grad_norm": 13.298909187316895, + "learning_rate": 2.9151729649503366e-05, + "loss": 4.4996, + "step": 18370 + }, + { + "epoch": 2.0935132980238054, + "grad_norm": 10.92052173614502, + "learning_rate": 2.91403128210983e-05, + "loss": 4.5275, + "step": 18380 + }, + { + "epoch": 2.09465231505211, + "grad_norm": 9.866982460021973, + "learning_rate": 2.9128895992693234e-05, + "loss": 4.321, + "step": 18390 + }, + { + "epoch": 2.0957913320804145, + "grad_norm": 11.814825057983398, + "learning_rate": 2.9117479164288165e-05, + "loss": 4.1376, + "step": 18400 + }, + { + "epoch": 2.0969303491087192, + "grad_norm": 13.49726676940918, + "learning_rate": 2.9106062335883094e-05, + "loss": 4.6261, + "step": 18410 + }, + { + "epoch": 2.0980693661370236, + "grad_norm": 13.917762756347656, + "learning_rate": 2.9094645507478026e-05, + "loss": 4.2507, + "step": 18420 + }, + { + "epoch": 2.0992083831653283, + "grad_norm": 10.437445640563965, + "learning_rate": 2.9083228679072954e-05, + "loss": 4.3688, + "step": 18430 + }, + { + "epoch": 2.100347400193633, + "grad_norm": 10.63725471496582, + "learning_rate": 2.9071811850667886e-05, + "loss": 4.1495, + "step": 18440 + }, + { + "epoch": 2.1014864172219374, + "grad_norm": 12.689176559448242, + "learning_rate": 2.9060395022262814e-05, + "loss": 4.4013, + "step": 18450 + }, + { + "epoch": 2.102625434250242, + "grad_norm": 10.703514099121094, + "learning_rate": 2.9048978193857746e-05, + "loss": 4.0065, + "step": 18460 + }, + { + "epoch": 2.1037644512785465, + "grad_norm": 12.065997123718262, + "learning_rate": 2.903756136545268e-05, + "loss": 4.6333, + "step": 18470 + }, + { + "epoch": 2.1049034683068513, + "grad_norm": 9.926651000976562, + "learning_rate": 2.902614453704761e-05, + "loss": 4.4492, + "step": 18480 + }, + { + "epoch": 2.1060424853351556, + "grad_norm": 24.569591522216797, + "learning_rate": 2.9014727708642542e-05, + "loss": 4.4071, + "step": 18490 + }, + { + "epoch": 2.1071815023634604, + "grad_norm": 15.291176795959473, + "learning_rate": 2.9003310880237474e-05, + "loss": 4.3494, + "step": 18500 + }, + { + "epoch": 2.108320519391765, + "grad_norm": 10.331123352050781, + "learning_rate": 2.8991894051832402e-05, + "loss": 4.2621, + "step": 18510 + }, + { + "epoch": 2.1094595364200694, + "grad_norm": 13.224411010742188, + "learning_rate": 2.8980477223427334e-05, + "loss": 4.5029, + "step": 18520 + }, + { + "epoch": 2.110598553448374, + "grad_norm": 16.534788131713867, + "learning_rate": 2.8969060395022262e-05, + "loss": 4.4198, + "step": 18530 + }, + { + "epoch": 2.1117375704766785, + "grad_norm": 37.1566276550293, + "learning_rate": 2.8957643566617194e-05, + "loss": 4.6362, + "step": 18540 + }, + { + "epoch": 2.1128765875049833, + "grad_norm": 10.607845306396484, + "learning_rate": 2.894622673821213e-05, + "loss": 4.0439, + "step": 18550 + }, + { + "epoch": 2.1140156045332876, + "grad_norm": 12.826197624206543, + "learning_rate": 2.8934809909807058e-05, + "loss": 4.4564, + "step": 18560 + }, + { + "epoch": 2.1151546215615924, + "grad_norm": 13.908170700073242, + "learning_rate": 2.892339308140199e-05, + "loss": 4.3056, + "step": 18570 + }, + { + "epoch": 2.1162936385898967, + "grad_norm": 8.121379852294922, + "learning_rate": 2.8911976252996918e-05, + "loss": 4.328, + "step": 18580 + }, + { + "epoch": 2.1174326556182015, + "grad_norm": 10.788400650024414, + "learning_rate": 2.890055942459185e-05, + "loss": 4.4259, + "step": 18590 + }, + { + "epoch": 2.1185716726465063, + "grad_norm": 15.44107437133789, + "learning_rate": 2.888914259618678e-05, + "loss": 4.2949, + "step": 18600 + }, + { + "epoch": 2.1197106896748106, + "grad_norm": 29.66758155822754, + "learning_rate": 2.887772576778171e-05, + "loss": 4.4404, + "step": 18610 + }, + { + "epoch": 2.1208497067031153, + "grad_norm": 34.616451263427734, + "learning_rate": 2.8866308939376642e-05, + "loss": 4.4044, + "step": 18620 + }, + { + "epoch": 2.1219887237314197, + "grad_norm": 11.029510498046875, + "learning_rate": 2.885489211097157e-05, + "loss": 4.4095, + "step": 18630 + }, + { + "epoch": 2.1231277407597244, + "grad_norm": 10.427742958068848, + "learning_rate": 2.8843475282566506e-05, + "loss": 4.5589, + "step": 18640 + }, + { + "epoch": 2.1242667577880288, + "grad_norm": 23.671968460083008, + "learning_rate": 2.8832058454161438e-05, + "loss": 4.3538, + "step": 18650 + }, + { + "epoch": 2.1254057748163335, + "grad_norm": 12.53283405303955, + "learning_rate": 2.8820641625756366e-05, + "loss": 4.2531, + "step": 18660 + }, + { + "epoch": 2.126544791844638, + "grad_norm": 14.044245719909668, + "learning_rate": 2.8809224797351298e-05, + "loss": 4.6159, + "step": 18670 + }, + { + "epoch": 2.1276838088729426, + "grad_norm": 13.84389877319336, + "learning_rate": 2.8797807968946226e-05, + "loss": 4.1674, + "step": 18680 + }, + { + "epoch": 2.1288228259012474, + "grad_norm": 34.24262237548828, + "learning_rate": 2.8786391140541158e-05, + "loss": 4.4919, + "step": 18690 + }, + { + "epoch": 2.1299618429295517, + "grad_norm": 11.81908130645752, + "learning_rate": 2.8774974312136087e-05, + "loss": 4.454, + "step": 18700 + }, + { + "epoch": 2.1311008599578565, + "grad_norm": 25.702617645263672, + "learning_rate": 2.876355748373102e-05, + "loss": 4.3196, + "step": 18710 + }, + { + "epoch": 2.132239876986161, + "grad_norm": 31.084396362304688, + "learning_rate": 2.8752140655325954e-05, + "loss": 4.1424, + "step": 18720 + }, + { + "epoch": 2.1333788940144656, + "grad_norm": 10.866751670837402, + "learning_rate": 2.8740723826920886e-05, + "loss": 4.7732, + "step": 18730 + }, + { + "epoch": 2.13451791104277, + "grad_norm": 14.681591033935547, + "learning_rate": 2.8729306998515814e-05, + "loss": 4.1544, + "step": 18740 + }, + { + "epoch": 2.1356569280710747, + "grad_norm": 14.033744812011719, + "learning_rate": 2.8717890170110746e-05, + "loss": 4.477, + "step": 18750 + }, + { + "epoch": 2.1367959450993794, + "grad_norm": 21.682268142700195, + "learning_rate": 2.8706473341705674e-05, + "loss": 4.3857, + "step": 18760 + }, + { + "epoch": 2.1379349621276837, + "grad_norm": 17.77375602722168, + "learning_rate": 2.8695056513300606e-05, + "loss": 4.6613, + "step": 18770 + }, + { + "epoch": 2.1390739791559885, + "grad_norm": 10.164037704467773, + "learning_rate": 2.8683639684895535e-05, + "loss": 4.8563, + "step": 18780 + }, + { + "epoch": 2.140212996184293, + "grad_norm": 14.451604843139648, + "learning_rate": 2.8672222856490466e-05, + "loss": 4.3895, + "step": 18790 + }, + { + "epoch": 2.1413520132125976, + "grad_norm": 8.25815200805664, + "learning_rate": 2.86608060280854e-05, + "loss": 4.5051, + "step": 18800 + }, + { + "epoch": 2.142491030240902, + "grad_norm": 19.291881561279297, + "learning_rate": 2.8649389199680333e-05, + "loss": 4.3857, + "step": 18810 + }, + { + "epoch": 2.1436300472692067, + "grad_norm": 11.34103775024414, + "learning_rate": 2.8637972371275262e-05, + "loss": 4.2127, + "step": 18820 + }, + { + "epoch": 2.1447690642975115, + "grad_norm": 11.805054664611816, + "learning_rate": 2.8626555542870194e-05, + "loss": 4.2074, + "step": 18830 + }, + { + "epoch": 2.145908081325816, + "grad_norm": 10.304930686950684, + "learning_rate": 2.8615138714465122e-05, + "loss": 4.2003, + "step": 18840 + }, + { + "epoch": 2.1470470983541206, + "grad_norm": 11.176360130310059, + "learning_rate": 2.8603721886060054e-05, + "loss": 4.3713, + "step": 18850 + }, + { + "epoch": 2.148186115382425, + "grad_norm": 10.51423454284668, + "learning_rate": 2.8592305057654982e-05, + "loss": 4.5065, + "step": 18860 + }, + { + "epoch": 2.1493251324107296, + "grad_norm": 20.841686248779297, + "learning_rate": 2.8580888229249914e-05, + "loss": 4.0744, + "step": 18870 + }, + { + "epoch": 2.150464149439034, + "grad_norm": 13.774003028869629, + "learning_rate": 2.856947140084485e-05, + "loss": 4.5333, + "step": 18880 + }, + { + "epoch": 2.1516031664673387, + "grad_norm": 13.642642974853516, + "learning_rate": 2.8558054572439778e-05, + "loss": 4.2307, + "step": 18890 + }, + { + "epoch": 2.152742183495643, + "grad_norm": 17.952116012573242, + "learning_rate": 2.854663774403471e-05, + "loss": 4.5442, + "step": 18900 + }, + { + "epoch": 2.153881200523948, + "grad_norm": 10.366288185119629, + "learning_rate": 2.853522091562964e-05, + "loss": 4.4885, + "step": 18910 + }, + { + "epoch": 2.1550202175522526, + "grad_norm": 13.921804428100586, + "learning_rate": 2.852380408722457e-05, + "loss": 4.2344, + "step": 18920 + }, + { + "epoch": 2.156159234580557, + "grad_norm": 9.826135635375977, + "learning_rate": 2.8512387258819502e-05, + "loss": 4.5081, + "step": 18930 + }, + { + "epoch": 2.1572982516088617, + "grad_norm": 10.746516227722168, + "learning_rate": 2.850097043041443e-05, + "loss": 4.361, + "step": 18940 + }, + { + "epoch": 2.158437268637166, + "grad_norm": 16.652273178100586, + "learning_rate": 2.8489553602009362e-05, + "loss": 4.3208, + "step": 18950 + }, + { + "epoch": 2.1595762856654708, + "grad_norm": 11.606681823730469, + "learning_rate": 2.847813677360429e-05, + "loss": 4.4263, + "step": 18960 + }, + { + "epoch": 2.160715302693775, + "grad_norm": 11.555379867553711, + "learning_rate": 2.8466719945199226e-05, + "loss": 4.1051, + "step": 18970 + }, + { + "epoch": 2.16185431972208, + "grad_norm": 14.31595516204834, + "learning_rate": 2.8455303116794158e-05, + "loss": 4.5425, + "step": 18980 + }, + { + "epoch": 2.162993336750384, + "grad_norm": 16.75832176208496, + "learning_rate": 2.8443886288389086e-05, + "loss": 4.2823, + "step": 18990 + }, + { + "epoch": 2.164132353778689, + "grad_norm": 25.401622772216797, + "learning_rate": 2.8432469459984018e-05, + "loss": 4.4266, + "step": 19000 + }, + { + "epoch": 2.1652713708069937, + "grad_norm": 10.595063209533691, + "learning_rate": 2.842105263157895e-05, + "loss": 4.5758, + "step": 19010 + }, + { + "epoch": 2.166410387835298, + "grad_norm": 12.421451568603516, + "learning_rate": 2.840963580317388e-05, + "loss": 4.3854, + "step": 19020 + }, + { + "epoch": 2.167549404863603, + "grad_norm": 11.803699493408203, + "learning_rate": 2.839821897476881e-05, + "loss": 4.1064, + "step": 19030 + }, + { + "epoch": 2.168688421891907, + "grad_norm": 14.315633773803711, + "learning_rate": 2.838680214636374e-05, + "loss": 4.447, + "step": 19040 + }, + { + "epoch": 2.169827438920212, + "grad_norm": 9.619046211242676, + "learning_rate": 2.8375385317958674e-05, + "loss": 4.3029, + "step": 19050 + }, + { + "epoch": 2.1709664559485162, + "grad_norm": 9.873806953430176, + "learning_rate": 2.8363968489553606e-05, + "loss": 4.4561, + "step": 19060 + }, + { + "epoch": 2.172105472976821, + "grad_norm": 11.56844425201416, + "learning_rate": 2.8352551661148534e-05, + "loss": 4.5356, + "step": 19070 + }, + { + "epoch": 2.1732444900051258, + "grad_norm": 19.031890869140625, + "learning_rate": 2.8341134832743466e-05, + "loss": 4.3294, + "step": 19080 + }, + { + "epoch": 2.17438350703343, + "grad_norm": 11.496087074279785, + "learning_rate": 2.8329718004338394e-05, + "loss": 4.6025, + "step": 19090 + }, + { + "epoch": 2.175522524061735, + "grad_norm": 23.368568420410156, + "learning_rate": 2.8318301175933326e-05, + "loss": 4.2988, + "step": 19100 + }, + { + "epoch": 2.176661541090039, + "grad_norm": 8.582066535949707, + "learning_rate": 2.8306884347528255e-05, + "loss": 4.523, + "step": 19110 + }, + { + "epoch": 2.177800558118344, + "grad_norm": 10.590781211853027, + "learning_rate": 2.8295467519123187e-05, + "loss": 4.3902, + "step": 19120 + }, + { + "epoch": 2.1789395751466483, + "grad_norm": 11.430822372436523, + "learning_rate": 2.8284050690718122e-05, + "loss": 4.6761, + "step": 19130 + }, + { + "epoch": 2.180078592174953, + "grad_norm": 22.227521896362305, + "learning_rate": 2.8272633862313054e-05, + "loss": 4.287, + "step": 19140 + }, + { + "epoch": 2.181217609203258, + "grad_norm": 7.667318820953369, + "learning_rate": 2.8261217033907982e-05, + "loss": 4.2209, + "step": 19150 + }, + { + "epoch": 2.182356626231562, + "grad_norm": 33.098731994628906, + "learning_rate": 2.8249800205502914e-05, + "loss": 4.2548, + "step": 19160 + }, + { + "epoch": 2.183495643259867, + "grad_norm": 15.808575630187988, + "learning_rate": 2.8238383377097842e-05, + "loss": 4.3751, + "step": 19170 + }, + { + "epoch": 2.184634660288171, + "grad_norm": 33.99452209472656, + "learning_rate": 2.8226966548692774e-05, + "loss": 4.537, + "step": 19180 + }, + { + "epoch": 2.185773677316476, + "grad_norm": 17.135643005371094, + "learning_rate": 2.8215549720287703e-05, + "loss": 4.2375, + "step": 19190 + }, + { + "epoch": 2.1869126943447803, + "grad_norm": 11.814332962036133, + "learning_rate": 2.8204132891882634e-05, + "loss": 4.3262, + "step": 19200 + }, + { + "epoch": 2.188051711373085, + "grad_norm": 9.625064849853516, + "learning_rate": 2.819271606347757e-05, + "loss": 4.3138, + "step": 19210 + }, + { + "epoch": 2.1891907284013894, + "grad_norm": 11.318060874938965, + "learning_rate": 2.81812992350725e-05, + "loss": 4.3632, + "step": 19220 + }, + { + "epoch": 2.190329745429694, + "grad_norm": 11.130118370056152, + "learning_rate": 2.816988240666743e-05, + "loss": 4.4463, + "step": 19230 + }, + { + "epoch": 2.191468762457999, + "grad_norm": 40.44393539428711, + "learning_rate": 2.8158465578262362e-05, + "loss": 4.3744, + "step": 19240 + }, + { + "epoch": 2.1926077794863033, + "grad_norm": 11.919827461242676, + "learning_rate": 2.814704874985729e-05, + "loss": 4.7022, + "step": 19250 + }, + { + "epoch": 2.193746796514608, + "grad_norm": 7.852390289306641, + "learning_rate": 2.8135631921452222e-05, + "loss": 4.5084, + "step": 19260 + }, + { + "epoch": 2.1948858135429123, + "grad_norm": 12.34241008758545, + "learning_rate": 2.812421509304715e-05, + "loss": 4.5451, + "step": 19270 + }, + { + "epoch": 2.196024830571217, + "grad_norm": 10.834270477294922, + "learning_rate": 2.8112798264642082e-05, + "loss": 4.67, + "step": 19280 + }, + { + "epoch": 2.1971638475995214, + "grad_norm": 12.868874549865723, + "learning_rate": 2.810138143623701e-05, + "loss": 4.1292, + "step": 19290 + }, + { + "epoch": 2.198302864627826, + "grad_norm": 18.698772430419922, + "learning_rate": 2.808996460783195e-05, + "loss": 4.7825, + "step": 19300 + }, + { + "epoch": 2.1994418816561305, + "grad_norm": 10.663459777832031, + "learning_rate": 2.8078547779426878e-05, + "loss": 4.1828, + "step": 19310 + }, + { + "epoch": 2.2005808986844353, + "grad_norm": 11.447162628173828, + "learning_rate": 2.806713095102181e-05, + "loss": 4.3824, + "step": 19320 + }, + { + "epoch": 2.20171991571274, + "grad_norm": 11.612629890441895, + "learning_rate": 2.8055714122616738e-05, + "loss": 4.4154, + "step": 19330 + }, + { + "epoch": 2.2028589327410444, + "grad_norm": 22.35382080078125, + "learning_rate": 2.804429729421167e-05, + "loss": 4.2377, + "step": 19340 + }, + { + "epoch": 2.203997949769349, + "grad_norm": 14.205232620239258, + "learning_rate": 2.80328804658066e-05, + "loss": 4.1946, + "step": 19350 + }, + { + "epoch": 2.2051369667976535, + "grad_norm": 8.173402786254883, + "learning_rate": 2.802146363740153e-05, + "loss": 4.3419, + "step": 19360 + }, + { + "epoch": 2.2062759838259582, + "grad_norm": 13.870865821838379, + "learning_rate": 2.801004680899646e-05, + "loss": 4.2369, + "step": 19370 + }, + { + "epoch": 2.2074150008542626, + "grad_norm": 13.200169563293457, + "learning_rate": 2.7998629980591394e-05, + "loss": 4.2176, + "step": 19380 + }, + { + "epoch": 2.2085540178825673, + "grad_norm": 11.639530181884766, + "learning_rate": 2.7987213152186326e-05, + "loss": 4.5517, + "step": 19390 + }, + { + "epoch": 2.209693034910872, + "grad_norm": 8.542173385620117, + "learning_rate": 2.7975796323781254e-05, + "loss": 4.5128, + "step": 19400 + }, + { + "epoch": 2.2108320519391764, + "grad_norm": 11.450385093688965, + "learning_rate": 2.7964379495376186e-05, + "loss": 4.4245, + "step": 19410 + }, + { + "epoch": 2.211971068967481, + "grad_norm": 10.551682472229004, + "learning_rate": 2.7952962666971118e-05, + "loss": 4.4777, + "step": 19420 + }, + { + "epoch": 2.2131100859957855, + "grad_norm": 20.971851348876953, + "learning_rate": 2.7941545838566046e-05, + "loss": 4.4916, + "step": 19430 + }, + { + "epoch": 2.2142491030240903, + "grad_norm": 22.77348518371582, + "learning_rate": 2.7930129010160978e-05, + "loss": 4.2798, + "step": 19440 + }, + { + "epoch": 2.2153881200523946, + "grad_norm": 8.859170913696289, + "learning_rate": 2.7918712181755907e-05, + "loss": 4.567, + "step": 19450 + }, + { + "epoch": 2.2165271370806994, + "grad_norm": 14.664798736572266, + "learning_rate": 2.7907295353350842e-05, + "loss": 4.2937, + "step": 19460 + }, + { + "epoch": 2.217666154109004, + "grad_norm": 14.15429973602295, + "learning_rate": 2.7895878524945774e-05, + "loss": 4.3192, + "step": 19470 + }, + { + "epoch": 2.2188051711373085, + "grad_norm": 12.038146018981934, + "learning_rate": 2.7884461696540702e-05, + "loss": 4.6759, + "step": 19480 + }, + { + "epoch": 2.2199441881656132, + "grad_norm": 9.109766006469727, + "learning_rate": 2.7873044868135634e-05, + "loss": 4.2986, + "step": 19490 + }, + { + "epoch": 2.2210832051939176, + "grad_norm": 14.091863632202148, + "learning_rate": 2.7861628039730562e-05, + "loss": 4.3147, + "step": 19500 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 32.21546173095703, + "learning_rate": 2.7850211211325494e-05, + "loss": 4.6065, + "step": 19510 + }, + { + "epoch": 2.2233612392505266, + "grad_norm": 31.96328353881836, + "learning_rate": 2.7838794382920426e-05, + "loss": 4.5492, + "step": 19520 + }, + { + "epoch": 2.2245002562788314, + "grad_norm": 15.795702934265137, + "learning_rate": 2.7827377554515355e-05, + "loss": 4.3908, + "step": 19530 + }, + { + "epoch": 2.2256392733071357, + "grad_norm": 20.312828063964844, + "learning_rate": 2.781596072611029e-05, + "loss": 4.3908, + "step": 19540 + }, + { + "epoch": 2.2267782903354405, + "grad_norm": 11.913114547729492, + "learning_rate": 2.780454389770522e-05, + "loss": 4.0156, + "step": 19550 + }, + { + "epoch": 2.2279173073637453, + "grad_norm": 67.34058380126953, + "learning_rate": 2.779312706930015e-05, + "loss": 4.1589, + "step": 19560 + }, + { + "epoch": 2.2290563243920496, + "grad_norm": 13.381096839904785, + "learning_rate": 2.7781710240895082e-05, + "loss": 4.3149, + "step": 19570 + }, + { + "epoch": 2.2301953414203544, + "grad_norm": 24.699398040771484, + "learning_rate": 2.777029341249001e-05, + "loss": 4.8308, + "step": 19580 + }, + { + "epoch": 2.2313343584486587, + "grad_norm": 14.611007690429688, + "learning_rate": 2.7758876584084942e-05, + "loss": 4.355, + "step": 19590 + }, + { + "epoch": 2.2324733754769635, + "grad_norm": 9.360657691955566, + "learning_rate": 2.774745975567987e-05, + "loss": 4.3769, + "step": 19600 + }, + { + "epoch": 2.2336123925052678, + "grad_norm": 12.764294624328613, + "learning_rate": 2.7736042927274803e-05, + "loss": 3.9479, + "step": 19610 + }, + { + "epoch": 2.2347514095335725, + "grad_norm": 16.35887336730957, + "learning_rate": 2.772462609886973e-05, + "loss": 4.4006, + "step": 19620 + }, + { + "epoch": 2.235890426561877, + "grad_norm": 10.12515926361084, + "learning_rate": 2.771320927046467e-05, + "loss": 4.412, + "step": 19630 + }, + { + "epoch": 2.2370294435901816, + "grad_norm": 11.307372093200684, + "learning_rate": 2.7701792442059598e-05, + "loss": 4.7658, + "step": 19640 + }, + { + "epoch": 2.2381684606184864, + "grad_norm": 9.900909423828125, + "learning_rate": 2.769037561365453e-05, + "loss": 4.399, + "step": 19650 + }, + { + "epoch": 2.2393074776467907, + "grad_norm": 14.501945495605469, + "learning_rate": 2.767895878524946e-05, + "loss": 4.2879, + "step": 19660 + }, + { + "epoch": 2.2404464946750955, + "grad_norm": 10.298788070678711, + "learning_rate": 2.766754195684439e-05, + "loss": 4.5807, + "step": 19670 + }, + { + "epoch": 2.2415855117034, + "grad_norm": 17.297077178955078, + "learning_rate": 2.765612512843932e-05, + "loss": 4.3349, + "step": 19680 + }, + { + "epoch": 2.2427245287317046, + "grad_norm": 8.442501068115234, + "learning_rate": 2.764470830003425e-05, + "loss": 4.2717, + "step": 19690 + }, + { + "epoch": 2.243863545760009, + "grad_norm": 14.779812812805176, + "learning_rate": 2.763329147162918e-05, + "loss": 4.5482, + "step": 19700 + }, + { + "epoch": 2.2450025627883137, + "grad_norm": 10.94418716430664, + "learning_rate": 2.7621874643224118e-05, + "loss": 4.1592, + "step": 19710 + }, + { + "epoch": 2.2461415798166184, + "grad_norm": 9.443648338317871, + "learning_rate": 2.7610457814819046e-05, + "loss": 4.485, + "step": 19720 + }, + { + "epoch": 2.2472805968449228, + "grad_norm": 14.997633934020996, + "learning_rate": 2.7599040986413978e-05, + "loss": 4.4784, + "step": 19730 + }, + { + "epoch": 2.2484196138732275, + "grad_norm": 7.396342754364014, + "learning_rate": 2.7587624158008906e-05, + "loss": 4.903, + "step": 19740 + }, + { + "epoch": 2.249558630901532, + "grad_norm": 9.457368850708008, + "learning_rate": 2.7576207329603838e-05, + "loss": 4.5902, + "step": 19750 + }, + { + "epoch": 2.2506976479298366, + "grad_norm": 12.0418701171875, + "learning_rate": 2.7564790501198767e-05, + "loss": 4.4375, + "step": 19760 + }, + { + "epoch": 2.251836664958141, + "grad_norm": 8.690922737121582, + "learning_rate": 2.75533736727937e-05, + "loss": 4.3008, + "step": 19770 + }, + { + "epoch": 2.2529756819864457, + "grad_norm": 17.454952239990234, + "learning_rate": 2.7541956844388627e-05, + "loss": 4.369, + "step": 19780 + }, + { + "epoch": 2.2541146990147505, + "grad_norm": 9.971555709838867, + "learning_rate": 2.7530540015983562e-05, + "loss": 4.3026, + "step": 19790 + }, + { + "epoch": 2.255253716043055, + "grad_norm": 15.098048210144043, + "learning_rate": 2.7519123187578494e-05, + "loss": 4.1767, + "step": 19800 + }, + { + "epoch": 2.2563927330713596, + "grad_norm": 15.469975471496582, + "learning_rate": 2.7507706359173426e-05, + "loss": 4.2714, + "step": 19810 + }, + { + "epoch": 2.257531750099664, + "grad_norm": 8.48707389831543, + "learning_rate": 2.7496289530768354e-05, + "loss": 4.3728, + "step": 19820 + }, + { + "epoch": 2.2586707671279687, + "grad_norm": 11.636101722717285, + "learning_rate": 2.7484872702363286e-05, + "loss": 4.6468, + "step": 19830 + }, + { + "epoch": 2.259809784156273, + "grad_norm": 13.015317916870117, + "learning_rate": 2.7473455873958214e-05, + "loss": 4.2907, + "step": 19840 + }, + { + "epoch": 2.2609488011845777, + "grad_norm": 26.91872215270996, + "learning_rate": 2.7462039045553146e-05, + "loss": 4.3164, + "step": 19850 + }, + { + "epoch": 2.2620878182128825, + "grad_norm": 17.38819122314453, + "learning_rate": 2.7450622217148075e-05, + "loss": 4.5248, + "step": 19860 + }, + { + "epoch": 2.263226835241187, + "grad_norm": 10.703744888305664, + "learning_rate": 2.743920538874301e-05, + "loss": 4.5878, + "step": 19870 + }, + { + "epoch": 2.2643658522694916, + "grad_norm": 23.601512908935547, + "learning_rate": 2.7427788560337942e-05, + "loss": 4.291, + "step": 19880 + }, + { + "epoch": 2.265504869297796, + "grad_norm": 27.674367904663086, + "learning_rate": 2.741637173193287e-05, + "loss": 4.6971, + "step": 19890 + }, + { + "epoch": 2.2666438863261007, + "grad_norm": 11.249340057373047, + "learning_rate": 2.7404954903527802e-05, + "loss": 4.4546, + "step": 19900 + }, + { + "epoch": 2.267782903354405, + "grad_norm": 18.939645767211914, + "learning_rate": 2.739353807512273e-05, + "loss": 4.4617, + "step": 19910 + }, + { + "epoch": 2.26892192038271, + "grad_norm": 8.411871910095215, + "learning_rate": 2.7382121246717662e-05, + "loss": 4.4564, + "step": 19920 + }, + { + "epoch": 2.270060937411014, + "grad_norm": 19.94964027404785, + "learning_rate": 2.7370704418312594e-05, + "loss": 4.4576, + "step": 19930 + }, + { + "epoch": 2.271199954439319, + "grad_norm": 16.94525146484375, + "learning_rate": 2.7359287589907523e-05, + "loss": 4.4211, + "step": 19940 + }, + { + "epoch": 2.272338971467623, + "grad_norm": 12.117650985717773, + "learning_rate": 2.7347870761502454e-05, + "loss": 4.3448, + "step": 19950 + }, + { + "epoch": 2.273477988495928, + "grad_norm": 17.60885238647461, + "learning_rate": 2.733645393309739e-05, + "loss": 4.0793, + "step": 19960 + }, + { + "epoch": 2.2746170055242327, + "grad_norm": 22.5484561920166, + "learning_rate": 2.7325037104692318e-05, + "loss": 4.504, + "step": 19970 + }, + { + "epoch": 2.275756022552537, + "grad_norm": 12.337329864501953, + "learning_rate": 2.731362027628725e-05, + "loss": 4.4758, + "step": 19980 + }, + { + "epoch": 2.276895039580842, + "grad_norm": 10.084304809570312, + "learning_rate": 2.730220344788218e-05, + "loss": 4.5065, + "step": 19990 + }, + { + "epoch": 2.278034056609146, + "grad_norm": 28.399568557739258, + "learning_rate": 2.729078661947711e-05, + "loss": 4.3735, + "step": 20000 + }, + { + "epoch": 2.278034056609146, + "eval_loss": 6.095102310180664, + "eval_runtime": 12.419, + "eval_samples_per_second": 1.208, + "eval_steps_per_second": 0.161, + "step": 20000 + }, + { + "epoch": 2.279173073637451, + "grad_norm": 9.15103530883789, + "learning_rate": 2.727936979107204e-05, + "loss": 4.2103, + "step": 20010 + }, + { + "epoch": 2.2803120906657552, + "grad_norm": 15.227364540100098, + "learning_rate": 2.726795296266697e-05, + "loss": 4.3143, + "step": 20020 + }, + { + "epoch": 2.28145110769406, + "grad_norm": 18.669891357421875, + "learning_rate": 2.7256536134261902e-05, + "loss": 4.2012, + "step": 20030 + }, + { + "epoch": 2.2825901247223648, + "grad_norm": 11.993313789367676, + "learning_rate": 2.7245119305856838e-05, + "loss": 4.2507, + "step": 20040 + }, + { + "epoch": 2.283729141750669, + "grad_norm": 12.1185941696167, + "learning_rate": 2.7233702477451766e-05, + "loss": 4.5801, + "step": 20050 + }, + { + "epoch": 2.284868158778974, + "grad_norm": 9.721781730651855, + "learning_rate": 2.7222285649046698e-05, + "loss": 4.8564, + "step": 20060 + }, + { + "epoch": 2.286007175807278, + "grad_norm": 10.034998893737793, + "learning_rate": 2.7210868820641626e-05, + "loss": 4.3931, + "step": 20070 + }, + { + "epoch": 2.287146192835583, + "grad_norm": 15.754368782043457, + "learning_rate": 2.7199451992236558e-05, + "loss": 4.4728, + "step": 20080 + }, + { + "epoch": 2.2882852098638873, + "grad_norm": 15.889591217041016, + "learning_rate": 2.7188035163831487e-05, + "loss": 4.456, + "step": 20090 + }, + { + "epoch": 2.289424226892192, + "grad_norm": 8.562204360961914, + "learning_rate": 2.717661833542642e-05, + "loss": 4.7066, + "step": 20100 + }, + { + "epoch": 2.290563243920497, + "grad_norm": 38.456298828125, + "learning_rate": 2.7165201507021347e-05, + "loss": 4.5442, + "step": 20110 + }, + { + "epoch": 2.291702260948801, + "grad_norm": 9.698710441589355, + "learning_rate": 2.7153784678616286e-05, + "loss": 4.5983, + "step": 20120 + }, + { + "epoch": 2.292841277977106, + "grad_norm": 21.27276039123535, + "learning_rate": 2.7142367850211214e-05, + "loss": 4.5653, + "step": 20130 + }, + { + "epoch": 2.2939802950054102, + "grad_norm": 14.186841011047363, + "learning_rate": 2.7130951021806146e-05, + "loss": 4.2409, + "step": 20140 + }, + { + "epoch": 2.295119312033715, + "grad_norm": 19.440866470336914, + "learning_rate": 2.7119534193401074e-05, + "loss": 4.2499, + "step": 20150 + }, + { + "epoch": 2.2962583290620193, + "grad_norm": 24.299516677856445, + "learning_rate": 2.7108117364996006e-05, + "loss": 4.4182, + "step": 20160 + }, + { + "epoch": 2.297397346090324, + "grad_norm": 13.558712005615234, + "learning_rate": 2.7096700536590935e-05, + "loss": 4.4855, + "step": 20170 + }, + { + "epoch": 2.298536363118629, + "grad_norm": 9.241644859313965, + "learning_rate": 2.7085283708185866e-05, + "loss": 4.3026, + "step": 20180 + }, + { + "epoch": 2.299675380146933, + "grad_norm": 13.817281723022461, + "learning_rate": 2.7073866879780795e-05, + "loss": 4.1999, + "step": 20190 + }, + { + "epoch": 2.300814397175238, + "grad_norm": 15.542625427246094, + "learning_rate": 2.706245005137573e-05, + "loss": 4.4185, + "step": 20200 + }, + { + "epoch": 2.3019534142035423, + "grad_norm": 9.056170463562012, + "learning_rate": 2.7051033222970662e-05, + "loss": 4.4653, + "step": 20210 + }, + { + "epoch": 2.303092431231847, + "grad_norm": 29.517549514770508, + "learning_rate": 2.7039616394565594e-05, + "loss": 4.2953, + "step": 20220 + }, + { + "epoch": 2.3042314482601514, + "grad_norm": 22.298954010009766, + "learning_rate": 2.7028199566160522e-05, + "loss": 4.1913, + "step": 20230 + }, + { + "epoch": 2.305370465288456, + "grad_norm": 28.369159698486328, + "learning_rate": 2.7016782737755454e-05, + "loss": 4.4942, + "step": 20240 + }, + { + "epoch": 2.3065094823167605, + "grad_norm": 18.9779109954834, + "learning_rate": 2.700650759219089e-05, + "loss": 4.053, + "step": 20250 + }, + { + "epoch": 2.307648499345065, + "grad_norm": 11.17407512664795, + "learning_rate": 2.6995090763785825e-05, + "loss": 4.5072, + "step": 20260 + }, + { + "epoch": 2.3087875163733695, + "grad_norm": 15.445562362670898, + "learning_rate": 2.6983673935380754e-05, + "loss": 4.3479, + "step": 20270 + }, + { + "epoch": 2.3099265334016743, + "grad_norm": 11.916383743286133, + "learning_rate": 2.6972257106975686e-05, + "loss": 4.2965, + "step": 20280 + }, + { + "epoch": 2.311065550429979, + "grad_norm": 13.146663665771484, + "learning_rate": 2.6960840278570614e-05, + "loss": 4.6778, + "step": 20290 + }, + { + "epoch": 2.3122045674582834, + "grad_norm": 11.7923002243042, + "learning_rate": 2.6949423450165546e-05, + "loss": 4.3451, + "step": 20300 + }, + { + "epoch": 2.313343584486588, + "grad_norm": 13.446571350097656, + "learning_rate": 2.6938006621760474e-05, + "loss": 4.2385, + "step": 20310 + }, + { + "epoch": 2.3144826015148925, + "grad_norm": 9.76078987121582, + "learning_rate": 2.6926589793355406e-05, + "loss": 3.9775, + "step": 20320 + }, + { + "epoch": 2.3156216185431973, + "grad_norm": 8.01722526550293, + "learning_rate": 2.6915172964950335e-05, + "loss": 4.1899, + "step": 20330 + }, + { + "epoch": 2.3167606355715016, + "grad_norm": 15.921638488769531, + "learning_rate": 2.6903756136545273e-05, + "loss": 4.4662, + "step": 20340 + }, + { + "epoch": 2.3178996525998063, + "grad_norm": 20.366044998168945, + "learning_rate": 2.68923393081402e-05, + "loss": 4.007, + "step": 20350 + }, + { + "epoch": 2.319038669628111, + "grad_norm": 13.331708908081055, + "learning_rate": 2.6880922479735133e-05, + "loss": 4.3506, + "step": 20360 + }, + { + "epoch": 2.3201776866564154, + "grad_norm": 14.432374000549316, + "learning_rate": 2.6869505651330062e-05, + "loss": 4.3741, + "step": 20370 + }, + { + "epoch": 2.32131670368472, + "grad_norm": 17.888835906982422, + "learning_rate": 2.6858088822924994e-05, + "loss": 4.4641, + "step": 20380 + }, + { + "epoch": 2.3224557207130245, + "grad_norm": 12.111369132995605, + "learning_rate": 2.6846671994519922e-05, + "loss": 4.609, + "step": 20390 + }, + { + "epoch": 2.3235947377413293, + "grad_norm": 18.915489196777344, + "learning_rate": 2.6835255166114854e-05, + "loss": 4.3178, + "step": 20400 + }, + { + "epoch": 2.3247337547696336, + "grad_norm": 13.839448928833008, + "learning_rate": 2.6823838337709782e-05, + "loss": 4.4563, + "step": 20410 + }, + { + "epoch": 2.3258727717979384, + "grad_norm": 22.157007217407227, + "learning_rate": 2.6812421509304714e-05, + "loss": 4.3481, + "step": 20420 + }, + { + "epoch": 2.327011788826243, + "grad_norm": 11.417917251586914, + "learning_rate": 2.680100468089965e-05, + "loss": 4.5919, + "step": 20430 + }, + { + "epoch": 2.3281508058545475, + "grad_norm": 10.939807891845703, + "learning_rate": 2.678958785249458e-05, + "loss": 4.1816, + "step": 20440 + }, + { + "epoch": 2.3292898228828522, + "grad_norm": 19.246337890625, + "learning_rate": 2.677817102408951e-05, + "loss": 4.3146, + "step": 20450 + }, + { + "epoch": 2.3304288399111566, + "grad_norm": 25.213848114013672, + "learning_rate": 2.676675419568444e-05, + "loss": 4.372, + "step": 20460 + }, + { + "epoch": 2.3315678569394613, + "grad_norm": 10.462803840637207, + "learning_rate": 2.675533736727937e-05, + "loss": 4.5615, + "step": 20470 + }, + { + "epoch": 2.3327068739677657, + "grad_norm": 25.962284088134766, + "learning_rate": 2.6743920538874302e-05, + "loss": 4.5635, + "step": 20480 + }, + { + "epoch": 2.3338458909960704, + "grad_norm": 15.802549362182617, + "learning_rate": 2.673250371046923e-05, + "loss": 4.3188, + "step": 20490 + }, + { + "epoch": 2.334984908024375, + "grad_norm": 9.226036071777344, + "learning_rate": 2.6721086882064162e-05, + "loss": 4.2887, + "step": 20500 + }, + { + "epoch": 2.3361239250526795, + "grad_norm": 23.429420471191406, + "learning_rate": 2.6709670053659097e-05, + "loss": 4.31, + "step": 20510 + }, + { + "epoch": 2.3372629420809843, + "grad_norm": 11.576409339904785, + "learning_rate": 2.6698253225254026e-05, + "loss": 4.6043, + "step": 20520 + }, + { + "epoch": 2.3384019591092886, + "grad_norm": 13.73523235321045, + "learning_rate": 2.6686836396848958e-05, + "loss": 4.4029, + "step": 20530 + }, + { + "epoch": 2.3395409761375934, + "grad_norm": 9.554508209228516, + "learning_rate": 2.667541956844389e-05, + "loss": 4.5881, + "step": 20540 + }, + { + "epoch": 2.3406799931658977, + "grad_norm": 15.159540176391602, + "learning_rate": 2.6664002740038818e-05, + "loss": 4.4314, + "step": 20550 + }, + { + "epoch": 2.3418190101942025, + "grad_norm": 9.74484634399414, + "learning_rate": 2.665258591163375e-05, + "loss": 4.3374, + "step": 20560 + }, + { + "epoch": 2.342958027222507, + "grad_norm": 14.005760192871094, + "learning_rate": 2.664116908322868e-05, + "loss": 4.3698, + "step": 20570 + }, + { + "epoch": 2.3440970442508116, + "grad_norm": 18.01361846923828, + "learning_rate": 2.662975225482361e-05, + "loss": 4.5266, + "step": 20580 + }, + { + "epoch": 2.345236061279116, + "grad_norm": 9.59418773651123, + "learning_rate": 2.6618335426418545e-05, + "loss": 4.4088, + "step": 20590 + }, + { + "epoch": 2.3463750783074206, + "grad_norm": 11.596879005432129, + "learning_rate": 2.6606918598013474e-05, + "loss": 4.4025, + "step": 20600 + }, + { + "epoch": 2.3475140953357254, + "grad_norm": 19.087871551513672, + "learning_rate": 2.6595501769608406e-05, + "loss": 4.2458, + "step": 20610 + }, + { + "epoch": 2.3486531123640297, + "grad_norm": 14.203076362609863, + "learning_rate": 2.6584084941203334e-05, + "loss": 4.7268, + "step": 20620 + }, + { + "epoch": 2.3497921293923345, + "grad_norm": 25.49370002746582, + "learning_rate": 2.6572668112798266e-05, + "loss": 4.303, + "step": 20630 + }, + { + "epoch": 2.350931146420639, + "grad_norm": 23.890228271484375, + "learning_rate": 2.6561251284393194e-05, + "loss": 4.7874, + "step": 20640 + }, + { + "epoch": 2.3520701634489436, + "grad_norm": 11.49235725402832, + "learning_rate": 2.6549834455988126e-05, + "loss": 4.4918, + "step": 20650 + }, + { + "epoch": 2.353209180477248, + "grad_norm": 15.059396743774414, + "learning_rate": 2.6538417627583058e-05, + "loss": 4.3722, + "step": 20660 + }, + { + "epoch": 2.3543481975055527, + "grad_norm": 9.131157875061035, + "learning_rate": 2.6527000799177993e-05, + "loss": 4.2016, + "step": 20670 + }, + { + "epoch": 2.3554872145338575, + "grad_norm": 9.978961944580078, + "learning_rate": 2.6515583970772922e-05, + "loss": 4.5545, + "step": 20680 + }, + { + "epoch": 2.3566262315621618, + "grad_norm": 10.577940940856934, + "learning_rate": 2.6504167142367854e-05, + "loss": 4.2959, + "step": 20690 + }, + { + "epoch": 2.3577652485904665, + "grad_norm": 13.292899131774902, + "learning_rate": 2.6492750313962782e-05, + "loss": 4.4357, + "step": 20700 + }, + { + "epoch": 2.358904265618771, + "grad_norm": 12.609500885009766, + "learning_rate": 2.6481333485557714e-05, + "loss": 4.3743, + "step": 20710 + }, + { + "epoch": 2.3600432826470756, + "grad_norm": 12.299931526184082, + "learning_rate": 2.6469916657152642e-05, + "loss": 4.5427, + "step": 20720 + }, + { + "epoch": 2.36118229967538, + "grad_norm": 15.316471099853516, + "learning_rate": 2.6458499828747574e-05, + "loss": 4.3887, + "step": 20730 + }, + { + "epoch": 2.3623213167036847, + "grad_norm": 15.284895896911621, + "learning_rate": 2.6447083000342503e-05, + "loss": 4.2833, + "step": 20740 + }, + { + "epoch": 2.3634603337319895, + "grad_norm": 7.876543045043945, + "learning_rate": 2.6435666171937434e-05, + "loss": 4.3327, + "step": 20750 + }, + { + "epoch": 2.364599350760294, + "grad_norm": 10.778501510620117, + "learning_rate": 2.642424934353237e-05, + "loss": 4.2446, + "step": 20760 + }, + { + "epoch": 2.3657383677885986, + "grad_norm": 15.023706436157227, + "learning_rate": 2.64128325151273e-05, + "loss": 4.6691, + "step": 20770 + }, + { + "epoch": 2.366877384816903, + "grad_norm": 14.946088790893555, + "learning_rate": 2.640141568672223e-05, + "loss": 4.1224, + "step": 20780 + }, + { + "epoch": 2.3680164018452077, + "grad_norm": 13.244171142578125, + "learning_rate": 2.6389998858317162e-05, + "loss": 5.0236, + "step": 20790 + }, + { + "epoch": 2.369155418873512, + "grad_norm": 15.137276649475098, + "learning_rate": 2.6379723712752598e-05, + "loss": 4.3492, + "step": 20800 + }, + { + "epoch": 2.3702944359018168, + "grad_norm": 12.232105255126953, + "learning_rate": 2.6368306884347533e-05, + "loss": 4.4269, + "step": 20810 + }, + { + "epoch": 2.3714334529301215, + "grad_norm": 14.004165649414062, + "learning_rate": 2.635689005594246e-05, + "loss": 4.4677, + "step": 20820 + }, + { + "epoch": 2.372572469958426, + "grad_norm": 16.884231567382812, + "learning_rate": 2.6345473227537393e-05, + "loss": 4.4402, + "step": 20830 + }, + { + "epoch": 2.3737114869867306, + "grad_norm": 12.156578063964844, + "learning_rate": 2.6334056399132322e-05, + "loss": 4.7016, + "step": 20840 + }, + { + "epoch": 2.374850504015035, + "grad_norm": 12.994196891784668, + "learning_rate": 2.6322639570727254e-05, + "loss": 4.3935, + "step": 20850 + }, + { + "epoch": 2.3759895210433397, + "grad_norm": 23.809234619140625, + "learning_rate": 2.6311222742322182e-05, + "loss": 4.7228, + "step": 20860 + }, + { + "epoch": 2.377128538071644, + "grad_norm": 16.818578720092773, + "learning_rate": 2.6299805913917114e-05, + "loss": 4.3697, + "step": 20870 + }, + { + "epoch": 2.378267555099949, + "grad_norm": 10.739876747131348, + "learning_rate": 2.6288389085512046e-05, + "loss": 4.2883, + "step": 20880 + }, + { + "epoch": 2.379406572128253, + "grad_norm": 11.96601676940918, + "learning_rate": 2.6276972257106974e-05, + "loss": 4.2737, + "step": 20890 + }, + { + "epoch": 2.380545589156558, + "grad_norm": 12.246533393859863, + "learning_rate": 2.626555542870191e-05, + "loss": 4.4658, + "step": 20900 + }, + { + "epoch": 2.381684606184862, + "grad_norm": 12.089892387390137, + "learning_rate": 2.625413860029684e-05, + "loss": 4.3181, + "step": 20910 + }, + { + "epoch": 2.382823623213167, + "grad_norm": 11.611150741577148, + "learning_rate": 2.624272177189177e-05, + "loss": 4.7861, + "step": 20920 + }, + { + "epoch": 2.3839626402414718, + "grad_norm": 15.781271934509277, + "learning_rate": 2.62313049434867e-05, + "loss": 4.2517, + "step": 20930 + }, + { + "epoch": 2.385101657269776, + "grad_norm": 7.367682456970215, + "learning_rate": 2.621988811508163e-05, + "loss": 4.3842, + "step": 20940 + }, + { + "epoch": 2.386240674298081, + "grad_norm": 16.008838653564453, + "learning_rate": 2.6208471286676562e-05, + "loss": 4.3608, + "step": 20950 + }, + { + "epoch": 2.387379691326385, + "grad_norm": 7.852641582489014, + "learning_rate": 2.619705445827149e-05, + "loss": 4.8198, + "step": 20960 + }, + { + "epoch": 2.38851870835469, + "grad_norm": 9.072783470153809, + "learning_rate": 2.6185637629866422e-05, + "loss": 4.6759, + "step": 20970 + }, + { + "epoch": 2.3896577253829943, + "grad_norm": 8.352493286132812, + "learning_rate": 2.6174220801461357e-05, + "loss": 4.4424, + "step": 20980 + }, + { + "epoch": 2.390796742411299, + "grad_norm": 12.035454750061035, + "learning_rate": 2.616280397305629e-05, + "loss": 4.6051, + "step": 20990 + }, + { + "epoch": 2.391935759439604, + "grad_norm": 15.381999969482422, + "learning_rate": 2.6151387144651218e-05, + "loss": 4.5452, + "step": 21000 + }, + { + "epoch": 2.393074776467908, + "grad_norm": 8.592596054077148, + "learning_rate": 2.613997031624615e-05, + "loss": 4.6688, + "step": 21010 + }, + { + "epoch": 2.394213793496213, + "grad_norm": 12.32676887512207, + "learning_rate": 2.6128553487841078e-05, + "loss": 4.3724, + "step": 21020 + }, + { + "epoch": 2.395352810524517, + "grad_norm": 10.219091415405273, + "learning_rate": 2.611713665943601e-05, + "loss": 4.5526, + "step": 21030 + }, + { + "epoch": 2.396491827552822, + "grad_norm": 23.337568283081055, + "learning_rate": 2.6105719831030938e-05, + "loss": 4.4877, + "step": 21040 + }, + { + "epoch": 2.3976308445811263, + "grad_norm": 23.432353973388672, + "learning_rate": 2.609430300262587e-05, + "loss": 4.2944, + "step": 21050 + }, + { + "epoch": 2.398769861609431, + "grad_norm": 21.79475212097168, + "learning_rate": 2.6082886174220805e-05, + "loss": 4.1989, + "step": 21060 + }, + { + "epoch": 2.399908878637736, + "grad_norm": 12.396292686462402, + "learning_rate": 2.6071469345815737e-05, + "loss": 4.45, + "step": 21070 + }, + { + "epoch": 2.40104789566604, + "grad_norm": 18.911386489868164, + "learning_rate": 2.6060052517410665e-05, + "loss": 4.5491, + "step": 21080 + }, + { + "epoch": 2.402186912694345, + "grad_norm": 10.625134468078613, + "learning_rate": 2.6048635689005597e-05, + "loss": 4.3925, + "step": 21090 + }, + { + "epoch": 2.4033259297226492, + "grad_norm": 11.378150939941406, + "learning_rate": 2.6037218860600526e-05, + "loss": 4.4926, + "step": 21100 + }, + { + "epoch": 2.404464946750954, + "grad_norm": 14.170574188232422, + "learning_rate": 2.6025802032195458e-05, + "loss": 4.7183, + "step": 21110 + }, + { + "epoch": 2.4056039637792583, + "grad_norm": 21.01306915283203, + "learning_rate": 2.6014385203790386e-05, + "loss": 4.1274, + "step": 21120 + }, + { + "epoch": 2.406742980807563, + "grad_norm": 12.135096549987793, + "learning_rate": 2.6002968375385318e-05, + "loss": 4.8377, + "step": 21130 + }, + { + "epoch": 2.407881997835868, + "grad_norm": 10.106658935546875, + "learning_rate": 2.5991551546980253e-05, + "loss": 4.4585, + "step": 21140 + }, + { + "epoch": 2.409021014864172, + "grad_norm": 22.058813095092773, + "learning_rate": 2.598013471857518e-05, + "loss": 4.1022, + "step": 21150 + }, + { + "epoch": 2.410160031892477, + "grad_norm": 9.689461708068848, + "learning_rate": 2.5968717890170113e-05, + "loss": 4.2402, + "step": 21160 + }, + { + "epoch": 2.4112990489207813, + "grad_norm": 22.227386474609375, + "learning_rate": 2.5957301061765045e-05, + "loss": 4.6234, + "step": 21170 + }, + { + "epoch": 2.412438065949086, + "grad_norm": 14.180185317993164, + "learning_rate": 2.5945884233359974e-05, + "loss": 4.1885, + "step": 21180 + }, + { + "epoch": 2.4135770829773904, + "grad_norm": 11.204601287841797, + "learning_rate": 2.5934467404954906e-05, + "loss": 4.6157, + "step": 21190 + }, + { + "epoch": 2.414716100005695, + "grad_norm": 13.349756240844727, + "learning_rate": 2.5923050576549834e-05, + "loss": 4.4678, + "step": 21200 + }, + { + "epoch": 2.4158551170339995, + "grad_norm": 8.626503944396973, + "learning_rate": 2.5911633748144766e-05, + "loss": 4.1648, + "step": 21210 + }, + { + "epoch": 2.4169941340623042, + "grad_norm": 21.226703643798828, + "learning_rate": 2.5900216919739694e-05, + "loss": 4.4587, + "step": 21220 + }, + { + "epoch": 2.4181331510906086, + "grad_norm": 11.00561237335205, + "learning_rate": 2.588880009133463e-05, + "loss": 4.484, + "step": 21230 + }, + { + "epoch": 2.4192721681189133, + "grad_norm": 21.933300018310547, + "learning_rate": 2.587738326292956e-05, + "loss": 4.5352, + "step": 21240 + }, + { + "epoch": 2.420411185147218, + "grad_norm": 8.876955032348633, + "learning_rate": 2.586596643452449e-05, + "loss": 4.0314, + "step": 21250 + }, + { + "epoch": 2.4215502021755224, + "grad_norm": 16.549108505249023, + "learning_rate": 2.585454960611942e-05, + "loss": 4.4442, + "step": 21260 + }, + { + "epoch": 2.422689219203827, + "grad_norm": 9.397076606750488, + "learning_rate": 2.584313277771435e-05, + "loss": 4.3439, + "step": 21270 + }, + { + "epoch": 2.4238282362321315, + "grad_norm": 13.82287311553955, + "learning_rate": 2.5831715949309282e-05, + "loss": 4.4222, + "step": 21280 + }, + { + "epoch": 2.4249672532604363, + "grad_norm": 13.481350898742676, + "learning_rate": 2.5820299120904214e-05, + "loss": 4.558, + "step": 21290 + }, + { + "epoch": 2.4261062702887406, + "grad_norm": 17.883499145507812, + "learning_rate": 2.5808882292499142e-05, + "loss": 4.8666, + "step": 21300 + }, + { + "epoch": 2.4272452873170454, + "grad_norm": 12.39294719696045, + "learning_rate": 2.5797465464094077e-05, + "loss": 4.5578, + "step": 21310 + }, + { + "epoch": 2.42838430434535, + "grad_norm": 16.293148040771484, + "learning_rate": 2.578604863568901e-05, + "loss": 4.5579, + "step": 21320 + }, + { + "epoch": 2.4295233213736545, + "grad_norm": 8.16700267791748, + "learning_rate": 2.5774631807283938e-05, + "loss": 4.5132, + "step": 21330 + }, + { + "epoch": 2.430662338401959, + "grad_norm": 12.83481216430664, + "learning_rate": 2.576321497887887e-05, + "loss": 4.6121, + "step": 21340 + }, + { + "epoch": 2.4318013554302635, + "grad_norm": 16.078487396240234, + "learning_rate": 2.5751798150473798e-05, + "loss": 4.3257, + "step": 21350 + }, + { + "epoch": 2.4329403724585683, + "grad_norm": 10.014603614807129, + "learning_rate": 2.574038132206873e-05, + "loss": 4.3713, + "step": 21360 + }, + { + "epoch": 2.4340793894868726, + "grad_norm": 10.41618824005127, + "learning_rate": 2.5728964493663658e-05, + "loss": 4.6494, + "step": 21370 + }, + { + "epoch": 2.4352184065151774, + "grad_norm": 12.7681245803833, + "learning_rate": 2.571754766525859e-05, + "loss": 4.4948, + "step": 21380 + }, + { + "epoch": 2.436357423543482, + "grad_norm": 14.007975578308105, + "learning_rate": 2.5706130836853525e-05, + "loss": 4.2392, + "step": 21390 + }, + { + "epoch": 2.4374964405717865, + "grad_norm": 22.04856300354004, + "learning_rate": 2.5694714008448457e-05, + "loss": 4.0833, + "step": 21400 + }, + { + "epoch": 2.4386354576000913, + "grad_norm": 10.801676750183105, + "learning_rate": 2.5683297180043386e-05, + "loss": 4.4641, + "step": 21410 + }, + { + "epoch": 2.4397744746283956, + "grad_norm": 15.246986389160156, + "learning_rate": 2.5671880351638317e-05, + "loss": 4.2553, + "step": 21420 + }, + { + "epoch": 2.4409134916567004, + "grad_norm": 8.889477729797363, + "learning_rate": 2.5660463523233246e-05, + "loss": 4.4263, + "step": 21430 + }, + { + "epoch": 2.4420525086850047, + "grad_norm": 12.95500373840332, + "learning_rate": 2.5649046694828178e-05, + "loss": 4.6147, + "step": 21440 + }, + { + "epoch": 2.4431915257133094, + "grad_norm": 13.485177993774414, + "learning_rate": 2.5637629866423106e-05, + "loss": 4.5826, + "step": 21450 + }, + { + "epoch": 2.444330542741614, + "grad_norm": 11.917974472045898, + "learning_rate": 2.5626213038018038e-05, + "loss": 4.1817, + "step": 21460 + }, + { + "epoch": 2.4454695597699185, + "grad_norm": 17.149272918701172, + "learning_rate": 2.5614796209612973e-05, + "loss": 4.4273, + "step": 21470 + }, + { + "epoch": 2.4466085767982233, + "grad_norm": 10.053730964660645, + "learning_rate": 2.5603379381207905e-05, + "loss": 4.4531, + "step": 21480 + }, + { + "epoch": 2.4477475938265276, + "grad_norm": 26.15191078186035, + "learning_rate": 2.5591962552802834e-05, + "loss": 4.2981, + "step": 21490 + }, + { + "epoch": 2.4488866108548324, + "grad_norm": 24.09326171875, + "learning_rate": 2.5580545724397765e-05, + "loss": 4.4, + "step": 21500 + }, + { + "epoch": 2.4500256278831367, + "grad_norm": 10.37264633178711, + "learning_rate": 2.5569128895992694e-05, + "loss": 4.7429, + "step": 21510 + }, + { + "epoch": 2.4511646449114415, + "grad_norm": 10.826128959655762, + "learning_rate": 2.5557712067587626e-05, + "loss": 3.9984, + "step": 21520 + }, + { + "epoch": 2.4523036619397462, + "grad_norm": 12.696362495422363, + "learning_rate": 2.5546295239182554e-05, + "loss": 4.599, + "step": 21530 + }, + { + "epoch": 2.4534426789680506, + "grad_norm": 10.249017715454102, + "learning_rate": 2.5534878410777486e-05, + "loss": 4.6064, + "step": 21540 + }, + { + "epoch": 2.4545816959963553, + "grad_norm": 17.17274284362793, + "learning_rate": 2.5523461582372414e-05, + "loss": 4.3368, + "step": 21550 + }, + { + "epoch": 2.4557207130246597, + "grad_norm": 14.794090270996094, + "learning_rate": 2.551204475396735e-05, + "loss": 4.4248, + "step": 21560 + }, + { + "epoch": 2.4568597300529644, + "grad_norm": 18.64275550842285, + "learning_rate": 2.550062792556228e-05, + "loss": 4.4132, + "step": 21570 + }, + { + "epoch": 2.4579987470812688, + "grad_norm": 52.16679763793945, + "learning_rate": 2.5489211097157213e-05, + "loss": 4.7241, + "step": 21580 + }, + { + "epoch": 2.4591377641095735, + "grad_norm": 10.204142570495605, + "learning_rate": 2.5477794268752142e-05, + "loss": 4.2251, + "step": 21590 + }, + { + "epoch": 2.460276781137878, + "grad_norm": 13.011360168457031, + "learning_rate": 2.5466377440347074e-05, + "loss": 4.2051, + "step": 21600 + }, + { + "epoch": 2.4614157981661826, + "grad_norm": 13.616783142089844, + "learning_rate": 2.5454960611942002e-05, + "loss": 4.3595, + "step": 21610 + }, + { + "epoch": 2.462554815194487, + "grad_norm": 13.076457977294922, + "learning_rate": 2.5443543783536934e-05, + "loss": 4.4622, + "step": 21620 + }, + { + "epoch": 2.4636938322227917, + "grad_norm": 16.63337516784668, + "learning_rate": 2.5432126955131862e-05, + "loss": 4.4244, + "step": 21630 + }, + { + "epoch": 2.4648328492510965, + "grad_norm": 18.748550415039062, + "learning_rate": 2.5420710126726798e-05, + "loss": 4.375, + "step": 21640 + }, + { + "epoch": 2.465971866279401, + "grad_norm": 9.516826629638672, + "learning_rate": 2.540929329832173e-05, + "loss": 4.3369, + "step": 21650 + }, + { + "epoch": 2.4671108833077056, + "grad_norm": 24.19636344909668, + "learning_rate": 2.5397876469916658e-05, + "loss": 4.1031, + "step": 21660 + }, + { + "epoch": 2.46824990033601, + "grad_norm": 16.849750518798828, + "learning_rate": 2.538645964151159e-05, + "loss": 4.4579, + "step": 21670 + }, + { + "epoch": 2.4693889173643147, + "grad_norm": 13.501879692077637, + "learning_rate": 2.537504281310652e-05, + "loss": 4.7379, + "step": 21680 + }, + { + "epoch": 2.470527934392619, + "grad_norm": 22.789653778076172, + "learning_rate": 2.536362598470145e-05, + "loss": 4.3932, + "step": 21690 + }, + { + "epoch": 2.4716669514209237, + "grad_norm": 12.54429817199707, + "learning_rate": 2.5352209156296382e-05, + "loss": 4.3613, + "step": 21700 + }, + { + "epoch": 2.4728059684492285, + "grad_norm": 26.30118751525879, + "learning_rate": 2.534079232789131e-05, + "loss": 4.3107, + "step": 21710 + }, + { + "epoch": 2.473944985477533, + "grad_norm": 10.114954948425293, + "learning_rate": 2.5329375499486245e-05, + "loss": 4.2892, + "step": 21720 + }, + { + "epoch": 2.4750840025058376, + "grad_norm": 21.673948287963867, + "learning_rate": 2.5317958671081177e-05, + "loss": 4.5669, + "step": 21730 + }, + { + "epoch": 2.476223019534142, + "grad_norm": 14.892142295837402, + "learning_rate": 2.5306541842676106e-05, + "loss": 4.2368, + "step": 21740 + }, + { + "epoch": 2.4773620365624467, + "grad_norm": 17.35381507873535, + "learning_rate": 2.5295125014271038e-05, + "loss": 4.3717, + "step": 21750 + }, + { + "epoch": 2.478501053590751, + "grad_norm": 18.587623596191406, + "learning_rate": 2.5283708185865966e-05, + "loss": 4.4697, + "step": 21760 + }, + { + "epoch": 2.479640070619056, + "grad_norm": 32.892051696777344, + "learning_rate": 2.5272291357460898e-05, + "loss": 4.3698, + "step": 21770 + }, + { + "epoch": 2.4807790876473605, + "grad_norm": 26.555744171142578, + "learning_rate": 2.5260874529055826e-05, + "loss": 4.2927, + "step": 21780 + }, + { + "epoch": 2.481918104675665, + "grad_norm": 8.519964218139648, + "learning_rate": 2.5249457700650758e-05, + "loss": 4.534, + "step": 21790 + }, + { + "epoch": 2.4830571217039696, + "grad_norm": 13.621071815490723, + "learning_rate": 2.5238040872245693e-05, + "loss": 4.5586, + "step": 21800 + }, + { + "epoch": 2.484196138732274, + "grad_norm": 15.400689125061035, + "learning_rate": 2.5226624043840625e-05, + "loss": 4.2739, + "step": 21810 + }, + { + "epoch": 2.4853351557605787, + "grad_norm": 14.989195823669434, + "learning_rate": 2.5215207215435554e-05, + "loss": 4.5925, + "step": 21820 + }, + { + "epoch": 2.486474172788883, + "grad_norm": 13.892701148986816, + "learning_rate": 2.5203790387030486e-05, + "loss": 4.274, + "step": 21830 + }, + { + "epoch": 2.487613189817188, + "grad_norm": 11.06151294708252, + "learning_rate": 2.5192373558625414e-05, + "loss": 4.2846, + "step": 21840 + }, + { + "epoch": 2.4887522068454926, + "grad_norm": 15.464399337768555, + "learning_rate": 2.5180956730220346e-05, + "loss": 4.3184, + "step": 21850 + }, + { + "epoch": 2.489891223873797, + "grad_norm": 16.73919105529785, + "learning_rate": 2.5169539901815274e-05, + "loss": 4.4767, + "step": 21860 + }, + { + "epoch": 2.4910302409021017, + "grad_norm": 10.512547492980957, + "learning_rate": 2.5158123073410206e-05, + "loss": 4.4806, + "step": 21870 + }, + { + "epoch": 2.492169257930406, + "grad_norm": 12.304697036743164, + "learning_rate": 2.5146706245005135e-05, + "loss": 4.5144, + "step": 21880 + }, + { + "epoch": 2.4933082749587108, + "grad_norm": 12.59383487701416, + "learning_rate": 2.5135289416600073e-05, + "loss": 4.4376, + "step": 21890 + }, + { + "epoch": 2.494447291987015, + "grad_norm": 28.72754669189453, + "learning_rate": 2.5123872588195e-05, + "loss": 4.2984, + "step": 21900 + }, + { + "epoch": 2.49558630901532, + "grad_norm": 15.549976348876953, + "learning_rate": 2.5112455759789933e-05, + "loss": 4.3363, + "step": 21910 + }, + { + "epoch": 2.496725326043624, + "grad_norm": 13.449384689331055, + "learning_rate": 2.5101038931384862e-05, + "loss": 4.5456, + "step": 21920 + }, + { + "epoch": 2.497864343071929, + "grad_norm": 15.404632568359375, + "learning_rate": 2.5089622102979794e-05, + "loss": 4.5399, + "step": 21930 + }, + { + "epoch": 2.4990033601002333, + "grad_norm": 36.435874938964844, + "learning_rate": 2.5078205274574722e-05, + "loss": 4.4067, + "step": 21940 + }, + { + "epoch": 2.500142377128538, + "grad_norm": 13.879344940185547, + "learning_rate": 2.5066788446169654e-05, + "loss": 4.2819, + "step": 21950 + }, + { + "epoch": 2.501281394156843, + "grad_norm": 12.27030086517334, + "learning_rate": 2.5055371617764582e-05, + "loss": 4.5013, + "step": 21960 + }, + { + "epoch": 2.502420411185147, + "grad_norm": 9.679683685302734, + "learning_rate": 2.504395478935952e-05, + "loss": 4.4269, + "step": 21970 + }, + { + "epoch": 2.503559428213452, + "grad_norm": 21.375797271728516, + "learning_rate": 2.503253796095445e-05, + "loss": 4.6372, + "step": 21980 + }, + { + "epoch": 2.504698445241756, + "grad_norm": 13.330611228942871, + "learning_rate": 2.502112113254938e-05, + "loss": 4.267, + "step": 21990 + }, + { + "epoch": 2.505837462270061, + "grad_norm": 31.16680908203125, + "learning_rate": 2.500970430414431e-05, + "loss": 4.541, + "step": 22000 + }, + { + "epoch": 2.505837462270061, + "eval_loss": 6.085844039916992, + "eval_runtime": 11.247, + "eval_samples_per_second": 1.334, + "eval_steps_per_second": 0.178, + "step": 22000 + }, + { + "epoch": 2.5069764792983653, + "grad_norm": 15.743983268737793, + "learning_rate": 2.499828747573924e-05, + "loss": 4.5104, + "step": 22010 + }, + { + "epoch": 2.50811549632667, + "grad_norm": 22.402015686035156, + "learning_rate": 2.498687064733417e-05, + "loss": 4.4935, + "step": 22020 + }, + { + "epoch": 2.509254513354975, + "grad_norm": 12.458966255187988, + "learning_rate": 2.4975453818929105e-05, + "loss": 4.166, + "step": 22030 + }, + { + "epoch": 2.510393530383279, + "grad_norm": 11.811493873596191, + "learning_rate": 2.4964036990524034e-05, + "loss": 4.2387, + "step": 22040 + }, + { + "epoch": 2.511532547411584, + "grad_norm": 10.788464546203613, + "learning_rate": 2.4952620162118966e-05, + "loss": 4.4599, + "step": 22050 + }, + { + "epoch": 2.5126715644398883, + "grad_norm": 19.103111267089844, + "learning_rate": 2.4941203333713894e-05, + "loss": 4.2352, + "step": 22060 + }, + { + "epoch": 2.513810581468193, + "grad_norm": 11.614288330078125, + "learning_rate": 2.4929786505308826e-05, + "loss": 4.3215, + "step": 22070 + }, + { + "epoch": 2.5149495984964974, + "grad_norm": 11.717473030090332, + "learning_rate": 2.4918369676903758e-05, + "loss": 4.5536, + "step": 22080 + }, + { + "epoch": 2.516088615524802, + "grad_norm": 10.13782024383545, + "learning_rate": 2.490695284849869e-05, + "loss": 5.0282, + "step": 22090 + }, + { + "epoch": 2.517227632553107, + "grad_norm": 19.63833999633789, + "learning_rate": 2.4895536020093618e-05, + "loss": 4.5342, + "step": 22100 + }, + { + "epoch": 2.518366649581411, + "grad_norm": 21.288978576660156, + "learning_rate": 2.488411919168855e-05, + "loss": 4.2393, + "step": 22110 + }, + { + "epoch": 2.519505666609716, + "grad_norm": 14.36782455444336, + "learning_rate": 2.4872702363283482e-05, + "loss": 4.5864, + "step": 22120 + }, + { + "epoch": 2.5206446836380203, + "grad_norm": 15.093581199645996, + "learning_rate": 2.486128553487841e-05, + "loss": 4.7658, + "step": 22130 + }, + { + "epoch": 2.521783700666325, + "grad_norm": 13.700653076171875, + "learning_rate": 2.4849868706473342e-05, + "loss": 4.2336, + "step": 22140 + }, + { + "epoch": 2.5229227176946294, + "grad_norm": 7.751123428344727, + "learning_rate": 2.4838451878068274e-05, + "loss": 4.8393, + "step": 22150 + }, + { + "epoch": 2.524061734722934, + "grad_norm": 13.696725845336914, + "learning_rate": 2.4827035049663206e-05, + "loss": 4.5059, + "step": 22160 + }, + { + "epoch": 2.525200751751239, + "grad_norm": 8.605315208435059, + "learning_rate": 2.4815618221258134e-05, + "loss": 4.4345, + "step": 22170 + }, + { + "epoch": 2.5263397687795432, + "grad_norm": 22.05066680908203, + "learning_rate": 2.4804201392853066e-05, + "loss": 4.2108, + "step": 22180 + }, + { + "epoch": 2.5274787858078476, + "grad_norm": 21.944713592529297, + "learning_rate": 2.4792784564447998e-05, + "loss": 4.3732, + "step": 22190 + }, + { + "epoch": 2.5286178028361523, + "grad_norm": 19.73705291748047, + "learning_rate": 2.478136773604293e-05, + "loss": 4.8541, + "step": 22200 + }, + { + "epoch": 2.529756819864457, + "grad_norm": 10.252677917480469, + "learning_rate": 2.4769950907637858e-05, + "loss": 4.4317, + "step": 22210 + }, + { + "epoch": 2.5308958368927614, + "grad_norm": 10.081154823303223, + "learning_rate": 2.475853407923279e-05, + "loss": 4.5045, + "step": 22220 + }, + { + "epoch": 2.532034853921066, + "grad_norm": 12.180397987365723, + "learning_rate": 2.474711725082772e-05, + "loss": 4.654, + "step": 22230 + }, + { + "epoch": 2.533173870949371, + "grad_norm": 11.417940139770508, + "learning_rate": 2.4735700422422654e-05, + "loss": 4.6833, + "step": 22240 + }, + { + "epoch": 2.5343128879776753, + "grad_norm": 14.055529594421387, + "learning_rate": 2.4724283594017582e-05, + "loss": 4.5018, + "step": 22250 + }, + { + "epoch": 2.5354519050059796, + "grad_norm": 27.471111297607422, + "learning_rate": 2.4712866765612514e-05, + "loss": 4.3245, + "step": 22260 + }, + { + "epoch": 2.5365909220342844, + "grad_norm": 10.814193725585938, + "learning_rate": 2.4701449937207442e-05, + "loss": 4.4528, + "step": 22270 + }, + { + "epoch": 2.537729939062589, + "grad_norm": 22.31861686706543, + "learning_rate": 2.4690033108802378e-05, + "loss": 4.5534, + "step": 22280 + }, + { + "epoch": 2.5388689560908935, + "grad_norm": 8.343460083007812, + "learning_rate": 2.4678616280397306e-05, + "loss": 4.6753, + "step": 22290 + }, + { + "epoch": 2.5400079731191982, + "grad_norm": 92.80477142333984, + "learning_rate": 2.4667199451992238e-05, + "loss": 5.0062, + "step": 22300 + }, + { + "epoch": 2.5411469901475026, + "grad_norm": 18.66383934020996, + "learning_rate": 2.4655782623587166e-05, + "loss": 4.2345, + "step": 22310 + }, + { + "epoch": 2.5422860071758073, + "grad_norm": 17.985797882080078, + "learning_rate": 2.46443657951821e-05, + "loss": 4.4421, + "step": 22320 + }, + { + "epoch": 2.5434250242041117, + "grad_norm": 11.158591270446777, + "learning_rate": 2.463294896677703e-05, + "loss": 4.4239, + "step": 22330 + }, + { + "epoch": 2.5445640412324164, + "grad_norm": 11.361953735351562, + "learning_rate": 2.4621532138371962e-05, + "loss": 4.4585, + "step": 22340 + }, + { + "epoch": 2.545703058260721, + "grad_norm": 11.874592781066895, + "learning_rate": 2.461011530996689e-05, + "loss": 4.3894, + "step": 22350 + }, + { + "epoch": 2.5468420752890255, + "grad_norm": 10.548654556274414, + "learning_rate": 2.4598698481561825e-05, + "loss": 4.2967, + "step": 22360 + }, + { + "epoch": 2.5479810923173303, + "grad_norm": 9.199511528015137, + "learning_rate": 2.4587281653156754e-05, + "loss": 4.4433, + "step": 22370 + }, + { + "epoch": 2.5491201093456346, + "grad_norm": 11.830724716186523, + "learning_rate": 2.4575864824751686e-05, + "loss": 4.2873, + "step": 22380 + }, + { + "epoch": 2.5502591263739394, + "grad_norm": 13.431696891784668, + "learning_rate": 2.4564447996346614e-05, + "loss": 4.6081, + "step": 22390 + }, + { + "epoch": 2.5513981434022437, + "grad_norm": 9.607420921325684, + "learning_rate": 2.455303116794155e-05, + "loss": 4.528, + "step": 22400 + }, + { + "epoch": 2.5525371604305485, + "grad_norm": 9.920609474182129, + "learning_rate": 2.4541614339536478e-05, + "loss": 4.3601, + "step": 22410 + }, + { + "epoch": 2.5536761774588532, + "grad_norm": 17.428359985351562, + "learning_rate": 2.453019751113141e-05, + "loss": 4.6, + "step": 22420 + }, + { + "epoch": 2.5548151944871575, + "grad_norm": 15.154529571533203, + "learning_rate": 2.4518780682726338e-05, + "loss": 4.621, + "step": 22430 + }, + { + "epoch": 2.5559542115154623, + "grad_norm": 8.525446891784668, + "learning_rate": 2.4507363854321273e-05, + "loss": 4.2658, + "step": 22440 + }, + { + "epoch": 2.5570932285437666, + "grad_norm": 13.262287139892578, + "learning_rate": 2.4495947025916202e-05, + "loss": 4.4597, + "step": 22450 + }, + { + "epoch": 2.5582322455720714, + "grad_norm": 16.254911422729492, + "learning_rate": 2.4484530197511134e-05, + "loss": 4.3065, + "step": 22460 + }, + { + "epoch": 2.5593712626003757, + "grad_norm": 9.883187294006348, + "learning_rate": 2.4473113369106062e-05, + "loss": 4.7282, + "step": 22470 + }, + { + "epoch": 2.5605102796286805, + "grad_norm": 12.782791137695312, + "learning_rate": 2.4461696540700997e-05, + "loss": 4.4512, + "step": 22480 + }, + { + "epoch": 2.5616492966569853, + "grad_norm": 16.31939697265625, + "learning_rate": 2.4450279712295926e-05, + "loss": 4.5547, + "step": 22490 + }, + { + "epoch": 2.5627883136852896, + "grad_norm": 9.065730094909668, + "learning_rate": 2.4438862883890858e-05, + "loss": 4.404, + "step": 22500 + }, + { + "epoch": 2.563927330713594, + "grad_norm": 12.488701820373535, + "learning_rate": 2.4427446055485786e-05, + "loss": 4.5696, + "step": 22510 + }, + { + "epoch": 2.5650663477418987, + "grad_norm": 9.828531265258789, + "learning_rate": 2.4416029227080718e-05, + "loss": 4.3566, + "step": 22520 + }, + { + "epoch": 2.5662053647702034, + "grad_norm": 14.15413761138916, + "learning_rate": 2.440461239867565e-05, + "loss": 4.5018, + "step": 22530 + }, + { + "epoch": 2.5673443817985078, + "grad_norm": 14.06329345703125, + "learning_rate": 2.439319557027058e-05, + "loss": 4.4367, + "step": 22540 + }, + { + "epoch": 2.5684833988268125, + "grad_norm": 13.392003059387207, + "learning_rate": 2.438177874186551e-05, + "loss": 4.9594, + "step": 22550 + }, + { + "epoch": 2.5696224158551173, + "grad_norm": 9.897027969360352, + "learning_rate": 2.4370361913460442e-05, + "loss": 4.4857, + "step": 22560 + }, + { + "epoch": 2.5707614328834216, + "grad_norm": 16.1251163482666, + "learning_rate": 2.4358945085055374e-05, + "loss": 4.2417, + "step": 22570 + }, + { + "epoch": 2.571900449911726, + "grad_norm": 17.117238998413086, + "learning_rate": 2.4347528256650302e-05, + "loss": 4.3507, + "step": 22580 + }, + { + "epoch": 2.5730394669400307, + "grad_norm": 15.291457176208496, + "learning_rate": 2.4336111428245234e-05, + "loss": 4.5056, + "step": 22590 + }, + { + "epoch": 2.5741784839683355, + "grad_norm": 12.484232902526855, + "learning_rate": 2.4324694599840166e-05, + "loss": 4.3101, + "step": 22600 + }, + { + "epoch": 2.57531750099664, + "grad_norm": 13.828505516052246, + "learning_rate": 2.4313277771435098e-05, + "loss": 4.3826, + "step": 22610 + }, + { + "epoch": 2.5764565180249446, + "grad_norm": 22.372825622558594, + "learning_rate": 2.4301860943030026e-05, + "loss": 4.4154, + "step": 22620 + }, + { + "epoch": 2.577595535053249, + "grad_norm": 27.027711868286133, + "learning_rate": 2.4290444114624958e-05, + "loss": 4.0432, + "step": 22630 + }, + { + "epoch": 2.5787345520815537, + "grad_norm": 12.284584045410156, + "learning_rate": 2.4279027286219886e-05, + "loss": 4.6365, + "step": 22640 + }, + { + "epoch": 2.579873569109858, + "grad_norm": 42.92761993408203, + "learning_rate": 2.426761045781482e-05, + "loss": 4.3148, + "step": 22650 + }, + { + "epoch": 2.5810125861381628, + "grad_norm": 13.668088912963867, + "learning_rate": 2.425619362940975e-05, + "loss": 4.459, + "step": 22660 + }, + { + "epoch": 2.5821516031664675, + "grad_norm": 30.51060676574707, + "learning_rate": 2.4244776801004682e-05, + "loss": 4.2199, + "step": 22670 + }, + { + "epoch": 2.583290620194772, + "grad_norm": 7.775396823883057, + "learning_rate": 2.423335997259961e-05, + "loss": 4.3111, + "step": 22680 + }, + { + "epoch": 2.5844296372230766, + "grad_norm": 17.842409133911133, + "learning_rate": 2.4221943144194546e-05, + "loss": 4.2721, + "step": 22690 + }, + { + "epoch": 2.585568654251381, + "grad_norm": 16.127065658569336, + "learning_rate": 2.4210526315789474e-05, + "loss": 4.544, + "step": 22700 + }, + { + "epoch": 2.5867076712796857, + "grad_norm": 10.717621803283691, + "learning_rate": 2.4199109487384406e-05, + "loss": 4.2006, + "step": 22710 + }, + { + "epoch": 2.58784668830799, + "grad_norm": 78.16552734375, + "learning_rate": 2.4187692658979334e-05, + "loss": 4.6037, + "step": 22720 + }, + { + "epoch": 2.588985705336295, + "grad_norm": 10.776037216186523, + "learning_rate": 2.417627583057427e-05, + "loss": 4.1843, + "step": 22730 + }, + { + "epoch": 2.5901247223645996, + "grad_norm": 12.085357666015625, + "learning_rate": 2.4164859002169198e-05, + "loss": 4.2983, + "step": 22740 + }, + { + "epoch": 2.591263739392904, + "grad_norm": 14.134846687316895, + "learning_rate": 2.415344217376413e-05, + "loss": 4.4293, + "step": 22750 + }, + { + "epoch": 2.5924027564212087, + "grad_norm": 13.935742378234863, + "learning_rate": 2.414202534535906e-05, + "loss": 4.2963, + "step": 22760 + }, + { + "epoch": 2.593541773449513, + "grad_norm": 11.380349159240723, + "learning_rate": 2.4130608516953994e-05, + "loss": 4.6185, + "step": 22770 + }, + { + "epoch": 2.5946807904778177, + "grad_norm": 9.011507987976074, + "learning_rate": 2.4119191688548922e-05, + "loss": 4.3858, + "step": 22780 + }, + { + "epoch": 2.595819807506122, + "grad_norm": 13.143577575683594, + "learning_rate": 2.4107774860143854e-05, + "loss": 4.801, + "step": 22790 + }, + { + "epoch": 2.596958824534427, + "grad_norm": 10.079553604125977, + "learning_rate": 2.4096358031738782e-05, + "loss": 4.4776, + "step": 22800 + }, + { + "epoch": 2.5980978415627316, + "grad_norm": 15.193427085876465, + "learning_rate": 2.4084941203333717e-05, + "loss": 4.1893, + "step": 22810 + }, + { + "epoch": 2.599236858591036, + "grad_norm": 18.3494815826416, + "learning_rate": 2.4073524374928646e-05, + "loss": 4.2356, + "step": 22820 + }, + { + "epoch": 2.6003758756193402, + "grad_norm": 9.987313270568848, + "learning_rate": 2.4062107546523578e-05, + "loss": 4.1031, + "step": 22830 + }, + { + "epoch": 2.601514892647645, + "grad_norm": 10.688200950622559, + "learning_rate": 2.4050690718118506e-05, + "loss": 4.5624, + "step": 22840 + }, + { + "epoch": 2.60265390967595, + "grad_norm": 17.7220401763916, + "learning_rate": 2.403927388971344e-05, + "loss": 4.2588, + "step": 22850 + }, + { + "epoch": 2.603792926704254, + "grad_norm": 15.036736488342285, + "learning_rate": 2.402785706130837e-05, + "loss": 4.3631, + "step": 22860 + }, + { + "epoch": 2.604931943732559, + "grad_norm": 13.211268424987793, + "learning_rate": 2.4016440232903302e-05, + "loss": 4.1452, + "step": 22870 + }, + { + "epoch": 2.6060709607608636, + "grad_norm": 11.788058280944824, + "learning_rate": 2.400502340449823e-05, + "loss": 4.6403, + "step": 22880 + }, + { + "epoch": 2.607209977789168, + "grad_norm": 11.017247200012207, + "learning_rate": 2.3993606576093162e-05, + "loss": 4.6076, + "step": 22890 + }, + { + "epoch": 2.6083489948174723, + "grad_norm": 32.86994171142578, + "learning_rate": 2.3982189747688094e-05, + "loss": 4.045, + "step": 22900 + }, + { + "epoch": 2.609488011845777, + "grad_norm": 11.733973503112793, + "learning_rate": 2.3970772919283026e-05, + "loss": 4.4082, + "step": 22910 + }, + { + "epoch": 2.610627028874082, + "grad_norm": 12.394948959350586, + "learning_rate": 2.3959356090877954e-05, + "loss": 4.4233, + "step": 22920 + }, + { + "epoch": 2.611766045902386, + "grad_norm": 8.273716926574707, + "learning_rate": 2.3947939262472886e-05, + "loss": 4.3974, + "step": 22930 + }, + { + "epoch": 2.612905062930691, + "grad_norm": 12.322443962097168, + "learning_rate": 2.3936522434067818e-05, + "loss": 4.5955, + "step": 22940 + }, + { + "epoch": 2.6140440799589952, + "grad_norm": 15.869829177856445, + "learning_rate": 2.392510560566275e-05, + "loss": 4.3106, + "step": 22950 + }, + { + "epoch": 2.6151830969873, + "grad_norm": 9.611001968383789, + "learning_rate": 2.3913688777257678e-05, + "loss": 4.4168, + "step": 22960 + }, + { + "epoch": 2.6163221140156043, + "grad_norm": 10.356759071350098, + "learning_rate": 2.390227194885261e-05, + "loss": 4.4961, + "step": 22970 + }, + { + "epoch": 2.617461131043909, + "grad_norm": 34.95732879638672, + "learning_rate": 2.3890855120447542e-05, + "loss": 4.0985, + "step": 22980 + }, + { + "epoch": 2.618600148072214, + "grad_norm": 39.855934143066406, + "learning_rate": 2.3879438292042474e-05, + "loss": 4.1007, + "step": 22990 + }, + { + "epoch": 2.619739165100518, + "grad_norm": 9.185993194580078, + "learning_rate": 2.3868021463637402e-05, + "loss": 4.4709, + "step": 23000 + }, + { + "epoch": 2.620878182128823, + "grad_norm": 14.936957359313965, + "learning_rate": 2.3856604635232334e-05, + "loss": 4.6402, + "step": 23010 + }, + { + "epoch": 2.6220171991571273, + "grad_norm": 11.28511905670166, + "learning_rate": 2.3845187806827266e-05, + "loss": 4.8301, + "step": 23020 + }, + { + "epoch": 2.623156216185432, + "grad_norm": 14.267093658447266, + "learning_rate": 2.3833770978422194e-05, + "loss": 4.439, + "step": 23030 + }, + { + "epoch": 2.6242952332137364, + "grad_norm": 9.55140209197998, + "learning_rate": 2.3822354150017126e-05, + "loss": 4.4499, + "step": 23040 + }, + { + "epoch": 2.625434250242041, + "grad_norm": 12.693892478942871, + "learning_rate": 2.3810937321612058e-05, + "loss": 4.3799, + "step": 23050 + }, + { + "epoch": 2.626573267270346, + "grad_norm": 19.29254150390625, + "learning_rate": 2.379952049320699e-05, + "loss": 4.4028, + "step": 23060 + }, + { + "epoch": 2.6277122842986502, + "grad_norm": 18.440366744995117, + "learning_rate": 2.3788103664801918e-05, + "loss": 4.2386, + "step": 23070 + }, + { + "epoch": 2.628851301326955, + "grad_norm": 11.92179012298584, + "learning_rate": 2.377668683639685e-05, + "loss": 4.1973, + "step": 23080 + }, + { + "epoch": 2.6299903183552593, + "grad_norm": 10.336345672607422, + "learning_rate": 2.376527000799178e-05, + "loss": 4.4222, + "step": 23090 + }, + { + "epoch": 2.631129335383564, + "grad_norm": 20.774919509887695, + "learning_rate": 2.3753853179586714e-05, + "loss": 4.2102, + "step": 23100 + }, + { + "epoch": 2.6322683524118684, + "grad_norm": 11.118274688720703, + "learning_rate": 2.3742436351181642e-05, + "loss": 4.7242, + "step": 23110 + }, + { + "epoch": 2.633407369440173, + "grad_norm": 9.938228607177734, + "learning_rate": 2.3731019522776574e-05, + "loss": 4.5976, + "step": 23120 + }, + { + "epoch": 2.634546386468478, + "grad_norm": 13.257858276367188, + "learning_rate": 2.3720744377212013e-05, + "loss": 4.2183, + "step": 23130 + }, + { + "epoch": 2.6356854034967823, + "grad_norm": 12.497907638549805, + "learning_rate": 2.3709327548806942e-05, + "loss": 4.5292, + "step": 23140 + }, + { + "epoch": 2.6368244205250866, + "grad_norm": 11.595098495483398, + "learning_rate": 2.3697910720401874e-05, + "loss": 4.2823, + "step": 23150 + }, + { + "epoch": 2.6379634375533914, + "grad_norm": 10.914962768554688, + "learning_rate": 2.3686493891996805e-05, + "loss": 4.4755, + "step": 23160 + }, + { + "epoch": 2.639102454581696, + "grad_norm": 8.901376724243164, + "learning_rate": 2.3675077063591737e-05, + "loss": 4.5236, + "step": 23170 + }, + { + "epoch": 2.6402414716100004, + "grad_norm": 9.814475059509277, + "learning_rate": 2.3663660235186666e-05, + "loss": 4.5523, + "step": 23180 + }, + { + "epoch": 2.641380488638305, + "grad_norm": 26.18296241760254, + "learning_rate": 2.3652243406781598e-05, + "loss": 4.3089, + "step": 23190 + }, + { + "epoch": 2.64251950566661, + "grad_norm": 16.817184448242188, + "learning_rate": 2.364082657837653e-05, + "loss": 4.3481, + "step": 23200 + }, + { + "epoch": 2.6436585226949143, + "grad_norm": 11.565564155578613, + "learning_rate": 2.3629409749971458e-05, + "loss": 4.4284, + "step": 23210 + }, + { + "epoch": 2.6447975397232186, + "grad_norm": 18.328445434570312, + "learning_rate": 2.361799292156639e-05, + "loss": 4.5163, + "step": 23220 + }, + { + "epoch": 2.6459365567515234, + "grad_norm": 17.54689598083496, + "learning_rate": 2.360657609316132e-05, + "loss": 4.1599, + "step": 23230 + }, + { + "epoch": 2.647075573779828, + "grad_norm": 11.191732406616211, + "learning_rate": 2.3595159264756253e-05, + "loss": 4.4104, + "step": 23240 + }, + { + "epoch": 2.6482145908081325, + "grad_norm": 9.781057357788086, + "learning_rate": 2.3583742436351182e-05, + "loss": 4.2188, + "step": 23250 + }, + { + "epoch": 2.6493536078364373, + "grad_norm": 14.09641170501709, + "learning_rate": 2.3572325607946114e-05, + "loss": 4.1248, + "step": 23260 + }, + { + "epoch": 2.6504926248647416, + "grad_norm": 21.603864669799805, + "learning_rate": 2.3560908779541045e-05, + "loss": 4.4381, + "step": 23270 + }, + { + "epoch": 2.6516316418930463, + "grad_norm": 34.63275909423828, + "learning_rate": 2.3549491951135977e-05, + "loss": 4.3809, + "step": 23280 + }, + { + "epoch": 2.6527706589213507, + "grad_norm": 15.982152938842773, + "learning_rate": 2.3538075122730906e-05, + "loss": 4.2162, + "step": 23290 + }, + { + "epoch": 2.6539096759496554, + "grad_norm": 11.666319847106934, + "learning_rate": 2.3526658294325838e-05, + "loss": 4.7708, + "step": 23300 + }, + { + "epoch": 2.65504869297796, + "grad_norm": 13.447303771972656, + "learning_rate": 2.3515241465920766e-05, + "loss": 4.6003, + "step": 23310 + }, + { + "epoch": 2.6561877100062645, + "grad_norm": 30.06685447692871, + "learning_rate": 2.35038246375157e-05, + "loss": 4.3196, + "step": 23320 + }, + { + "epoch": 2.6573267270345693, + "grad_norm": 18.421146392822266, + "learning_rate": 2.349240780911063e-05, + "loss": 4.5392, + "step": 23330 + }, + { + "epoch": 2.6584657440628736, + "grad_norm": 7.0810227394104, + "learning_rate": 2.348099098070556e-05, + "loss": 4.5955, + "step": 23340 + }, + { + "epoch": 2.6596047610911784, + "grad_norm": 18.516857147216797, + "learning_rate": 2.346957415230049e-05, + "loss": 4.3521, + "step": 23350 + }, + { + "epoch": 2.6607437781194827, + "grad_norm": 10.7908935546875, + "learning_rate": 2.3458157323895422e-05, + "loss": 4.2535, + "step": 23360 + }, + { + "epoch": 2.6618827951477875, + "grad_norm": 46.05020523071289, + "learning_rate": 2.3446740495490354e-05, + "loss": 4.271, + "step": 23370 + }, + { + "epoch": 2.6630218121760922, + "grad_norm": 16.857059478759766, + "learning_rate": 2.3435323667085286e-05, + "loss": 4.7449, + "step": 23380 + }, + { + "epoch": 2.6641608292043966, + "grad_norm": 15.106719017028809, + "learning_rate": 2.3423906838680214e-05, + "loss": 4.6435, + "step": 23390 + }, + { + "epoch": 2.6652998462327013, + "grad_norm": 19.65445899963379, + "learning_rate": 2.3412490010275146e-05, + "loss": 4.3389, + "step": 23400 + }, + { + "epoch": 2.6664388632610057, + "grad_norm": 10.110185623168945, + "learning_rate": 2.3401073181870078e-05, + "loss": 4.544, + "step": 23410 + }, + { + "epoch": 2.6675778802893104, + "grad_norm": 24.37737274169922, + "learning_rate": 2.338965635346501e-05, + "loss": 4.5911, + "step": 23420 + }, + { + "epoch": 2.6687168973176147, + "grad_norm": 26.02604103088379, + "learning_rate": 2.3378239525059938e-05, + "loss": 4.1215, + "step": 23430 + }, + { + "epoch": 2.6698559143459195, + "grad_norm": 10.337889671325684, + "learning_rate": 2.336682269665487e-05, + "loss": 4.4812, + "step": 23440 + }, + { + "epoch": 2.6709949313742243, + "grad_norm": 12.305615425109863, + "learning_rate": 2.33554058682498e-05, + "loss": 4.3729, + "step": 23450 + }, + { + "epoch": 2.6721339484025286, + "grad_norm": 23.679931640625, + "learning_rate": 2.3343989039844733e-05, + "loss": 4.5077, + "step": 23460 + }, + { + "epoch": 2.673272965430833, + "grad_norm": 19.276321411132812, + "learning_rate": 2.3332572211439662e-05, + "loss": 4.7323, + "step": 23470 + }, + { + "epoch": 2.6744119824591377, + "grad_norm": 18.629981994628906, + "learning_rate": 2.3321155383034594e-05, + "loss": 4.6628, + "step": 23480 + }, + { + "epoch": 2.6755509994874425, + "grad_norm": 57.32600021362305, + "learning_rate": 2.3309738554629526e-05, + "loss": 4.3868, + "step": 23490 + }, + { + "epoch": 2.676690016515747, + "grad_norm": 10.035811424255371, + "learning_rate": 2.3298321726224457e-05, + "loss": 4.4898, + "step": 23500 + }, + { + "epoch": 2.6778290335440516, + "grad_norm": 11.750822067260742, + "learning_rate": 2.3286904897819386e-05, + "loss": 4.542, + "step": 23510 + }, + { + "epoch": 2.6789680505723563, + "grad_norm": 11.74675178527832, + "learning_rate": 2.3275488069414318e-05, + "loss": 4.7508, + "step": 23520 + }, + { + "epoch": 2.6801070676006606, + "grad_norm": 13.379685401916504, + "learning_rate": 2.326407124100925e-05, + "loss": 4.4905, + "step": 23530 + }, + { + "epoch": 2.681246084628965, + "grad_norm": 17.032699584960938, + "learning_rate": 2.325265441260418e-05, + "loss": 4.4351, + "step": 23540 + }, + { + "epoch": 2.6823851016572697, + "grad_norm": 11.6006441116333, + "learning_rate": 2.324123758419911e-05, + "loss": 4.0651, + "step": 23550 + }, + { + "epoch": 2.6835241186855745, + "grad_norm": 10.766636848449707, + "learning_rate": 2.322982075579404e-05, + "loss": 4.6497, + "step": 23560 + }, + { + "epoch": 2.684663135713879, + "grad_norm": 17.304126739501953, + "learning_rate": 2.3218403927388973e-05, + "loss": 4.3875, + "step": 23570 + }, + { + "epoch": 2.6858021527421836, + "grad_norm": 8.679882049560547, + "learning_rate": 2.3206987098983905e-05, + "loss": 4.3275, + "step": 23580 + }, + { + "epoch": 2.686941169770488, + "grad_norm": 12.993999481201172, + "learning_rate": 2.3195570270578834e-05, + "loss": 4.5775, + "step": 23590 + }, + { + "epoch": 2.6880801867987927, + "grad_norm": 19.391279220581055, + "learning_rate": 2.3184153442173766e-05, + "loss": 4.3392, + "step": 23600 + }, + { + "epoch": 2.689219203827097, + "grad_norm": 9.784872055053711, + "learning_rate": 2.3172736613768697e-05, + "loss": 4.4187, + "step": 23610 + }, + { + "epoch": 2.6903582208554018, + "grad_norm": 17.43511199951172, + "learning_rate": 2.316131978536363e-05, + "loss": 4.6063, + "step": 23620 + }, + { + "epoch": 2.6914972378837065, + "grad_norm": 47.65671157836914, + "learning_rate": 2.3149902956958558e-05, + "loss": 4.1528, + "step": 23630 + }, + { + "epoch": 2.692636254912011, + "grad_norm": 11.7083740234375, + "learning_rate": 2.313848612855349e-05, + "loss": 4.5552, + "step": 23640 + }, + { + "epoch": 2.6937752719403156, + "grad_norm": 42.59748840332031, + "learning_rate": 2.312706930014842e-05, + "loss": 4.3025, + "step": 23650 + }, + { + "epoch": 2.69491428896862, + "grad_norm": 23.80282974243164, + "learning_rate": 2.311565247174335e-05, + "loss": 4.593, + "step": 23660 + }, + { + "epoch": 2.6960533059969247, + "grad_norm": 12.07687759399414, + "learning_rate": 2.310423564333828e-05, + "loss": 4.6442, + "step": 23670 + }, + { + "epoch": 2.697192323025229, + "grad_norm": 9.943156242370605, + "learning_rate": 2.3092818814933214e-05, + "loss": 4.3141, + "step": 23680 + }, + { + "epoch": 2.698331340053534, + "grad_norm": 45.91529846191406, + "learning_rate": 2.3081401986528142e-05, + "loss": 4.6511, + "step": 23690 + }, + { + "epoch": 2.6994703570818386, + "grad_norm": 14.183732032775879, + "learning_rate": 2.3069985158123074e-05, + "loss": 4.3711, + "step": 23700 + }, + { + "epoch": 2.700609374110143, + "grad_norm": 10.457621574401855, + "learning_rate": 2.3058568329718006e-05, + "loss": 4.1473, + "step": 23710 + }, + { + "epoch": 2.7017483911384477, + "grad_norm": 15.812500953674316, + "learning_rate": 2.3047151501312934e-05, + "loss": 4.4821, + "step": 23720 + }, + { + "epoch": 2.702887408166752, + "grad_norm": 10.635324478149414, + "learning_rate": 2.3035734672907866e-05, + "loss": 4.6118, + "step": 23730 + }, + { + "epoch": 2.7040264251950568, + "grad_norm": 23.687637329101562, + "learning_rate": 2.3024317844502798e-05, + "loss": 4.4089, + "step": 23740 + }, + { + "epoch": 2.705165442223361, + "grad_norm": 11.307161331176758, + "learning_rate": 2.301290101609773e-05, + "loss": 4.4876, + "step": 23750 + }, + { + "epoch": 2.706304459251666, + "grad_norm": 9.728133201599121, + "learning_rate": 2.3001484187692658e-05, + "loss": 4.4325, + "step": 23760 + }, + { + "epoch": 2.7074434762799706, + "grad_norm": 17.820701599121094, + "learning_rate": 2.299006735928759e-05, + "loss": 4.4467, + "step": 23770 + }, + { + "epoch": 2.708582493308275, + "grad_norm": 14.636841773986816, + "learning_rate": 2.2978650530882522e-05, + "loss": 4.4374, + "step": 23780 + }, + { + "epoch": 2.7097215103365793, + "grad_norm": 14.008092880249023, + "learning_rate": 2.2967233702477454e-05, + "loss": 3.9471, + "step": 23790 + }, + { + "epoch": 2.710860527364884, + "grad_norm": 11.054430961608887, + "learning_rate": 2.2955816874072382e-05, + "loss": 4.1937, + "step": 23800 + }, + { + "epoch": 2.711999544393189, + "grad_norm": 10.359509468078613, + "learning_rate": 2.2944400045667314e-05, + "loss": 4.6554, + "step": 23810 + }, + { + "epoch": 2.713138561421493, + "grad_norm": 11.266721725463867, + "learning_rate": 2.2932983217262246e-05, + "loss": 4.2294, + "step": 23820 + }, + { + "epoch": 2.714277578449798, + "grad_norm": 13.704965591430664, + "learning_rate": 2.2921566388857178e-05, + "loss": 4.446, + "step": 23830 + }, + { + "epoch": 2.7154165954781027, + "grad_norm": 10.502473831176758, + "learning_rate": 2.2910149560452106e-05, + "loss": 4.669, + "step": 23840 + }, + { + "epoch": 2.716555612506407, + "grad_norm": 11.850542068481445, + "learning_rate": 2.2898732732047038e-05, + "loss": 4.5317, + "step": 23850 + }, + { + "epoch": 2.7176946295347113, + "grad_norm": 28.340858459472656, + "learning_rate": 2.288731590364197e-05, + "loss": 4.4106, + "step": 23860 + }, + { + "epoch": 2.718833646563016, + "grad_norm": 13.200830459594727, + "learning_rate": 2.28758990752369e-05, + "loss": 4.4204, + "step": 23870 + }, + { + "epoch": 2.719972663591321, + "grad_norm": 22.98053741455078, + "learning_rate": 2.286448224683183e-05, + "loss": 4.1871, + "step": 23880 + }, + { + "epoch": 2.721111680619625, + "grad_norm": 20.95444107055664, + "learning_rate": 2.2853065418426762e-05, + "loss": 4.3372, + "step": 23890 + }, + { + "epoch": 2.72225069764793, + "grad_norm": 11.450345993041992, + "learning_rate": 2.2841648590021694e-05, + "loss": 4.4479, + "step": 23900 + }, + { + "epoch": 2.7233897146762343, + "grad_norm": 48.11774826049805, + "learning_rate": 2.2830231761616625e-05, + "loss": 4.208, + "step": 23910 + }, + { + "epoch": 2.724528731704539, + "grad_norm": 17.685771942138672, + "learning_rate": 2.2818814933211554e-05, + "loss": 4.3589, + "step": 23920 + }, + { + "epoch": 2.7256677487328433, + "grad_norm": 13.555407524108887, + "learning_rate": 2.2807398104806486e-05, + "loss": 4.4688, + "step": 23930 + }, + { + "epoch": 2.726806765761148, + "grad_norm": 12.690849304199219, + "learning_rate": 2.2795981276401418e-05, + "loss": 4.1132, + "step": 23940 + }, + { + "epoch": 2.727945782789453, + "grad_norm": 12.816424369812012, + "learning_rate": 2.278456444799635e-05, + "loss": 4.5699, + "step": 23950 + }, + { + "epoch": 2.729084799817757, + "grad_norm": 10.102202415466309, + "learning_rate": 2.2773147619591278e-05, + "loss": 4.3418, + "step": 23960 + }, + { + "epoch": 2.730223816846062, + "grad_norm": 12.096992492675781, + "learning_rate": 2.276173079118621e-05, + "loss": 4.3054, + "step": 23970 + }, + { + "epoch": 2.7313628338743663, + "grad_norm": 11.40463638305664, + "learning_rate": 2.275031396278114e-05, + "loss": 4.4429, + "step": 23980 + }, + { + "epoch": 2.732501850902671, + "grad_norm": 28.24003028869629, + "learning_rate": 2.2738897134376073e-05, + "loss": 4.2202, + "step": 23990 + }, + { + "epoch": 2.7336408679309754, + "grad_norm": 12.26915168762207, + "learning_rate": 2.2727480305971002e-05, + "loss": 4.4253, + "step": 24000 + }, + { + "epoch": 2.7336408679309754, + "eval_loss": 6.015596389770508, + "eval_runtime": 11.1696, + "eval_samples_per_second": 1.343, + "eval_steps_per_second": 0.179, + "step": 24000 + }, + { + "epoch": 2.73477988495928, + "grad_norm": 11.82247543334961, + "learning_rate": 2.2716063477565934e-05, + "loss": 4.3248, + "step": 24010 + }, + { + "epoch": 2.735918901987585, + "grad_norm": 19.194496154785156, + "learning_rate": 2.2704646649160862e-05, + "loss": 4.542, + "step": 24020 + }, + { + "epoch": 2.7370579190158892, + "grad_norm": 21.586811065673828, + "learning_rate": 2.2693229820755797e-05, + "loss": 4.1183, + "step": 24030 + }, + { + "epoch": 2.738196936044194, + "grad_norm": 10.829854011535645, + "learning_rate": 2.2681812992350726e-05, + "loss": 4.367, + "step": 24040 + }, + { + "epoch": 2.7393359530724983, + "grad_norm": 9.324262619018555, + "learning_rate": 2.2670396163945658e-05, + "loss": 4.5154, + "step": 24050 + }, + { + "epoch": 2.740474970100803, + "grad_norm": 13.50275993347168, + "learning_rate": 2.2658979335540586e-05, + "loss": 4.6754, + "step": 24060 + }, + { + "epoch": 2.7416139871291074, + "grad_norm": 20.0113468170166, + "learning_rate": 2.264756250713552e-05, + "loss": 4.7198, + "step": 24070 + }, + { + "epoch": 2.742753004157412, + "grad_norm": 10.483927726745605, + "learning_rate": 2.263614567873045e-05, + "loss": 4.4205, + "step": 24080 + }, + { + "epoch": 2.743892021185717, + "grad_norm": 16.790781021118164, + "learning_rate": 2.262472885032538e-05, + "loss": 4.5606, + "step": 24090 + }, + { + "epoch": 2.7450310382140213, + "grad_norm": 11.87458610534668, + "learning_rate": 2.261331202192031e-05, + "loss": 4.3453, + "step": 24100 + }, + { + "epoch": 2.7461700552423256, + "grad_norm": 11.971658706665039, + "learning_rate": 2.2601895193515242e-05, + "loss": 4.116, + "step": 24110 + }, + { + "epoch": 2.7473090722706304, + "grad_norm": 9.94765853881836, + "learning_rate": 2.2590478365110174e-05, + "loss": 4.4187, + "step": 24120 + }, + { + "epoch": 2.748448089298935, + "grad_norm": 16.397294998168945, + "learning_rate": 2.2579061536705106e-05, + "loss": 4.5822, + "step": 24130 + }, + { + "epoch": 2.7495871063272395, + "grad_norm": 18.915102005004883, + "learning_rate": 2.2567644708300034e-05, + "loss": 4.5509, + "step": 24140 + }, + { + "epoch": 2.7507261233555442, + "grad_norm": 19.87112808227539, + "learning_rate": 2.2556227879894966e-05, + "loss": 4.7238, + "step": 24150 + }, + { + "epoch": 2.751865140383849, + "grad_norm": 39.02969741821289, + "learning_rate": 2.2544811051489898e-05, + "loss": 4.2455, + "step": 24160 + }, + { + "epoch": 2.7530041574121533, + "grad_norm": 13.967534065246582, + "learning_rate": 2.2533394223084826e-05, + "loss": 4.7613, + "step": 24170 + }, + { + "epoch": 2.7541431744404576, + "grad_norm": 21.92757797241211, + "learning_rate": 2.2521977394679758e-05, + "loss": 4.2869, + "step": 24180 + }, + { + "epoch": 2.7552821914687624, + "grad_norm": 12.102161407470703, + "learning_rate": 2.251056056627469e-05, + "loss": 4.2129, + "step": 24190 + }, + { + "epoch": 2.756421208497067, + "grad_norm": 8.07431411743164, + "learning_rate": 2.249914373786962e-05, + "loss": 4.4516, + "step": 24200 + }, + { + "epoch": 2.7575602255253715, + "grad_norm": 8.32588005065918, + "learning_rate": 2.248772690946455e-05, + "loss": 4.4829, + "step": 24210 + }, + { + "epoch": 2.7586992425536763, + "grad_norm": 7.788595676422119, + "learning_rate": 2.2476310081059482e-05, + "loss": 4.4237, + "step": 24220 + }, + { + "epoch": 2.7598382595819806, + "grad_norm": 9.158100128173828, + "learning_rate": 2.2464893252654414e-05, + "loss": 4.1744, + "step": 24230 + }, + { + "epoch": 2.7609772766102854, + "grad_norm": 12.381442070007324, + "learning_rate": 2.2453476424249346e-05, + "loss": 4.3906, + "step": 24240 + }, + { + "epoch": 2.7621162936385897, + "grad_norm": 11.133394241333008, + "learning_rate": 2.2442059595844274e-05, + "loss": 4.1881, + "step": 24250 + }, + { + "epoch": 2.7632553106668944, + "grad_norm": 14.3223295211792, + "learning_rate": 2.2430642767439206e-05, + "loss": 4.4534, + "step": 24260 + }, + { + "epoch": 2.764394327695199, + "grad_norm": 11.660000801086426, + "learning_rate": 2.2419225939034138e-05, + "loss": 4.399, + "step": 24270 + }, + { + "epoch": 2.7655333447235035, + "grad_norm": 9.384432792663574, + "learning_rate": 2.240780911062907e-05, + "loss": 4.447, + "step": 24280 + }, + { + "epoch": 2.7666723617518083, + "grad_norm": 10.920495986938477, + "learning_rate": 2.2396392282223998e-05, + "loss": 4.4813, + "step": 24290 + }, + { + "epoch": 2.7678113787801126, + "grad_norm": 10.877670288085938, + "learning_rate": 2.238497545381893e-05, + "loss": 4.5073, + "step": 24300 + }, + { + "epoch": 2.7689503958084174, + "grad_norm": 11.857516288757324, + "learning_rate": 2.237355862541386e-05, + "loss": 4.4858, + "step": 24310 + }, + { + "epoch": 2.7700894128367217, + "grad_norm": 19.363431930541992, + "learning_rate": 2.2362141797008794e-05, + "loss": 4.4434, + "step": 24320 + }, + { + "epoch": 2.7712284298650265, + "grad_norm": 13.742807388305664, + "learning_rate": 2.2350724968603722e-05, + "loss": 3.8316, + "step": 24330 + }, + { + "epoch": 2.7723674468933313, + "grad_norm": 7.8158135414123535, + "learning_rate": 2.2339308140198654e-05, + "loss": 4.4209, + "step": 24340 + }, + { + "epoch": 2.7735064639216356, + "grad_norm": 19.696626663208008, + "learning_rate": 2.2327891311793586e-05, + "loss": 4.4275, + "step": 24350 + }, + { + "epoch": 2.7746454809499403, + "grad_norm": 13.6576509475708, + "learning_rate": 2.2316474483388517e-05, + "loss": 4.53, + "step": 24360 + }, + { + "epoch": 2.7757844979782447, + "grad_norm": 12.400209426879883, + "learning_rate": 2.2305057654983446e-05, + "loss": 4.3855, + "step": 24370 + }, + { + "epoch": 2.7769235150065494, + "grad_norm": 24.432838439941406, + "learning_rate": 2.2293640826578378e-05, + "loss": 4.2563, + "step": 24380 + }, + { + "epoch": 2.7780625320348538, + "grad_norm": 9.757552146911621, + "learning_rate": 2.2282223998173306e-05, + "loss": 4.2891, + "step": 24390 + }, + { + "epoch": 2.7792015490631585, + "grad_norm": 50.52961349487305, + "learning_rate": 2.227080716976824e-05, + "loss": 4.4685, + "step": 24400 + }, + { + "epoch": 2.7803405660914633, + "grad_norm": 28.591514587402344, + "learning_rate": 2.225939034136317e-05, + "loss": 4.6151, + "step": 24410 + }, + { + "epoch": 2.7814795831197676, + "grad_norm": 13.835251808166504, + "learning_rate": 2.2247973512958102e-05, + "loss": 4.3123, + "step": 24420 + }, + { + "epoch": 2.782618600148072, + "grad_norm": 9.26570987701416, + "learning_rate": 2.223655668455303e-05, + "loss": 4.4072, + "step": 24430 + }, + { + "epoch": 2.7837576171763767, + "grad_norm": 13.097249031066895, + "learning_rate": 2.2225139856147965e-05, + "loss": 4.4835, + "step": 24440 + }, + { + "epoch": 2.7848966342046815, + "grad_norm": 8.850903511047363, + "learning_rate": 2.2213723027742894e-05, + "loss": 4.8888, + "step": 24450 + }, + { + "epoch": 2.786035651232986, + "grad_norm": 14.328838348388672, + "learning_rate": 2.2202306199337826e-05, + "loss": 4.1116, + "step": 24460 + }, + { + "epoch": 2.7871746682612906, + "grad_norm": 12.785542488098145, + "learning_rate": 2.2190889370932754e-05, + "loss": 4.6598, + "step": 24470 + }, + { + "epoch": 2.7883136852895953, + "grad_norm": 11.574085235595703, + "learning_rate": 2.217947254252769e-05, + "loss": 4.7404, + "step": 24480 + }, + { + "epoch": 2.7894527023178997, + "grad_norm": 13.864222526550293, + "learning_rate": 2.2168055714122618e-05, + "loss": 4.5075, + "step": 24490 + }, + { + "epoch": 2.790591719346204, + "grad_norm": 9.302299499511719, + "learning_rate": 2.215663888571755e-05, + "loss": 4.4674, + "step": 24500 + }, + { + "epoch": 2.7917307363745087, + "grad_norm": 11.267061233520508, + "learning_rate": 2.2145222057312478e-05, + "loss": 4.5262, + "step": 24510 + }, + { + "epoch": 2.7928697534028135, + "grad_norm": 15.274855613708496, + "learning_rate": 2.213380522890741e-05, + "loss": 4.1499, + "step": 24520 + }, + { + "epoch": 2.794008770431118, + "grad_norm": 12.671558380126953, + "learning_rate": 2.2122388400502342e-05, + "loss": 4.6206, + "step": 24530 + }, + { + "epoch": 2.7951477874594226, + "grad_norm": 12.626591682434082, + "learning_rate": 2.2110971572097274e-05, + "loss": 4.5262, + "step": 24540 + }, + { + "epoch": 2.796286804487727, + "grad_norm": 33.061283111572266, + "learning_rate": 2.2099554743692202e-05, + "loss": 3.9884, + "step": 24550 + }, + { + "epoch": 2.7974258215160317, + "grad_norm": 14.592642784118652, + "learning_rate": 2.2088137915287134e-05, + "loss": 4.2963, + "step": 24560 + }, + { + "epoch": 2.798564838544336, + "grad_norm": 20.097068786621094, + "learning_rate": 2.2076721086882066e-05, + "loss": 4.4814, + "step": 24570 + }, + { + "epoch": 2.799703855572641, + "grad_norm": 11.807546615600586, + "learning_rate": 2.2065304258476998e-05, + "loss": 4.2837, + "step": 24580 + }, + { + "epoch": 2.8008428726009456, + "grad_norm": 13.322919845581055, + "learning_rate": 2.2053887430071926e-05, + "loss": 4.328, + "step": 24590 + }, + { + "epoch": 2.80198188962925, + "grad_norm": 10.876242637634277, + "learning_rate": 2.2042470601666858e-05, + "loss": 4.4833, + "step": 24600 + }, + { + "epoch": 2.8031209066575546, + "grad_norm": 8.46240234375, + "learning_rate": 2.203105377326179e-05, + "loss": 4.0987, + "step": 24610 + }, + { + "epoch": 2.804259923685859, + "grad_norm": 8.846264839172363, + "learning_rate": 2.2019636944856718e-05, + "loss": 4.2597, + "step": 24620 + }, + { + "epoch": 2.8053989407141637, + "grad_norm": 11.06393814086914, + "learning_rate": 2.200822011645165e-05, + "loss": 4.4988, + "step": 24630 + }, + { + "epoch": 2.806537957742468, + "grad_norm": 16.189380645751953, + "learning_rate": 2.1996803288046582e-05, + "loss": 4.245, + "step": 24640 + }, + { + "epoch": 2.807676974770773, + "grad_norm": 9.137038230895996, + "learning_rate": 2.1985386459641514e-05, + "loss": 4.3353, + "step": 24650 + }, + { + "epoch": 2.8088159917990776, + "grad_norm": 20.32309913635254, + "learning_rate": 2.1973969631236442e-05, + "loss": 4.1225, + "step": 24660 + }, + { + "epoch": 2.809955008827382, + "grad_norm": 9.596814155578613, + "learning_rate": 2.1962552802831374e-05, + "loss": 4.2842, + "step": 24670 + }, + { + "epoch": 2.8110940258556867, + "grad_norm": 42.25757598876953, + "learning_rate": 2.1951135974426306e-05, + "loss": 4.3953, + "step": 24680 + }, + { + "epoch": 2.812233042883991, + "grad_norm": 10.706561088562012, + "learning_rate": 2.1939719146021238e-05, + "loss": 4.3581, + "step": 24690 + }, + { + "epoch": 2.8133720599122958, + "grad_norm": 8.807883262634277, + "learning_rate": 2.1928302317616166e-05, + "loss": 4.4784, + "step": 24700 + }, + { + "epoch": 2.8145110769406, + "grad_norm": 12.849080085754395, + "learning_rate": 2.1916885489211098e-05, + "loss": 4.1881, + "step": 24710 + }, + { + "epoch": 2.815650093968905, + "grad_norm": 31.404457092285156, + "learning_rate": 2.1905468660806026e-05, + "loss": 3.8575, + "step": 24720 + }, + { + "epoch": 2.8167891109972096, + "grad_norm": 10.283102989196777, + "learning_rate": 2.189405183240096e-05, + "loss": 4.3447, + "step": 24730 + }, + { + "epoch": 2.817928128025514, + "grad_norm": 14.568113327026367, + "learning_rate": 2.188263500399589e-05, + "loss": 4.3281, + "step": 24740 + }, + { + "epoch": 2.8190671450538183, + "grad_norm": 19.217689514160156, + "learning_rate": 2.1871218175590822e-05, + "loss": 4.1652, + "step": 24750 + }, + { + "epoch": 2.820206162082123, + "grad_norm": 13.5247802734375, + "learning_rate": 2.185980134718575e-05, + "loss": 4.4339, + "step": 24760 + }, + { + "epoch": 2.821345179110428, + "grad_norm": 31.91325569152832, + "learning_rate": 2.1848384518780686e-05, + "loss": 4.5414, + "step": 24770 + }, + { + "epoch": 2.822484196138732, + "grad_norm": 10.52747631072998, + "learning_rate": 2.1836967690375614e-05, + "loss": 4.1582, + "step": 24780 + }, + { + "epoch": 2.823623213167037, + "grad_norm": 10.953377723693848, + "learning_rate": 2.1825550861970546e-05, + "loss": 4.7382, + "step": 24790 + }, + { + "epoch": 2.8247622301953417, + "grad_norm": 16.29425048828125, + "learning_rate": 2.1814134033565474e-05, + "loss": 4.3219, + "step": 24800 + }, + { + "epoch": 2.825901247223646, + "grad_norm": 11.031655311584473, + "learning_rate": 2.180271720516041e-05, + "loss": 4.5626, + "step": 24810 + }, + { + "epoch": 2.8270402642519503, + "grad_norm": 10.524728775024414, + "learning_rate": 2.1791300376755338e-05, + "loss": 4.4956, + "step": 24820 + }, + { + "epoch": 2.828179281280255, + "grad_norm": 18.397052764892578, + "learning_rate": 2.177988354835027e-05, + "loss": 4.6385, + "step": 24830 + }, + { + "epoch": 2.82931829830856, + "grad_norm": 9.142768859863281, + "learning_rate": 2.1768466719945198e-05, + "loss": 4.0946, + "step": 24840 + }, + { + "epoch": 2.830457315336864, + "grad_norm": 21.6534366607666, + "learning_rate": 2.1757049891540133e-05, + "loss": 4.0786, + "step": 24850 + }, + { + "epoch": 2.831596332365169, + "grad_norm": 15.395153045654297, + "learning_rate": 2.1745633063135062e-05, + "loss": 4.2351, + "step": 24860 + }, + { + "epoch": 2.8327353493934733, + "grad_norm": 10.324874877929688, + "learning_rate": 2.1734216234729994e-05, + "loss": 4.5813, + "step": 24870 + }, + { + "epoch": 2.833874366421778, + "grad_norm": 8.68514347076416, + "learning_rate": 2.1722799406324922e-05, + "loss": 4.1804, + "step": 24880 + }, + { + "epoch": 2.8350133834500824, + "grad_norm": 24.952619552612305, + "learning_rate": 2.1711382577919857e-05, + "loss": 4.4424, + "step": 24890 + }, + { + "epoch": 2.836152400478387, + "grad_norm": 10.112604141235352, + "learning_rate": 2.1699965749514786e-05, + "loss": 4.3136, + "step": 24900 + }, + { + "epoch": 2.837291417506692, + "grad_norm": 9.500236511230469, + "learning_rate": 2.1688548921109718e-05, + "loss": 4.2868, + "step": 24910 + }, + { + "epoch": 2.838430434534996, + "grad_norm": 11.29405403137207, + "learning_rate": 2.1677132092704646e-05, + "loss": 4.8407, + "step": 24920 + }, + { + "epoch": 2.839569451563301, + "grad_norm": 14.321564674377441, + "learning_rate": 2.166571526429958e-05, + "loss": 4.587, + "step": 24930 + }, + { + "epoch": 2.8407084685916053, + "grad_norm": 14.331136703491211, + "learning_rate": 2.165429843589451e-05, + "loss": 4.5715, + "step": 24940 + }, + { + "epoch": 2.84184748561991, + "grad_norm": 10.820293426513672, + "learning_rate": 2.164288160748944e-05, + "loss": 4.4391, + "step": 24950 + }, + { + "epoch": 2.8429865026482144, + "grad_norm": 17.32324981689453, + "learning_rate": 2.163146477908437e-05, + "loss": 4.2561, + "step": 24960 + }, + { + "epoch": 2.844125519676519, + "grad_norm": 31.806528091430664, + "learning_rate": 2.1620047950679302e-05, + "loss": 4.1464, + "step": 24970 + }, + { + "epoch": 2.845264536704824, + "grad_norm": 37.59844970703125, + "learning_rate": 2.1608631122274234e-05, + "loss": 4.1737, + "step": 24980 + }, + { + "epoch": 2.8464035537331283, + "grad_norm": 11.175416946411133, + "learning_rate": 2.1597214293869166e-05, + "loss": 4.13, + "step": 24990 + }, + { + "epoch": 2.847542570761433, + "grad_norm": 12.340738296508789, + "learning_rate": 2.1585797465464094e-05, + "loss": 4.8189, + "step": 25000 + }, + { + "epoch": 2.8486815877897373, + "grad_norm": 13.959515571594238, + "learning_rate": 2.1574380637059026e-05, + "loss": 4.2027, + "step": 25010 + }, + { + "epoch": 2.849820604818042, + "grad_norm": 10.458575248718262, + "learning_rate": 2.1562963808653958e-05, + "loss": 4.1396, + "step": 25020 + }, + { + "epoch": 2.8509596218463464, + "grad_norm": 21.41703224182129, + "learning_rate": 2.1551546980248886e-05, + "loss": 4.3719, + "step": 25030 + }, + { + "epoch": 2.852098638874651, + "grad_norm": 11.951301574707031, + "learning_rate": 2.1540130151843818e-05, + "loss": 4.4561, + "step": 25040 + }, + { + "epoch": 2.853237655902956, + "grad_norm": 20.143756866455078, + "learning_rate": 2.152871332343875e-05, + "loss": 4.3852, + "step": 25050 + }, + { + "epoch": 2.8543766729312603, + "grad_norm": 13.41882038116455, + "learning_rate": 2.1517296495033682e-05, + "loss": 4.3855, + "step": 25060 + }, + { + "epoch": 2.8555156899595646, + "grad_norm": 21.393659591674805, + "learning_rate": 2.150587966662861e-05, + "loss": 4.4363, + "step": 25070 + }, + { + "epoch": 2.8566547069878694, + "grad_norm": 10.970490455627441, + "learning_rate": 2.1494462838223542e-05, + "loss": 4.3659, + "step": 25080 + }, + { + "epoch": 2.857793724016174, + "grad_norm": 14.912433624267578, + "learning_rate": 2.148304600981847e-05, + "loss": 4.567, + "step": 25090 + }, + { + "epoch": 2.8589327410444785, + "grad_norm": 23.709640502929688, + "learning_rate": 2.1471629181413406e-05, + "loss": 4.416, + "step": 25100 + }, + { + "epoch": 2.8600717580727832, + "grad_norm": 10.713531494140625, + "learning_rate": 2.1460212353008334e-05, + "loss": 4.2958, + "step": 25110 + }, + { + "epoch": 2.861210775101088, + "grad_norm": 20.735071182250977, + "learning_rate": 2.1448795524603266e-05, + "loss": 4.6377, + "step": 25120 + }, + { + "epoch": 2.8623497921293923, + "grad_norm": 14.454633712768555, + "learning_rate": 2.1437378696198194e-05, + "loss": 4.7845, + "step": 25130 + }, + { + "epoch": 2.8634888091576967, + "grad_norm": 11.987555503845215, + "learning_rate": 2.142596186779313e-05, + "loss": 4.3835, + "step": 25140 + }, + { + "epoch": 2.8646278261860014, + "grad_norm": 28.97137451171875, + "learning_rate": 2.1414545039388058e-05, + "loss": 4.3128, + "step": 25150 + }, + { + "epoch": 2.865766843214306, + "grad_norm": 12.924393653869629, + "learning_rate": 2.140312821098299e-05, + "loss": 4.5555, + "step": 25160 + }, + { + "epoch": 2.8669058602426105, + "grad_norm": 8.018871307373047, + "learning_rate": 2.139171138257792e-05, + "loss": 4.6651, + "step": 25170 + }, + { + "epoch": 2.8680448772709153, + "grad_norm": 8.445608139038086, + "learning_rate": 2.1380294554172854e-05, + "loss": 4.9271, + "step": 25180 + }, + { + "epoch": 2.86918389429922, + "grad_norm": 11.492569923400879, + "learning_rate": 2.1368877725767782e-05, + "loss": 4.4513, + "step": 25190 + }, + { + "epoch": 2.8703229113275244, + "grad_norm": 10.3760347366333, + "learning_rate": 2.1357460897362714e-05, + "loss": 4.4288, + "step": 25200 + }, + { + "epoch": 2.8714619283558287, + "grad_norm": 10.512873649597168, + "learning_rate": 2.1346044068957642e-05, + "loss": 4.4764, + "step": 25210 + }, + { + "epoch": 2.8726009453841335, + "grad_norm": 15.735414505004883, + "learning_rate": 2.1334627240552578e-05, + "loss": 4.4197, + "step": 25220 + }, + { + "epoch": 2.8737399624124382, + "grad_norm": 11.466535568237305, + "learning_rate": 2.1323210412147506e-05, + "loss": 4.5131, + "step": 25230 + }, + { + "epoch": 2.8748789794407426, + "grad_norm": 12.852303504943848, + "learning_rate": 2.1311793583742438e-05, + "loss": 4.3155, + "step": 25240 + }, + { + "epoch": 2.8760179964690473, + "grad_norm": 10.068410873413086, + "learning_rate": 2.1300376755337366e-05, + "loss": 4.4798, + "step": 25250 + }, + { + "epoch": 2.8771570134973516, + "grad_norm": 13.780503273010254, + "learning_rate": 2.12889599269323e-05, + "loss": 4.4898, + "step": 25260 + }, + { + "epoch": 2.8782960305256564, + "grad_norm": 15.617043495178223, + "learning_rate": 2.127754309852723e-05, + "loss": 4.4464, + "step": 25270 + }, + { + "epoch": 2.8794350475539607, + "grad_norm": 11.50779914855957, + "learning_rate": 2.1266126270122162e-05, + "loss": 4.3775, + "step": 25280 + }, + { + "epoch": 2.8805740645822655, + "grad_norm": 9.568475723266602, + "learning_rate": 2.125470944171709e-05, + "loss": 4.6033, + "step": 25290 + }, + { + "epoch": 2.8817130816105703, + "grad_norm": 12.430456161499023, + "learning_rate": 2.1243292613312026e-05, + "loss": 4.3264, + "step": 25300 + }, + { + "epoch": 2.8828520986388746, + "grad_norm": 18.374462127685547, + "learning_rate": 2.1231875784906954e-05, + "loss": 4.4276, + "step": 25310 + }, + { + "epoch": 2.8839911156671794, + "grad_norm": 12.369324684143066, + "learning_rate": 2.1220458956501886e-05, + "loss": 4.5652, + "step": 25320 + }, + { + "epoch": 2.8851301326954837, + "grad_norm": 10.463457107543945, + "learning_rate": 2.1209042128096814e-05, + "loss": 4.6378, + "step": 25330 + }, + { + "epoch": 2.8862691497237885, + "grad_norm": 15.187032699584961, + "learning_rate": 2.119762529969175e-05, + "loss": 4.2882, + "step": 25340 + }, + { + "epoch": 2.8874081667520928, + "grad_norm": 14.741857528686523, + "learning_rate": 2.1186208471286678e-05, + "loss": 4.8769, + "step": 25350 + }, + { + "epoch": 2.8885471837803975, + "grad_norm": 12.209632873535156, + "learning_rate": 2.117479164288161e-05, + "loss": 4.6269, + "step": 25360 + }, + { + "epoch": 2.8896862008087023, + "grad_norm": 10.20417594909668, + "learning_rate": 2.1163374814476538e-05, + "loss": 4.4955, + "step": 25370 + }, + { + "epoch": 2.8908252178370066, + "grad_norm": 51.44119644165039, + "learning_rate": 2.115195798607147e-05, + "loss": 4.6589, + "step": 25380 + }, + { + "epoch": 2.8919642348653114, + "grad_norm": 23.65181541442871, + "learning_rate": 2.1140541157666402e-05, + "loss": 4.0472, + "step": 25390 + }, + { + "epoch": 2.8931032518936157, + "grad_norm": 9.626806259155273, + "learning_rate": 2.1129124329261334e-05, + "loss": 4.2082, + "step": 25400 + }, + { + "epoch": 2.8942422689219205, + "grad_norm": 18.213037490844727, + "learning_rate": 2.1117707500856262e-05, + "loss": 4.5842, + "step": 25410 + }, + { + "epoch": 2.895381285950225, + "grad_norm": 12.3046293258667, + "learning_rate": 2.1106290672451194e-05, + "loss": 4.6834, + "step": 25420 + }, + { + "epoch": 2.8965203029785296, + "grad_norm": 9.713820457458496, + "learning_rate": 2.1094873844046126e-05, + "loss": 4.3921, + "step": 25430 + }, + { + "epoch": 2.8976593200068343, + "grad_norm": 12.635478019714355, + "learning_rate": 2.1083457015641058e-05, + "loss": 4.2344, + "step": 25440 + }, + { + "epoch": 2.8987983370351387, + "grad_norm": 8.912229537963867, + "learning_rate": 2.1072040187235986e-05, + "loss": 4.4152, + "step": 25450 + }, + { + "epoch": 2.899937354063443, + "grad_norm": 10.33163070678711, + "learning_rate": 2.1060623358830918e-05, + "loss": 4.408, + "step": 25460 + }, + { + "epoch": 2.9010763710917478, + "grad_norm": 14.34139347076416, + "learning_rate": 2.104920653042585e-05, + "loss": 4.4, + "step": 25470 + }, + { + "epoch": 2.9022153881200525, + "grad_norm": 13.725435256958008, + "learning_rate": 2.1037789702020778e-05, + "loss": 5.1318, + "step": 25480 + }, + { + "epoch": 2.903354405148357, + "grad_norm": 27.68000602722168, + "learning_rate": 2.102637287361571e-05, + "loss": 4.7994, + "step": 25490 + }, + { + "epoch": 2.9044934221766616, + "grad_norm": 13.529292106628418, + "learning_rate": 2.1014956045210642e-05, + "loss": 4.4018, + "step": 25500 + }, + { + "epoch": 2.9056324392049664, + "grad_norm": 11.97656536102295, + "learning_rate": 2.1003539216805574e-05, + "loss": 4.4313, + "step": 25510 + }, + { + "epoch": 2.9067714562332707, + "grad_norm": 11.98920726776123, + "learning_rate": 2.0992122388400502e-05, + "loss": 4.5136, + "step": 25520 + }, + { + "epoch": 2.907910473261575, + "grad_norm": 17.4196834564209, + "learning_rate": 2.0980705559995434e-05, + "loss": 4.1763, + "step": 25530 + }, + { + "epoch": 2.90904949028988, + "grad_norm": 9.960898399353027, + "learning_rate": 2.0969288731590362e-05, + "loss": 4.4402, + "step": 25540 + }, + { + "epoch": 2.9101885073181846, + "grad_norm": 10.668099403381348, + "learning_rate": 2.0957871903185298e-05, + "loss": 4.3085, + "step": 25550 + }, + { + "epoch": 2.911327524346489, + "grad_norm": 10.457843780517578, + "learning_rate": 2.0946455074780226e-05, + "loss": 4.3377, + "step": 25560 + }, + { + "epoch": 2.9124665413747937, + "grad_norm": 28.04571533203125, + "learning_rate": 2.0935038246375158e-05, + "loss": 4.4546, + "step": 25570 + }, + { + "epoch": 2.913605558403098, + "grad_norm": 65.01653289794922, + "learning_rate": 2.0923621417970086e-05, + "loss": 4.3514, + "step": 25580 + }, + { + "epoch": 2.9147445754314028, + "grad_norm": 23.05303955078125, + "learning_rate": 2.091220458956502e-05, + "loss": 4.3337, + "step": 25590 + }, + { + "epoch": 2.915883592459707, + "grad_norm": 17.922874450683594, + "learning_rate": 2.090078776115995e-05, + "loss": 4.2132, + "step": 25600 + }, + { + "epoch": 2.917022609488012, + "grad_norm": 14.76237964630127, + "learning_rate": 2.0889370932754882e-05, + "loss": 4.3352, + "step": 25610 + }, + { + "epoch": 2.9181616265163166, + "grad_norm": 9.139527320861816, + "learning_rate": 2.087795410434981e-05, + "loss": 4.4611, + "step": 25620 + }, + { + "epoch": 2.919300643544621, + "grad_norm": 28.567909240722656, + "learning_rate": 2.0866537275944746e-05, + "loss": 4.8546, + "step": 25630 + }, + { + "epoch": 2.9204396605729257, + "grad_norm": 9.053948402404785, + "learning_rate": 2.0855120447539674e-05, + "loss": 4.2187, + "step": 25640 + }, + { + "epoch": 2.92157867760123, + "grad_norm": 10.97348403930664, + "learning_rate": 2.0843703619134606e-05, + "loss": 4.0639, + "step": 25650 + }, + { + "epoch": 2.922717694629535, + "grad_norm": 12.089532852172852, + "learning_rate": 2.0832286790729534e-05, + "loss": 4.6351, + "step": 25660 + }, + { + "epoch": 2.923856711657839, + "grad_norm": 58.029869079589844, + "learning_rate": 2.082086996232447e-05, + "loss": 4.2459, + "step": 25670 + }, + { + "epoch": 2.924995728686144, + "grad_norm": 19.57094955444336, + "learning_rate": 2.0809453133919398e-05, + "loss": 4.583, + "step": 25680 + }, + { + "epoch": 2.9261347457144486, + "grad_norm": 22.886457443237305, + "learning_rate": 2.0799177988354837e-05, + "loss": 4.3523, + "step": 25690 + }, + { + "epoch": 2.927273762742753, + "grad_norm": 17.789207458496094, + "learning_rate": 2.0787761159949766e-05, + "loss": 4.2271, + "step": 25700 + }, + { + "epoch": 2.9284127797710577, + "grad_norm": 14.033312797546387, + "learning_rate": 2.0776344331544698e-05, + "loss": 4.4841, + "step": 25710 + }, + { + "epoch": 2.929551796799362, + "grad_norm": 9.000493049621582, + "learning_rate": 2.076492750313963e-05, + "loss": 4.4725, + "step": 25720 + }, + { + "epoch": 2.930690813827667, + "grad_norm": 12.288798332214355, + "learning_rate": 2.075351067473456e-05, + "loss": 4.4778, + "step": 25730 + }, + { + "epoch": 2.931829830855971, + "grad_norm": 15.534489631652832, + "learning_rate": 2.074209384632949e-05, + "loss": 4.3485, + "step": 25740 + }, + { + "epoch": 2.932968847884276, + "grad_norm": 11.241649627685547, + "learning_rate": 2.073067701792442e-05, + "loss": 4.4533, + "step": 25750 + }, + { + "epoch": 2.9341078649125807, + "grad_norm": 12.891498565673828, + "learning_rate": 2.071926018951935e-05, + "loss": 4.2297, + "step": 25760 + }, + { + "epoch": 2.935246881940885, + "grad_norm": 10.07817268371582, + "learning_rate": 2.0707843361114285e-05, + "loss": 4.5676, + "step": 25770 + }, + { + "epoch": 2.9363858989691893, + "grad_norm": 10.336227416992188, + "learning_rate": 2.0696426532709214e-05, + "loss": 4.3517, + "step": 25780 + }, + { + "epoch": 2.937524915997494, + "grad_norm": 25.487110137939453, + "learning_rate": 2.0685009704304146e-05, + "loss": 3.9777, + "step": 25790 + }, + { + "epoch": 2.938663933025799, + "grad_norm": 10.53114128112793, + "learning_rate": 2.0673592875899074e-05, + "loss": 4.2856, + "step": 25800 + }, + { + "epoch": 2.939802950054103, + "grad_norm": 38.20022964477539, + "learning_rate": 2.066217604749401e-05, + "loss": 4.1268, + "step": 25810 + }, + { + "epoch": 2.940941967082408, + "grad_norm": 11.7878999710083, + "learning_rate": 2.0650759219088938e-05, + "loss": 4.5761, + "step": 25820 + }, + { + "epoch": 2.9420809841107127, + "grad_norm": 9.836922645568848, + "learning_rate": 2.063934239068387e-05, + "loss": 4.415, + "step": 25830 + }, + { + "epoch": 2.943220001139017, + "grad_norm": 29.2979736328125, + "learning_rate": 2.0627925562278798e-05, + "loss": 4.4441, + "step": 25840 + }, + { + "epoch": 2.9443590181673214, + "grad_norm": 9.555939674377441, + "learning_rate": 2.061650873387373e-05, + "loss": 4.7018, + "step": 25850 + }, + { + "epoch": 2.945498035195626, + "grad_norm": 9.398253440856934, + "learning_rate": 2.060509190546866e-05, + "loss": 4.3748, + "step": 25860 + }, + { + "epoch": 2.946637052223931, + "grad_norm": 15.86281681060791, + "learning_rate": 2.0593675077063594e-05, + "loss": 4.3274, + "step": 25870 + }, + { + "epoch": 2.9477760692522352, + "grad_norm": 19.11648941040039, + "learning_rate": 2.0582258248658522e-05, + "loss": 4.8558, + "step": 25880 + }, + { + "epoch": 2.94891508628054, + "grad_norm": 12.06795883178711, + "learning_rate": 2.0570841420253454e-05, + "loss": 4.8912, + "step": 25890 + }, + { + "epoch": 2.9500541033088443, + "grad_norm": 14.896605491638184, + "learning_rate": 2.0559424591848386e-05, + "loss": 4.6473, + "step": 25900 + }, + { + "epoch": 2.951193120337149, + "grad_norm": 14.027856826782227, + "learning_rate": 2.0548007763443317e-05, + "loss": 4.4415, + "step": 25910 + }, + { + "epoch": 2.9523321373654534, + "grad_norm": 10.832018852233887, + "learning_rate": 2.0536590935038246e-05, + "loss": 4.125, + "step": 25920 + }, + { + "epoch": 2.953471154393758, + "grad_norm": 18.01789093017578, + "learning_rate": 2.0525174106633178e-05, + "loss": 4.1041, + "step": 25930 + }, + { + "epoch": 2.954610171422063, + "grad_norm": 19.601741790771484, + "learning_rate": 2.051375727822811e-05, + "loss": 4.1179, + "step": 25940 + }, + { + "epoch": 2.9557491884503673, + "grad_norm": 11.704947471618652, + "learning_rate": 2.050234044982304e-05, + "loss": 4.5004, + "step": 25950 + }, + { + "epoch": 2.956888205478672, + "grad_norm": 11.281723976135254, + "learning_rate": 2.049092362141797e-05, + "loss": 4.6813, + "step": 25960 + }, + { + "epoch": 2.9580272225069764, + "grad_norm": 36.630462646484375, + "learning_rate": 2.0479506793012902e-05, + "loss": 4.3721, + "step": 25970 + }, + { + "epoch": 2.959166239535281, + "grad_norm": 10.66319751739502, + "learning_rate": 2.0468089964607834e-05, + "loss": 4.341, + "step": 25980 + }, + { + "epoch": 2.9603052565635855, + "grad_norm": 24.53282928466797, + "learning_rate": 2.0456673136202765e-05, + "loss": 4.1195, + "step": 25990 + }, + { + "epoch": 2.96144427359189, + "grad_norm": 10.011075019836426, + "learning_rate": 2.0445256307797694e-05, + "loss": 4.4799, + "step": 26000 + }, + { + "epoch": 2.96144427359189, + "eval_loss": 6.0944600105285645, + "eval_runtime": 12.0712, + "eval_samples_per_second": 1.243, + "eval_steps_per_second": 0.166, + "step": 26000 + }, + { + "epoch": 2.962583290620195, + "grad_norm": 9.768843650817871, + "learning_rate": 2.0433839479392626e-05, + "loss": 4.3493, + "step": 26010 + }, + { + "epoch": 2.9637223076484993, + "grad_norm": 8.531599998474121, + "learning_rate": 2.0422422650987558e-05, + "loss": 4.5169, + "step": 26020 + }, + { + "epoch": 2.964861324676804, + "grad_norm": 13.053476333618164, + "learning_rate": 2.041100582258249e-05, + "loss": 4.4832, + "step": 26030 + }, + { + "epoch": 2.9660003417051084, + "grad_norm": 17.686269760131836, + "learning_rate": 2.0399588994177418e-05, + "loss": 4.5952, + "step": 26040 + }, + { + "epoch": 2.967139358733413, + "grad_norm": 11.529505729675293, + "learning_rate": 2.038817216577235e-05, + "loss": 4.3991, + "step": 26050 + }, + { + "epoch": 2.9682783757617175, + "grad_norm": 41.76747512817383, + "learning_rate": 2.037675533736728e-05, + "loss": 4.6966, + "step": 26060 + }, + { + "epoch": 2.9694173927900223, + "grad_norm": 16.205829620361328, + "learning_rate": 2.0365338508962213e-05, + "loss": 4.4837, + "step": 26070 + }, + { + "epoch": 2.970556409818327, + "grad_norm": 12.326767921447754, + "learning_rate": 2.0353921680557142e-05, + "loss": 4.306, + "step": 26080 + }, + { + "epoch": 2.9716954268466313, + "grad_norm": 16.592323303222656, + "learning_rate": 2.0342504852152074e-05, + "loss": 4.2738, + "step": 26090 + }, + { + "epoch": 2.9728344438749357, + "grad_norm": 14.362883567810059, + "learning_rate": 2.0331088023747005e-05, + "loss": 4.4225, + "step": 26100 + }, + { + "epoch": 2.9739734609032404, + "grad_norm": 17.716650009155273, + "learning_rate": 2.0319671195341934e-05, + "loss": 4.447, + "step": 26110 + }, + { + "epoch": 2.975112477931545, + "grad_norm": 15.68718147277832, + "learning_rate": 2.0308254366936866e-05, + "loss": 4.4186, + "step": 26120 + }, + { + "epoch": 2.9762514949598495, + "grad_norm": 8.298096656799316, + "learning_rate": 2.0296837538531798e-05, + "loss": 4.3944, + "step": 26130 + }, + { + "epoch": 2.9773905119881543, + "grad_norm": 14.487730979919434, + "learning_rate": 2.028542071012673e-05, + "loss": 4.4883, + "step": 26140 + }, + { + "epoch": 2.978529529016459, + "grad_norm": 45.469085693359375, + "learning_rate": 2.0274003881721658e-05, + "loss": 4.3144, + "step": 26150 + }, + { + "epoch": 2.9796685460447634, + "grad_norm": 11.404308319091797, + "learning_rate": 2.026258705331659e-05, + "loss": 4.1847, + "step": 26160 + }, + { + "epoch": 2.9808075630730677, + "grad_norm": 14.415459632873535, + "learning_rate": 2.0251170224911518e-05, + "loss": 4.5272, + "step": 26170 + }, + { + "epoch": 2.9819465801013725, + "grad_norm": 22.10926055908203, + "learning_rate": 2.0239753396506453e-05, + "loss": 4.6152, + "step": 26180 + }, + { + "epoch": 2.9830855971296772, + "grad_norm": 9.495277404785156, + "learning_rate": 2.0228336568101382e-05, + "loss": 4.6948, + "step": 26190 + }, + { + "epoch": 2.9842246141579816, + "grad_norm": 8.213211059570312, + "learning_rate": 2.0216919739696314e-05, + "loss": 4.3108, + "step": 26200 + }, + { + "epoch": 2.9853636311862863, + "grad_norm": 15.879585266113281, + "learning_rate": 2.0205502911291242e-05, + "loss": 4.4845, + "step": 26210 + }, + { + "epoch": 2.9865026482145907, + "grad_norm": 7.781096935272217, + "learning_rate": 2.0194086082886174e-05, + "loss": 4.3855, + "step": 26220 + }, + { + "epoch": 2.9876416652428954, + "grad_norm": 17.566858291625977, + "learning_rate": 2.0182669254481106e-05, + "loss": 4.2371, + "step": 26230 + }, + { + "epoch": 2.9887806822711998, + "grad_norm": 10.631742477416992, + "learning_rate": 2.0171252426076038e-05, + "loss": 4.5712, + "step": 26240 + }, + { + "epoch": 2.9899196992995045, + "grad_norm": 11.58210277557373, + "learning_rate": 2.0159835597670966e-05, + "loss": 4.3123, + "step": 26250 + }, + { + "epoch": 2.9910587163278093, + "grad_norm": 12.677685737609863, + "learning_rate": 2.0148418769265898e-05, + "loss": 4.1608, + "step": 26260 + }, + { + "epoch": 2.9921977333561136, + "grad_norm": 9.210295677185059, + "learning_rate": 2.013700194086083e-05, + "loss": 4.4583, + "step": 26270 + }, + { + "epoch": 2.9933367503844184, + "grad_norm": 43.07964324951172, + "learning_rate": 2.012558511245576e-05, + "loss": 4.2155, + "step": 26280 + }, + { + "epoch": 2.9944757674127227, + "grad_norm": 12.508554458618164, + "learning_rate": 2.011416828405069e-05, + "loss": 4.1294, + "step": 26290 + }, + { + "epoch": 2.9956147844410275, + "grad_norm": 9.088356018066406, + "learning_rate": 2.0102751455645622e-05, + "loss": 4.3443, + "step": 26300 + }, + { + "epoch": 2.996753801469332, + "grad_norm": 10.917645454406738, + "learning_rate": 2.0091334627240554e-05, + "loss": 4.356, + "step": 26310 + }, + { + "epoch": 2.9978928184976366, + "grad_norm": 12.575218200683594, + "learning_rate": 2.0079917798835486e-05, + "loss": 4.1655, + "step": 26320 + }, + { + "epoch": 2.9990318355259413, + "grad_norm": 12.658269882202148, + "learning_rate": 2.0068500970430414e-05, + "loss": 4.5894, + "step": 26330 + }, + { + "epoch": 3.0001708525542456, + "grad_norm": 8.49130630493164, + "learning_rate": 2.0057084142025346e-05, + "loss": 4.3359, + "step": 26340 + }, + { + "epoch": 3.0013098695825504, + "grad_norm": 18.979001998901367, + "learning_rate": 2.0045667313620278e-05, + "loss": 3.4778, + "step": 26350 + }, + { + "epoch": 3.0024488866108547, + "grad_norm": 20.550743103027344, + "learning_rate": 2.003425048521521e-05, + "loss": 3.1705, + "step": 26360 + }, + { + "epoch": 3.0035879036391595, + "grad_norm": 23.528396606445312, + "learning_rate": 2.0022833656810138e-05, + "loss": 3.3643, + "step": 26370 + }, + { + "epoch": 3.004726920667464, + "grad_norm": 13.984031677246094, + "learning_rate": 2.001141682840507e-05, + "loss": 3.1264, + "step": 26380 + }, + { + "epoch": 3.0058659376957686, + "grad_norm": 8.928504943847656, + "learning_rate": 2e-05, + "loss": 3.389, + "step": 26390 + }, + { + "epoch": 3.007004954724073, + "grad_norm": 26.28722381591797, + "learning_rate": 1.9988583171594933e-05, + "loss": 3.4283, + "step": 26400 + }, + { + "epoch": 3.0081439717523777, + "grad_norm": 19.788816452026367, + "learning_rate": 1.9977166343189862e-05, + "loss": 3.0866, + "step": 26410 + }, + { + "epoch": 3.0092829887806825, + "grad_norm": 12.7147216796875, + "learning_rate": 1.9965749514784794e-05, + "loss": 3.226, + "step": 26420 + }, + { + "epoch": 3.010422005808987, + "grad_norm": 12.794295310974121, + "learning_rate": 1.9954332686379726e-05, + "loss": 3.5337, + "step": 26430 + }, + { + "epoch": 3.0115610228372915, + "grad_norm": 25.851694107055664, + "learning_rate": 1.9942915857974657e-05, + "loss": 3.2126, + "step": 26440 + }, + { + "epoch": 3.012700039865596, + "grad_norm": 13.408803939819336, + "learning_rate": 1.9931499029569586e-05, + "loss": 3.0875, + "step": 26450 + }, + { + "epoch": 3.0138390568939006, + "grad_norm": 16.897953033447266, + "learning_rate": 1.9920082201164518e-05, + "loss": 3.1841, + "step": 26460 + }, + { + "epoch": 3.014978073922205, + "grad_norm": 37.791839599609375, + "learning_rate": 1.990866537275945e-05, + "loss": 3.3605, + "step": 26470 + }, + { + "epoch": 3.0161170909505097, + "grad_norm": 19.288755416870117, + "learning_rate": 1.989724854435438e-05, + "loss": 3.4918, + "step": 26480 + }, + { + "epoch": 3.0172561079788145, + "grad_norm": 20.846179962158203, + "learning_rate": 1.988583171594931e-05, + "loss": 3.2042, + "step": 26490 + }, + { + "epoch": 3.018395125007119, + "grad_norm": 19.75118064880371, + "learning_rate": 1.987441488754424e-05, + "loss": 3.3452, + "step": 26500 + }, + { + "epoch": 3.0195341420354236, + "grad_norm": 13.249592781066895, + "learning_rate": 1.9862998059139174e-05, + "loss": 3.2647, + "step": 26510 + }, + { + "epoch": 3.020673159063728, + "grad_norm": 16.227333068847656, + "learning_rate": 1.9851581230734105e-05, + "loss": 3.28, + "step": 26520 + }, + { + "epoch": 3.0218121760920327, + "grad_norm": 11.824593544006348, + "learning_rate": 1.9840164402329034e-05, + "loss": 3.2526, + "step": 26530 + }, + { + "epoch": 3.022951193120337, + "grad_norm": 12.387273788452148, + "learning_rate": 1.9828747573923966e-05, + "loss": 2.9822, + "step": 26540 + }, + { + "epoch": 3.0240902101486418, + "grad_norm": 25.661296844482422, + "learning_rate": 1.9817330745518894e-05, + "loss": 3.5264, + "step": 26550 + }, + { + "epoch": 3.025229227176946, + "grad_norm": 11.836889266967773, + "learning_rate": 1.9805913917113826e-05, + "loss": 3.0826, + "step": 26560 + }, + { + "epoch": 3.026368244205251, + "grad_norm": 11.189791679382324, + "learning_rate": 1.9794497088708758e-05, + "loss": 3.3436, + "step": 26570 + }, + { + "epoch": 3.0275072612335556, + "grad_norm": 13.328079223632812, + "learning_rate": 1.978308026030369e-05, + "loss": 3.4042, + "step": 26580 + }, + { + "epoch": 3.02864627826186, + "grad_norm": 10.731392860412598, + "learning_rate": 1.9771663431898618e-05, + "loss": 3.1904, + "step": 26590 + }, + { + "epoch": 3.0297852952901647, + "grad_norm": 15.390854835510254, + "learning_rate": 1.976024660349355e-05, + "loss": 3.2487, + "step": 26600 + }, + { + "epoch": 3.030924312318469, + "grad_norm": 13.056636810302734, + "learning_rate": 1.9748829775088482e-05, + "loss": 3.2201, + "step": 26610 + }, + { + "epoch": 3.032063329346774, + "grad_norm": 16.926904678344727, + "learning_rate": 1.973741294668341e-05, + "loss": 3.5044, + "step": 26620 + }, + { + "epoch": 3.033202346375078, + "grad_norm": 23.026594161987305, + "learning_rate": 1.9725996118278342e-05, + "loss": 3.0956, + "step": 26630 + }, + { + "epoch": 3.034341363403383, + "grad_norm": 15.527753829956055, + "learning_rate": 1.9714579289873274e-05, + "loss": 3.3468, + "step": 26640 + }, + { + "epoch": 3.0354803804316877, + "grad_norm": 31.828067779541016, + "learning_rate": 1.9703162461468206e-05, + "loss": 2.9932, + "step": 26650 + }, + { + "epoch": 3.036619397459992, + "grad_norm": 16.210556030273438, + "learning_rate": 1.9691745633063134e-05, + "loss": 3.3843, + "step": 26660 + }, + { + "epoch": 3.0377584144882968, + "grad_norm": 18.695133209228516, + "learning_rate": 1.9680328804658066e-05, + "loss": 3.4115, + "step": 26670 + }, + { + "epoch": 3.038897431516601, + "grad_norm": 10.234685897827148, + "learning_rate": 1.9668911976252998e-05, + "loss": 3.4362, + "step": 26680 + }, + { + "epoch": 3.040036448544906, + "grad_norm": 13.889457702636719, + "learning_rate": 1.965749514784793e-05, + "loss": 3.2165, + "step": 26690 + }, + { + "epoch": 3.04117546557321, + "grad_norm": 13.6388521194458, + "learning_rate": 1.9646078319442858e-05, + "loss": 3.0997, + "step": 26700 + }, + { + "epoch": 3.042314482601515, + "grad_norm": 17.444013595581055, + "learning_rate": 1.963466149103779e-05, + "loss": 3.2621, + "step": 26710 + }, + { + "epoch": 3.0434534996298193, + "grad_norm": 12.85341739654541, + "learning_rate": 1.9623244662632722e-05, + "loss": 3.0798, + "step": 26720 + }, + { + "epoch": 3.044592516658124, + "grad_norm": 23.041946411132812, + "learning_rate": 1.9611827834227654e-05, + "loss": 3.4639, + "step": 26730 + }, + { + "epoch": 3.045731533686429, + "grad_norm": 14.276169776916504, + "learning_rate": 1.9600411005822582e-05, + "loss": 3.2518, + "step": 26740 + }, + { + "epoch": 3.046870550714733, + "grad_norm": 11.501389503479004, + "learning_rate": 1.9588994177417514e-05, + "loss": 2.7835, + "step": 26750 + }, + { + "epoch": 3.048009567743038, + "grad_norm": 18.737586975097656, + "learning_rate": 1.9577577349012446e-05, + "loss": 3.1787, + "step": 26760 + }, + { + "epoch": 3.049148584771342, + "grad_norm": 12.630542755126953, + "learning_rate": 1.9566160520607378e-05, + "loss": 3.1533, + "step": 26770 + }, + { + "epoch": 3.050287601799647, + "grad_norm": 18.454057693481445, + "learning_rate": 1.9554743692202306e-05, + "loss": 2.8906, + "step": 26780 + }, + { + "epoch": 3.0514266188279513, + "grad_norm": 14.291041374206543, + "learning_rate": 1.9543326863797238e-05, + "loss": 3.347, + "step": 26790 + }, + { + "epoch": 3.052565635856256, + "grad_norm": 29.922286987304688, + "learning_rate": 1.953191003539217e-05, + "loss": 3.139, + "step": 26800 + }, + { + "epoch": 3.053704652884561, + "grad_norm": 11.581583023071289, + "learning_rate": 1.95204932069871e-05, + "loss": 2.96, + "step": 26810 + }, + { + "epoch": 3.054843669912865, + "grad_norm": 12.896805763244629, + "learning_rate": 1.950907637858203e-05, + "loss": 3.3296, + "step": 26820 + }, + { + "epoch": 3.05598268694117, + "grad_norm": 29.834461212158203, + "learning_rate": 1.9497659550176962e-05, + "loss": 3.2579, + "step": 26830 + }, + { + "epoch": 3.0571217039694742, + "grad_norm": 12.87248420715332, + "learning_rate": 1.9486242721771894e-05, + "loss": 2.9772, + "step": 26840 + }, + { + "epoch": 3.058260720997779, + "grad_norm": 31.38313865661621, + "learning_rate": 1.9474825893366826e-05, + "loss": 3.2572, + "step": 26850 + }, + { + "epoch": 3.0593997380260833, + "grad_norm": 15.08276653289795, + "learning_rate": 1.9463409064961754e-05, + "loss": 3.1538, + "step": 26860 + }, + { + "epoch": 3.060538755054388, + "grad_norm": 11.733377456665039, + "learning_rate": 1.9451992236556686e-05, + "loss": 3.2799, + "step": 26870 + }, + { + "epoch": 3.0616777720826924, + "grad_norm": 15.852864265441895, + "learning_rate": 1.9440575408151614e-05, + "loss": 3.327, + "step": 26880 + }, + { + "epoch": 3.062816789110997, + "grad_norm": 12.521268844604492, + "learning_rate": 1.942915857974655e-05, + "loss": 3.2815, + "step": 26890 + }, + { + "epoch": 3.063955806139302, + "grad_norm": 32.56338882446289, + "learning_rate": 1.9417741751341478e-05, + "loss": 3.2574, + "step": 26900 + }, + { + "epoch": 3.0650948231676063, + "grad_norm": 13.80693244934082, + "learning_rate": 1.940632492293641e-05, + "loss": 2.9822, + "step": 26910 + }, + { + "epoch": 3.066233840195911, + "grad_norm": 16.31395149230957, + "learning_rate": 1.9394908094531338e-05, + "loss": 3.1047, + "step": 26920 + }, + { + "epoch": 3.0673728572242154, + "grad_norm": 14.875893592834473, + "learning_rate": 1.9383491266126273e-05, + "loss": 3.2336, + "step": 26930 + }, + { + "epoch": 3.06851187425252, + "grad_norm": 23.317651748657227, + "learning_rate": 1.9372074437721202e-05, + "loss": 3.2547, + "step": 26940 + }, + { + "epoch": 3.0696508912808245, + "grad_norm": 16.346988677978516, + "learning_rate": 1.9360657609316134e-05, + "loss": 3.1503, + "step": 26950 + }, + { + "epoch": 3.0707899083091292, + "grad_norm": 17.115734100341797, + "learning_rate": 1.9349240780911062e-05, + "loss": 3.3662, + "step": 26960 + }, + { + "epoch": 3.071928925337434, + "grad_norm": 34.3673095703125, + "learning_rate": 1.9337823952505994e-05, + "loss": 3.2764, + "step": 26970 + }, + { + "epoch": 3.0730679423657383, + "grad_norm": 20.209665298461914, + "learning_rate": 1.9326407124100926e-05, + "loss": 3.6596, + "step": 26980 + }, + { + "epoch": 3.074206959394043, + "grad_norm": 16.05097770690918, + "learning_rate": 1.9314990295695858e-05, + "loss": 3.2271, + "step": 26990 + }, + { + "epoch": 3.0753459764223474, + "grad_norm": 13.981256484985352, + "learning_rate": 1.9303573467290786e-05, + "loss": 3.4841, + "step": 27000 + }, + { + "epoch": 3.076484993450652, + "grad_norm": 17.1622314453125, + "learning_rate": 1.9292156638885718e-05, + "loss": 2.9933, + "step": 27010 + }, + { + "epoch": 3.0776240104789565, + "grad_norm": 43.3394889831543, + "learning_rate": 1.928073981048065e-05, + "loss": 2.9414, + "step": 27020 + }, + { + "epoch": 3.0787630275072613, + "grad_norm": 14.771462440490723, + "learning_rate": 1.926932298207558e-05, + "loss": 3.0738, + "step": 27030 + }, + { + "epoch": 3.0799020445355656, + "grad_norm": 13.060588836669922, + "learning_rate": 1.925790615367051e-05, + "loss": 3.1623, + "step": 27040 + }, + { + "epoch": 3.0810410615638704, + "grad_norm": 30.9134521484375, + "learning_rate": 1.9246489325265442e-05, + "loss": 3.2988, + "step": 27050 + }, + { + "epoch": 3.082180078592175, + "grad_norm": 11.042315483093262, + "learning_rate": 1.9235072496860374e-05, + "loss": 3.1594, + "step": 27060 + }, + { + "epoch": 3.0833190956204795, + "grad_norm": 18.68390655517578, + "learning_rate": 1.9223655668455302e-05, + "loss": 3.2803, + "step": 27070 + }, + { + "epoch": 3.0844581126487842, + "grad_norm": 13.154905319213867, + "learning_rate": 1.9212238840050234e-05, + "loss": 3.1246, + "step": 27080 + }, + { + "epoch": 3.0855971296770885, + "grad_norm": 29.026409149169922, + "learning_rate": 1.9200822011645166e-05, + "loss": 3.3788, + "step": 27090 + }, + { + "epoch": 3.0867361467053933, + "grad_norm": 15.003780364990234, + "learning_rate": 1.9189405183240098e-05, + "loss": 3.289, + "step": 27100 + }, + { + "epoch": 3.0878751637336976, + "grad_norm": 14.786931037902832, + "learning_rate": 1.9177988354835026e-05, + "loss": 3.0621, + "step": 27110 + }, + { + "epoch": 3.0890141807620024, + "grad_norm": 18.21460723876953, + "learning_rate": 1.9166571526429958e-05, + "loss": 3.5111, + "step": 27120 + }, + { + "epoch": 3.090153197790307, + "grad_norm": 12.12824821472168, + "learning_rate": 1.915515469802489e-05, + "loss": 3.2396, + "step": 27130 + }, + { + "epoch": 3.0912922148186115, + "grad_norm": 10.3527250289917, + "learning_rate": 1.914373786961982e-05, + "loss": 3.3413, + "step": 27140 + }, + { + "epoch": 3.0924312318469163, + "grad_norm": 11.547093391418457, + "learning_rate": 1.913232104121475e-05, + "loss": 3.2743, + "step": 27150 + }, + { + "epoch": 3.0935702488752206, + "grad_norm": 31.078540802001953, + "learning_rate": 1.9120904212809682e-05, + "loss": 2.6648, + "step": 27160 + }, + { + "epoch": 3.0947092659035254, + "grad_norm": 15.62102222442627, + "learning_rate": 1.9109487384404614e-05, + "loss": 2.994, + "step": 27170 + }, + { + "epoch": 3.0958482829318297, + "grad_norm": 10.508420944213867, + "learning_rate": 1.9098070555999546e-05, + "loss": 3.4438, + "step": 27180 + }, + { + "epoch": 3.0969872999601344, + "grad_norm": 23.045381546020508, + "learning_rate": 1.9086653727594474e-05, + "loss": 3.171, + "step": 27190 + }, + { + "epoch": 3.0981263169884388, + "grad_norm": 15.51526927947998, + "learning_rate": 1.9075236899189406e-05, + "loss": 3.0396, + "step": 27200 + }, + { + "epoch": 3.0992653340167435, + "grad_norm": 11.499743461608887, + "learning_rate": 1.9063820070784338e-05, + "loss": 3.1568, + "step": 27210 + }, + { + "epoch": 3.1004043510450483, + "grad_norm": 15.384307861328125, + "learning_rate": 1.905240324237927e-05, + "loss": 3.1907, + "step": 27220 + }, + { + "epoch": 3.1015433680733526, + "grad_norm": 16.944202423095703, + "learning_rate": 1.9040986413974198e-05, + "loss": 3.3019, + "step": 27230 + }, + { + "epoch": 3.1026823851016574, + "grad_norm": 19.369962692260742, + "learning_rate": 1.902956958556913e-05, + "loss": 3.1569, + "step": 27240 + }, + { + "epoch": 3.1038214021299617, + "grad_norm": 24.748016357421875, + "learning_rate": 1.901815275716406e-05, + "loss": 3.323, + "step": 27250 + }, + { + "epoch": 3.1049604191582665, + "grad_norm": 17.034259796142578, + "learning_rate": 1.9006735928758994e-05, + "loss": 2.9709, + "step": 27260 + }, + { + "epoch": 3.106099436186571, + "grad_norm": 14.272811889648438, + "learning_rate": 1.8995319100353922e-05, + "loss": 3.0763, + "step": 27270 + }, + { + "epoch": 3.1072384532148756, + "grad_norm": 16.28206443786621, + "learning_rate": 1.8983902271948854e-05, + "loss": 3.297, + "step": 27280 + }, + { + "epoch": 3.1083774702431803, + "grad_norm": 16.134042739868164, + "learning_rate": 1.8972485443543782e-05, + "loss": 3.4735, + "step": 27290 + }, + { + "epoch": 3.1095164872714847, + "grad_norm": 44.60962677001953, + "learning_rate": 1.8961068615138718e-05, + "loss": 3.4407, + "step": 27300 + }, + { + "epoch": 3.1106555042997894, + "grad_norm": 14.325160026550293, + "learning_rate": 1.8949651786733646e-05, + "loss": 3.8483, + "step": 27310 + }, + { + "epoch": 3.1117945213280938, + "grad_norm": 14.658393859863281, + "learning_rate": 1.8938234958328578e-05, + "loss": 2.8621, + "step": 27320 + }, + { + "epoch": 3.1129335383563985, + "grad_norm": 32.288665771484375, + "learning_rate": 1.8926818129923506e-05, + "loss": 3.3189, + "step": 27330 + }, + { + "epoch": 3.114072555384703, + "grad_norm": 15.71682357788086, + "learning_rate": 1.891540130151844e-05, + "loss": 3.2937, + "step": 27340 + }, + { + "epoch": 3.1152115724130076, + "grad_norm": 22.568588256835938, + "learning_rate": 1.890398447311337e-05, + "loss": 3.1074, + "step": 27350 + }, + { + "epoch": 3.116350589441312, + "grad_norm": 12.612749099731445, + "learning_rate": 1.8892567644708302e-05, + "loss": 3.345, + "step": 27360 + }, + { + "epoch": 3.1174896064696167, + "grad_norm": 15.754531860351562, + "learning_rate": 1.888115081630323e-05, + "loss": 3.3005, + "step": 27370 + }, + { + "epoch": 3.1186286234979215, + "grad_norm": 12.605548858642578, + "learning_rate": 1.8869733987898165e-05, + "loss": 3.2184, + "step": 27380 + }, + { + "epoch": 3.119767640526226, + "grad_norm": 12.064836502075195, + "learning_rate": 1.8858317159493094e-05, + "loss": 3.2672, + "step": 27390 + }, + { + "epoch": 3.1209066575545306, + "grad_norm": 16.371063232421875, + "learning_rate": 1.8846900331088026e-05, + "loss": 3.0152, + "step": 27400 + }, + { + "epoch": 3.122045674582835, + "grad_norm": 15.042854309082031, + "learning_rate": 1.8835483502682954e-05, + "loss": 2.9543, + "step": 27410 + }, + { + "epoch": 3.1231846916111397, + "grad_norm": 23.035600662231445, + "learning_rate": 1.8824066674277886e-05, + "loss": 3.3771, + "step": 27420 + }, + { + "epoch": 3.124323708639444, + "grad_norm": 34.90754699707031, + "learning_rate": 1.8812649845872818e-05, + "loss": 3.411, + "step": 27430 + }, + { + "epoch": 3.1254627256677487, + "grad_norm": 17.134292602539062, + "learning_rate": 1.880123301746775e-05, + "loss": 3.3458, + "step": 27440 + }, + { + "epoch": 3.1266017426960535, + "grad_norm": 41.59492874145508, + "learning_rate": 1.8789816189062678e-05, + "loss": 2.8683, + "step": 27450 + }, + { + "epoch": 3.127740759724358, + "grad_norm": 11.885684967041016, + "learning_rate": 1.877839936065761e-05, + "loss": 3.0729, + "step": 27460 + }, + { + "epoch": 3.1288797767526626, + "grad_norm": 18.801475524902344, + "learning_rate": 1.8766982532252542e-05, + "loss": 3.579, + "step": 27470 + }, + { + "epoch": 3.130018793780967, + "grad_norm": 18.656755447387695, + "learning_rate": 1.875556570384747e-05, + "loss": 3.1842, + "step": 27480 + }, + { + "epoch": 3.1311578108092717, + "grad_norm": 16.117191314697266, + "learning_rate": 1.8744148875442402e-05, + "loss": 3.2142, + "step": 27490 + }, + { + "epoch": 3.132296827837576, + "grad_norm": 38.7790641784668, + "learning_rate": 1.8732732047037334e-05, + "loss": 2.9492, + "step": 27500 + }, + { + "epoch": 3.133435844865881, + "grad_norm": 24.200565338134766, + "learning_rate": 1.8721315218632266e-05, + "loss": 3.2178, + "step": 27510 + }, + { + "epoch": 3.134574861894185, + "grad_norm": 17.899675369262695, + "learning_rate": 1.8709898390227194e-05, + "loss": 3.0469, + "step": 27520 + }, + { + "epoch": 3.13571387892249, + "grad_norm": 17.653701782226562, + "learning_rate": 1.8698481561822126e-05, + "loss": 3.2397, + "step": 27530 + }, + { + "epoch": 3.1368528959507946, + "grad_norm": 15.90965461730957, + "learning_rate": 1.8687064733417058e-05, + "loss": 3.3233, + "step": 27540 + }, + { + "epoch": 3.137991912979099, + "grad_norm": 30.597681045532227, + "learning_rate": 1.867564790501199e-05, + "loss": 3.1581, + "step": 27550 + }, + { + "epoch": 3.1391309300074037, + "grad_norm": 18.15818214416504, + "learning_rate": 1.8664231076606918e-05, + "loss": 3.2221, + "step": 27560 + }, + { + "epoch": 3.140269947035708, + "grad_norm": 13.555785179138184, + "learning_rate": 1.865281424820185e-05, + "loss": 3.2178, + "step": 27570 + }, + { + "epoch": 3.141408964064013, + "grad_norm": 15.18497371673584, + "learning_rate": 1.864139741979678e-05, + "loss": 2.8833, + "step": 27580 + }, + { + "epoch": 3.142547981092317, + "grad_norm": 20.986042022705078, + "learning_rate": 1.8629980591391714e-05, + "loss": 3.0598, + "step": 27590 + }, + { + "epoch": 3.143686998120622, + "grad_norm": 15.958553314208984, + "learning_rate": 1.8618563762986642e-05, + "loss": 3.6522, + "step": 27600 + }, + { + "epoch": 3.1448260151489267, + "grad_norm": 50.318267822265625, + "learning_rate": 1.8607146934581574e-05, + "loss": 3.1646, + "step": 27610 + }, + { + "epoch": 3.145965032177231, + "grad_norm": 15.260852813720703, + "learning_rate": 1.8595730106176502e-05, + "loss": 3.2088, + "step": 27620 + }, + { + "epoch": 3.1471040492055358, + "grad_norm": 14.715402603149414, + "learning_rate": 1.8584313277771438e-05, + "loss": 3.2356, + "step": 27630 + }, + { + "epoch": 3.14824306623384, + "grad_norm": 13.372052192687988, + "learning_rate": 1.8572896449366366e-05, + "loss": 3.4894, + "step": 27640 + }, + { + "epoch": 3.149382083262145, + "grad_norm": 41.42268371582031, + "learning_rate": 1.8561479620961298e-05, + "loss": 3.0889, + "step": 27650 + }, + { + "epoch": 3.150521100290449, + "grad_norm": 16.11638069152832, + "learning_rate": 1.8550062792556226e-05, + "loss": 3.325, + "step": 27660 + }, + { + "epoch": 3.151660117318754, + "grad_norm": 17.42380142211914, + "learning_rate": 1.853864596415116e-05, + "loss": 3.2423, + "step": 27670 + }, + { + "epoch": 3.1527991343470587, + "grad_norm": 18.778663635253906, + "learning_rate": 1.852722913574609e-05, + "loss": 3.4032, + "step": 27680 + }, + { + "epoch": 3.153938151375363, + "grad_norm": 16.600717544555664, + "learning_rate": 1.8515812307341022e-05, + "loss": 3.2928, + "step": 27690 + }, + { + "epoch": 3.155077168403668, + "grad_norm": 20.402591705322266, + "learning_rate": 1.850439547893595e-05, + "loss": 3.6319, + "step": 27700 + }, + { + "epoch": 3.156216185431972, + "grad_norm": 33.90158462524414, + "learning_rate": 1.8492978650530886e-05, + "loss": 3.2214, + "step": 27710 + }, + { + "epoch": 3.157355202460277, + "grad_norm": 12.001391410827637, + "learning_rate": 1.8481561822125814e-05, + "loss": 3.3975, + "step": 27720 + }, + { + "epoch": 3.1584942194885812, + "grad_norm": 16.599849700927734, + "learning_rate": 1.8470144993720746e-05, + "loss": 3.271, + "step": 27730 + }, + { + "epoch": 3.159633236516886, + "grad_norm": 17.87288475036621, + "learning_rate": 1.8458728165315674e-05, + "loss": 3.1615, + "step": 27740 + }, + { + "epoch": 3.1607722535451903, + "grad_norm": 17.997711181640625, + "learning_rate": 1.844731133691061e-05, + "loss": 3.2489, + "step": 27750 + }, + { + "epoch": 3.161911270573495, + "grad_norm": 35.24727249145508, + "learning_rate": 1.8435894508505538e-05, + "loss": 3.1077, + "step": 27760 + }, + { + "epoch": 3.1630502876018, + "grad_norm": 16.674970626831055, + "learning_rate": 1.842447768010047e-05, + "loss": 3.0481, + "step": 27770 + }, + { + "epoch": 3.164189304630104, + "grad_norm": 12.34125804901123, + "learning_rate": 1.8413060851695398e-05, + "loss": 3.324, + "step": 27780 + }, + { + "epoch": 3.165328321658409, + "grad_norm": 23.691303253173828, + "learning_rate": 1.8401644023290334e-05, + "loss": 3.254, + "step": 27790 + }, + { + "epoch": 3.1664673386867133, + "grad_norm": 33.48701477050781, + "learning_rate": 1.8390227194885262e-05, + "loss": 3.2557, + "step": 27800 + }, + { + "epoch": 3.167606355715018, + "grad_norm": 15.587418556213379, + "learning_rate": 1.8378810366480194e-05, + "loss": 3.4168, + "step": 27810 + }, + { + "epoch": 3.1687453727433224, + "grad_norm": 17.93613624572754, + "learning_rate": 1.8367393538075122e-05, + "loss": 3.1407, + "step": 27820 + }, + { + "epoch": 3.169884389771627, + "grad_norm": 23.921279907226562, + "learning_rate": 1.8355976709670054e-05, + "loss": 2.6977, + "step": 27830 + }, + { + "epoch": 3.1710234067999314, + "grad_norm": 13.405139923095703, + "learning_rate": 1.8344559881264986e-05, + "loss": 3.1785, + "step": 27840 + }, + { + "epoch": 3.172162423828236, + "grad_norm": 11.815970420837402, + "learning_rate": 1.8333143052859918e-05, + "loss": 3.1895, + "step": 27850 + }, + { + "epoch": 3.173301440856541, + "grad_norm": 23.689008712768555, + "learning_rate": 1.8321726224454846e-05, + "loss": 3.2437, + "step": 27860 + }, + { + "epoch": 3.1744404578848453, + "grad_norm": 16.902727127075195, + "learning_rate": 1.8310309396049778e-05, + "loss": 3.4533, + "step": 27870 + }, + { + "epoch": 3.17557947491315, + "grad_norm": 42.89224624633789, + "learning_rate": 1.829889256764471e-05, + "loss": 2.8198, + "step": 27880 + }, + { + "epoch": 3.1767184919414544, + "grad_norm": 18.913209915161133, + "learning_rate": 1.8287475739239642e-05, + "loss": 2.9572, + "step": 27890 + }, + { + "epoch": 3.177857508969759, + "grad_norm": 27.72986602783203, + "learning_rate": 1.827605891083457e-05, + "loss": 3.2509, + "step": 27900 + }, + { + "epoch": 3.1789965259980635, + "grad_norm": 42.71379852294922, + "learning_rate": 1.8264642082429502e-05, + "loss": 3.1344, + "step": 27910 + }, + { + "epoch": 3.1801355430263682, + "grad_norm": 17.06867218017578, + "learning_rate": 1.8253225254024434e-05, + "loss": 2.9378, + "step": 27920 + }, + { + "epoch": 3.181274560054673, + "grad_norm": 24.895004272460938, + "learning_rate": 1.8241808425619362e-05, + "loss": 2.9792, + "step": 27930 + }, + { + "epoch": 3.1824135770829773, + "grad_norm": 41.76930618286133, + "learning_rate": 1.8230391597214294e-05, + "loss": 3.4044, + "step": 27940 + }, + { + "epoch": 3.183552594111282, + "grad_norm": 12.345658302307129, + "learning_rate": 1.8218974768809226e-05, + "loss": 3.3096, + "step": 27950 + }, + { + "epoch": 3.1846916111395864, + "grad_norm": 24.934452056884766, + "learning_rate": 1.8207557940404158e-05, + "loss": 2.6984, + "step": 27960 + }, + { + "epoch": 3.185830628167891, + "grad_norm": 18.349428176879883, + "learning_rate": 1.8196141111999086e-05, + "loss": 3.1897, + "step": 27970 + }, + { + "epoch": 3.1869696451961955, + "grad_norm": 18.513832092285156, + "learning_rate": 1.8184724283594018e-05, + "loss": 3.4669, + "step": 27980 + }, + { + "epoch": 3.1881086622245003, + "grad_norm": 83.0946273803711, + "learning_rate": 1.8173307455188947e-05, + "loss": 3.1882, + "step": 27990 + }, + { + "epoch": 3.189247679252805, + "grad_norm": 32.92939758300781, + "learning_rate": 1.8161890626783882e-05, + "loss": 3.4756, + "step": 28000 + }, + { + "epoch": 3.189247679252805, + "eval_loss": 6.487438678741455, + "eval_runtime": 11.7208, + "eval_samples_per_second": 1.28, + "eval_steps_per_second": 0.171, + "step": 28000 + }, + { + "epoch": 3.1903866962811094, + "grad_norm": 13.909664154052734, + "learning_rate": 1.815047379837881e-05, + "loss": 3.2001, + "step": 28010 + }, + { + "epoch": 3.191525713309414, + "grad_norm": 18.252351760864258, + "learning_rate": 1.8139056969973742e-05, + "loss": 3.3558, + "step": 28020 + }, + { + "epoch": 3.1926647303377185, + "grad_norm": 23.161178588867188, + "learning_rate": 1.812764014156867e-05, + "loss": 3.0862, + "step": 28030 + }, + { + "epoch": 3.1938037473660232, + "grad_norm": 11.257837295532227, + "learning_rate": 1.8116223313163606e-05, + "loss": 3.3629, + "step": 28040 + }, + { + "epoch": 3.1949427643943276, + "grad_norm": 53.57904815673828, + "learning_rate": 1.8104806484758534e-05, + "loss": 3.2856, + "step": 28050 + }, + { + "epoch": 3.1960817814226323, + "grad_norm": 17.169322967529297, + "learning_rate": 1.8093389656353466e-05, + "loss": 3.1074, + "step": 28060 + }, + { + "epoch": 3.1972207984509367, + "grad_norm": 12.976736068725586, + "learning_rate": 1.8081972827948394e-05, + "loss": 3.2706, + "step": 28070 + }, + { + "epoch": 3.1983598154792414, + "grad_norm": 15.092696189880371, + "learning_rate": 1.807055599954333e-05, + "loss": 3.1404, + "step": 28080 + }, + { + "epoch": 3.199498832507546, + "grad_norm": 58.73832321166992, + "learning_rate": 1.8059139171138258e-05, + "loss": 3.3886, + "step": 28090 + }, + { + "epoch": 3.2006378495358505, + "grad_norm": 19.246742248535156, + "learning_rate": 1.804772234273319e-05, + "loss": 3.0287, + "step": 28100 + }, + { + "epoch": 3.2017768665641553, + "grad_norm": 21.087007522583008, + "learning_rate": 1.803630551432812e-05, + "loss": 3.088, + "step": 28110 + }, + { + "epoch": 3.2029158835924596, + "grad_norm": 40.340824127197266, + "learning_rate": 1.8024888685923054e-05, + "loss": 3.0932, + "step": 28120 + }, + { + "epoch": 3.2040549006207644, + "grad_norm": 15.739103317260742, + "learning_rate": 1.8013471857517982e-05, + "loss": 3.0367, + "step": 28130 + }, + { + "epoch": 3.2051939176490687, + "grad_norm": 26.22688102722168, + "learning_rate": 1.8002055029112914e-05, + "loss": 3.0504, + "step": 28140 + }, + { + "epoch": 3.2063329346773735, + "grad_norm": 16.44894027709961, + "learning_rate": 1.7990638200707842e-05, + "loss": 3.2259, + "step": 28150 + }, + { + "epoch": 3.207471951705678, + "grad_norm": 14.065156936645508, + "learning_rate": 1.7979221372302778e-05, + "loss": 3.5373, + "step": 28160 + }, + { + "epoch": 3.2086109687339825, + "grad_norm": 16.340435028076172, + "learning_rate": 1.7967804543897706e-05, + "loss": 3.4307, + "step": 28170 + }, + { + "epoch": 3.2097499857622873, + "grad_norm": 13.92990493774414, + "learning_rate": 1.7956387715492638e-05, + "loss": 3.2959, + "step": 28180 + }, + { + "epoch": 3.2108890027905916, + "grad_norm": 13.67980670928955, + "learning_rate": 1.7944970887087566e-05, + "loss": 3.1901, + "step": 28190 + }, + { + "epoch": 3.2120280198188964, + "grad_norm": 58.88777542114258, + "learning_rate": 1.79335540586825e-05, + "loss": 3.167, + "step": 28200 + }, + { + "epoch": 3.2131670368472007, + "grad_norm": 17.12492561340332, + "learning_rate": 1.792213723027743e-05, + "loss": 3.7324, + "step": 28210 + }, + { + "epoch": 3.2143060538755055, + "grad_norm": 19.977720260620117, + "learning_rate": 1.7910720401872362e-05, + "loss": 3.1822, + "step": 28220 + }, + { + "epoch": 3.21544507090381, + "grad_norm": 21.183141708374023, + "learning_rate": 1.789930357346729e-05, + "loss": 3.1884, + "step": 28230 + }, + { + "epoch": 3.2165840879321146, + "grad_norm": 20.963783264160156, + "learning_rate": 1.7887886745062226e-05, + "loss": 3.544, + "step": 28240 + }, + { + "epoch": 3.2177231049604194, + "grad_norm": 34.167781829833984, + "learning_rate": 1.7876469916657154e-05, + "loss": 3.2661, + "step": 28250 + }, + { + "epoch": 3.2188621219887237, + "grad_norm": 17.497081756591797, + "learning_rate": 1.7865053088252086e-05, + "loss": 2.9077, + "step": 28260 + }, + { + "epoch": 3.2200011390170284, + "grad_norm": 31.264101028442383, + "learning_rate": 1.7853636259847014e-05, + "loss": 3.0755, + "step": 28270 + }, + { + "epoch": 3.2211401560453328, + "grad_norm": 30.14015769958496, + "learning_rate": 1.7842219431441946e-05, + "loss": 3.9268, + "step": 28280 + }, + { + "epoch": 3.2222791730736375, + "grad_norm": 15.659712791442871, + "learning_rate": 1.7830802603036878e-05, + "loss": 3.5625, + "step": 28290 + }, + { + "epoch": 3.223418190101942, + "grad_norm": 18.148754119873047, + "learning_rate": 1.781938577463181e-05, + "loss": 2.8605, + "step": 28300 + }, + { + "epoch": 3.2245572071302466, + "grad_norm": 44.391685485839844, + "learning_rate": 1.7807968946226738e-05, + "loss": 3.3872, + "step": 28310 + }, + { + "epoch": 3.2256962241585514, + "grad_norm": 16.15254020690918, + "learning_rate": 1.779655211782167e-05, + "loss": 3.4028, + "step": 28320 + }, + { + "epoch": 3.2268352411868557, + "grad_norm": 14.51396369934082, + "learning_rate": 1.7785135289416602e-05, + "loss": 3.3886, + "step": 28330 + }, + { + "epoch": 3.2279742582151605, + "grad_norm": 28.465871810913086, + "learning_rate": 1.777371846101153e-05, + "loss": 3.0538, + "step": 28340 + }, + { + "epoch": 3.229113275243465, + "grad_norm": 15.167338371276855, + "learning_rate": 1.7762301632606462e-05, + "loss": 3.4863, + "step": 28350 + }, + { + "epoch": 3.2302522922717696, + "grad_norm": 15.840221405029297, + "learning_rate": 1.7750884804201394e-05, + "loss": 2.9079, + "step": 28360 + }, + { + "epoch": 3.231391309300074, + "grad_norm": 17.186038970947266, + "learning_rate": 1.7739467975796326e-05, + "loss": 3.5992, + "step": 28370 + }, + { + "epoch": 3.2325303263283787, + "grad_norm": 14.156516075134277, + "learning_rate": 1.7728051147391254e-05, + "loss": 3.6097, + "step": 28380 + }, + { + "epoch": 3.233669343356683, + "grad_norm": 25.355093002319336, + "learning_rate": 1.7716634318986186e-05, + "loss": 3.3119, + "step": 28390 + }, + { + "epoch": 3.2348083603849878, + "grad_norm": 26.929956436157227, + "learning_rate": 1.7705217490581118e-05, + "loss": 3.3645, + "step": 28400 + }, + { + "epoch": 3.2359473774132925, + "grad_norm": 18.3270263671875, + "learning_rate": 1.769380066217605e-05, + "loss": 3.2608, + "step": 28410 + }, + { + "epoch": 3.237086394441597, + "grad_norm": 15.915236473083496, + "learning_rate": 1.7682383833770978e-05, + "loss": 3.1598, + "step": 28420 + }, + { + "epoch": 3.2382254114699016, + "grad_norm": 13.311419486999512, + "learning_rate": 1.767096700536591e-05, + "loss": 3.2362, + "step": 28430 + }, + { + "epoch": 3.239364428498206, + "grad_norm": 18.324195861816406, + "learning_rate": 1.765955017696084e-05, + "loss": 3.3171, + "step": 28440 + }, + { + "epoch": 3.2405034455265107, + "grad_norm": 17.85279083251953, + "learning_rate": 1.7648133348555774e-05, + "loss": 3.3206, + "step": 28450 + }, + { + "epoch": 3.241642462554815, + "grad_norm": 15.024346351623535, + "learning_rate": 1.763785820299121e-05, + "loss": 3.3791, + "step": 28460 + }, + { + "epoch": 3.24278147958312, + "grad_norm": 24.47608184814453, + "learning_rate": 1.762644137458614e-05, + "loss": 3.3351, + "step": 28470 + }, + { + "epoch": 3.243920496611424, + "grad_norm": 16.118467330932617, + "learning_rate": 1.7615024546181073e-05, + "loss": 3.241, + "step": 28480 + }, + { + "epoch": 3.245059513639729, + "grad_norm": 14.81120777130127, + "learning_rate": 1.7603607717776002e-05, + "loss": 3.3641, + "step": 28490 + }, + { + "epoch": 3.2461985306680337, + "grad_norm": 15.765020370483398, + "learning_rate": 1.7592190889370934e-05, + "loss": 3.4755, + "step": 28500 + }, + { + "epoch": 3.247337547696338, + "grad_norm": 22.138263702392578, + "learning_rate": 1.7580774060965866e-05, + "loss": 3.1655, + "step": 28510 + }, + { + "epoch": 3.2484765647246427, + "grad_norm": 21.298864364624023, + "learning_rate": 1.7569357232560797e-05, + "loss": 3.1699, + "step": 28520 + }, + { + "epoch": 3.249615581752947, + "grad_norm": 13.472810745239258, + "learning_rate": 1.7557940404155726e-05, + "loss": 2.773, + "step": 28530 + }, + { + "epoch": 3.250754598781252, + "grad_norm": 17.579721450805664, + "learning_rate": 1.7546523575750658e-05, + "loss": 3.2553, + "step": 28540 + }, + { + "epoch": 3.251893615809556, + "grad_norm": 18.842206954956055, + "learning_rate": 1.753510674734559e-05, + "loss": 3.2994, + "step": 28550 + }, + { + "epoch": 3.253032632837861, + "grad_norm": 22.36919593811035, + "learning_rate": 1.7523689918940518e-05, + "loss": 2.9392, + "step": 28560 + }, + { + "epoch": 3.2541716498661657, + "grad_norm": 21.19603157043457, + "learning_rate": 1.751227309053545e-05, + "loss": 3.3386, + "step": 28570 + }, + { + "epoch": 3.25531066689447, + "grad_norm": 19.441781997680664, + "learning_rate": 1.750085626213038e-05, + "loss": 3.1832, + "step": 28580 + }, + { + "epoch": 3.256449683922775, + "grad_norm": 22.879182815551758, + "learning_rate": 1.7489439433725313e-05, + "loss": 3.3685, + "step": 28590 + }, + { + "epoch": 3.257588700951079, + "grad_norm": 12.05748176574707, + "learning_rate": 1.7478022605320242e-05, + "loss": 3.0341, + "step": 28600 + }, + { + "epoch": 3.258727717979384, + "grad_norm": 32.73973083496094, + "learning_rate": 1.7466605776915174e-05, + "loss": 3.0821, + "step": 28610 + }, + { + "epoch": 3.259866735007688, + "grad_norm": 18.262226104736328, + "learning_rate": 1.7455188948510102e-05, + "loss": 3.092, + "step": 28620 + }, + { + "epoch": 3.261005752035993, + "grad_norm": 19.36309242248535, + "learning_rate": 1.7443772120105037e-05, + "loss": 3.3871, + "step": 28630 + }, + { + "epoch": 3.2621447690642977, + "grad_norm": 13.321374893188477, + "learning_rate": 1.7432355291699966e-05, + "loss": 3.2189, + "step": 28640 + }, + { + "epoch": 3.263283786092602, + "grad_norm": 16.028152465820312, + "learning_rate": 1.7420938463294898e-05, + "loss": 3.091, + "step": 28650 + }, + { + "epoch": 3.264422803120907, + "grad_norm": 15.734786987304688, + "learning_rate": 1.7409521634889826e-05, + "loss": 3.4497, + "step": 28660 + }, + { + "epoch": 3.265561820149211, + "grad_norm": 19.153057098388672, + "learning_rate": 1.739810480648476e-05, + "loss": 3.1372, + "step": 28670 + }, + { + "epoch": 3.266700837177516, + "grad_norm": 33.36358642578125, + "learning_rate": 1.738668797807969e-05, + "loss": 3.2049, + "step": 28680 + }, + { + "epoch": 3.2678398542058202, + "grad_norm": 19.029390335083008, + "learning_rate": 1.737527114967462e-05, + "loss": 3.0428, + "step": 28690 + }, + { + "epoch": 3.268978871234125, + "grad_norm": 15.155694007873535, + "learning_rate": 1.736385432126955e-05, + "loss": 3.1559, + "step": 28700 + }, + { + "epoch": 3.2701178882624298, + "grad_norm": 18.706727981567383, + "learning_rate": 1.7352437492864482e-05, + "loss": 3.0554, + "step": 28710 + }, + { + "epoch": 3.271256905290734, + "grad_norm": 17.440990447998047, + "learning_rate": 1.7341020664459414e-05, + "loss": 3.55, + "step": 28720 + }, + { + "epoch": 3.272395922319039, + "grad_norm": 23.9125919342041, + "learning_rate": 1.7329603836054346e-05, + "loss": 3.3082, + "step": 28730 + }, + { + "epoch": 3.273534939347343, + "grad_norm": 32.674766540527344, + "learning_rate": 1.7318187007649274e-05, + "loss": 3.3228, + "step": 28740 + }, + { + "epoch": 3.274673956375648, + "grad_norm": 14.836384773254395, + "learning_rate": 1.7306770179244206e-05, + "loss": 3.3278, + "step": 28750 + }, + { + "epoch": 3.2758129734039523, + "grad_norm": 18.351093292236328, + "learning_rate": 1.7295353350839138e-05, + "loss": 3.328, + "step": 28760 + }, + { + "epoch": 3.276951990432257, + "grad_norm": 17.573938369750977, + "learning_rate": 1.728393652243407e-05, + "loss": 3.2982, + "step": 28770 + }, + { + "epoch": 3.2780910074605614, + "grad_norm": 19.292461395263672, + "learning_rate": 1.7272519694028998e-05, + "loss": 3.1923, + "step": 28780 + }, + { + "epoch": 3.279230024488866, + "grad_norm": 22.580219268798828, + "learning_rate": 1.726110286562393e-05, + "loss": 3.1601, + "step": 28790 + }, + { + "epoch": 3.2803690415171705, + "grad_norm": 24.21674346923828, + "learning_rate": 1.7249686037218862e-05, + "loss": 3.3531, + "step": 28800 + }, + { + "epoch": 3.2815080585454752, + "grad_norm": 19.79977798461914, + "learning_rate": 1.7238269208813794e-05, + "loss": 3.2324, + "step": 28810 + }, + { + "epoch": 3.28264707557378, + "grad_norm": 32.75783920288086, + "learning_rate": 1.7226852380408722e-05, + "loss": 3.0395, + "step": 28820 + }, + { + "epoch": 3.2837860926020843, + "grad_norm": 14.975643157958984, + "learning_rate": 1.7215435552003654e-05, + "loss": 3.1527, + "step": 28830 + }, + { + "epoch": 3.284925109630389, + "grad_norm": 23.69989585876465, + "learning_rate": 1.7204018723598586e-05, + "loss": 3.5149, + "step": 28840 + }, + { + "epoch": 3.2860641266586934, + "grad_norm": 15.182013511657715, + "learning_rate": 1.7192601895193518e-05, + "loss": 3.5798, + "step": 28850 + }, + { + "epoch": 3.287203143686998, + "grad_norm": 22.140403747558594, + "learning_rate": 1.7181185066788446e-05, + "loss": 3.0379, + "step": 28860 + }, + { + "epoch": 3.2883421607153025, + "grad_norm": 15.012598991394043, + "learning_rate": 1.7169768238383378e-05, + "loss": 3.3189, + "step": 28870 + }, + { + "epoch": 3.2894811777436073, + "grad_norm": 23.979196548461914, + "learning_rate": 1.715835140997831e-05, + "loss": 3.1195, + "step": 28880 + }, + { + "epoch": 3.290620194771912, + "grad_norm": 19.301538467407227, + "learning_rate": 1.714693458157324e-05, + "loss": 3.2205, + "step": 28890 + }, + { + "epoch": 3.2917592118002164, + "grad_norm": 24.4025936126709, + "learning_rate": 1.713551775316817e-05, + "loss": 3.2443, + "step": 28900 + }, + { + "epoch": 3.292898228828521, + "grad_norm": 12.656296730041504, + "learning_rate": 1.7124100924763102e-05, + "loss": 3.0962, + "step": 28910 + }, + { + "epoch": 3.2940372458568254, + "grad_norm": 84.212890625, + "learning_rate": 1.7112684096358034e-05, + "loss": 3.1838, + "step": 28920 + }, + { + "epoch": 3.29517626288513, + "grad_norm": 19.40008544921875, + "learning_rate": 1.7101267267952965e-05, + "loss": 3.4919, + "step": 28930 + }, + { + "epoch": 3.2963152799134345, + "grad_norm": 14.742105484008789, + "learning_rate": 1.7089850439547894e-05, + "loss": 3.4597, + "step": 28940 + }, + { + "epoch": 3.2974542969417393, + "grad_norm": 27.001304626464844, + "learning_rate": 1.7078433611142826e-05, + "loss": 3.2382, + "step": 28950 + }, + { + "epoch": 3.298593313970044, + "grad_norm": 53.63911437988281, + "learning_rate": 1.7067016782737758e-05, + "loss": 3.3239, + "step": 28960 + }, + { + "epoch": 3.2997323309983484, + "grad_norm": 86.16913604736328, + "learning_rate": 1.705559995433269e-05, + "loss": 2.9274, + "step": 28970 + }, + { + "epoch": 3.300871348026653, + "grad_norm": 15.573724746704102, + "learning_rate": 1.7044183125927618e-05, + "loss": 3.355, + "step": 28980 + }, + { + "epoch": 3.3020103650549575, + "grad_norm": 17.94513702392578, + "learning_rate": 1.703276629752255e-05, + "loss": 3.3487, + "step": 28990 + }, + { + "epoch": 3.3031493820832623, + "grad_norm": 32.797203063964844, + "learning_rate": 1.702134946911748e-05, + "loss": 3.099, + "step": 29000 + }, + { + "epoch": 3.3042883991115666, + "grad_norm": 18.942058563232422, + "learning_rate": 1.700993264071241e-05, + "loss": 3.6015, + "step": 29010 + }, + { + "epoch": 3.3054274161398713, + "grad_norm": 36.88163375854492, + "learning_rate": 1.6998515812307342e-05, + "loss": 3.4283, + "step": 29020 + }, + { + "epoch": 3.306566433168176, + "grad_norm": 14.28677749633789, + "learning_rate": 1.6987098983902274e-05, + "loss": 3.3481, + "step": 29030 + }, + { + "epoch": 3.3077054501964804, + "grad_norm": 13.87667179107666, + "learning_rate": 1.6975682155497202e-05, + "loss": 3.1402, + "step": 29040 + }, + { + "epoch": 3.308844467224785, + "grad_norm": 22.34500503540039, + "learning_rate": 1.6964265327092134e-05, + "loss": 3.1876, + "step": 29050 + }, + { + "epoch": 3.3099834842530895, + "grad_norm": 13.625529289245605, + "learning_rate": 1.6952848498687066e-05, + "loss": 3.1295, + "step": 29060 + }, + { + "epoch": 3.3111225012813943, + "grad_norm": 20.435510635375977, + "learning_rate": 1.6941431670281994e-05, + "loss": 2.9283, + "step": 29070 + }, + { + "epoch": 3.3122615183096986, + "grad_norm": 25.591533660888672, + "learning_rate": 1.6930014841876926e-05, + "loss": 3.3265, + "step": 29080 + }, + { + "epoch": 3.3134005353380034, + "grad_norm": 17.3295841217041, + "learning_rate": 1.6918598013471858e-05, + "loss": 3.1512, + "step": 29090 + }, + { + "epoch": 3.3145395523663077, + "grad_norm": 43.755069732666016, + "learning_rate": 1.690718118506679e-05, + "loss": 2.9848, + "step": 29100 + }, + { + "epoch": 3.3156785693946125, + "grad_norm": 16.596729278564453, + "learning_rate": 1.6895764356661718e-05, + "loss": 3.2103, + "step": 29110 + }, + { + "epoch": 3.316817586422917, + "grad_norm": 25.127235412597656, + "learning_rate": 1.688434752825665e-05, + "loss": 3.0511, + "step": 29120 + }, + { + "epoch": 3.3179566034512216, + "grad_norm": 24.36378288269043, + "learning_rate": 1.6872930699851582e-05, + "loss": 3.2856, + "step": 29130 + }, + { + "epoch": 3.3190956204795263, + "grad_norm": 21.73314094543457, + "learning_rate": 1.6861513871446514e-05, + "loss": 3.1421, + "step": 29140 + }, + { + "epoch": 3.3202346375078307, + "grad_norm": 17.820711135864258, + "learning_rate": 1.6850097043041442e-05, + "loss": 3.0204, + "step": 29150 + }, + { + "epoch": 3.3213736545361354, + "grad_norm": 21.556001663208008, + "learning_rate": 1.6838680214636374e-05, + "loss": 2.9305, + "step": 29160 + }, + { + "epoch": 3.3225126715644397, + "grad_norm": 14.502067565917969, + "learning_rate": 1.6827263386231306e-05, + "loss": 3.0541, + "step": 29170 + }, + { + "epoch": 3.3236516885927445, + "grad_norm": 26.800825119018555, + "learning_rate": 1.6815846557826238e-05, + "loss": 3.1339, + "step": 29180 + }, + { + "epoch": 3.324790705621049, + "grad_norm": 17.516202926635742, + "learning_rate": 1.6804429729421166e-05, + "loss": 3.11, + "step": 29190 + }, + { + "epoch": 3.3259297226493536, + "grad_norm": 15.0630521774292, + "learning_rate": 1.6793012901016098e-05, + "loss": 3.012, + "step": 29200 + }, + { + "epoch": 3.3270687396776584, + "grad_norm": 30.267250061035156, + "learning_rate": 1.678159607261103e-05, + "loss": 3.4193, + "step": 29210 + }, + { + "epoch": 3.3282077567059627, + "grad_norm": 38.177757263183594, + "learning_rate": 1.677017924420596e-05, + "loss": 3.302, + "step": 29220 + }, + { + "epoch": 3.3293467737342675, + "grad_norm": 15.641914367675781, + "learning_rate": 1.675876241580089e-05, + "loss": 3.3494, + "step": 29230 + }, + { + "epoch": 3.330485790762572, + "grad_norm": 16.158750534057617, + "learning_rate": 1.6747345587395822e-05, + "loss": 3.3635, + "step": 29240 + }, + { + "epoch": 3.3316248077908766, + "grad_norm": 20.805484771728516, + "learning_rate": 1.6735928758990754e-05, + "loss": 3.1683, + "step": 29250 + }, + { + "epoch": 3.332763824819181, + "grad_norm": 32.77595901489258, + "learning_rate": 1.6724511930585686e-05, + "loss": 3.324, + "step": 29260 + }, + { + "epoch": 3.3339028418474856, + "grad_norm": 35.97859573364258, + "learning_rate": 1.6713095102180614e-05, + "loss": 2.9465, + "step": 29270 + }, + { + "epoch": 3.3350418588757904, + "grad_norm": 23.729339599609375, + "learning_rate": 1.6701678273775546e-05, + "loss": 3.0817, + "step": 29280 + }, + { + "epoch": 3.3361808759040947, + "grad_norm": 15.407066345214844, + "learning_rate": 1.6690261445370478e-05, + "loss": 3.0837, + "step": 29290 + }, + { + "epoch": 3.3373198929323995, + "grad_norm": 13.66457462310791, + "learning_rate": 1.667884461696541e-05, + "loss": 3.2846, + "step": 29300 + }, + { + "epoch": 3.338458909960704, + "grad_norm": 22.89889907836914, + "learning_rate": 1.6667427788560338e-05, + "loss": 3.2511, + "step": 29310 + }, + { + "epoch": 3.3395979269890086, + "grad_norm": 131.8885498046875, + "learning_rate": 1.665601096015527e-05, + "loss": 3.0335, + "step": 29320 + }, + { + "epoch": 3.340736944017313, + "grad_norm": 20.44186019897461, + "learning_rate": 1.66445941317502e-05, + "loss": 3.2246, + "step": 29330 + }, + { + "epoch": 3.3418759610456177, + "grad_norm": 14.773825645446777, + "learning_rate": 1.6633177303345134e-05, + "loss": 3.481, + "step": 29340 + }, + { + "epoch": 3.3430149780739224, + "grad_norm": 46.8711051940918, + "learning_rate": 1.6621760474940062e-05, + "loss": 2.9988, + "step": 29350 + }, + { + "epoch": 3.3441539951022268, + "grad_norm": 22.26974868774414, + "learning_rate": 1.6610343646534994e-05, + "loss": 3.2196, + "step": 29360 + }, + { + "epoch": 3.3452930121305315, + "grad_norm": 24.883068084716797, + "learning_rate": 1.6598926818129926e-05, + "loss": 3.1028, + "step": 29370 + }, + { + "epoch": 3.346432029158836, + "grad_norm": 17.86937141418457, + "learning_rate": 1.6587509989724857e-05, + "loss": 3.3909, + "step": 29380 + }, + { + "epoch": 3.3475710461871406, + "grad_norm": 14.344905853271484, + "learning_rate": 1.6576093161319786e-05, + "loss": 3.6976, + "step": 29390 + }, + { + "epoch": 3.348710063215445, + "grad_norm": 13.908252716064453, + "learning_rate": 1.6564676332914718e-05, + "loss": 3.252, + "step": 29400 + }, + { + "epoch": 3.3498490802437497, + "grad_norm": 42.503692626953125, + "learning_rate": 1.6553259504509646e-05, + "loss": 3.5162, + "step": 29410 + }, + { + "epoch": 3.350988097272054, + "grad_norm": 23.021549224853516, + "learning_rate": 1.6541842676104578e-05, + "loss": 3.2051, + "step": 29420 + }, + { + "epoch": 3.352127114300359, + "grad_norm": 13.986491203308105, + "learning_rate": 1.653042584769951e-05, + "loss": 3.0717, + "step": 29430 + }, + { + "epoch": 3.353266131328663, + "grad_norm": 23.040395736694336, + "learning_rate": 1.6519009019294442e-05, + "loss": 2.8798, + "step": 29440 + }, + { + "epoch": 3.354405148356968, + "grad_norm": 31.770923614501953, + "learning_rate": 1.650759219088937e-05, + "loss": 3.4936, + "step": 29450 + }, + { + "epoch": 3.3555441653852727, + "grad_norm": 21.78679847717285, + "learning_rate": 1.6496175362484302e-05, + "loss": 3.3232, + "step": 29460 + }, + { + "epoch": 3.356683182413577, + "grad_norm": 18.76166534423828, + "learning_rate": 1.6484758534079234e-05, + "loss": 3.3764, + "step": 29470 + }, + { + "epoch": 3.3578221994418818, + "grad_norm": 14.343433380126953, + "learning_rate": 1.6473341705674166e-05, + "loss": 3.4346, + "step": 29480 + }, + { + "epoch": 3.358961216470186, + "grad_norm": 13.465190887451172, + "learning_rate": 1.6461924877269094e-05, + "loss": 3.1158, + "step": 29490 + }, + { + "epoch": 3.360100233498491, + "grad_norm": 20.464893341064453, + "learning_rate": 1.6450508048864026e-05, + "loss": 3.3114, + "step": 29500 + }, + { + "epoch": 3.361239250526795, + "grad_norm": 14.541702270507812, + "learning_rate": 1.6439091220458958e-05, + "loss": 3.0706, + "step": 29510 + }, + { + "epoch": 3.3623782675551, + "grad_norm": 16.677627563476562, + "learning_rate": 1.6427674392053886e-05, + "loss": 3.1693, + "step": 29520 + }, + { + "epoch": 3.3635172845834047, + "grad_norm": 23.162879943847656, + "learning_rate": 1.6416257563648818e-05, + "loss": 3.3536, + "step": 29530 + }, + { + "epoch": 3.364656301611709, + "grad_norm": 17.129453659057617, + "learning_rate": 1.640484073524375e-05, + "loss": 3.1853, + "step": 29540 + }, + { + "epoch": 3.365795318640014, + "grad_norm": 11.490333557128906, + "learning_rate": 1.6393423906838682e-05, + "loss": 3.3844, + "step": 29550 + }, + { + "epoch": 3.366934335668318, + "grad_norm": 16.73991584777832, + "learning_rate": 1.638200707843361e-05, + "loss": 3.4305, + "step": 29560 + }, + { + "epoch": 3.368073352696623, + "grad_norm": 22.227336883544922, + "learning_rate": 1.6370590250028542e-05, + "loss": 3.1781, + "step": 29570 + }, + { + "epoch": 3.369212369724927, + "grad_norm": 11.860732078552246, + "learning_rate": 1.6359173421623474e-05, + "loss": 3.0631, + "step": 29580 + }, + { + "epoch": 3.370351386753232, + "grad_norm": 24.782207489013672, + "learning_rate": 1.6347756593218406e-05, + "loss": 3.3757, + "step": 29590 + }, + { + "epoch": 3.3714904037815367, + "grad_norm": 13.964835166931152, + "learning_rate": 1.6336339764813334e-05, + "loss": 3.3611, + "step": 29600 + }, + { + "epoch": 3.372629420809841, + "grad_norm": 16.538766860961914, + "learning_rate": 1.6324922936408266e-05, + "loss": 3.3345, + "step": 29610 + }, + { + "epoch": 3.373768437838146, + "grad_norm": 13.755891799926758, + "learning_rate": 1.6313506108003198e-05, + "loss": 3.2177, + "step": 29620 + }, + { + "epoch": 3.37490745486645, + "grad_norm": 15.727559089660645, + "learning_rate": 1.630208927959813e-05, + "loss": 3.5218, + "step": 29630 + }, + { + "epoch": 3.376046471894755, + "grad_norm": 14.589932441711426, + "learning_rate": 1.6290672451193058e-05, + "loss": 3.0755, + "step": 29640 + }, + { + "epoch": 3.3771854889230593, + "grad_norm": 14.680362701416016, + "learning_rate": 1.627925562278799e-05, + "loss": 3.5294, + "step": 29650 + }, + { + "epoch": 3.378324505951364, + "grad_norm": 20.469858169555664, + "learning_rate": 1.6267838794382922e-05, + "loss": 3.106, + "step": 29660 + }, + { + "epoch": 3.379463522979669, + "grad_norm": 16.367835998535156, + "learning_rate": 1.6256421965977854e-05, + "loss": 3.355, + "step": 29670 + }, + { + "epoch": 3.380602540007973, + "grad_norm": 19.499868392944336, + "learning_rate": 1.6245005137572782e-05, + "loss": 3.2907, + "step": 29680 + }, + { + "epoch": 3.381741557036278, + "grad_norm": 25.876239776611328, + "learning_rate": 1.6233588309167714e-05, + "loss": 2.9573, + "step": 29690 + }, + { + "epoch": 3.382880574064582, + "grad_norm": 28.356945037841797, + "learning_rate": 1.6222171480762646e-05, + "loss": 3.0147, + "step": 29700 + }, + { + "epoch": 3.384019591092887, + "grad_norm": 20.488754272460938, + "learning_rate": 1.6210754652357578e-05, + "loss": 2.8789, + "step": 29710 + }, + { + "epoch": 3.3851586081211913, + "grad_norm": 19.562116622924805, + "learning_rate": 1.6199337823952506e-05, + "loss": 3.412, + "step": 29720 + }, + { + "epoch": 3.386297625149496, + "grad_norm": 15.406684875488281, + "learning_rate": 1.6187920995547438e-05, + "loss": 3.0184, + "step": 29730 + }, + { + "epoch": 3.3874366421778004, + "grad_norm": 16.705585479736328, + "learning_rate": 1.6176504167142366e-05, + "loss": 3.0089, + "step": 29740 + }, + { + "epoch": 3.388575659206105, + "grad_norm": 17.685901641845703, + "learning_rate": 1.61650873387373e-05, + "loss": 3.8159, + "step": 29750 + }, + { + "epoch": 3.3897146762344095, + "grad_norm": 14.58100414276123, + "learning_rate": 1.615367051033223e-05, + "loss": 3.2699, + "step": 29760 + }, + { + "epoch": 3.3908536932627142, + "grad_norm": 23.760499954223633, + "learning_rate": 1.6142253681927162e-05, + "loss": 3.0052, + "step": 29770 + }, + { + "epoch": 3.391992710291019, + "grad_norm": 19.12250328063965, + "learning_rate": 1.613083685352209e-05, + "loss": 3.2497, + "step": 29780 + }, + { + "epoch": 3.3931317273193233, + "grad_norm": 13.280375480651855, + "learning_rate": 1.6119420025117026e-05, + "loss": 3.107, + "step": 29790 + }, + { + "epoch": 3.394270744347628, + "grad_norm": 17.527719497680664, + "learning_rate": 1.6108003196711954e-05, + "loss": 2.9071, + "step": 29800 + }, + { + "epoch": 3.3954097613759324, + "grad_norm": 17.869985580444336, + "learning_rate": 1.6096586368306886e-05, + "loss": 3.4961, + "step": 29810 + }, + { + "epoch": 3.396548778404237, + "grad_norm": 21.941728591918945, + "learning_rate": 1.6085169539901814e-05, + "loss": 3.0918, + "step": 29820 + }, + { + "epoch": 3.3976877954325415, + "grad_norm": 14.9606351852417, + "learning_rate": 1.607375271149675e-05, + "loss": 3.3902, + "step": 29830 + }, + { + "epoch": 3.3988268124608463, + "grad_norm": 11.171098709106445, + "learning_rate": 1.6062335883091678e-05, + "loss": 3.2473, + "step": 29840 + }, + { + "epoch": 3.399965829489151, + "grad_norm": 11.946869850158691, + "learning_rate": 1.605091905468661e-05, + "loss": 3.0678, + "step": 29850 + }, + { + "epoch": 3.4011048465174554, + "grad_norm": 22.47144889831543, + "learning_rate": 1.6039502226281538e-05, + "loss": 3.3255, + "step": 29860 + }, + { + "epoch": 3.40224386354576, + "grad_norm": 14.231310844421387, + "learning_rate": 1.602808539787647e-05, + "loss": 3.7008, + "step": 29870 + }, + { + "epoch": 3.4033828805740645, + "grad_norm": 31.942094802856445, + "learning_rate": 1.6016668569471402e-05, + "loss": 3.1198, + "step": 29880 + }, + { + "epoch": 3.4045218976023692, + "grad_norm": 85.79116821289062, + "learning_rate": 1.6005251741066334e-05, + "loss": 3.1192, + "step": 29890 + }, + { + "epoch": 3.4056609146306736, + "grad_norm": 20.57061004638672, + "learning_rate": 1.5993834912661262e-05, + "loss": 3.2926, + "step": 29900 + }, + { + "epoch": 3.4067999316589783, + "grad_norm": 16.892438888549805, + "learning_rate": 1.5982418084256194e-05, + "loss": 3.2882, + "step": 29910 + }, + { + "epoch": 3.407938948687283, + "grad_norm": 25.468454360961914, + "learning_rate": 1.5971001255851126e-05, + "loss": 3.1861, + "step": 29920 + }, + { + "epoch": 3.4090779657155874, + "grad_norm": 18.283166885375977, + "learning_rate": 1.5959584427446054e-05, + "loss": 3.0752, + "step": 29930 + }, + { + "epoch": 3.410216982743892, + "grad_norm": 18.180814743041992, + "learning_rate": 1.5948167599040986e-05, + "loss": 3.4524, + "step": 29940 + }, + { + "epoch": 3.4113559997721965, + "grad_norm": 26.990262985229492, + "learning_rate": 1.5936750770635918e-05, + "loss": 3.2659, + "step": 29950 + }, + { + "epoch": 3.4124950168005013, + "grad_norm": 16.079538345336914, + "learning_rate": 1.592533394223085e-05, + "loss": 3.9032, + "step": 29960 + }, + { + "epoch": 3.4136340338288056, + "grad_norm": 22.733596801757812, + "learning_rate": 1.5913917113825778e-05, + "loss": 3.2087, + "step": 29970 + }, + { + "epoch": 3.4147730508571104, + "grad_norm": 23.974925994873047, + "learning_rate": 1.590250028542071e-05, + "loss": 3.2805, + "step": 29980 + }, + { + "epoch": 3.415912067885415, + "grad_norm": 18.082353591918945, + "learning_rate": 1.5891083457015642e-05, + "loss": 3.1268, + "step": 29990 + }, + { + "epoch": 3.4170510849137194, + "grad_norm": 12.27398681640625, + "learning_rate": 1.5879666628610574e-05, + "loss": 3.3404, + "step": 30000 + }, + { + "epoch": 3.4170510849137194, + "eval_loss": 6.625226974487305, + "eval_runtime": 10.8277, + "eval_samples_per_second": 1.385, + "eval_steps_per_second": 0.185, + "step": 30000 + }, + { + "epoch": 3.418190101942024, + "grad_norm": 15.191031455993652, + "learning_rate": 1.5868249800205502e-05, + "loss": 3.0973, + "step": 30010 + }, + { + "epoch": 3.4193291189703285, + "grad_norm": 14.566495895385742, + "learning_rate": 1.5856832971800434e-05, + "loss": 3.2779, + "step": 30020 + }, + { + "epoch": 3.4204681359986333, + "grad_norm": 19.6086483001709, + "learning_rate": 1.5845416143395366e-05, + "loss": 2.934, + "step": 30030 + }, + { + "epoch": 3.4216071530269376, + "grad_norm": 23.241235733032227, + "learning_rate": 1.5833999314990298e-05, + "loss": 3.334, + "step": 30040 + }, + { + "epoch": 3.4227461700552424, + "grad_norm": 14.927157402038574, + "learning_rate": 1.5822582486585226e-05, + "loss": 3.4852, + "step": 30050 + }, + { + "epoch": 3.4238851870835467, + "grad_norm": 26.401268005371094, + "learning_rate": 1.5811165658180158e-05, + "loss": 3.1974, + "step": 30060 + }, + { + "epoch": 3.4250242041118515, + "grad_norm": 18.7486515045166, + "learning_rate": 1.5799748829775086e-05, + "loss": 3.0255, + "step": 30070 + }, + { + "epoch": 3.426163221140156, + "grad_norm": 16.352279663085938, + "learning_rate": 1.5788332001370022e-05, + "loss": 3.306, + "step": 30080 + }, + { + "epoch": 3.4273022381684606, + "grad_norm": 16.137226104736328, + "learning_rate": 1.577691517296495e-05, + "loss": 3.5011, + "step": 30090 + }, + { + "epoch": 3.4284412551967653, + "grad_norm": 11.79498291015625, + "learning_rate": 1.5765498344559882e-05, + "loss": 2.9683, + "step": 30100 + }, + { + "epoch": 3.4295802722250697, + "grad_norm": 55.726356506347656, + "learning_rate": 1.575408151615481e-05, + "loss": 2.8664, + "step": 30110 + }, + { + "epoch": 3.4307192892533744, + "grad_norm": 13.45319652557373, + "learning_rate": 1.5742664687749746e-05, + "loss": 3.3007, + "step": 30120 + }, + { + "epoch": 3.4318583062816788, + "grad_norm": 34.02750015258789, + "learning_rate": 1.5731247859344674e-05, + "loss": 3.0453, + "step": 30130 + }, + { + "epoch": 3.4329973233099835, + "grad_norm": 36.55954360961914, + "learning_rate": 1.5719831030939606e-05, + "loss": 3.1496, + "step": 30140 + }, + { + "epoch": 3.434136340338288, + "grad_norm": 27.306396484375, + "learning_rate": 1.5708414202534534e-05, + "loss": 3.196, + "step": 30150 + }, + { + "epoch": 3.4352753573665926, + "grad_norm": 12.680956840515137, + "learning_rate": 1.569699737412947e-05, + "loss": 3.3181, + "step": 30160 + }, + { + "epoch": 3.4364143743948974, + "grad_norm": 18.007972717285156, + "learning_rate": 1.5685580545724398e-05, + "loss": 2.9668, + "step": 30170 + }, + { + "epoch": 3.4375533914232017, + "grad_norm": 19.509634017944336, + "learning_rate": 1.567416371731933e-05, + "loss": 3.5464, + "step": 30180 + }, + { + "epoch": 3.4386924084515065, + "grad_norm": 28.20089340209961, + "learning_rate": 1.566274688891426e-05, + "loss": 3.1637, + "step": 30190 + }, + { + "epoch": 3.439831425479811, + "grad_norm": 55.81390380859375, + "learning_rate": 1.5651330060509194e-05, + "loss": 3.2181, + "step": 30200 + }, + { + "epoch": 3.4409704425081156, + "grad_norm": 17.022470474243164, + "learning_rate": 1.5639913232104122e-05, + "loss": 3.2408, + "step": 30210 + }, + { + "epoch": 3.44210945953642, + "grad_norm": 19.492403030395508, + "learning_rate": 1.5628496403699054e-05, + "loss": 3.2508, + "step": 30220 + }, + { + "epoch": 3.4432484765647247, + "grad_norm": 22.010845184326172, + "learning_rate": 1.5617079575293982e-05, + "loss": 3.2779, + "step": 30230 + }, + { + "epoch": 3.4443874935930294, + "grad_norm": 20.154726028442383, + "learning_rate": 1.5605662746888918e-05, + "loss": 3.3812, + "step": 30240 + }, + { + "epoch": 3.4455265106213337, + "grad_norm": 17.139026641845703, + "learning_rate": 1.5594245918483846e-05, + "loss": 3.1647, + "step": 30250 + }, + { + "epoch": 3.4466655276496385, + "grad_norm": 17.56887435913086, + "learning_rate": 1.5582829090078778e-05, + "loss": 3.4022, + "step": 30260 + }, + { + "epoch": 3.447804544677943, + "grad_norm": 27.469118118286133, + "learning_rate": 1.5571412261673706e-05, + "loss": 2.9381, + "step": 30270 + }, + { + "epoch": 3.4489435617062476, + "grad_norm": 26.647045135498047, + "learning_rate": 1.555999543326864e-05, + "loss": 3.7534, + "step": 30280 + }, + { + "epoch": 3.450082578734552, + "grad_norm": 31.436521530151367, + "learning_rate": 1.554857860486357e-05, + "loss": 3.2533, + "step": 30290 + }, + { + "epoch": 3.4512215957628567, + "grad_norm": 16.83370018005371, + "learning_rate": 1.5537161776458502e-05, + "loss": 3.0931, + "step": 30300 + }, + { + "epoch": 3.4523606127911615, + "grad_norm": 16.535404205322266, + "learning_rate": 1.552574494805343e-05, + "loss": 3.1092, + "step": 30310 + }, + { + "epoch": 3.453499629819466, + "grad_norm": 30.36872673034668, + "learning_rate": 1.5514328119648362e-05, + "loss": 2.9677, + "step": 30320 + }, + { + "epoch": 3.4546386468477706, + "grad_norm": 12.996785163879395, + "learning_rate": 1.5502911291243294e-05, + "loss": 2.9729, + "step": 30330 + }, + { + "epoch": 3.455777663876075, + "grad_norm": 45.42363357543945, + "learning_rate": 1.5491494462838226e-05, + "loss": 2.915, + "step": 30340 + }, + { + "epoch": 3.4569166809043796, + "grad_norm": 16.04938316345215, + "learning_rate": 1.5480077634433154e-05, + "loss": 3.538, + "step": 30350 + }, + { + "epoch": 3.458055697932684, + "grad_norm": 17.409317016601562, + "learning_rate": 1.5468660806028086e-05, + "loss": 3.0676, + "step": 30360 + }, + { + "epoch": 3.4591947149609887, + "grad_norm": 26.960065841674805, + "learning_rate": 1.5457243977623018e-05, + "loss": 3.8267, + "step": 30370 + }, + { + "epoch": 3.460333731989293, + "grad_norm": 21.414134979248047, + "learning_rate": 1.5445827149217946e-05, + "loss": 3.2971, + "step": 30380 + }, + { + "epoch": 3.461472749017598, + "grad_norm": 15.664322853088379, + "learning_rate": 1.5434410320812878e-05, + "loss": 3.0403, + "step": 30390 + }, + { + "epoch": 3.462611766045902, + "grad_norm": 39.66256332397461, + "learning_rate": 1.542299349240781e-05, + "loss": 3.1763, + "step": 30400 + }, + { + "epoch": 3.463750783074207, + "grad_norm": 18.446809768676758, + "learning_rate": 1.5411576664002742e-05, + "loss": 3.4536, + "step": 30410 + }, + { + "epoch": 3.4648898001025117, + "grad_norm": 19.519460678100586, + "learning_rate": 1.540015983559767e-05, + "loss": 3.5506, + "step": 30420 + }, + { + "epoch": 3.466028817130816, + "grad_norm": 17.096614837646484, + "learning_rate": 1.5388743007192602e-05, + "loss": 3.2732, + "step": 30430 + }, + { + "epoch": 3.4671678341591208, + "grad_norm": 20.875993728637695, + "learning_rate": 1.537732617878753e-05, + "loss": 2.9171, + "step": 30440 + }, + { + "epoch": 3.468306851187425, + "grad_norm": 23.673830032348633, + "learning_rate": 1.5365909350382466e-05, + "loss": 3.0686, + "step": 30450 + }, + { + "epoch": 3.46944586821573, + "grad_norm": 25.037826538085938, + "learning_rate": 1.5354492521977394e-05, + "loss": 3.1601, + "step": 30460 + }, + { + "epoch": 3.470584885244034, + "grad_norm": 18.874675750732422, + "learning_rate": 1.5343075693572326e-05, + "loss": 3.2137, + "step": 30470 + }, + { + "epoch": 3.471723902272339, + "grad_norm": 20.864459991455078, + "learning_rate": 1.5331658865167255e-05, + "loss": 3.2251, + "step": 30480 + }, + { + "epoch": 3.4728629193006437, + "grad_norm": 12.102560997009277, + "learning_rate": 1.532024203676219e-05, + "loss": 3.3722, + "step": 30490 + }, + { + "epoch": 3.474001936328948, + "grad_norm": 19.17115020751953, + "learning_rate": 1.5308825208357118e-05, + "loss": 3.0327, + "step": 30500 + }, + { + "epoch": 3.475140953357253, + "grad_norm": 14.776339530944824, + "learning_rate": 1.529740837995205e-05, + "loss": 3.2387, + "step": 30510 + }, + { + "epoch": 3.476279970385557, + "grad_norm": 21.124536514282227, + "learning_rate": 1.528599155154698e-05, + "loss": 3.3011, + "step": 30520 + }, + { + "epoch": 3.477418987413862, + "grad_norm": 18.17337417602539, + "learning_rate": 1.5274574723141914e-05, + "loss": 3.2292, + "step": 30530 + }, + { + "epoch": 3.4785580044421662, + "grad_norm": 14.608512878417969, + "learning_rate": 1.5263157894736842e-05, + "loss": 3.1605, + "step": 30540 + }, + { + "epoch": 3.479697021470471, + "grad_norm": 15.62062931060791, + "learning_rate": 1.5251741066331774e-05, + "loss": 3.2276, + "step": 30550 + }, + { + "epoch": 3.4808360384987758, + "grad_norm": 39.68000411987305, + "learning_rate": 1.5240324237926704e-05, + "loss": 3.7446, + "step": 30560 + }, + { + "epoch": 3.48197505552708, + "grad_norm": 28.0665340423584, + "learning_rate": 1.5228907409521636e-05, + "loss": 2.9518, + "step": 30570 + }, + { + "epoch": 3.483114072555385, + "grad_norm": 14.938121795654297, + "learning_rate": 1.5217490581116566e-05, + "loss": 3.2497, + "step": 30580 + }, + { + "epoch": 3.484253089583689, + "grad_norm": 15.683838844299316, + "learning_rate": 1.5206073752711496e-05, + "loss": 2.9266, + "step": 30590 + }, + { + "epoch": 3.485392106611994, + "grad_norm": 36.78779602050781, + "learning_rate": 1.5194656924306428e-05, + "loss": 3.3181, + "step": 30600 + }, + { + "epoch": 3.4865311236402983, + "grad_norm": 21.261545181274414, + "learning_rate": 1.518324009590136e-05, + "loss": 3.148, + "step": 30610 + }, + { + "epoch": 3.487670140668603, + "grad_norm": 15.538030624389648, + "learning_rate": 1.517182326749629e-05, + "loss": 3.21, + "step": 30620 + }, + { + "epoch": 3.488809157696908, + "grad_norm": 16.829191207885742, + "learning_rate": 1.516040643909122e-05, + "loss": 3.0878, + "step": 30630 + }, + { + "epoch": 3.489948174725212, + "grad_norm": 18.444971084594727, + "learning_rate": 1.514898961068615e-05, + "loss": 3.414, + "step": 30640 + }, + { + "epoch": 3.491087191753517, + "grad_norm": 20.05096435546875, + "learning_rate": 1.5137572782281084e-05, + "loss": 3.4709, + "step": 30650 + }, + { + "epoch": 3.492226208781821, + "grad_norm": 18.415931701660156, + "learning_rate": 1.5126155953876014e-05, + "loss": 3.0899, + "step": 30660 + }, + { + "epoch": 3.493365225810126, + "grad_norm": 17.503332138061523, + "learning_rate": 1.5114739125470944e-05, + "loss": 3.2454, + "step": 30670 + }, + { + "epoch": 3.4945042428384303, + "grad_norm": 12.981982231140137, + "learning_rate": 1.5103322297065874e-05, + "loss": 3.6228, + "step": 30680 + }, + { + "epoch": 3.495643259866735, + "grad_norm": 13.89370346069336, + "learning_rate": 1.5091905468660808e-05, + "loss": 2.9744, + "step": 30690 + }, + { + "epoch": 3.4967822768950394, + "grad_norm": 17.46722984313965, + "learning_rate": 1.5080488640255738e-05, + "loss": 2.9983, + "step": 30700 + }, + { + "epoch": 3.497921293923344, + "grad_norm": 20.388866424560547, + "learning_rate": 1.5069071811850668e-05, + "loss": 3.3692, + "step": 30710 + }, + { + "epoch": 3.4990603109516485, + "grad_norm": 18.890443801879883, + "learning_rate": 1.5057654983445598e-05, + "loss": 3.1463, + "step": 30720 + }, + { + "epoch": 3.5001993279799533, + "grad_norm": 20.678749084472656, + "learning_rate": 1.5046238155040532e-05, + "loss": 3.3334, + "step": 30730 + }, + { + "epoch": 3.501338345008258, + "grad_norm": 35.57350158691406, + "learning_rate": 1.5034821326635462e-05, + "loss": 3.0429, + "step": 30740 + }, + { + "epoch": 3.5024773620365623, + "grad_norm": 16.265722274780273, + "learning_rate": 1.5023404498230392e-05, + "loss": 3.4358, + "step": 30750 + }, + { + "epoch": 3.503616379064867, + "grad_norm": 21.30552864074707, + "learning_rate": 1.5011987669825322e-05, + "loss": 3.4364, + "step": 30760 + }, + { + "epoch": 3.5047553960931714, + "grad_norm": 14.750267028808594, + "learning_rate": 1.5000570841420252e-05, + "loss": 3.2435, + "step": 30770 + }, + { + "epoch": 3.505894413121476, + "grad_norm": 22.640623092651367, + "learning_rate": 1.4989154013015186e-05, + "loss": 2.9251, + "step": 30780 + }, + { + "epoch": 3.5070334301497805, + "grad_norm": 15.266586303710938, + "learning_rate": 1.4977737184610116e-05, + "loss": 3.1723, + "step": 30790 + }, + { + "epoch": 3.5081724471780853, + "grad_norm": 17.580577850341797, + "learning_rate": 1.4966320356205046e-05, + "loss": 3.2163, + "step": 30800 + }, + { + "epoch": 3.50931146420639, + "grad_norm": 25.055023193359375, + "learning_rate": 1.4954903527799976e-05, + "loss": 3.2747, + "step": 30810 + }, + { + "epoch": 3.5104504812346944, + "grad_norm": 15.265680313110352, + "learning_rate": 1.494348669939491e-05, + "loss": 2.945, + "step": 30820 + }, + { + "epoch": 3.511589498262999, + "grad_norm": 43.274559020996094, + "learning_rate": 1.493206987098984e-05, + "loss": 3.0168, + "step": 30830 + }, + { + "epoch": 3.5127285152913035, + "grad_norm": 17.391170501708984, + "learning_rate": 1.492065304258477e-05, + "loss": 3.5466, + "step": 30840 + }, + { + "epoch": 3.5138675323196082, + "grad_norm": 34.3690071105957, + "learning_rate": 1.49092362141797e-05, + "loss": 3.611, + "step": 30850 + }, + { + "epoch": 3.5150065493479126, + "grad_norm": 18.427181243896484, + "learning_rate": 1.4897819385774634e-05, + "loss": 3.0353, + "step": 30860 + }, + { + "epoch": 3.5161455663762173, + "grad_norm": 22.536880493164062, + "learning_rate": 1.4886402557369564e-05, + "loss": 3.4574, + "step": 30870 + }, + { + "epoch": 3.517284583404522, + "grad_norm": 17.170167922973633, + "learning_rate": 1.4874985728964494e-05, + "loss": 3.3474, + "step": 30880 + }, + { + "epoch": 3.5184236004328264, + "grad_norm": 22.633150100708008, + "learning_rate": 1.4863568900559424e-05, + "loss": 3.0808, + "step": 30890 + }, + { + "epoch": 3.519562617461131, + "grad_norm": 24.105703353881836, + "learning_rate": 1.4852152072154358e-05, + "loss": 3.5359, + "step": 30900 + }, + { + "epoch": 3.5207016344894355, + "grad_norm": 16.735557556152344, + "learning_rate": 1.4840735243749288e-05, + "loss": 3.5234, + "step": 30910 + }, + { + "epoch": 3.5218406515177403, + "grad_norm": 17.64985466003418, + "learning_rate": 1.4829318415344218e-05, + "loss": 3.2298, + "step": 30920 + }, + { + "epoch": 3.5229796685460446, + "grad_norm": 33.44835662841797, + "learning_rate": 1.4817901586939148e-05, + "loss": 3.3543, + "step": 30930 + }, + { + "epoch": 3.5241186855743494, + "grad_norm": 28.6575984954834, + "learning_rate": 1.4806484758534082e-05, + "loss": 3.1647, + "step": 30940 + }, + { + "epoch": 3.525257702602654, + "grad_norm": 16.440593719482422, + "learning_rate": 1.4795067930129012e-05, + "loss": 3.3117, + "step": 30950 + }, + { + "epoch": 3.5263967196309585, + "grad_norm": 26.83173370361328, + "learning_rate": 1.4783651101723942e-05, + "loss": 3.2006, + "step": 30960 + }, + { + "epoch": 3.527535736659263, + "grad_norm": 17.046058654785156, + "learning_rate": 1.4772234273318872e-05, + "loss": 3.1932, + "step": 30970 + }, + { + "epoch": 3.5286747536875676, + "grad_norm": 32.76432418823242, + "learning_rate": 1.4760817444913804e-05, + "loss": 3.6004, + "step": 30980 + }, + { + "epoch": 3.5298137707158723, + "grad_norm": 15.658669471740723, + "learning_rate": 1.4749400616508734e-05, + "loss": 3.0127, + "step": 30990 + }, + { + "epoch": 3.5309527877441766, + "grad_norm": 24.679824829101562, + "learning_rate": 1.4737983788103666e-05, + "loss": 3.3527, + "step": 31000 + }, + { + "epoch": 3.5320918047724814, + "grad_norm": 11.274155616760254, + "learning_rate": 1.4726566959698596e-05, + "loss": 2.9699, + "step": 31010 + }, + { + "epoch": 3.533230821800786, + "grad_norm": 10.21949291229248, + "learning_rate": 1.4715150131293528e-05, + "loss": 3.4614, + "step": 31020 + }, + { + "epoch": 3.5343698388290905, + "grad_norm": 13.77395248413086, + "learning_rate": 1.4703733302888458e-05, + "loss": 3.1646, + "step": 31030 + }, + { + "epoch": 3.535508855857395, + "grad_norm": 21.251293182373047, + "learning_rate": 1.4692316474483388e-05, + "loss": 3.286, + "step": 31040 + }, + { + "epoch": 3.5366478728856996, + "grad_norm": 13.558059692382812, + "learning_rate": 1.468089964607832e-05, + "loss": 3.0235, + "step": 31050 + }, + { + "epoch": 3.5377868899140044, + "grad_norm": 46.95809555053711, + "learning_rate": 1.4669482817673252e-05, + "loss": 3.0121, + "step": 31060 + }, + { + "epoch": 3.5389259069423087, + "grad_norm": 17.31770896911621, + "learning_rate": 1.4658065989268182e-05, + "loss": 3.4192, + "step": 31070 + }, + { + "epoch": 3.5400649239706135, + "grad_norm": 17.153045654296875, + "learning_rate": 1.4646649160863112e-05, + "loss": 3.0657, + "step": 31080 + }, + { + "epoch": 3.5412039409989178, + "grad_norm": 19.619625091552734, + "learning_rate": 1.4635232332458042e-05, + "loss": 2.9207, + "step": 31090 + }, + { + "epoch": 3.5423429580272225, + "grad_norm": 23.412649154663086, + "learning_rate": 1.4623815504052973e-05, + "loss": 3.4202, + "step": 31100 + }, + { + "epoch": 3.543481975055527, + "grad_norm": 24.09189224243164, + "learning_rate": 1.4612398675647906e-05, + "loss": 3.1347, + "step": 31110 + }, + { + "epoch": 3.5446209920838316, + "grad_norm": 28.67925453186035, + "learning_rate": 1.4600981847242836e-05, + "loss": 3.0364, + "step": 31120 + }, + { + "epoch": 3.5457600091121364, + "grad_norm": 25.847000122070312, + "learning_rate": 1.4589565018837766e-05, + "loss": 3.2325, + "step": 31130 + }, + { + "epoch": 3.5468990261404407, + "grad_norm": 12.999897956848145, + "learning_rate": 1.4578148190432697e-05, + "loss": 3.3743, + "step": 31140 + }, + { + "epoch": 3.5480380431687455, + "grad_norm": 32.32277297973633, + "learning_rate": 1.456673136202763e-05, + "loss": 3.0741, + "step": 31150 + }, + { + "epoch": 3.54917706019705, + "grad_norm": 22.80971908569336, + "learning_rate": 1.455531453362256e-05, + "loss": 3.1639, + "step": 31160 + }, + { + "epoch": 3.5503160772253546, + "grad_norm": 28.205598831176758, + "learning_rate": 1.454389770521749e-05, + "loss": 3.3927, + "step": 31170 + }, + { + "epoch": 3.551455094253659, + "grad_norm": 26.13220977783203, + "learning_rate": 1.453248087681242e-05, + "loss": 3.2931, + "step": 31180 + }, + { + "epoch": 3.5525941112819637, + "grad_norm": 17.164941787719727, + "learning_rate": 1.4521064048407354e-05, + "loss": 3.5586, + "step": 31190 + }, + { + "epoch": 3.5537331283102684, + "grad_norm": 21.556045532226562, + "learning_rate": 1.4509647220002284e-05, + "loss": 3.3343, + "step": 31200 + }, + { + "epoch": 3.5548721453385728, + "grad_norm": 33.504573822021484, + "learning_rate": 1.4498230391597214e-05, + "loss": 3.0983, + "step": 31210 + }, + { + "epoch": 3.5560111623668775, + "grad_norm": 14.091229438781738, + "learning_rate": 1.4486813563192144e-05, + "loss": 2.8769, + "step": 31220 + }, + { + "epoch": 3.557150179395182, + "grad_norm": 21.432533264160156, + "learning_rate": 1.4475396734787078e-05, + "loss": 3.091, + "step": 31230 + }, + { + "epoch": 3.5582891964234866, + "grad_norm": 18.884870529174805, + "learning_rate": 1.4463979906382008e-05, + "loss": 3.3367, + "step": 31240 + }, + { + "epoch": 3.559428213451791, + "grad_norm": 16.285051345825195, + "learning_rate": 1.4452563077976938e-05, + "loss": 3.1689, + "step": 31250 + }, + { + "epoch": 3.5605672304800957, + "grad_norm": 23.484182357788086, + "learning_rate": 1.4441146249571868e-05, + "loss": 3.1922, + "step": 31260 + }, + { + "epoch": 3.5617062475084005, + "grad_norm": 15.000371932983398, + "learning_rate": 1.4429729421166802e-05, + "loss": 3.5585, + "step": 31270 + }, + { + "epoch": 3.562845264536705, + "grad_norm": 33.48192596435547, + "learning_rate": 1.4418312592761732e-05, + "loss": 3.3127, + "step": 31280 + }, + { + "epoch": 3.563984281565009, + "grad_norm": 27.877798080444336, + "learning_rate": 1.4406895764356662e-05, + "loss": 3.4143, + "step": 31290 + }, + { + "epoch": 3.565123298593314, + "grad_norm": 22.70536994934082, + "learning_rate": 1.4395478935951592e-05, + "loss": 3.316, + "step": 31300 + }, + { + "epoch": 3.5662623156216187, + "grad_norm": 20.055557250976562, + "learning_rate": 1.4384062107546526e-05, + "loss": 3.3235, + "step": 31310 + }, + { + "epoch": 3.567401332649923, + "grad_norm": 15.23216724395752, + "learning_rate": 1.4372645279141456e-05, + "loss": 3.4314, + "step": 31320 + }, + { + "epoch": 3.5685403496782278, + "grad_norm": 22.534870147705078, + "learning_rate": 1.4361228450736386e-05, + "loss": 3.1412, + "step": 31330 + }, + { + "epoch": 3.5696793667065325, + "grad_norm": 17.03142738342285, + "learning_rate": 1.4349811622331316e-05, + "loss": 3.5653, + "step": 31340 + }, + { + "epoch": 3.570818383734837, + "grad_norm": 17.72332000732422, + "learning_rate": 1.433839479392625e-05, + "loss": 3.2854, + "step": 31350 + }, + { + "epoch": 3.571957400763141, + "grad_norm": 18.860029220581055, + "learning_rate": 1.432697796552118e-05, + "loss": 3.3297, + "step": 31360 + }, + { + "epoch": 3.573096417791446, + "grad_norm": 15.948371887207031, + "learning_rate": 1.431556113711611e-05, + "loss": 3.2231, + "step": 31370 + }, + { + "epoch": 3.5742354348197507, + "grad_norm": 14.283364295959473, + "learning_rate": 1.430414430871104e-05, + "loss": 3.0603, + "step": 31380 + }, + { + "epoch": 3.575374451848055, + "grad_norm": 23.466094970703125, + "learning_rate": 1.4292727480305972e-05, + "loss": 3.3291, + "step": 31390 + }, + { + "epoch": 3.57651346887636, + "grad_norm": 14.758607864379883, + "learning_rate": 1.4281310651900904e-05, + "loss": 3.2652, + "step": 31400 + }, + { + "epoch": 3.577652485904664, + "grad_norm": 29.233253479003906, + "learning_rate": 1.4269893823495834e-05, + "loss": 2.9194, + "step": 31410 + }, + { + "epoch": 3.578791502932969, + "grad_norm": 93.158935546875, + "learning_rate": 1.4258476995090764e-05, + "loss": 3.2382, + "step": 31420 + }, + { + "epoch": 3.579930519961273, + "grad_norm": 15.761682510375977, + "learning_rate": 1.4247060166685696e-05, + "loss": 3.5805, + "step": 31430 + }, + { + "epoch": 3.581069536989578, + "grad_norm": 20.996610641479492, + "learning_rate": 1.4235643338280626e-05, + "loss": 3.0408, + "step": 31440 + }, + { + "epoch": 3.5822085540178827, + "grad_norm": 38.895912170410156, + "learning_rate": 1.4224226509875558e-05, + "loss": 3.1143, + "step": 31450 + }, + { + "epoch": 3.583347571046187, + "grad_norm": 18.48572540283203, + "learning_rate": 1.4212809681470488e-05, + "loss": 3.1559, + "step": 31460 + }, + { + "epoch": 3.584486588074492, + "grad_norm": 19.490568161010742, + "learning_rate": 1.4202534535905926e-05, + "loss": 3.606, + "step": 31470 + }, + { + "epoch": 3.585625605102796, + "grad_norm": 17.260679244995117, + "learning_rate": 1.4191117707500856e-05, + "loss": 3.3253, + "step": 31480 + }, + { + "epoch": 3.586764622131101, + "grad_norm": 23.358652114868164, + "learning_rate": 1.417970087909579e-05, + "loss": 3.1187, + "step": 31490 + }, + { + "epoch": 3.5879036391594052, + "grad_norm": 23.210037231445312, + "learning_rate": 1.416828405069072e-05, + "loss": 3.2333, + "step": 31500 + }, + { + "epoch": 3.58904265618771, + "grad_norm": 18.232166290283203, + "learning_rate": 1.415686722228565e-05, + "loss": 3.274, + "step": 31510 + }, + { + "epoch": 3.590181673216015, + "grad_norm": 35.253353118896484, + "learning_rate": 1.414545039388058e-05, + "loss": 2.9219, + "step": 31520 + }, + { + "epoch": 3.591320690244319, + "grad_norm": 14.418645858764648, + "learning_rate": 1.4134033565475514e-05, + "loss": 3.354, + "step": 31530 + }, + { + "epoch": 3.592459707272624, + "grad_norm": 13.531665802001953, + "learning_rate": 1.4122616737070444e-05, + "loss": 3.4503, + "step": 31540 + }, + { + "epoch": 3.593598724300928, + "grad_norm": 16.598588943481445, + "learning_rate": 1.4111199908665374e-05, + "loss": 3.4679, + "step": 31550 + }, + { + "epoch": 3.594737741329233, + "grad_norm": 16.628454208374023, + "learning_rate": 1.4099783080260304e-05, + "loss": 3.4018, + "step": 31560 + }, + { + "epoch": 3.5958767583575373, + "grad_norm": 16.88093376159668, + "learning_rate": 1.4088366251855234e-05, + "loss": 3.5366, + "step": 31570 + }, + { + "epoch": 3.597015775385842, + "grad_norm": 23.86626625061035, + "learning_rate": 1.4076949423450168e-05, + "loss": 3.271, + "step": 31580 + }, + { + "epoch": 3.598154792414147, + "grad_norm": 15.128466606140137, + "learning_rate": 1.4065532595045098e-05, + "loss": 3.163, + "step": 31590 + }, + { + "epoch": 3.599293809442451, + "grad_norm": 18.111583709716797, + "learning_rate": 1.4054115766640028e-05, + "loss": 3.3405, + "step": 31600 + }, + { + "epoch": 3.6004328264707555, + "grad_norm": 19.99451446533203, + "learning_rate": 1.4042698938234958e-05, + "loss": 3.4593, + "step": 31610 + }, + { + "epoch": 3.6015718434990602, + "grad_norm": 14.503628730773926, + "learning_rate": 1.4031282109829892e-05, + "loss": 3.412, + "step": 31620 + }, + { + "epoch": 3.602710860527365, + "grad_norm": 21.69760513305664, + "learning_rate": 1.4019865281424822e-05, + "loss": 3.3879, + "step": 31630 + }, + { + "epoch": 3.6038498775556693, + "grad_norm": 15.388642311096191, + "learning_rate": 1.4008448453019752e-05, + "loss": 2.9837, + "step": 31640 + }, + { + "epoch": 3.604988894583974, + "grad_norm": 20.263608932495117, + "learning_rate": 1.3997031624614682e-05, + "loss": 3.4145, + "step": 31650 + }, + { + "epoch": 3.606127911612279, + "grad_norm": 18.486692428588867, + "learning_rate": 1.3985614796209614e-05, + "loss": 3.4104, + "step": 31660 + }, + { + "epoch": 3.607266928640583, + "grad_norm": 18.97177505493164, + "learning_rate": 1.3974197967804544e-05, + "loss": 3.341, + "step": 31670 + }, + { + "epoch": 3.6084059456688875, + "grad_norm": 24.558284759521484, + "learning_rate": 1.3962781139399476e-05, + "loss": 3.3603, + "step": 31680 + }, + { + "epoch": 3.6095449626971923, + "grad_norm": 28.811580657958984, + "learning_rate": 1.3951364310994406e-05, + "loss": 2.9241, + "step": 31690 + }, + { + "epoch": 3.610683979725497, + "grad_norm": 20.004981994628906, + "learning_rate": 1.3939947482589338e-05, + "loss": 3.2403, + "step": 31700 + }, + { + "epoch": 3.6118229967538014, + "grad_norm": 23.830997467041016, + "learning_rate": 1.3928530654184268e-05, + "loss": 3.1013, + "step": 31710 + }, + { + "epoch": 3.612962013782106, + "grad_norm": 18.13580894470215, + "learning_rate": 1.3917113825779198e-05, + "loss": 3.2296, + "step": 31720 + }, + { + "epoch": 3.6141010308104105, + "grad_norm": 38.78871536254883, + "learning_rate": 1.390569699737413e-05, + "loss": 2.7775, + "step": 31730 + }, + { + "epoch": 3.615240047838715, + "grad_norm": 19.123943328857422, + "learning_rate": 1.3894280168969062e-05, + "loss": 3.4268, + "step": 31740 + }, + { + "epoch": 3.6163790648670195, + "grad_norm": 78.86042785644531, + "learning_rate": 1.3882863340563992e-05, + "loss": 3.2857, + "step": 31750 + }, + { + "epoch": 3.6175180818953243, + "grad_norm": 15.203715324401855, + "learning_rate": 1.3871446512158922e-05, + "loss": 3.6243, + "step": 31760 + }, + { + "epoch": 3.618657098923629, + "grad_norm": 12.27120304107666, + "learning_rate": 1.3860029683753852e-05, + "loss": 3.2453, + "step": 31770 + }, + { + "epoch": 3.6197961159519334, + "grad_norm": 13.538084983825684, + "learning_rate": 1.3848612855348786e-05, + "loss": 3.5178, + "step": 31780 + }, + { + "epoch": 3.620935132980238, + "grad_norm": 14.385091781616211, + "learning_rate": 1.3837196026943716e-05, + "loss": 3.0514, + "step": 31790 + }, + { + "epoch": 3.6220741500085425, + "grad_norm": 16.23875617980957, + "learning_rate": 1.3825779198538646e-05, + "loss": 3.1698, + "step": 31800 + }, + { + "epoch": 3.6232131670368473, + "grad_norm": 20.820890426635742, + "learning_rate": 1.3814362370133576e-05, + "loss": 3.1269, + "step": 31810 + }, + { + "epoch": 3.6243521840651516, + "grad_norm": 32.737640380859375, + "learning_rate": 1.380294554172851e-05, + "loss": 3.2673, + "step": 31820 + }, + { + "epoch": 3.6254912010934564, + "grad_norm": 40.11422348022461, + "learning_rate": 1.379152871332344e-05, + "loss": 3.286, + "step": 31830 + }, + { + "epoch": 3.626630218121761, + "grad_norm": 17.645458221435547, + "learning_rate": 1.378011188491837e-05, + "loss": 3.0182, + "step": 31840 + }, + { + "epoch": 3.6277692351500654, + "grad_norm": 26.229267120361328, + "learning_rate": 1.37686950565133e-05, + "loss": 3.103, + "step": 31850 + }, + { + "epoch": 3.62890825217837, + "grad_norm": 16.306425094604492, + "learning_rate": 1.3757278228108234e-05, + "loss": 3.0238, + "step": 31860 + }, + { + "epoch": 3.6300472692066745, + "grad_norm": 17.451791763305664, + "learning_rate": 1.3745861399703164e-05, + "loss": 3.1266, + "step": 31870 + }, + { + "epoch": 3.6311862862349793, + "grad_norm": 9.61365795135498, + "learning_rate": 1.3734444571298094e-05, + "loss": 3.0935, + "step": 31880 + }, + { + "epoch": 3.6323253032632836, + "grad_norm": 27.982622146606445, + "learning_rate": 1.3723027742893024e-05, + "loss": 3.1049, + "step": 31890 + }, + { + "epoch": 3.6334643202915884, + "grad_norm": 18.57550048828125, + "learning_rate": 1.3711610914487954e-05, + "loss": 3.1093, + "step": 31900 + }, + { + "epoch": 3.634603337319893, + "grad_norm": 13.69741153717041, + "learning_rate": 1.3700194086082888e-05, + "loss": 3.0639, + "step": 31910 + }, + { + "epoch": 3.6357423543481975, + "grad_norm": 15.091955184936523, + "learning_rate": 1.3688777257677818e-05, + "loss": 3.1179, + "step": 31920 + }, + { + "epoch": 3.636881371376502, + "grad_norm": 15.062325477600098, + "learning_rate": 1.3677360429272748e-05, + "loss": 3.6283, + "step": 31930 + }, + { + "epoch": 3.6380203884048066, + "grad_norm": 26.963247299194336, + "learning_rate": 1.3665943600867678e-05, + "loss": 3.4437, + "step": 31940 + }, + { + "epoch": 3.6391594054331113, + "grad_norm": 14.558040618896484, + "learning_rate": 1.3654526772462612e-05, + "loss": 3.0342, + "step": 31950 + }, + { + "epoch": 3.6402984224614157, + "grad_norm": 15.125991821289062, + "learning_rate": 1.3643109944057542e-05, + "loss": 3.0543, + "step": 31960 + }, + { + "epoch": 3.6414374394897204, + "grad_norm": 73.07426452636719, + "learning_rate": 1.3631693115652472e-05, + "loss": 2.9307, + "step": 31970 + }, + { + "epoch": 3.642576456518025, + "grad_norm": 19.4658145904541, + "learning_rate": 1.3620276287247402e-05, + "loss": 3.3872, + "step": 31980 + }, + { + "epoch": 3.6437154735463295, + "grad_norm": 54.43883514404297, + "learning_rate": 1.3608859458842336e-05, + "loss": 3.3176, + "step": 31990 + }, + { + "epoch": 3.644854490574634, + "grad_norm": 11.834235191345215, + "learning_rate": 1.3597442630437266e-05, + "loss": 3.077, + "step": 32000 + }, + { + "epoch": 3.644854490574634, + "eval_loss": 6.616151809692383, + "eval_runtime": 12.3895, + "eval_samples_per_second": 1.211, + "eval_steps_per_second": 0.161, + "step": 32000 + }, + { + "epoch": 3.6459935076029386, + "grad_norm": 16.523601531982422, + "learning_rate": 1.3586025802032196e-05, + "loss": 3.4931, + "step": 32010 + }, + { + "epoch": 3.6471325246312434, + "grad_norm": 18.468137741088867, + "learning_rate": 1.3574608973627126e-05, + "loss": 3.1046, + "step": 32020 + }, + { + "epoch": 3.6482715416595477, + "grad_norm": 22.56095314025879, + "learning_rate": 1.356319214522206e-05, + "loss": 2.8267, + "step": 32030 + }, + { + "epoch": 3.6494105586878525, + "grad_norm": 28.748476028442383, + "learning_rate": 1.355177531681699e-05, + "loss": 3.1378, + "step": 32040 + }, + { + "epoch": 3.650549575716157, + "grad_norm": 16.834548950195312, + "learning_rate": 1.354035848841192e-05, + "loss": 3.6769, + "step": 32050 + }, + { + "epoch": 3.6516885927444616, + "grad_norm": 18.101911544799805, + "learning_rate": 1.352894166000685e-05, + "loss": 3.1517, + "step": 32060 + }, + { + "epoch": 3.652827609772766, + "grad_norm": 16.95029067993164, + "learning_rate": 1.3517524831601782e-05, + "loss": 3.334, + "step": 32070 + }, + { + "epoch": 3.6539666268010706, + "grad_norm": 22.611167907714844, + "learning_rate": 1.3506108003196714e-05, + "loss": 3.2867, + "step": 32080 + }, + { + "epoch": 3.6551056438293754, + "grad_norm": 27.677146911621094, + "learning_rate": 1.3494691174791644e-05, + "loss": 3.1403, + "step": 32090 + }, + { + "epoch": 3.6562446608576797, + "grad_norm": 20.259828567504883, + "learning_rate": 1.3483274346386574e-05, + "loss": 3.1741, + "step": 32100 + }, + { + "epoch": 3.6573836778859845, + "grad_norm": 20.366390228271484, + "learning_rate": 1.3471857517981506e-05, + "loss": 3.4369, + "step": 32110 + }, + { + "epoch": 3.658522694914289, + "grad_norm": 33.92325210571289, + "learning_rate": 1.3460440689576436e-05, + "loss": 3.1641, + "step": 32120 + }, + { + "epoch": 3.6596617119425936, + "grad_norm": 20.80040740966797, + "learning_rate": 1.3449023861171368e-05, + "loss": 3.2044, + "step": 32130 + }, + { + "epoch": 3.660800728970898, + "grad_norm": 15.508310317993164, + "learning_rate": 1.3437607032766298e-05, + "loss": 2.8465, + "step": 32140 + }, + { + "epoch": 3.6619397459992027, + "grad_norm": 14.790544509887695, + "learning_rate": 1.342619020436123e-05, + "loss": 3.3739, + "step": 32150 + }, + { + "epoch": 3.6630787630275075, + "grad_norm": 13.493619918823242, + "learning_rate": 1.341477337595616e-05, + "loss": 3.0562, + "step": 32160 + }, + { + "epoch": 3.664217780055812, + "grad_norm": 32.97386932373047, + "learning_rate": 1.340335654755109e-05, + "loss": 3.0645, + "step": 32170 + }, + { + "epoch": 3.6653567970841165, + "grad_norm": 35.83689880371094, + "learning_rate": 1.339193971914602e-05, + "loss": 3.2999, + "step": 32180 + }, + { + "epoch": 3.666495814112421, + "grad_norm": 52.69847106933594, + "learning_rate": 1.3380522890740954e-05, + "loss": 2.9034, + "step": 32190 + }, + { + "epoch": 3.6676348311407256, + "grad_norm": 15.297989845275879, + "learning_rate": 1.3369106062335884e-05, + "loss": 3.6567, + "step": 32200 + }, + { + "epoch": 3.66877384816903, + "grad_norm": 22.48167610168457, + "learning_rate": 1.3357689233930814e-05, + "loss": 3.0576, + "step": 32210 + }, + { + "epoch": 3.6699128651973347, + "grad_norm": 18.812744140625, + "learning_rate": 1.3346272405525744e-05, + "loss": 2.963, + "step": 32220 + }, + { + "epoch": 3.6710518822256395, + "grad_norm": 30.261520385742188, + "learning_rate": 1.3334855577120678e-05, + "loss": 3.2873, + "step": 32230 + }, + { + "epoch": 3.672190899253944, + "grad_norm": 24.69428825378418, + "learning_rate": 1.3323438748715608e-05, + "loss": 3.2801, + "step": 32240 + }, + { + "epoch": 3.673329916282248, + "grad_norm": 16.969112396240234, + "learning_rate": 1.3312021920310538e-05, + "loss": 3.5014, + "step": 32250 + }, + { + "epoch": 3.674468933310553, + "grad_norm": 14.116311073303223, + "learning_rate": 1.3300605091905468e-05, + "loss": 3.2267, + "step": 32260 + }, + { + "epoch": 3.6756079503388577, + "grad_norm": 39.485328674316406, + "learning_rate": 1.3289188263500398e-05, + "loss": 3.3788, + "step": 32270 + }, + { + "epoch": 3.676746967367162, + "grad_norm": 11.645790100097656, + "learning_rate": 1.3277771435095332e-05, + "loss": 3.3113, + "step": 32280 + }, + { + "epoch": 3.6778859843954668, + "grad_norm": 20.604970932006836, + "learning_rate": 1.3266354606690262e-05, + "loss": 2.9979, + "step": 32290 + }, + { + "epoch": 3.6790250014237715, + "grad_norm": 15.729682922363281, + "learning_rate": 1.3254937778285192e-05, + "loss": 3.44, + "step": 32300 + }, + { + "epoch": 3.680164018452076, + "grad_norm": 25.164766311645508, + "learning_rate": 1.3243520949880122e-05, + "loss": 3.1505, + "step": 32310 + }, + { + "epoch": 3.68130303548038, + "grad_norm": 22.59656524658203, + "learning_rate": 1.3232104121475056e-05, + "loss": 3.4189, + "step": 32320 + }, + { + "epoch": 3.682442052508685, + "grad_norm": 22.83663558959961, + "learning_rate": 1.3220687293069986e-05, + "loss": 3.0199, + "step": 32330 + }, + { + "epoch": 3.6835810695369897, + "grad_norm": 25.320524215698242, + "learning_rate": 1.3209270464664916e-05, + "loss": 3.3938, + "step": 32340 + }, + { + "epoch": 3.684720086565294, + "grad_norm": 15.34455394744873, + "learning_rate": 1.3197853636259846e-05, + "loss": 3.2307, + "step": 32350 + }, + { + "epoch": 3.685859103593599, + "grad_norm": 16.524751663208008, + "learning_rate": 1.318643680785478e-05, + "loss": 3.0647, + "step": 32360 + }, + { + "epoch": 3.6869981206219036, + "grad_norm": 16.40989112854004, + "learning_rate": 1.317501997944971e-05, + "loss": 3.2068, + "step": 32370 + }, + { + "epoch": 3.688137137650208, + "grad_norm": 12.444294929504395, + "learning_rate": 1.316360315104464e-05, + "loss": 3.2993, + "step": 32380 + }, + { + "epoch": 3.689276154678512, + "grad_norm": 21.498640060424805, + "learning_rate": 1.315218632263957e-05, + "loss": 3.2295, + "step": 32390 + }, + { + "epoch": 3.690415171706817, + "grad_norm": 15.598748207092285, + "learning_rate": 1.3140769494234504e-05, + "loss": 3.2294, + "step": 32400 + }, + { + "epoch": 3.6915541887351218, + "grad_norm": 19.711767196655273, + "learning_rate": 1.3129352665829434e-05, + "loss": 3.517, + "step": 32410 + }, + { + "epoch": 3.692693205763426, + "grad_norm": 52.494483947753906, + "learning_rate": 1.3117935837424364e-05, + "loss": 3.6337, + "step": 32420 + }, + { + "epoch": 3.693832222791731, + "grad_norm": 16.527414321899414, + "learning_rate": 1.3106519009019294e-05, + "loss": 3.5641, + "step": 32430 + }, + { + "epoch": 3.694971239820035, + "grad_norm": 14.526664733886719, + "learning_rate": 1.3095102180614228e-05, + "loss": 3.2324, + "step": 32440 + }, + { + "epoch": 3.69611025684834, + "grad_norm": 22.665630340576172, + "learning_rate": 1.3083685352209158e-05, + "loss": 3.0854, + "step": 32450 + }, + { + "epoch": 3.6972492738766443, + "grad_norm": 22.589723587036133, + "learning_rate": 1.3072268523804088e-05, + "loss": 3.2505, + "step": 32460 + }, + { + "epoch": 3.698388290904949, + "grad_norm": 17.110258102416992, + "learning_rate": 1.3060851695399018e-05, + "loss": 3.3163, + "step": 32470 + }, + { + "epoch": 3.699527307933254, + "grad_norm": 16.35567855834961, + "learning_rate": 1.3049434866993952e-05, + "loss": 3.8002, + "step": 32480 + }, + { + "epoch": 3.700666324961558, + "grad_norm": 15.346707344055176, + "learning_rate": 1.3038018038588882e-05, + "loss": 3.2492, + "step": 32490 + }, + { + "epoch": 3.701805341989863, + "grad_norm": 18.525407791137695, + "learning_rate": 1.3026601210183812e-05, + "loss": 3.3648, + "step": 32500 + }, + { + "epoch": 3.702944359018167, + "grad_norm": 23.089082717895508, + "learning_rate": 1.3015184381778742e-05, + "loss": 3.1974, + "step": 32510 + }, + { + "epoch": 3.704083376046472, + "grad_norm": 14.466062545776367, + "learning_rate": 1.3003767553373674e-05, + "loss": 3.3509, + "step": 32520 + }, + { + "epoch": 3.7052223930747763, + "grad_norm": 22.863557815551758, + "learning_rate": 1.2992350724968606e-05, + "loss": 3.2939, + "step": 32530 + }, + { + "epoch": 3.706361410103081, + "grad_norm": 15.126258850097656, + "learning_rate": 1.2980933896563536e-05, + "loss": 3.5803, + "step": 32540 + }, + { + "epoch": 3.707500427131386, + "grad_norm": 25.215234756469727, + "learning_rate": 1.2969517068158466e-05, + "loss": 3.0721, + "step": 32550 + }, + { + "epoch": 3.70863944415969, + "grad_norm": 19.472930908203125, + "learning_rate": 1.2958100239753398e-05, + "loss": 2.9427, + "step": 32560 + }, + { + "epoch": 3.709778461187995, + "grad_norm": 16.894861221313477, + "learning_rate": 1.2946683411348328e-05, + "loss": 3.1013, + "step": 32570 + }, + { + "epoch": 3.7109174782162992, + "grad_norm": 15.443288803100586, + "learning_rate": 1.2935266582943258e-05, + "loss": 3.204, + "step": 32580 + }, + { + "epoch": 3.712056495244604, + "grad_norm": 14.198867797851562, + "learning_rate": 1.292384975453819e-05, + "loss": 3.5278, + "step": 32590 + }, + { + "epoch": 3.7131955122729083, + "grad_norm": 38.992313385009766, + "learning_rate": 1.291243292613312e-05, + "loss": 3.1439, + "step": 32600 + }, + { + "epoch": 3.714334529301213, + "grad_norm": 24.417865753173828, + "learning_rate": 1.2901016097728052e-05, + "loss": 3.3764, + "step": 32610 + }, + { + "epoch": 3.715473546329518, + "grad_norm": 31.99386215209961, + "learning_rate": 1.2889599269322982e-05, + "loss": 3.0947, + "step": 32620 + }, + { + "epoch": 3.716612563357822, + "grad_norm": 26.731693267822266, + "learning_rate": 1.2878182440917912e-05, + "loss": 2.7709, + "step": 32630 + }, + { + "epoch": 3.7177515803861265, + "grad_norm": 18.979000091552734, + "learning_rate": 1.2866765612512844e-05, + "loss": 3.3516, + "step": 32640 + }, + { + "epoch": 3.7188905974144313, + "grad_norm": 13.394664764404297, + "learning_rate": 1.2855348784107776e-05, + "loss": 2.7926, + "step": 32650 + }, + { + "epoch": 3.720029614442736, + "grad_norm": 15.783432006835938, + "learning_rate": 1.2843931955702706e-05, + "loss": 3.0971, + "step": 32660 + }, + { + "epoch": 3.7211686314710404, + "grad_norm": 23.57678985595703, + "learning_rate": 1.2832515127297636e-05, + "loss": 3.1433, + "step": 32670 + }, + { + "epoch": 3.722307648499345, + "grad_norm": 19.5240478515625, + "learning_rate": 1.2821098298892566e-05, + "loss": 3.0789, + "step": 32680 + }, + { + "epoch": 3.72344666552765, + "grad_norm": 26.521547317504883, + "learning_rate": 1.28096814704875e-05, + "loss": 3.1122, + "step": 32690 + }, + { + "epoch": 3.7245856825559542, + "grad_norm": 22.24355125427246, + "learning_rate": 1.279826464208243e-05, + "loss": 3.1131, + "step": 32700 + }, + { + "epoch": 3.7257246995842586, + "grad_norm": 15.753776550292969, + "learning_rate": 1.278684781367736e-05, + "loss": 3.3566, + "step": 32710 + }, + { + "epoch": 3.7268637166125633, + "grad_norm": 19.915433883666992, + "learning_rate": 1.277543098527229e-05, + "loss": 3.3159, + "step": 32720 + }, + { + "epoch": 3.728002733640868, + "grad_norm": 23.602800369262695, + "learning_rate": 1.2764014156867224e-05, + "loss": 3.3398, + "step": 32730 + }, + { + "epoch": 3.7291417506691724, + "grad_norm": 14.813172340393066, + "learning_rate": 1.2752597328462154e-05, + "loss": 2.9828, + "step": 32740 + }, + { + "epoch": 3.730280767697477, + "grad_norm": 32.590965270996094, + "learning_rate": 1.2741180500057084e-05, + "loss": 2.825, + "step": 32750 + }, + { + "epoch": 3.7314197847257815, + "grad_norm": 23.71125602722168, + "learning_rate": 1.2729763671652014e-05, + "loss": 3.1212, + "step": 32760 + }, + { + "epoch": 3.7325588017540863, + "grad_norm": 18.269615173339844, + "learning_rate": 1.2718346843246948e-05, + "loss": 3.2646, + "step": 32770 + }, + { + "epoch": 3.7336978187823906, + "grad_norm": 13.051878929138184, + "learning_rate": 1.2706930014841878e-05, + "loss": 3.4681, + "step": 32780 + }, + { + "epoch": 3.7348368358106954, + "grad_norm": 32.750213623046875, + "learning_rate": 1.2695513186436808e-05, + "loss": 2.9616, + "step": 32790 + }, + { + "epoch": 3.735975852839, + "grad_norm": 30.352237701416016, + "learning_rate": 1.2684096358031738e-05, + "loss": 3.1487, + "step": 32800 + }, + { + "epoch": 3.7371148698673045, + "grad_norm": 16.016530990600586, + "learning_rate": 1.2672679529626672e-05, + "loss": 3.4062, + "step": 32810 + }, + { + "epoch": 3.7382538868956092, + "grad_norm": 34.221534729003906, + "learning_rate": 1.2661262701221602e-05, + "loss": 3.1771, + "step": 32820 + }, + { + "epoch": 3.7393929039239135, + "grad_norm": 22.015287399291992, + "learning_rate": 1.2649845872816532e-05, + "loss": 3.2352, + "step": 32830 + }, + { + "epoch": 3.7405319209522183, + "grad_norm": 31.06937599182129, + "learning_rate": 1.2638429044411462e-05, + "loss": 3.0172, + "step": 32840 + }, + { + "epoch": 3.7416709379805226, + "grad_norm": 16.67007827758789, + "learning_rate": 1.2627012216006396e-05, + "loss": 3.5108, + "step": 32850 + }, + { + "epoch": 3.7428099550088274, + "grad_norm": 14.400307655334473, + "learning_rate": 1.2615595387601326e-05, + "loss": 3.1321, + "step": 32860 + }, + { + "epoch": 3.743948972037132, + "grad_norm": 21.144777297973633, + "learning_rate": 1.2604178559196256e-05, + "loss": 3.1542, + "step": 32870 + }, + { + "epoch": 3.7450879890654365, + "grad_norm": 18.0500431060791, + "learning_rate": 1.2592761730791186e-05, + "loss": 3.8213, + "step": 32880 + }, + { + "epoch": 3.7462270060937413, + "grad_norm": 34.40842819213867, + "learning_rate": 1.258134490238612e-05, + "loss": 3.3235, + "step": 32890 + }, + { + "epoch": 3.7473660231220456, + "grad_norm": 20.057058334350586, + "learning_rate": 1.256992807398105e-05, + "loss": 3.0745, + "step": 32900 + }, + { + "epoch": 3.7485050401503504, + "grad_norm": 56.5526123046875, + "learning_rate": 1.255851124557598e-05, + "loss": 3.4115, + "step": 32910 + }, + { + "epoch": 3.7496440571786547, + "grad_norm": 20.29167938232422, + "learning_rate": 1.254709441717091e-05, + "loss": 3.3576, + "step": 32920 + }, + { + "epoch": 3.7507830742069594, + "grad_norm": 28.709518432617188, + "learning_rate": 1.253567758876584e-05, + "loss": 3.0779, + "step": 32930 + }, + { + "epoch": 3.751922091235264, + "grad_norm": 25.286155700683594, + "learning_rate": 1.2524260760360774e-05, + "loss": 2.9963, + "step": 32940 + }, + { + "epoch": 3.7530611082635685, + "grad_norm": 20.122270584106445, + "learning_rate": 1.2512843931955704e-05, + "loss": 3.1923, + "step": 32950 + }, + { + "epoch": 3.754200125291873, + "grad_norm": 46.440547943115234, + "learning_rate": 1.2501427103550634e-05, + "loss": 3.3257, + "step": 32960 + }, + { + "epoch": 3.7553391423201776, + "grad_norm": 17.568931579589844, + "learning_rate": 1.2490010275145566e-05, + "loss": 3.293, + "step": 32970 + }, + { + "epoch": 3.7564781593484824, + "grad_norm": 16.84494972229004, + "learning_rate": 1.2478593446740496e-05, + "loss": 2.8759, + "step": 32980 + }, + { + "epoch": 3.7576171763767867, + "grad_norm": 29.088085174560547, + "learning_rate": 1.2467176618335428e-05, + "loss": 3.2411, + "step": 32990 + }, + { + "epoch": 3.7587561934050915, + "grad_norm": 18.981189727783203, + "learning_rate": 1.2455759789930358e-05, + "loss": 3.4173, + "step": 33000 + }, + { + "epoch": 3.7598952104333963, + "grad_norm": 32.48049545288086, + "learning_rate": 1.2444342961525288e-05, + "loss": 3.0813, + "step": 33010 + }, + { + "epoch": 3.7610342274617006, + "grad_norm": 14.662425994873047, + "learning_rate": 1.243292613312022e-05, + "loss": 3.2787, + "step": 33020 + }, + { + "epoch": 3.762173244490005, + "grad_norm": 59.103843688964844, + "learning_rate": 1.242150930471515e-05, + "loss": 3.1599, + "step": 33030 + }, + { + "epoch": 3.7633122615183097, + "grad_norm": 15.66943359375, + "learning_rate": 1.2410092476310082e-05, + "loss": 3.4895, + "step": 33040 + }, + { + "epoch": 3.7644512785466144, + "grad_norm": 12.428000450134277, + "learning_rate": 1.2398675647905012e-05, + "loss": 3.5344, + "step": 33050 + }, + { + "epoch": 3.7655902955749188, + "grad_norm": 16.733388900756836, + "learning_rate": 1.2387258819499942e-05, + "loss": 3.2387, + "step": 33060 + }, + { + "epoch": 3.7667293126032235, + "grad_norm": 29.713878631591797, + "learning_rate": 1.2375841991094874e-05, + "loss": 3.284, + "step": 33070 + }, + { + "epoch": 3.767868329631528, + "grad_norm": 42.232177734375, + "learning_rate": 1.2364425162689804e-05, + "loss": 3.2768, + "step": 33080 + }, + { + "epoch": 3.7690073466598326, + "grad_norm": 24.566675186157227, + "learning_rate": 1.2353008334284736e-05, + "loss": 3.3988, + "step": 33090 + }, + { + "epoch": 3.770146363688137, + "grad_norm": 17.95066261291504, + "learning_rate": 1.2341591505879666e-05, + "loss": 3.1141, + "step": 33100 + }, + { + "epoch": 3.7712853807164417, + "grad_norm": 21.431095123291016, + "learning_rate": 1.2330174677474598e-05, + "loss": 3.4754, + "step": 33110 + }, + { + "epoch": 3.7724243977447465, + "grad_norm": 22.057865142822266, + "learning_rate": 1.2318757849069528e-05, + "loss": 3.3989, + "step": 33120 + }, + { + "epoch": 3.773563414773051, + "grad_norm": 13.715916633605957, + "learning_rate": 1.230734102066446e-05, + "loss": 2.8591, + "step": 33130 + }, + { + "epoch": 3.7747024318013556, + "grad_norm": 26.173364639282227, + "learning_rate": 1.229592419225939e-05, + "loss": 3.7371, + "step": 33140 + }, + { + "epoch": 3.77584144882966, + "grad_norm": 38.05056381225586, + "learning_rate": 1.2284507363854322e-05, + "loss": 3.2632, + "step": 33150 + }, + { + "epoch": 3.7769804658579647, + "grad_norm": 19.166316986083984, + "learning_rate": 1.2273090535449252e-05, + "loss": 3.1668, + "step": 33160 + }, + { + "epoch": 3.778119482886269, + "grad_norm": 13.285821914672852, + "learning_rate": 1.2261673707044184e-05, + "loss": 3.306, + "step": 33170 + }, + { + "epoch": 3.7792584999145737, + "grad_norm": 24.364486694335938, + "learning_rate": 1.2250256878639114e-05, + "loss": 3.0431, + "step": 33180 + }, + { + "epoch": 3.7803975169428785, + "grad_norm": 17.88330841064453, + "learning_rate": 1.2238840050234046e-05, + "loss": 3.0506, + "step": 33190 + }, + { + "epoch": 3.781536533971183, + "grad_norm": 12.778934478759766, + "learning_rate": 1.2227423221828976e-05, + "loss": 3.3124, + "step": 33200 + }, + { + "epoch": 3.7826755509994876, + "grad_norm": 19.211511611938477, + "learning_rate": 1.2216006393423908e-05, + "loss": 3.2028, + "step": 33210 + }, + { + "epoch": 3.783814568027792, + "grad_norm": 17.600019454956055, + "learning_rate": 1.2204589565018838e-05, + "loss": 3.0564, + "step": 33220 + }, + { + "epoch": 3.7849535850560967, + "grad_norm": 25.08670425415039, + "learning_rate": 1.219317273661377e-05, + "loss": 3.5895, + "step": 33230 + }, + { + "epoch": 3.786092602084401, + "grad_norm": 25.197296142578125, + "learning_rate": 1.21817559082087e-05, + "loss": 3.0373, + "step": 33240 + }, + { + "epoch": 3.787231619112706, + "grad_norm": 18.133596420288086, + "learning_rate": 1.2170339079803632e-05, + "loss": 3.533, + "step": 33250 + }, + { + "epoch": 3.7883706361410106, + "grad_norm": 14.349433898925781, + "learning_rate": 1.2158922251398562e-05, + "loss": 3.1921, + "step": 33260 + }, + { + "epoch": 3.789509653169315, + "grad_norm": 32.27397537231445, + "learning_rate": 1.2147505422993492e-05, + "loss": 3.0316, + "step": 33270 + }, + { + "epoch": 3.790648670197619, + "grad_norm": 29.021055221557617, + "learning_rate": 1.2136088594588424e-05, + "loss": 3.1067, + "step": 33280 + }, + { + "epoch": 3.791787687225924, + "grad_norm": 28.820280075073242, + "learning_rate": 1.2124671766183354e-05, + "loss": 3.2606, + "step": 33290 + }, + { + "epoch": 3.7929267042542287, + "grad_norm": 19.30303955078125, + "learning_rate": 1.2113254937778286e-05, + "loss": 3.1208, + "step": 33300 + }, + { + "epoch": 3.794065721282533, + "grad_norm": 24.95954132080078, + "learning_rate": 1.2101838109373216e-05, + "loss": 3.3014, + "step": 33310 + }, + { + "epoch": 3.795204738310838, + "grad_norm": 17.875837326049805, + "learning_rate": 1.2090421280968148e-05, + "loss": 3.6693, + "step": 33320 + }, + { + "epoch": 3.7963437553391426, + "grad_norm": 18.86009407043457, + "learning_rate": 1.2079004452563078e-05, + "loss": 3.3994, + "step": 33330 + }, + { + "epoch": 3.797482772367447, + "grad_norm": 20.501466751098633, + "learning_rate": 1.206758762415801e-05, + "loss": 3.1364, + "step": 33340 + }, + { + "epoch": 3.7986217893957512, + "grad_norm": 22.27264976501465, + "learning_rate": 1.205617079575294e-05, + "loss": 3.2672, + "step": 33350 + }, + { + "epoch": 3.799760806424056, + "grad_norm": 30.55479621887207, + "learning_rate": 1.2044753967347872e-05, + "loss": 3.0276, + "step": 33360 + }, + { + "epoch": 3.8008998234523608, + "grad_norm": 11.907081604003906, + "learning_rate": 1.2033337138942802e-05, + "loss": 2.9857, + "step": 33370 + }, + { + "epoch": 3.802038840480665, + "grad_norm": 20.31851577758789, + "learning_rate": 1.2021920310537734e-05, + "loss": 3.3241, + "step": 33380 + }, + { + "epoch": 3.80317785750897, + "grad_norm": 32.05153274536133, + "learning_rate": 1.2010503482132664e-05, + "loss": 3.2727, + "step": 33390 + }, + { + "epoch": 3.804316874537274, + "grad_norm": 33.72991943359375, + "learning_rate": 1.1999086653727596e-05, + "loss": 2.8837, + "step": 33400 + }, + { + "epoch": 3.805455891565579, + "grad_norm": 17.577287673950195, + "learning_rate": 1.1987669825322526e-05, + "loss": 3.4889, + "step": 33410 + }, + { + "epoch": 3.8065949085938833, + "grad_norm": 34.42713165283203, + "learning_rate": 1.1976252996917458e-05, + "loss": 3.1066, + "step": 33420 + }, + { + "epoch": 3.807733925622188, + "grad_norm": 58.64443588256836, + "learning_rate": 1.1964836168512388e-05, + "loss": 2.9808, + "step": 33430 + }, + { + "epoch": 3.808872942650493, + "grad_norm": 19.376663208007812, + "learning_rate": 1.195341934010732e-05, + "loss": 3.0356, + "step": 33440 + }, + { + "epoch": 3.810011959678797, + "grad_norm": 18.63247299194336, + "learning_rate": 1.194200251170225e-05, + "loss": 3.4096, + "step": 33450 + }, + { + "epoch": 3.811150976707102, + "grad_norm": 16.259485244750977, + "learning_rate": 1.193058568329718e-05, + "loss": 3.2699, + "step": 33460 + }, + { + "epoch": 3.8122899937354062, + "grad_norm": 16.128490447998047, + "learning_rate": 1.1919168854892112e-05, + "loss": 3.1829, + "step": 33470 + }, + { + "epoch": 3.813429010763711, + "grad_norm": Infinity, + "learning_rate": 1.190889370932755e-05, + "loss": 3.3376, + "step": 33480 + }, + { + "epoch": 3.8145680277920153, + "grad_norm": 25.427331924438477, + "learning_rate": 1.189747688092248e-05, + "loss": 3.2922, + "step": 33490 + }, + { + "epoch": 3.81570704482032, + "grad_norm": 30.6014461517334, + "learning_rate": 1.1886060052517412e-05, + "loss": 2.9499, + "step": 33500 + }, + { + "epoch": 3.816846061848625, + "grad_norm": 15.999086380004883, + "learning_rate": 1.1874643224112342e-05, + "loss": 3.3593, + "step": 33510 + }, + { + "epoch": 3.817985078876929, + "grad_norm": 24.11785888671875, + "learning_rate": 1.1863226395707274e-05, + "loss": 2.8787, + "step": 33520 + }, + { + "epoch": 3.819124095905234, + "grad_norm": 15.513715744018555, + "learning_rate": 1.1851809567302204e-05, + "loss": 3.3408, + "step": 33530 + }, + { + "epoch": 3.8202631129335383, + "grad_norm": 22.851015090942383, + "learning_rate": 1.1840392738897136e-05, + "loss": 3.3966, + "step": 33540 + }, + { + "epoch": 3.821402129961843, + "grad_norm": 18.888126373291016, + "learning_rate": 1.1828975910492066e-05, + "loss": 3.1344, + "step": 33550 + }, + { + "epoch": 3.8225411469901474, + "grad_norm": 27.975624084472656, + "learning_rate": 1.1817559082086998e-05, + "loss": 3.3148, + "step": 33560 + }, + { + "epoch": 3.823680164018452, + "grad_norm": 16.57772445678711, + "learning_rate": 1.1806142253681928e-05, + "loss": 3.3026, + "step": 33570 + }, + { + "epoch": 3.824819181046757, + "grad_norm": 17.95849609375, + "learning_rate": 1.179472542527686e-05, + "loss": 3.1267, + "step": 33580 + }, + { + "epoch": 3.825958198075061, + "grad_norm": 16.972145080566406, + "learning_rate": 1.178330859687179e-05, + "loss": 3.3965, + "step": 33590 + }, + { + "epoch": 3.8270972151033655, + "grad_norm": 15.15140151977539, + "learning_rate": 1.1771891768466722e-05, + "loss": 3.3577, + "step": 33600 + }, + { + "epoch": 3.8282362321316703, + "grad_norm": 15.133687973022461, + "learning_rate": 1.1760474940061652e-05, + "loss": 3.0421, + "step": 33610 + }, + { + "epoch": 3.829375249159975, + "grad_norm": 26.999977111816406, + "learning_rate": 1.1749058111656584e-05, + "loss": 3.4199, + "step": 33620 + }, + { + "epoch": 3.8305142661882794, + "grad_norm": 17.019285202026367, + "learning_rate": 1.1737641283251514e-05, + "loss": 3.4735, + "step": 33630 + }, + { + "epoch": 3.831653283216584, + "grad_norm": 16.389368057250977, + "learning_rate": 1.1726224454846446e-05, + "loss": 3.0721, + "step": 33640 + }, + { + "epoch": 3.832792300244889, + "grad_norm": 29.870975494384766, + "learning_rate": 1.1714807626441376e-05, + "loss": 3.2233, + "step": 33650 + }, + { + "epoch": 3.8339313172731933, + "grad_norm": 42.89168167114258, + "learning_rate": 1.1703390798036306e-05, + "loss": 3.3995, + "step": 33660 + }, + { + "epoch": 3.8350703343014976, + "grad_norm": 18.50602149963379, + "learning_rate": 1.1691973969631238e-05, + "loss": 3.0315, + "step": 33670 + }, + { + "epoch": 3.8362093513298023, + "grad_norm": 15.6813325881958, + "learning_rate": 1.1680557141226168e-05, + "loss": 3.2828, + "step": 33680 + }, + { + "epoch": 3.837348368358107, + "grad_norm": 14.051004409790039, + "learning_rate": 1.1669140312821098e-05, + "loss": 3.2009, + "step": 33690 + }, + { + "epoch": 3.8384873853864114, + "grad_norm": 17.388364791870117, + "learning_rate": 1.165772348441603e-05, + "loss": 3.1334, + "step": 33700 + }, + { + "epoch": 3.839626402414716, + "grad_norm": 43.350547790527344, + "learning_rate": 1.164630665601096e-05, + "loss": 3.4336, + "step": 33710 + }, + { + "epoch": 3.8407654194430205, + "grad_norm": 19.648052215576172, + "learning_rate": 1.1634889827605892e-05, + "loss": 3.0718, + "step": 33720 + }, + { + "epoch": 3.8419044364713253, + "grad_norm": 14.41798210144043, + "learning_rate": 1.1623472999200822e-05, + "loss": 3.2986, + "step": 33730 + }, + { + "epoch": 3.8430434534996296, + "grad_norm": 19.065940856933594, + "learning_rate": 1.1613197853636261e-05, + "loss": 3.1973, + "step": 33740 + }, + { + "epoch": 3.8441824705279344, + "grad_norm": 16.98052406311035, + "learning_rate": 1.1601781025231191e-05, + "loss": 3.0104, + "step": 33750 + }, + { + "epoch": 3.845321487556239, + "grad_norm": 20.1462459564209, + "learning_rate": 1.1590364196826123e-05, + "loss": 3.1908, + "step": 33760 + }, + { + "epoch": 3.8464605045845435, + "grad_norm": 36.184539794921875, + "learning_rate": 1.1578947368421053e-05, + "loss": 3.0869, + "step": 33770 + }, + { + "epoch": 3.8475995216128482, + "grad_norm": 18.652177810668945, + "learning_rate": 1.1567530540015985e-05, + "loss": 3.4309, + "step": 33780 + }, + { + "epoch": 3.8487385386411526, + "grad_norm": 19.719371795654297, + "learning_rate": 1.1556113711610915e-05, + "loss": 3.3563, + "step": 33790 + }, + { + "epoch": 3.8498775556694573, + "grad_norm": 15.144607543945312, + "learning_rate": 1.1544696883205847e-05, + "loss": 3.1038, + "step": 33800 + }, + { + "epoch": 3.8510165726977617, + "grad_norm": 33.72139358520508, + "learning_rate": 1.1533280054800777e-05, + "loss": 3.3617, + "step": 33810 + }, + { + "epoch": 3.8521555897260664, + "grad_norm": 14.42808723449707, + "learning_rate": 1.1521863226395708e-05, + "loss": 3.2339, + "step": 33820 + }, + { + "epoch": 3.853294606754371, + "grad_norm": 21.783538818359375, + "learning_rate": 1.151044639799064e-05, + "loss": 3.5843, + "step": 33830 + }, + { + "epoch": 3.8544336237826755, + "grad_norm": 29.48973846435547, + "learning_rate": 1.149902956958557e-05, + "loss": 2.9454, + "step": 33840 + }, + { + "epoch": 3.8555726408109803, + "grad_norm": 15.045869827270508, + "learning_rate": 1.1487612741180501e-05, + "loss": 3.2732, + "step": 33850 + }, + { + "epoch": 3.8567116578392846, + "grad_norm": 20.551128387451172, + "learning_rate": 1.1476195912775431e-05, + "loss": 3.0871, + "step": 33860 + }, + { + "epoch": 3.8578506748675894, + "grad_norm": 16.876062393188477, + "learning_rate": 1.1464779084370363e-05, + "loss": 3.1624, + "step": 33870 + }, + { + "epoch": 3.8589896918958937, + "grad_norm": 19.543254852294922, + "learning_rate": 1.1453362255965293e-05, + "loss": 3.0916, + "step": 33880 + }, + { + "epoch": 3.8601287089241985, + "grad_norm": 40.416439056396484, + "learning_rate": 1.1441945427560225e-05, + "loss": 3.1961, + "step": 33890 + }, + { + "epoch": 3.8612677259525032, + "grad_norm": 21.792705535888672, + "learning_rate": 1.1430528599155155e-05, + "loss": 2.8603, + "step": 33900 + }, + { + "epoch": 3.8624067429808076, + "grad_norm": 19.090656280517578, + "learning_rate": 1.1419111770750086e-05, + "loss": 3.1213, + "step": 33910 + }, + { + "epoch": 3.863545760009112, + "grad_norm": 9.652124404907227, + "learning_rate": 1.1407694942345017e-05, + "loss": 3.292, + "step": 33920 + }, + { + "epoch": 3.8646847770374166, + "grad_norm": 32.77311706542969, + "learning_rate": 1.1396278113939948e-05, + "loss": 3.233, + "step": 33930 + }, + { + "epoch": 3.8658237940657214, + "grad_norm": 15.490493774414062, + "learning_rate": 1.1384861285534878e-05, + "loss": 3.4222, + "step": 33940 + }, + { + "epoch": 3.8669628110940257, + "grad_norm": 18.22795867919922, + "learning_rate": 1.137344445712981e-05, + "loss": 3.2197, + "step": 33950 + }, + { + "epoch": 3.8681018281223305, + "grad_norm": 16.72008514404297, + "learning_rate": 1.136202762872474e-05, + "loss": 3.1452, + "step": 33960 + }, + { + "epoch": 3.8692408451506353, + "grad_norm": 30.634672164916992, + "learning_rate": 1.1350610800319672e-05, + "loss": 3.3554, + "step": 33970 + }, + { + "epoch": 3.8703798621789396, + "grad_norm": 23.5197811126709, + "learning_rate": 1.1339193971914602e-05, + "loss": 3.0245, + "step": 33980 + }, + { + "epoch": 3.871518879207244, + "grad_norm": 17.863908767700195, + "learning_rate": 1.1327777143509533e-05, + "loss": 2.9671, + "step": 33990 + }, + { + "epoch": 3.8726578962355487, + "grad_norm": 16.72121238708496, + "learning_rate": 1.1316360315104464e-05, + "loss": 3.3413, + "step": 34000 + }, + { + "epoch": 3.8726578962355487, + "eval_loss": 6.661217212677002, + "eval_runtime": 11.3378, + "eval_samples_per_second": 1.323, + "eval_steps_per_second": 0.176, + "step": 34000 + }, + { + "epoch": 3.8737969132638534, + "grad_norm": 15.231101989746094, + "learning_rate": 1.1304943486699395e-05, + "loss": 3.5364, + "step": 34010 + }, + { + "epoch": 3.8749359302921578, + "grad_norm": 16.737051010131836, + "learning_rate": 1.1293526658294326e-05, + "loss": 3.4635, + "step": 34020 + }, + { + "epoch": 3.8760749473204625, + "grad_norm": 53.146888732910156, + "learning_rate": 1.1282109829889257e-05, + "loss": 3.1382, + "step": 34030 + }, + { + "epoch": 3.877213964348767, + "grad_norm": 14.017007827758789, + "learning_rate": 1.1270693001484188e-05, + "loss": 3.0769, + "step": 34040 + }, + { + "epoch": 3.8783529813770716, + "grad_norm": 16.23945426940918, + "learning_rate": 1.125927617307912e-05, + "loss": 2.8566, + "step": 34050 + }, + { + "epoch": 3.879491998405376, + "grad_norm": 12.588747024536133, + "learning_rate": 1.124785934467405e-05, + "loss": 3.3385, + "step": 34060 + }, + { + "epoch": 3.8806310154336807, + "grad_norm": 58.49121856689453, + "learning_rate": 1.1236442516268981e-05, + "loss": 3.1533, + "step": 34070 + }, + { + "epoch": 3.8817700324619855, + "grad_norm": 16.688541412353516, + "learning_rate": 1.1225025687863912e-05, + "loss": 3.255, + "step": 34080 + }, + { + "epoch": 3.88290904949029, + "grad_norm": 26.06526756286621, + "learning_rate": 1.1213608859458843e-05, + "loss": 2.9623, + "step": 34090 + }, + { + "epoch": 3.8840480665185946, + "grad_norm": 21.98783302307129, + "learning_rate": 1.1202192031053774e-05, + "loss": 3.0951, + "step": 34100 + }, + { + "epoch": 3.885187083546899, + "grad_norm": 28.741485595703125, + "learning_rate": 1.1190775202648705e-05, + "loss": 2.7282, + "step": 34110 + }, + { + "epoch": 3.8863261005752037, + "grad_norm": 14.933989524841309, + "learning_rate": 1.1179358374243636e-05, + "loss": 3.1487, + "step": 34120 + }, + { + "epoch": 3.887465117603508, + "grad_norm": 11.116551399230957, + "learning_rate": 1.1167941545838567e-05, + "loss": 3.4722, + "step": 34130 + }, + { + "epoch": 3.8886041346318128, + "grad_norm": 18.379390716552734, + "learning_rate": 1.1156524717433498e-05, + "loss": 3.6789, + "step": 34140 + }, + { + "epoch": 3.8897431516601175, + "grad_norm": 21.434953689575195, + "learning_rate": 1.114510788902843e-05, + "loss": 3.235, + "step": 34150 + }, + { + "epoch": 3.890882168688422, + "grad_norm": 17.4919490814209, + "learning_rate": 1.113369106062336e-05, + "loss": 3.1719, + "step": 34160 + }, + { + "epoch": 3.8920211857167266, + "grad_norm": 17.186697006225586, + "learning_rate": 1.112227423221829e-05, + "loss": 3.1761, + "step": 34170 + }, + { + "epoch": 3.893160202745031, + "grad_norm": 24.275035858154297, + "learning_rate": 1.1110857403813221e-05, + "loss": 3.2031, + "step": 34180 + }, + { + "epoch": 3.8942992197733357, + "grad_norm": 36.2103157043457, + "learning_rate": 1.1099440575408152e-05, + "loss": 2.9519, + "step": 34190 + }, + { + "epoch": 3.89543823680164, + "grad_norm": 30.00943374633789, + "learning_rate": 1.1088023747003083e-05, + "loss": 3.3271, + "step": 34200 + }, + { + "epoch": 3.896577253829945, + "grad_norm": 26.667987823486328, + "learning_rate": 1.1076606918598014e-05, + "loss": 3.1654, + "step": 34210 + }, + { + "epoch": 3.8977162708582496, + "grad_norm": 21.609365463256836, + "learning_rate": 1.1065190090192945e-05, + "loss": 3.1004, + "step": 34220 + }, + { + "epoch": 3.898855287886554, + "grad_norm": 25.805389404296875, + "learning_rate": 1.1053773261787876e-05, + "loss": 3.2328, + "step": 34230 + }, + { + "epoch": 3.899994304914858, + "grad_norm": 34.8479118347168, + "learning_rate": 1.1042356433382807e-05, + "loss": 3.3014, + "step": 34240 + }, + { + "epoch": 3.901133321943163, + "grad_norm": 41.24662399291992, + "learning_rate": 1.1030939604977738e-05, + "loss": 3.1267, + "step": 34250 + }, + { + "epoch": 3.9022723389714677, + "grad_norm": 43.70309829711914, + "learning_rate": 1.101952277657267e-05, + "loss": 2.963, + "step": 34260 + }, + { + "epoch": 3.903411355999772, + "grad_norm": 19.455928802490234, + "learning_rate": 1.10081059481676e-05, + "loss": 3.3216, + "step": 34270 + }, + { + "epoch": 3.904550373028077, + "grad_norm": 23.085784912109375, + "learning_rate": 1.0996689119762531e-05, + "loss": 3.146, + "step": 34280 + }, + { + "epoch": 3.9056893900563816, + "grad_norm": 22.985118865966797, + "learning_rate": 1.0985272291357462e-05, + "loss": 3.0404, + "step": 34290 + }, + { + "epoch": 3.906828407084686, + "grad_norm": 16.12543487548828, + "learning_rate": 1.0973855462952393e-05, + "loss": 3.3122, + "step": 34300 + }, + { + "epoch": 3.9079674241129903, + "grad_norm": 15.109716415405273, + "learning_rate": 1.0962438634547323e-05, + "loss": 3.0485, + "step": 34310 + }, + { + "epoch": 3.909106441141295, + "grad_norm": 16.566967010498047, + "learning_rate": 1.0951021806142255e-05, + "loss": 3.485, + "step": 34320 + }, + { + "epoch": 3.9102454581696, + "grad_norm": 13.127080917358398, + "learning_rate": 1.0939604977737185e-05, + "loss": 3.025, + "step": 34330 + }, + { + "epoch": 3.911384475197904, + "grad_norm": 15.987772941589355, + "learning_rate": 1.0928188149332116e-05, + "loss": 3.3851, + "step": 34340 + }, + { + "epoch": 3.912523492226209, + "grad_norm": 20.04308319091797, + "learning_rate": 1.0916771320927047e-05, + "loss": 3.3652, + "step": 34350 + }, + { + "epoch": 3.913662509254513, + "grad_norm": 31.386751174926758, + "learning_rate": 1.0905354492521978e-05, + "loss": 2.9917, + "step": 34360 + }, + { + "epoch": 3.914801526282818, + "grad_norm": 25.774982452392578, + "learning_rate": 1.0893937664116908e-05, + "loss": 3.4683, + "step": 34370 + }, + { + "epoch": 3.9159405433111223, + "grad_norm": 31.10222816467285, + "learning_rate": 1.088252083571184e-05, + "loss": 3.0355, + "step": 34380 + }, + { + "epoch": 3.917079560339427, + "grad_norm": 19.318578720092773, + "learning_rate": 1.087110400730677e-05, + "loss": 2.8449, + "step": 34390 + }, + { + "epoch": 3.918218577367732, + "grad_norm": 27.47287940979004, + "learning_rate": 1.0859687178901702e-05, + "loss": 3.2546, + "step": 34400 + }, + { + "epoch": 3.919357594396036, + "grad_norm": 16.842239379882812, + "learning_rate": 1.0848270350496632e-05, + "loss": 3.4527, + "step": 34410 + }, + { + "epoch": 3.920496611424341, + "grad_norm": 18.480390548706055, + "learning_rate": 1.0836853522091564e-05, + "loss": 3.388, + "step": 34420 + }, + { + "epoch": 3.9216356284526452, + "grad_norm": 13.238078117370605, + "learning_rate": 1.0825436693686494e-05, + "loss": 3.1253, + "step": 34430 + }, + { + "epoch": 3.92277464548095, + "grad_norm": 32.937164306640625, + "learning_rate": 1.0814019865281426e-05, + "loss": 3.0907, + "step": 34440 + }, + { + "epoch": 3.9239136625092543, + "grad_norm": 28.880056381225586, + "learning_rate": 1.0802603036876356e-05, + "loss": 3.3314, + "step": 34450 + }, + { + "epoch": 3.925052679537559, + "grad_norm": 18.199424743652344, + "learning_rate": 1.0791186208471288e-05, + "loss": 3.4384, + "step": 34460 + }, + { + "epoch": 3.926191696565864, + "grad_norm": 10.716328620910645, + "learning_rate": 1.0779769380066218e-05, + "loss": 3.1953, + "step": 34470 + }, + { + "epoch": 3.927330713594168, + "grad_norm": 17.6003475189209, + "learning_rate": 1.076835255166115e-05, + "loss": 3.1737, + "step": 34480 + }, + { + "epoch": 3.928469730622473, + "grad_norm": 18.68466567993164, + "learning_rate": 1.075693572325608e-05, + "loss": 3.4369, + "step": 34490 + }, + { + "epoch": 3.9296087476507773, + "grad_norm": 20.041231155395508, + "learning_rate": 1.0745518894851011e-05, + "loss": 3.3907, + "step": 34500 + }, + { + "epoch": 3.930747764679082, + "grad_norm": 43.14548873901367, + "learning_rate": 1.0734102066445942e-05, + "loss": 3.0981, + "step": 34510 + }, + { + "epoch": 3.9318867817073864, + "grad_norm": 21.33222198486328, + "learning_rate": 1.0722685238040872e-05, + "loss": 3.1751, + "step": 34520 + }, + { + "epoch": 3.933025798735691, + "grad_norm": 17.590347290039062, + "learning_rate": 1.0711268409635804e-05, + "loss": 3.0234, + "step": 34530 + }, + { + "epoch": 3.934164815763996, + "grad_norm": 13.048931121826172, + "learning_rate": 1.0699851581230734e-05, + "loss": 3.2729, + "step": 34540 + }, + { + "epoch": 3.9353038327923002, + "grad_norm": 28.10319709777832, + "learning_rate": 1.0688434752825666e-05, + "loss": 3.2528, + "step": 34550 + }, + { + "epoch": 3.9364428498206046, + "grad_norm": 18.601184844970703, + "learning_rate": 1.0677017924420596e-05, + "loss": 3.2596, + "step": 34560 + }, + { + "epoch": 3.9375818668489093, + "grad_norm": 22.97974395751953, + "learning_rate": 1.0665601096015528e-05, + "loss": 3.1703, + "step": 34570 + }, + { + "epoch": 3.938720883877214, + "grad_norm": 19.660066604614258, + "learning_rate": 1.0654184267610458e-05, + "loss": 3.0291, + "step": 34580 + }, + { + "epoch": 3.9398599009055184, + "grad_norm": 28.623441696166992, + "learning_rate": 1.064276743920539e-05, + "loss": 3.5273, + "step": 34590 + }, + { + "epoch": 3.940998917933823, + "grad_norm": 27.676889419555664, + "learning_rate": 1.063135061080032e-05, + "loss": 3.2439, + "step": 34600 + }, + { + "epoch": 3.942137934962128, + "grad_norm": 15.081567764282227, + "learning_rate": 1.0619933782395252e-05, + "loss": 3.2306, + "step": 34610 + }, + { + "epoch": 3.9432769519904323, + "grad_norm": 18.153156280517578, + "learning_rate": 1.0608516953990182e-05, + "loss": 3.0342, + "step": 34620 + }, + { + "epoch": 3.9444159690187366, + "grad_norm": 54.536563873291016, + "learning_rate": 1.0597100125585113e-05, + "loss": 3.4186, + "step": 34630 + }, + { + "epoch": 3.9455549860470414, + "grad_norm": 14.300661087036133, + "learning_rate": 1.0585683297180044e-05, + "loss": 2.9953, + "step": 34640 + }, + { + "epoch": 3.946694003075346, + "grad_norm": 16.941755294799805, + "learning_rate": 1.0574266468774975e-05, + "loss": 3.047, + "step": 34650 + }, + { + "epoch": 3.9478330201036504, + "grad_norm": 19.460634231567383, + "learning_rate": 1.0562849640369906e-05, + "loss": 2.9647, + "step": 34660 + }, + { + "epoch": 3.948972037131955, + "grad_norm": 33.485992431640625, + "learning_rate": 1.0551432811964837e-05, + "loss": 3.2282, + "step": 34670 + }, + { + "epoch": 3.9501110541602595, + "grad_norm": 19.1177978515625, + "learning_rate": 1.0540015983559768e-05, + "loss": 3.2775, + "step": 34680 + }, + { + "epoch": 3.9512500711885643, + "grad_norm": 22.325424194335938, + "learning_rate": 1.05285991551547e-05, + "loss": 3.4052, + "step": 34690 + }, + { + "epoch": 3.9523890882168686, + "grad_norm": 16.33624267578125, + "learning_rate": 1.051718232674963e-05, + "loss": 3.2813, + "step": 34700 + }, + { + "epoch": 3.9535281052451734, + "grad_norm": 34.81819534301758, + "learning_rate": 1.0505765498344561e-05, + "loss": 3.3768, + "step": 34710 + }, + { + "epoch": 3.954667122273478, + "grad_norm": 14.993651390075684, + "learning_rate": 1.0494348669939492e-05, + "loss": 3.8334, + "step": 34720 + }, + { + "epoch": 3.9558061393017825, + "grad_norm": 22.498533248901367, + "learning_rate": 1.0482931841534423e-05, + "loss": 2.9652, + "step": 34730 + }, + { + "epoch": 3.9569451563300873, + "grad_norm": 18.665935516357422, + "learning_rate": 1.0471515013129354e-05, + "loss": 3.1048, + "step": 34740 + }, + { + "epoch": 3.9580841733583916, + "grad_norm": 28.582048416137695, + "learning_rate": 1.0460098184724285e-05, + "loss": 3.0532, + "step": 34750 + }, + { + "epoch": 3.9592231903866963, + "grad_norm": 22.957115173339844, + "learning_rate": 1.0448681356319216e-05, + "loss": 3.383, + "step": 34760 + }, + { + "epoch": 3.9603622074150007, + "grad_norm": 16.739212036132812, + "learning_rate": 1.0437264527914146e-05, + "loss": 3.292, + "step": 34770 + }, + { + "epoch": 3.9615012244433054, + "grad_norm": 33.49989700317383, + "learning_rate": 1.0425847699509078e-05, + "loss": 2.9684, + "step": 34780 + }, + { + "epoch": 3.96264024147161, + "grad_norm": 19.21895408630371, + "learning_rate": 1.0414430871104008e-05, + "loss": 2.8992, + "step": 34790 + }, + { + "epoch": 3.9637792584999145, + "grad_norm": 18.931285858154297, + "learning_rate": 1.040301404269894e-05, + "loss": 3.4763, + "step": 34800 + }, + { + "epoch": 3.9649182755282193, + "grad_norm": 16.084930419921875, + "learning_rate": 1.039159721429387e-05, + "loss": 3.1776, + "step": 34810 + }, + { + "epoch": 3.9660572925565236, + "grad_norm": 17.007917404174805, + "learning_rate": 1.03801803858888e-05, + "loss": 3.0342, + "step": 34820 + }, + { + "epoch": 3.9671963095848284, + "grad_norm": 14.934893608093262, + "learning_rate": 1.0368763557483732e-05, + "loss": 3.2631, + "step": 34830 + }, + { + "epoch": 3.9683353266131327, + "grad_norm": 20.488367080688477, + "learning_rate": 1.0357346729078662e-05, + "loss": 3.3743, + "step": 34840 + }, + { + "epoch": 3.9694743436414375, + "grad_norm": 13.418065071105957, + "learning_rate": 1.0345929900673592e-05, + "loss": 3.1173, + "step": 34850 + }, + { + "epoch": 3.9706133606697422, + "grad_norm": 18.096406936645508, + "learning_rate": 1.0334513072268524e-05, + "loss": 3.4812, + "step": 34860 + }, + { + "epoch": 3.9717523776980466, + "grad_norm": 20.732946395874023, + "learning_rate": 1.0323096243863454e-05, + "loss": 3.0182, + "step": 34870 + }, + { + "epoch": 3.972891394726351, + "grad_norm": 19.0418758392334, + "learning_rate": 1.0311679415458386e-05, + "loss": 3.4738, + "step": 34880 + }, + { + "epoch": 3.9740304117546557, + "grad_norm": 19.997413635253906, + "learning_rate": 1.0300262587053316e-05, + "loss": 3.6017, + "step": 34890 + }, + { + "epoch": 3.9751694287829604, + "grad_norm": 16.964879989624023, + "learning_rate": 1.0288845758648248e-05, + "loss": 3.1749, + "step": 34900 + }, + { + "epoch": 3.9763084458112647, + "grad_norm": 29.300769805908203, + "learning_rate": 1.0277428930243178e-05, + "loss": 2.849, + "step": 34910 + }, + { + "epoch": 3.9774474628395695, + "grad_norm": 27.144683837890625, + "learning_rate": 1.026601210183811e-05, + "loss": 3.171, + "step": 34920 + }, + { + "epoch": 3.9785864798678743, + "grad_norm": 57.12122344970703, + "learning_rate": 1.025459527343304e-05, + "loss": 2.8867, + "step": 34930 + }, + { + "epoch": 3.9797254968961786, + "grad_norm": 30.16901397705078, + "learning_rate": 1.0243178445027972e-05, + "loss": 3.0755, + "step": 34940 + }, + { + "epoch": 3.980864513924483, + "grad_norm": 19.454696655273438, + "learning_rate": 1.0231761616622902e-05, + "loss": 2.9216, + "step": 34950 + }, + { + "epoch": 3.9820035309527877, + "grad_norm": 22.626222610473633, + "learning_rate": 1.0220344788217834e-05, + "loss": 2.8746, + "step": 34960 + }, + { + "epoch": 3.9831425479810925, + "grad_norm": 15.998950004577637, + "learning_rate": 1.0208927959812764e-05, + "loss": 3.9033, + "step": 34970 + }, + { + "epoch": 3.984281565009397, + "grad_norm": 17.5640811920166, + "learning_rate": 1.0197511131407696e-05, + "loss": 2.998, + "step": 34980 + }, + { + "epoch": 3.9854205820377016, + "grad_norm": 21.637866973876953, + "learning_rate": 1.0186094303002626e-05, + "loss": 3.176, + "step": 34990 + }, + { + "epoch": 3.986559599066006, + "grad_norm": 15.04176139831543, + "learning_rate": 1.0174677474597558e-05, + "loss": 3.1113, + "step": 35000 + }, + { + "epoch": 3.9876986160943106, + "grad_norm": 13.209607124328613, + "learning_rate": 1.0163260646192488e-05, + "loss": 2.8275, + "step": 35010 + }, + { + "epoch": 3.988837633122615, + "grad_norm": 24.719711303710938, + "learning_rate": 1.015184381778742e-05, + "loss": 3.1036, + "step": 35020 + }, + { + "epoch": 3.9899766501509197, + "grad_norm": 18.581214904785156, + "learning_rate": 1.014042698938235e-05, + "loss": 2.9769, + "step": 35030 + }, + { + "epoch": 3.9911156671792245, + "grad_norm": 48.083858489990234, + "learning_rate": 1.0129010160977282e-05, + "loss": 3.1646, + "step": 35040 + }, + { + "epoch": 3.992254684207529, + "grad_norm": 18.242029190063477, + "learning_rate": 1.0117593332572212e-05, + "loss": 3.3108, + "step": 35050 + }, + { + "epoch": 3.9933937012358336, + "grad_norm": 28.582324981689453, + "learning_rate": 1.0106176504167144e-05, + "loss": 3.2919, + "step": 35060 + }, + { + "epoch": 3.994532718264138, + "grad_norm": 27.994962692260742, + "learning_rate": 1.0094759675762074e-05, + "loss": 3.1523, + "step": 35070 + }, + { + "epoch": 3.9956717352924427, + "grad_norm": 24.75602912902832, + "learning_rate": 1.0083342847357006e-05, + "loss": 3.2566, + "step": 35080 + }, + { + "epoch": 3.996810752320747, + "grad_norm": 21.267677307128906, + "learning_rate": 1.0071926018951936e-05, + "loss": 3.3682, + "step": 35090 + }, + { + "epoch": 3.9979497693490518, + "grad_norm": 30.283794403076172, + "learning_rate": 1.0060509190546868e-05, + "loss": 3.4812, + "step": 35100 + }, + { + "epoch": 3.9990887863773565, + "grad_norm": 14.295228958129883, + "learning_rate": 1.0049092362141798e-05, + "loss": 3.2986, + "step": 35110 + }, + { + "epoch": 4.000227803405661, + "grad_norm": 13.515183448791504, + "learning_rate": 1.003767553373673e-05, + "loss": 2.9587, + "step": 35120 + }, + { + "epoch": 4.001366820433965, + "grad_norm": 16.725143432617188, + "learning_rate": 1.002625870533166e-05, + "loss": 2.2084, + "step": 35130 + }, + { + "epoch": 4.00250583746227, + "grad_norm": 12.10120677947998, + "learning_rate": 1.0014841876926591e-05, + "loss": 2.2386, + "step": 35140 + }, + { + "epoch": 4.003644854490575, + "grad_norm": 36.54746627807617, + "learning_rate": 1.0003425048521522e-05, + "loss": 2.1046, + "step": 35150 + }, + { + "epoch": 4.004783871518879, + "grad_norm": 21.148271560668945, + "learning_rate": 9.992008220116453e-06, + "loss": 2.0342, + "step": 35160 + }, + { + "epoch": 4.005922888547183, + "grad_norm": 18.56048011779785, + "learning_rate": 9.980591391711384e-06, + "loss": 2.4446, + "step": 35170 + }, + { + "epoch": 4.007061905575489, + "grad_norm": 23.44548988342285, + "learning_rate": 9.969174563306315e-06, + "loss": 2.2637, + "step": 35180 + }, + { + "epoch": 4.008200922603793, + "grad_norm": 15.573701858520508, + "learning_rate": 9.957757734901246e-06, + "loss": 2.5477, + "step": 35190 + }, + { + "epoch": 4.009339939632097, + "grad_norm": 20.715803146362305, + "learning_rate": 9.946340906496176e-06, + "loss": 2.1361, + "step": 35200 + }, + { + "epoch": 4.010478956660402, + "grad_norm": 26.810468673706055, + "learning_rate": 9.934924078091108e-06, + "loss": 1.9626, + "step": 35210 + }, + { + "epoch": 4.011617973688707, + "grad_norm": 25.35595703125, + "learning_rate": 9.923507249686038e-06, + "loss": 2.3449, + "step": 35220 + }, + { + "epoch": 4.012756990717011, + "grad_norm": 22.845022201538086, + "learning_rate": 9.91209042128097e-06, + "loss": 2.3955, + "step": 35230 + }, + { + "epoch": 4.013896007745315, + "grad_norm": 15.234258651733398, + "learning_rate": 9.9006735928759e-06, + "loss": 2.0885, + "step": 35240 + }, + { + "epoch": 4.015035024773621, + "grad_norm": 35.021820068359375, + "learning_rate": 9.88925676447083e-06, + "loss": 1.9235, + "step": 35250 + }, + { + "epoch": 4.016174041801925, + "grad_norm": 20.7735538482666, + "learning_rate": 9.877839936065762e-06, + "loss": 1.9411, + "step": 35260 + }, + { + "epoch": 4.017313058830229, + "grad_norm": 27.438777923583984, + "learning_rate": 9.866423107660692e-06, + "loss": 2.072, + "step": 35270 + }, + { + "epoch": 4.0184520758585345, + "grad_norm": 17.297277450561523, + "learning_rate": 9.855006279255622e-06, + "loss": 2.1, + "step": 35280 + }, + { + "epoch": 4.019591092886839, + "grad_norm": 58.3243408203125, + "learning_rate": 9.843589450850554e-06, + "loss": 1.9677, + "step": 35290 + }, + { + "epoch": 4.020730109915143, + "grad_norm": 29.510025024414062, + "learning_rate": 9.832172622445484e-06, + "loss": 2.5949, + "step": 35300 + }, + { + "epoch": 4.0218691269434474, + "grad_norm": 23.849496841430664, + "learning_rate": 9.820755794040416e-06, + "loss": 2.3084, + "step": 35310 + }, + { + "epoch": 4.023008143971753, + "grad_norm": 25.71555519104004, + "learning_rate": 9.809338965635346e-06, + "loss": 2.1529, + "step": 35320 + }, + { + "epoch": 4.024147161000057, + "grad_norm": 21.193737030029297, + "learning_rate": 9.797922137230278e-06, + "loss": 2.1629, + "step": 35330 + }, + { + "epoch": 4.025286178028361, + "grad_norm": 30.295127868652344, + "learning_rate": 9.786505308825208e-06, + "loss": 2.3461, + "step": 35340 + }, + { + "epoch": 4.0264251950566665, + "grad_norm": 24.55783462524414, + "learning_rate": 9.77508848042014e-06, + "loss": 2.0365, + "step": 35350 + }, + { + "epoch": 4.027564212084971, + "grad_norm": 21.69396209716797, + "learning_rate": 9.76367165201507e-06, + "loss": 2.0281, + "step": 35360 + }, + { + "epoch": 4.028703229113275, + "grad_norm": 40.41695022583008, + "learning_rate": 9.752254823610002e-06, + "loss": 1.9344, + "step": 35370 + }, + { + "epoch": 4.0298422461415795, + "grad_norm": 24.832134246826172, + "learning_rate": 9.740837995204932e-06, + "loss": 1.6282, + "step": 35380 + }, + { + "epoch": 4.030981263169885, + "grad_norm": 21.387557983398438, + "learning_rate": 9.729421166799864e-06, + "loss": 1.988, + "step": 35390 + }, + { + "epoch": 4.032120280198189, + "grad_norm": 25.913238525390625, + "learning_rate": 9.718004338394794e-06, + "loss": 2.2516, + "step": 35400 + }, + { + "epoch": 4.033259297226493, + "grad_norm": 13.866029739379883, + "learning_rate": 9.706587509989726e-06, + "loss": 2.3832, + "step": 35410 + }, + { + "epoch": 4.034398314254798, + "grad_norm": 35.09535598754883, + "learning_rate": 9.695170681584656e-06, + "loss": 2.0593, + "step": 35420 + }, + { + "epoch": 4.035537331283103, + "grad_norm": 15.327797889709473, + "learning_rate": 9.683753853179588e-06, + "loss": 1.729, + "step": 35430 + }, + { + "epoch": 4.036676348311407, + "grad_norm": 29.06590461730957, + "learning_rate": 9.672337024774518e-06, + "loss": 1.932, + "step": 35440 + }, + { + "epoch": 4.0378153653397115, + "grad_norm": 17.76268768310547, + "learning_rate": 9.66092019636945e-06, + "loss": 2.2326, + "step": 35450 + }, + { + "epoch": 4.038954382368017, + "grad_norm": 15.581809997558594, + "learning_rate": 9.64950336796438e-06, + "loss": 2.0454, + "step": 35460 + }, + { + "epoch": 4.040093399396321, + "grad_norm": 30.37004280090332, + "learning_rate": 9.638086539559312e-06, + "loss": 2.3074, + "step": 35470 + }, + { + "epoch": 4.041232416424625, + "grad_norm": 30.546335220336914, + "learning_rate": 9.626669711154242e-06, + "loss": 2.4812, + "step": 35480 + }, + { + "epoch": 4.04237143345293, + "grad_norm": 23.823406219482422, + "learning_rate": 9.615252882749174e-06, + "loss": 2.0709, + "step": 35490 + }, + { + "epoch": 4.043510450481235, + "grad_norm": 25.465885162353516, + "learning_rate": 9.603836054344104e-06, + "loss": 2.1954, + "step": 35500 + }, + { + "epoch": 4.044649467509539, + "grad_norm": 28.988924026489258, + "learning_rate": 9.592419225939036e-06, + "loss": 2.2382, + "step": 35510 + }, + { + "epoch": 4.045788484537844, + "grad_norm": 18.630001068115234, + "learning_rate": 9.581002397533966e-06, + "loss": 2.0662, + "step": 35520 + }, + { + "epoch": 4.046927501566149, + "grad_norm": 19.428184509277344, + "learning_rate": 9.569585569128898e-06, + "loss": 2.2448, + "step": 35530 + }, + { + "epoch": 4.048066518594453, + "grad_norm": 22.8642635345459, + "learning_rate": 9.558168740723828e-06, + "loss": 2.1093, + "step": 35540 + }, + { + "epoch": 4.049205535622757, + "grad_norm": 37.037315368652344, + "learning_rate": 9.546751912318758e-06, + "loss": 2.1334, + "step": 35550 + }, + { + "epoch": 4.050344552651062, + "grad_norm": 18.648784637451172, + "learning_rate": 9.53533508391369e-06, + "loss": 2.0379, + "step": 35560 + }, + { + "epoch": 4.051483569679367, + "grad_norm": 22.10618782043457, + "learning_rate": 9.52391825550862e-06, + "loss": 2.2371, + "step": 35570 + }, + { + "epoch": 4.052622586707671, + "grad_norm": 56.212379455566406, + "learning_rate": 9.512501427103552e-06, + "loss": 1.821, + "step": 35580 + }, + { + "epoch": 4.053761603735976, + "grad_norm": 22.27092933654785, + "learning_rate": 9.501084598698482e-06, + "loss": 2.3711, + "step": 35590 + }, + { + "epoch": 4.054900620764281, + "grad_norm": 18.950254440307617, + "learning_rate": 9.489667770293414e-06, + "loss": 2.2074, + "step": 35600 + }, + { + "epoch": 4.056039637792585, + "grad_norm": 21.826650619506836, + "learning_rate": 9.478250941888344e-06, + "loss": 1.8353, + "step": 35610 + }, + { + "epoch": 4.0571786548208895, + "grad_norm": 20.334300994873047, + "learning_rate": 9.466834113483276e-06, + "loss": 2.1423, + "step": 35620 + }, + { + "epoch": 4.058317671849194, + "grad_norm": 24.380020141601562, + "learning_rate": 9.455417285078206e-06, + "loss": 2.3729, + "step": 35630 + }, + { + "epoch": 4.059456688877499, + "grad_norm": 22.84755516052246, + "learning_rate": 9.444000456673138e-06, + "loss": 1.9276, + "step": 35640 + }, + { + "epoch": 4.060595705905803, + "grad_norm": 24.157442092895508, + "learning_rate": 9.432583628268068e-06, + "loss": 2.4208, + "step": 35650 + }, + { + "epoch": 4.061734722934108, + "grad_norm": 34.433815002441406, + "learning_rate": 9.421166799863e-06, + "loss": 1.8884, + "step": 35660 + }, + { + "epoch": 4.062873739962413, + "grad_norm": 19.3491268157959, + "learning_rate": 9.40974997145793e-06, + "loss": 2.1392, + "step": 35670 + }, + { + "epoch": 4.064012756990717, + "grad_norm": 16.556428909301758, + "learning_rate": 9.39833314305286e-06, + "loss": 2.0148, + "step": 35680 + }, + { + "epoch": 4.0651517740190215, + "grad_norm": 24.482839584350586, + "learning_rate": 9.386916314647792e-06, + "loss": 1.6598, + "step": 35690 + }, + { + "epoch": 4.066290791047326, + "grad_norm": 11.901963233947754, + "learning_rate": 9.375499486242722e-06, + "loss": 2.2623, + "step": 35700 + }, + { + "epoch": 4.067429808075631, + "grad_norm": 21.31659507751465, + "learning_rate": 9.364082657837652e-06, + "loss": 2.2655, + "step": 35710 + }, + { + "epoch": 4.068568825103935, + "grad_norm": 25.287250518798828, + "learning_rate": 9.352665829432584e-06, + "loss": 2.0801, + "step": 35720 + }, + { + "epoch": 4.06970784213224, + "grad_norm": 18.50433349609375, + "learning_rate": 9.341249001027514e-06, + "loss": 2.016, + "step": 35730 + }, + { + "epoch": 4.070846859160545, + "grad_norm": 18.797529220581055, + "learning_rate": 9.329832172622446e-06, + "loss": 1.9827, + "step": 35740 + }, + { + "epoch": 4.071985876188849, + "grad_norm": 18.657867431640625, + "learning_rate": 9.318415344217376e-06, + "loss": 2.1304, + "step": 35750 + }, + { + "epoch": 4.0731248932171535, + "grad_norm": 33.3067741394043, + "learning_rate": 9.306998515812308e-06, + "loss": 2.0231, + "step": 35760 + }, + { + "epoch": 4.074263910245458, + "grad_norm": 17.978513717651367, + "learning_rate": 9.295581687407238e-06, + "loss": 2.0184, + "step": 35770 + }, + { + "epoch": 4.075402927273763, + "grad_norm": 28.21367073059082, + "learning_rate": 9.28416485900217e-06, + "loss": 1.8178, + "step": 35780 + }, + { + "epoch": 4.076541944302067, + "grad_norm": 17.165653228759766, + "learning_rate": 9.2727480305971e-06, + "loss": 2.25, + "step": 35790 + }, + { + "epoch": 4.077680961330372, + "grad_norm": 31.635812759399414, + "learning_rate": 9.261331202192032e-06, + "loss": 1.8658, + "step": 35800 + }, + { + "epoch": 4.078819978358676, + "grad_norm": 24.203754425048828, + "learning_rate": 9.249914373786962e-06, + "loss": 2.2087, + "step": 35810 + }, + { + "epoch": 4.079958995386981, + "grad_norm": 18.861743927001953, + "learning_rate": 9.238497545381894e-06, + "loss": 2.4518, + "step": 35820 + }, + { + "epoch": 4.081098012415286, + "grad_norm": 17.043411254882812, + "learning_rate": 9.227080716976824e-06, + "loss": 2.1363, + "step": 35830 + }, + { + "epoch": 4.08223702944359, + "grad_norm": 18.362031936645508, + "learning_rate": 9.215663888571756e-06, + "loss": 1.9629, + "step": 35840 + }, + { + "epoch": 4.083376046471895, + "grad_norm": 15.54836654663086, + "learning_rate": 9.204247060166686e-06, + "loss": 2.2237, + "step": 35850 + }, + { + "epoch": 4.084515063500199, + "grad_norm": 16.14936065673828, + "learning_rate": 9.192830231761618e-06, + "loss": 2.2361, + "step": 35860 + }, + { + "epoch": 4.085654080528504, + "grad_norm": 16.48059844970703, + "learning_rate": 9.181413403356548e-06, + "loss": 2.2346, + "step": 35870 + }, + { + "epoch": 4.086793097556808, + "grad_norm": 19.451807022094727, + "learning_rate": 9.169996574951478e-06, + "loss": 2.0182, + "step": 35880 + }, + { + "epoch": 4.087932114585113, + "grad_norm": 36.3916130065918, + "learning_rate": 9.15857974654641e-06, + "loss": 2.1439, + "step": 35890 + }, + { + "epoch": 4.089071131613418, + "grad_norm": 19.93400764465332, + "learning_rate": 9.14716291814134e-06, + "loss": 2.3565, + "step": 35900 + }, + { + "epoch": 4.090210148641722, + "grad_norm": 18.886568069458008, + "learning_rate": 9.135746089736272e-06, + "loss": 2.3328, + "step": 35910 + }, + { + "epoch": 4.091349165670027, + "grad_norm": 16.396909713745117, + "learning_rate": 9.124329261331202e-06, + "loss": 2.1372, + "step": 35920 + }, + { + "epoch": 4.0924881826983315, + "grad_norm": 12.956214904785156, + "learning_rate": 9.112912432926134e-06, + "loss": 2.075, + "step": 35930 + }, + { + "epoch": 4.093627199726636, + "grad_norm": 26.4219970703125, + "learning_rate": 9.101495604521064e-06, + "loss": 1.9267, + "step": 35940 + }, + { + "epoch": 4.09476621675494, + "grad_norm": 17.349000930786133, + "learning_rate": 9.090078776115996e-06, + "loss": 2.2446, + "step": 35950 + }, + { + "epoch": 4.095905233783245, + "grad_norm": 22.992813110351562, + "learning_rate": 9.078661947710926e-06, + "loss": 1.9661, + "step": 35960 + }, + { + "epoch": 4.09704425081155, + "grad_norm": 18.934776306152344, + "learning_rate": 9.067245119305858e-06, + "loss": 2.1024, + "step": 35970 + }, + { + "epoch": 4.098183267839854, + "grad_norm": 15.392464637756348, + "learning_rate": 9.055828290900788e-06, + "loss": 2.2037, + "step": 35980 + }, + { + "epoch": 4.099322284868159, + "grad_norm": 26.582096099853516, + "learning_rate": 9.04441146249572e-06, + "loss": 1.9292, + "step": 35990 + }, + { + "epoch": 4.1004613018964635, + "grad_norm": 31.57132339477539, + "learning_rate": 9.03299463409065e-06, + "loss": 2.2435, + "step": 36000 + }, + { + "epoch": 4.1004613018964635, + "eval_loss": 7.28188943862915, + "eval_runtime": 11.7066, + "eval_samples_per_second": 1.281, + "eval_steps_per_second": 0.171, + "step": 36000 + }, + { + "epoch": 4.101600318924768, + "grad_norm": 25.907819747924805, + "learning_rate": 9.021577805685582e-06, + "loss": 1.9451, + "step": 36010 + }, + { + "epoch": 4.102739335953072, + "grad_norm": 26.724485397338867, + "learning_rate": 9.010160977280512e-06, + "loss": 2.0658, + "step": 36020 + }, + { + "epoch": 4.103878352981377, + "grad_norm": 25.53949546813965, + "learning_rate": 8.998744148875444e-06, + "loss": 2.1806, + "step": 36030 + }, + { + "epoch": 4.105017370009682, + "grad_norm": 40.665225982666016, + "learning_rate": 8.987327320470374e-06, + "loss": 2.1045, + "step": 36040 + }, + { + "epoch": 4.106156387037986, + "grad_norm": 17.811235427856445, + "learning_rate": 8.975910492065306e-06, + "loss": 2.2806, + "step": 36050 + }, + { + "epoch": 4.10729540406629, + "grad_norm": 17.411487579345703, + "learning_rate": 8.964493663660236e-06, + "loss": 2.2619, + "step": 36060 + }, + { + "epoch": 4.108434421094596, + "grad_norm": 37.91059875488281, + "learning_rate": 8.953076835255168e-06, + "loss": 2.2467, + "step": 36070 + }, + { + "epoch": 4.1095734381229, + "grad_norm": 19.229204177856445, + "learning_rate": 8.941660006850098e-06, + "loss": 1.9757, + "step": 36080 + }, + { + "epoch": 4.110712455151204, + "grad_norm": 25.931848526000977, + "learning_rate": 8.93024317844503e-06, + "loss": 2.1152, + "step": 36090 + }, + { + "epoch": 4.111851472179509, + "grad_norm": 18.092559814453125, + "learning_rate": 8.91882635003996e-06, + "loss": 2.0801, + "step": 36100 + }, + { + "epoch": 4.112990489207814, + "grad_norm": 21.948673248291016, + "learning_rate": 8.90740952163489e-06, + "loss": 2.0873, + "step": 36110 + }, + { + "epoch": 4.114129506236118, + "grad_norm": 30.466522216796875, + "learning_rate": 8.895992693229822e-06, + "loss": 2.1621, + "step": 36120 + }, + { + "epoch": 4.115268523264422, + "grad_norm": 35.11959457397461, + "learning_rate": 8.884575864824752e-06, + "loss": 1.9867, + "step": 36130 + }, + { + "epoch": 4.116407540292728, + "grad_norm": 17.788955688476562, + "learning_rate": 8.873159036419684e-06, + "loss": 2.0907, + "step": 36140 + }, + { + "epoch": 4.117546557321032, + "grad_norm": 33.79847717285156, + "learning_rate": 8.861742208014614e-06, + "loss": 2.1638, + "step": 36150 + }, + { + "epoch": 4.118685574349336, + "grad_norm": 20.393226623535156, + "learning_rate": 8.850325379609544e-06, + "loss": 2.0122, + "step": 36160 + }, + { + "epoch": 4.1198245913776415, + "grad_norm": 20.93840980529785, + "learning_rate": 8.838908551204476e-06, + "loss": 2.1149, + "step": 36170 + }, + { + "epoch": 4.120963608405946, + "grad_norm": 27.537561416625977, + "learning_rate": 8.827491722799406e-06, + "loss": 2.2873, + "step": 36180 + }, + { + "epoch": 4.12210262543425, + "grad_norm": 18.87044334411621, + "learning_rate": 8.816074894394338e-06, + "loss": 2.1831, + "step": 36190 + }, + { + "epoch": 4.123241642462554, + "grad_norm": 19.3295841217041, + "learning_rate": 8.804658065989268e-06, + "loss": 2.5341, + "step": 36200 + }, + { + "epoch": 4.12438065949086, + "grad_norm": 17.20130729675293, + "learning_rate": 8.7932412375842e-06, + "loss": 2.4069, + "step": 36210 + }, + { + "epoch": 4.125519676519164, + "grad_norm": 25.431472778320312, + "learning_rate": 8.78182440917913e-06, + "loss": 2.3164, + "step": 36220 + }, + { + "epoch": 4.126658693547468, + "grad_norm": 18.839248657226562, + "learning_rate": 8.77040758077406e-06, + "loss": 2.0175, + "step": 36230 + }, + { + "epoch": 4.1277977105757735, + "grad_norm": 15.26370620727539, + "learning_rate": 8.758990752368992e-06, + "loss": 2.429, + "step": 36240 + }, + { + "epoch": 4.128936727604078, + "grad_norm": 16.536043167114258, + "learning_rate": 8.747573923963922e-06, + "loss": 1.8358, + "step": 36250 + }, + { + "epoch": 4.130075744632382, + "grad_norm": 14.378931045532227, + "learning_rate": 8.736157095558854e-06, + "loss": 2.9562, + "step": 36260 + }, + { + "epoch": 4.1312147616606865, + "grad_norm": 16.765445709228516, + "learning_rate": 8.724740267153784e-06, + "loss": 1.9867, + "step": 36270 + }, + { + "epoch": 4.132353778688992, + "grad_norm": 15.949607849121094, + "learning_rate": 8.713323438748716e-06, + "loss": 2.3623, + "step": 36280 + }, + { + "epoch": 4.133492795717296, + "grad_norm": 26.54984474182129, + "learning_rate": 8.701906610343646e-06, + "loss": 2.2002, + "step": 36290 + }, + { + "epoch": 4.1346318127456, + "grad_norm": 27.666074752807617, + "learning_rate": 8.690489781938578e-06, + "loss": 1.8041, + "step": 36300 + }, + { + "epoch": 4.1357708297739055, + "grad_norm": 19.084428787231445, + "learning_rate": 8.679072953533508e-06, + "loss": 2.2665, + "step": 36310 + }, + { + "epoch": 4.13690984680221, + "grad_norm": 13.725714683532715, + "learning_rate": 8.66765612512844e-06, + "loss": 2.2309, + "step": 36320 + }, + { + "epoch": 4.138048863830514, + "grad_norm": 15.210906982421875, + "learning_rate": 8.65623929672337e-06, + "loss": 2.3237, + "step": 36330 + }, + { + "epoch": 4.1391878808588185, + "grad_norm": 15.349010467529297, + "learning_rate": 8.644822468318302e-06, + "loss": 1.9141, + "step": 36340 + }, + { + "epoch": 4.140326897887124, + "grad_norm": 28.416311264038086, + "learning_rate": 8.633405639913232e-06, + "loss": 2.3009, + "step": 36350 + }, + { + "epoch": 4.141465914915428, + "grad_norm": 23.310792922973633, + "learning_rate": 8.621988811508164e-06, + "loss": 1.8238, + "step": 36360 + }, + { + "epoch": 4.142604931943732, + "grad_norm": 20.92339324951172, + "learning_rate": 8.610571983103094e-06, + "loss": 2.1748, + "step": 36370 + }, + { + "epoch": 4.143743948972038, + "grad_norm": 49.91473388671875, + "learning_rate": 8.599155154698026e-06, + "loss": 2.1565, + "step": 36380 + }, + { + "epoch": 4.144882966000342, + "grad_norm": 23.245710372924805, + "learning_rate": 8.587738326292956e-06, + "loss": 2.2194, + "step": 36390 + }, + { + "epoch": 4.146021983028646, + "grad_norm": 14.820202827453613, + "learning_rate": 8.576321497887888e-06, + "loss": 2.2943, + "step": 36400 + }, + { + "epoch": 4.1471610000569505, + "grad_norm": 26.18470001220703, + "learning_rate": 8.564904669482818e-06, + "loss": 1.9496, + "step": 36410 + }, + { + "epoch": 4.148300017085256, + "grad_norm": 16.023595809936523, + "learning_rate": 8.55348784107775e-06, + "loss": 2.1462, + "step": 36420 + }, + { + "epoch": 4.14943903411356, + "grad_norm": 38.762264251708984, + "learning_rate": 8.54207101267268e-06, + "loss": 2.1576, + "step": 36430 + }, + { + "epoch": 4.150578051141864, + "grad_norm": 21.413854598999023, + "learning_rate": 8.530654184267612e-06, + "loss": 2.1135, + "step": 36440 + }, + { + "epoch": 4.151717068170169, + "grad_norm": 22.012020111083984, + "learning_rate": 8.519237355862542e-06, + "loss": 2.4493, + "step": 36450 + }, + { + "epoch": 4.152856085198474, + "grad_norm": 28.255403518676758, + "learning_rate": 8.507820527457474e-06, + "loss": 1.9046, + "step": 36460 + }, + { + "epoch": 4.153995102226778, + "grad_norm": 15.050342559814453, + "learning_rate": 8.496403699052404e-06, + "loss": 2.1713, + "step": 36470 + }, + { + "epoch": 4.155134119255083, + "grad_norm": 27.207244873046875, + "learning_rate": 8.484986870647336e-06, + "loss": 1.9978, + "step": 36480 + }, + { + "epoch": 4.156273136283388, + "grad_norm": 28.73916244506836, + "learning_rate": 8.473570042242266e-06, + "loss": 2.2427, + "step": 36490 + }, + { + "epoch": 4.157412153311692, + "grad_norm": 19.30052947998047, + "learning_rate": 8.462153213837198e-06, + "loss": 1.9667, + "step": 36500 + }, + { + "epoch": 4.158551170339996, + "grad_norm": 38.25870895385742, + "learning_rate": 8.450736385432128e-06, + "loss": 2.4221, + "step": 36510 + }, + { + "epoch": 4.159690187368301, + "grad_norm": 37.73525619506836, + "learning_rate": 8.43931955702706e-06, + "loss": 2.3187, + "step": 36520 + }, + { + "epoch": 4.160829204396606, + "grad_norm": 15.060072898864746, + "learning_rate": 8.42790272862199e-06, + "loss": 2.352, + "step": 36530 + }, + { + "epoch": 4.16196822142491, + "grad_norm": 15.708013534545898, + "learning_rate": 8.41648590021692e-06, + "loss": 2.5932, + "step": 36540 + }, + { + "epoch": 4.163107238453215, + "grad_norm": 18.792617797851562, + "learning_rate": 8.405069071811852e-06, + "loss": 2.4422, + "step": 36550 + }, + { + "epoch": 4.16424625548152, + "grad_norm": 22.093103408813477, + "learning_rate": 8.393652243406782e-06, + "loss": 2.1536, + "step": 36560 + }, + { + "epoch": 4.165385272509824, + "grad_norm": 14.1884765625, + "learning_rate": 8.382235415001714e-06, + "loss": 2.3499, + "step": 36570 + }, + { + "epoch": 4.1665242895381285, + "grad_norm": 17.117584228515625, + "learning_rate": 8.370818586596644e-06, + "loss": 2.2645, + "step": 36580 + }, + { + "epoch": 4.167663306566433, + "grad_norm": 73.05811309814453, + "learning_rate": 8.359401758191574e-06, + "loss": 2.1703, + "step": 36590 + }, + { + "epoch": 4.168802323594738, + "grad_norm": 31.396230697631836, + "learning_rate": 8.347984929786506e-06, + "loss": 1.6152, + "step": 36600 + }, + { + "epoch": 4.169941340623042, + "grad_norm": 76.58162689208984, + "learning_rate": 8.336568101381436e-06, + "loss": 1.9436, + "step": 36610 + }, + { + "epoch": 4.171080357651347, + "grad_norm": 18.453163146972656, + "learning_rate": 8.325151272976366e-06, + "loss": 2.1519, + "step": 36620 + }, + { + "epoch": 4.172219374679652, + "grad_norm": 26.716821670532227, + "learning_rate": 8.313734444571298e-06, + "loss": 2.083, + "step": 36630 + }, + { + "epoch": 4.173358391707956, + "grad_norm": 19.890064239501953, + "learning_rate": 8.302317616166228e-06, + "loss": 2.2614, + "step": 36640 + }, + { + "epoch": 4.1744974087362605, + "grad_norm": 23.264663696289062, + "learning_rate": 8.29090078776116e-06, + "loss": 2.2156, + "step": 36650 + }, + { + "epoch": 4.175636425764565, + "grad_norm": 28.301197052001953, + "learning_rate": 8.27948395935609e-06, + "loss": 2.2683, + "step": 36660 + }, + { + "epoch": 4.17677544279287, + "grad_norm": 26.214536666870117, + "learning_rate": 8.268067130951022e-06, + "loss": 2.1747, + "step": 36670 + }, + { + "epoch": 4.177914459821174, + "grad_norm": 20.721172332763672, + "learning_rate": 8.256650302545952e-06, + "loss": 2.4334, + "step": 36680 + }, + { + "epoch": 4.179053476849479, + "grad_norm": 20.722644805908203, + "learning_rate": 8.245233474140884e-06, + "loss": 2.0556, + "step": 36690 + }, + { + "epoch": 4.180192493877783, + "grad_norm": 18.149147033691406, + "learning_rate": 8.234958328576322e-06, + "loss": 2.2955, + "step": 36700 + }, + { + "epoch": 4.181331510906088, + "grad_norm": 27.724897384643555, + "learning_rate": 8.223541500171253e-06, + "loss": 2.1634, + "step": 36710 + }, + { + "epoch": 4.182470527934393, + "grad_norm": 40.385555267333984, + "learning_rate": 8.212124671766184e-06, + "loss": 2.0069, + "step": 36720 + }, + { + "epoch": 4.183609544962697, + "grad_norm": 22.849254608154297, + "learning_rate": 8.200707843361115e-06, + "loss": 2.1667, + "step": 36730 + }, + { + "epoch": 4.184748561991002, + "grad_norm": 18.224185943603516, + "learning_rate": 8.189291014956046e-06, + "loss": 1.9663, + "step": 36740 + }, + { + "epoch": 4.185887579019306, + "grad_norm": 21.56133270263672, + "learning_rate": 8.177874186550977e-06, + "loss": 2.3003, + "step": 36750 + }, + { + "epoch": 4.187026596047611, + "grad_norm": 15.947433471679688, + "learning_rate": 8.166457358145908e-06, + "loss": 2.068, + "step": 36760 + }, + { + "epoch": 4.188165613075915, + "grad_norm": 23.13700294494629, + "learning_rate": 8.15504052974084e-06, + "loss": 2.0999, + "step": 36770 + }, + { + "epoch": 4.18930463010422, + "grad_norm": 26.84025764465332, + "learning_rate": 8.14362370133577e-06, + "loss": 1.6851, + "step": 36780 + }, + { + "epoch": 4.190443647132525, + "grad_norm": 23.293354034423828, + "learning_rate": 8.1322068729307e-06, + "loss": 2.1896, + "step": 36790 + }, + { + "epoch": 4.191582664160829, + "grad_norm": 24.891653060913086, + "learning_rate": 8.120790044525632e-06, + "loss": 2.056, + "step": 36800 + }, + { + "epoch": 4.192721681189134, + "grad_norm": 14.650490760803223, + "learning_rate": 8.109373216120562e-06, + "loss": 2.0223, + "step": 36810 + }, + { + "epoch": 4.1938606982174385, + "grad_norm": 16.309293746948242, + "learning_rate": 8.097956387715493e-06, + "loss": 2.4589, + "step": 36820 + }, + { + "epoch": 4.194999715245743, + "grad_norm": 19.08662223815918, + "learning_rate": 8.086539559310424e-06, + "loss": 2.1032, + "step": 36830 + }, + { + "epoch": 4.196138732274047, + "grad_norm": 21.592103958129883, + "learning_rate": 8.075122730905354e-06, + "loss": 2.433, + "step": 36840 + }, + { + "epoch": 4.197277749302352, + "grad_norm": 16.574871063232422, + "learning_rate": 8.063705902500286e-06, + "loss": 2.2325, + "step": 36850 + }, + { + "epoch": 4.198416766330657, + "grad_norm": 22.576526641845703, + "learning_rate": 8.052289074095216e-06, + "loss": 2.2856, + "step": 36860 + }, + { + "epoch": 4.199555783358961, + "grad_norm": 18.7548828125, + "learning_rate": 8.040872245690148e-06, + "loss": 2.1577, + "step": 36870 + }, + { + "epoch": 4.200694800387266, + "grad_norm": 28.421300888061523, + "learning_rate": 8.029455417285078e-06, + "loss": 2.0314, + "step": 36880 + }, + { + "epoch": 4.2018338174155705, + "grad_norm": 36.84746551513672, + "learning_rate": 8.01803858888001e-06, + "loss": 2.1185, + "step": 36890 + }, + { + "epoch": 4.202972834443875, + "grad_norm": 20.92447280883789, + "learning_rate": 8.00662176047494e-06, + "loss": 2.1269, + "step": 36900 + }, + { + "epoch": 4.204111851472179, + "grad_norm": 18.971689224243164, + "learning_rate": 7.995204932069872e-06, + "loss": 1.9605, + "step": 36910 + }, + { + "epoch": 4.205250868500484, + "grad_norm": 19.306598663330078, + "learning_rate": 7.983788103664802e-06, + "loss": 2.2317, + "step": 36920 + }, + { + "epoch": 4.206389885528789, + "grad_norm": 30.666160583496094, + "learning_rate": 7.972371275259734e-06, + "loss": 2.1037, + "step": 36930 + }, + { + "epoch": 4.207528902557093, + "grad_norm": 16.627685546875, + "learning_rate": 7.960954446854664e-06, + "loss": 2.2279, + "step": 36940 + }, + { + "epoch": 4.208667919585398, + "grad_norm": 20.389049530029297, + "learning_rate": 7.949537618449596e-06, + "loss": 1.9111, + "step": 36950 + }, + { + "epoch": 4.2098069366137025, + "grad_norm": 18.669540405273438, + "learning_rate": 7.938120790044526e-06, + "loss": 1.8168, + "step": 36960 + }, + { + "epoch": 4.210945953642007, + "grad_norm": 25.134624481201172, + "learning_rate": 7.926703961639457e-06, + "loss": 2.0177, + "step": 36970 + }, + { + "epoch": 4.212084970670311, + "grad_norm": 17.900371551513672, + "learning_rate": 7.915287133234388e-06, + "loss": 2.34, + "step": 36980 + }, + { + "epoch": 4.213223987698616, + "grad_norm": 27.747390747070312, + "learning_rate": 7.90387030482932e-06, + "loss": 2.1439, + "step": 36990 + }, + { + "epoch": 4.214363004726921, + "grad_norm": 27.470014572143555, + "learning_rate": 7.89245347642425e-06, + "loss": 2.0902, + "step": 37000 + }, + { + "epoch": 4.215502021755225, + "grad_norm": 18.357351303100586, + "learning_rate": 7.88103664801918e-06, + "loss": 2.3596, + "step": 37010 + }, + { + "epoch": 4.21664103878353, + "grad_norm": 18.42882537841797, + "learning_rate": 7.869619819614112e-06, + "loss": 2.3628, + "step": 37020 + }, + { + "epoch": 4.217780055811835, + "grad_norm": 28.50634002685547, + "learning_rate": 7.858202991209042e-06, + "loss": 2.0287, + "step": 37030 + }, + { + "epoch": 4.218919072840139, + "grad_norm": 17.44330406188965, + "learning_rate": 7.846786162803974e-06, + "loss": 1.9671, + "step": 37040 + }, + { + "epoch": 4.220058089868443, + "grad_norm": 17.362106323242188, + "learning_rate": 7.835369334398904e-06, + "loss": 2.0065, + "step": 37050 + }, + { + "epoch": 4.221197106896748, + "grad_norm": 20.788583755493164, + "learning_rate": 7.823952505993836e-06, + "loss": 2.0531, + "step": 37060 + }, + { + "epoch": 4.222336123925053, + "grad_norm": 20.500598907470703, + "learning_rate": 7.812535677588766e-06, + "loss": 2.0393, + "step": 37070 + }, + { + "epoch": 4.223475140953357, + "grad_norm": 18.895593643188477, + "learning_rate": 7.801118849183698e-06, + "loss": 2.3467, + "step": 37080 + }, + { + "epoch": 4.224614157981661, + "grad_norm": 33.12016677856445, + "learning_rate": 7.789702020778628e-06, + "loss": 2.0312, + "step": 37090 + }, + { + "epoch": 4.225753175009967, + "grad_norm": 21.26396942138672, + "learning_rate": 7.77828519237356e-06, + "loss": 2.0708, + "step": 37100 + }, + { + "epoch": 4.226892192038271, + "grad_norm": 23.664295196533203, + "learning_rate": 7.76686836396849e-06, + "loss": 2.1109, + "step": 37110 + }, + { + "epoch": 4.228031209066575, + "grad_norm": 24.836483001708984, + "learning_rate": 7.755451535563422e-06, + "loss": 1.9681, + "step": 37120 + }, + { + "epoch": 4.2291702260948805, + "grad_norm": 26.818641662597656, + "learning_rate": 7.744034707158352e-06, + "loss": 2.4388, + "step": 37130 + }, + { + "epoch": 4.230309243123185, + "grad_norm": 28.54889488220215, + "learning_rate": 7.732617878753283e-06, + "loss": 2.3617, + "step": 37140 + }, + { + "epoch": 4.231448260151489, + "grad_norm": 19.3241024017334, + "learning_rate": 7.721201050348214e-06, + "loss": 2.3166, + "step": 37150 + }, + { + "epoch": 4.232587277179793, + "grad_norm": 17.69416046142578, + "learning_rate": 7.709784221943145e-06, + "loss": 2.1722, + "step": 37160 + }, + { + "epoch": 4.233726294208099, + "grad_norm": 19.831701278686523, + "learning_rate": 7.698367393538076e-06, + "loss": 2.0708, + "step": 37170 + }, + { + "epoch": 4.234865311236403, + "grad_norm": 18.91107177734375, + "learning_rate": 7.686950565133007e-06, + "loss": 2.6302, + "step": 37180 + }, + { + "epoch": 4.236004328264707, + "grad_norm": 24.90549087524414, + "learning_rate": 7.675533736727938e-06, + "loss": 1.9144, + "step": 37190 + }, + { + "epoch": 4.2371433452930125, + "grad_norm": 29.067604064941406, + "learning_rate": 7.66411690832287e-06, + "loss": 2.5048, + "step": 37200 + }, + { + "epoch": 4.238282362321317, + "grad_norm": 17.568086624145508, + "learning_rate": 7.6527000799178e-06, + "loss": 2.3056, + "step": 37210 + }, + { + "epoch": 4.239421379349621, + "grad_norm": 21.351818084716797, + "learning_rate": 7.64128325151273e-06, + "loss": 2.4039, + "step": 37220 + }, + { + "epoch": 4.2405603963779255, + "grad_norm": 25.68195915222168, + "learning_rate": 7.629866423107662e-06, + "loss": 2.2759, + "step": 37230 + }, + { + "epoch": 4.241699413406231, + "grad_norm": 25.75175666809082, + "learning_rate": 7.6184495947025925e-06, + "loss": 2.1244, + "step": 37240 + }, + { + "epoch": 4.242838430434535, + "grad_norm": 20.02338218688965, + "learning_rate": 7.607032766297523e-06, + "loss": 2.2837, + "step": 37250 + }, + { + "epoch": 4.243977447462839, + "grad_norm": 13.048833847045898, + "learning_rate": 7.5956159378924545e-06, + "loss": 1.8784, + "step": 37260 + }, + { + "epoch": 4.2451164644911445, + "grad_norm": 19.95956039428711, + "learning_rate": 7.584199109487385e-06, + "loss": 2.1352, + "step": 37270 + }, + { + "epoch": 4.246255481519449, + "grad_norm": 48.02158737182617, + "learning_rate": 7.5727822810823165e-06, + "loss": 2.289, + "step": 37280 + }, + { + "epoch": 4.247394498547753, + "grad_norm": 22.00335121154785, + "learning_rate": 7.561365452677247e-06, + "loss": 2.1507, + "step": 37290 + }, + { + "epoch": 4.2485335155760575, + "grad_norm": 19.705385208129883, + "learning_rate": 7.5499486242721785e-06, + "loss": 2.4327, + "step": 37300 + }, + { + "epoch": 4.249672532604363, + "grad_norm": 20.82594871520996, + "learning_rate": 7.538531795867109e-06, + "loss": 1.9802, + "step": 37310 + }, + { + "epoch": 4.250811549632667, + "grad_norm": 22.009002685546875, + "learning_rate": 7.52711496746204e-06, + "loss": 1.8252, + "step": 37320 + }, + { + "epoch": 4.251950566660971, + "grad_norm": 22.141630172729492, + "learning_rate": 7.515698139056971e-06, + "loss": 2.2541, + "step": 37330 + }, + { + "epoch": 4.253089583689276, + "grad_norm": 25.771196365356445, + "learning_rate": 7.504281310651902e-06, + "loss": 2.0781, + "step": 37340 + }, + { + "epoch": 4.254228600717581, + "grad_norm": 26.597488403320312, + "learning_rate": 7.492864482246833e-06, + "loss": 1.9622, + "step": 37350 + }, + { + "epoch": 4.255367617745885, + "grad_norm": 27.240440368652344, + "learning_rate": 7.481447653841763e-06, + "loss": 1.7686, + "step": 37360 + }, + { + "epoch": 4.25650663477419, + "grad_norm": 18.339662551879883, + "learning_rate": 7.470030825436694e-06, + "loss": 1.95, + "step": 37370 + }, + { + "epoch": 4.257645651802495, + "grad_norm": 31.862226486206055, + "learning_rate": 7.458613997031625e-06, + "loss": 1.864, + "step": 37380 + }, + { + "epoch": 4.258784668830799, + "grad_norm": 19.483177185058594, + "learning_rate": 7.447197168626556e-06, + "loss": 1.8942, + "step": 37390 + }, + { + "epoch": 4.259923685859103, + "grad_norm": 18.145130157470703, + "learning_rate": 7.435780340221486e-06, + "loss": 2.3554, + "step": 37400 + }, + { + "epoch": 4.261062702887408, + "grad_norm": 17.70627784729004, + "learning_rate": 7.424363511816418e-06, + "loss": 2.4099, + "step": 37410 + }, + { + "epoch": 4.262201719915713, + "grad_norm": 30.782800674438477, + "learning_rate": 7.412946683411348e-06, + "loss": 1.9665, + "step": 37420 + }, + { + "epoch": 4.263340736944017, + "grad_norm": 25.127151489257812, + "learning_rate": 7.40152985500628e-06, + "loss": 2.1923, + "step": 37430 + }, + { + "epoch": 4.264479753972322, + "grad_norm": 43.31418991088867, + "learning_rate": 7.39011302660121e-06, + "loss": 2.2469, + "step": 37440 + }, + { + "epoch": 4.265618771000627, + "grad_norm": 20.627605438232422, + "learning_rate": 7.378696198196142e-06, + "loss": 2.3459, + "step": 37450 + }, + { + "epoch": 4.266757788028931, + "grad_norm": 19.898103713989258, + "learning_rate": 7.367279369791072e-06, + "loss": 2.3236, + "step": 37460 + }, + { + "epoch": 4.2678968050572355, + "grad_norm": 20.90271759033203, + "learning_rate": 7.355862541386004e-06, + "loss": 1.9029, + "step": 37470 + }, + { + "epoch": 4.26903582208554, + "grad_norm": 37.733097076416016, + "learning_rate": 7.344445712980934e-06, + "loss": 2.2354, + "step": 37480 + }, + { + "epoch": 4.270174839113845, + "grad_norm": 24.870241165161133, + "learning_rate": 7.333028884575866e-06, + "loss": 2.2772, + "step": 37490 + }, + { + "epoch": 4.271313856142149, + "grad_norm": 26.055681228637695, + "learning_rate": 7.321612056170796e-06, + "loss": 1.7569, + "step": 37500 + }, + { + "epoch": 4.272452873170454, + "grad_norm": 17.234634399414062, + "learning_rate": 7.310195227765728e-06, + "loss": 2.3357, + "step": 37510 + }, + { + "epoch": 4.273591890198759, + "grad_norm": 24.080350875854492, + "learning_rate": 7.298778399360658e-06, + "loss": 2.1092, + "step": 37520 + }, + { + "epoch": 4.274730907227063, + "grad_norm": 14.734991073608398, + "learning_rate": 7.2873615709555896e-06, + "loss": 2.2117, + "step": 37530 + }, + { + "epoch": 4.2758699242553675, + "grad_norm": 23.05234718322754, + "learning_rate": 7.27594474255052e-06, + "loss": 1.9585, + "step": 37540 + }, + { + "epoch": 4.277008941283672, + "grad_norm": 19.261940002441406, + "learning_rate": 7.2645279141454515e-06, + "loss": 1.9687, + "step": 37550 + }, + { + "epoch": 4.278147958311977, + "grad_norm": 18.015884399414062, + "learning_rate": 7.253111085740382e-06, + "loss": 1.8536, + "step": 37560 + }, + { + "epoch": 4.279286975340281, + "grad_norm": 39.21778106689453, + "learning_rate": 7.241694257335313e-06, + "loss": 1.9613, + "step": 37570 + }, + { + "epoch": 4.280425992368586, + "grad_norm": 40.37881851196289, + "learning_rate": 7.230277428930244e-06, + "loss": 1.9152, + "step": 37580 + }, + { + "epoch": 4.281565009396891, + "grad_norm": 15.979982376098633, + "learning_rate": 7.218860600525175e-06, + "loss": 2.5318, + "step": 37590 + }, + { + "epoch": 4.282704026425195, + "grad_norm": 18.194150924682617, + "learning_rate": 7.207443772120105e-06, + "loss": 2.1756, + "step": 37600 + }, + { + "epoch": 4.2838430434534995, + "grad_norm": 14.689291000366211, + "learning_rate": 7.196026943715037e-06, + "loss": 2.2388, + "step": 37610 + }, + { + "epoch": 4.284982060481804, + "grad_norm": 32.164737701416016, + "learning_rate": 7.184610115309967e-06, + "loss": 2.1866, + "step": 37620 + }, + { + "epoch": 4.286121077510109, + "grad_norm": 20.101280212402344, + "learning_rate": 7.173193286904899e-06, + "loss": 2.1098, + "step": 37630 + }, + { + "epoch": 4.287260094538413, + "grad_norm": 22.178539276123047, + "learning_rate": 7.161776458499829e-06, + "loss": 2.2009, + "step": 37640 + }, + { + "epoch": 4.288399111566718, + "grad_norm": 18.402748107910156, + "learning_rate": 7.150359630094761e-06, + "loss": 2.314, + "step": 37650 + }, + { + "epoch": 4.289538128595023, + "grad_norm": 16.527597427368164, + "learning_rate": 7.138942801689691e-06, + "loss": 2.44, + "step": 37660 + }, + { + "epoch": 4.290677145623327, + "grad_norm": 19.57364273071289, + "learning_rate": 7.127525973284623e-06, + "loss": 2.0746, + "step": 37670 + }, + { + "epoch": 4.291816162651632, + "grad_norm": 21.5516300201416, + "learning_rate": 7.116109144879553e-06, + "loss": 2.0775, + "step": 37680 + }, + { + "epoch": 4.292955179679936, + "grad_norm": 31.42508888244629, + "learning_rate": 7.1046923164744846e-06, + "loss": 2.0373, + "step": 37690 + }, + { + "epoch": 4.294094196708241, + "grad_norm": 19.42819595336914, + "learning_rate": 7.093275488069415e-06, + "loss": 2.0249, + "step": 37700 + }, + { + "epoch": 4.295233213736545, + "grad_norm": 30.202531814575195, + "learning_rate": 7.081858659664345e-06, + "loss": 2.0713, + "step": 37710 + }, + { + "epoch": 4.29637223076485, + "grad_norm": 51.16549301147461, + "learning_rate": 7.070441831259277e-06, + "loss": 1.999, + "step": 37720 + }, + { + "epoch": 4.297511247793155, + "grad_norm": 16.67644691467285, + "learning_rate": 7.059025002854207e-06, + "loss": 2.1155, + "step": 37730 + }, + { + "epoch": 4.298650264821459, + "grad_norm": 13.299612045288086, + "learning_rate": 7.047608174449139e-06, + "loss": 2.1001, + "step": 37740 + }, + { + "epoch": 4.299789281849764, + "grad_norm": 19.42896842956543, + "learning_rate": 7.036191346044069e-06, + "loss": 1.8753, + "step": 37750 + }, + { + "epoch": 4.300928298878068, + "grad_norm": 30.482791900634766, + "learning_rate": 7.024774517639001e-06, + "loss": 1.9218, + "step": 37760 + }, + { + "epoch": 4.302067315906373, + "grad_norm": 20.236452102661133, + "learning_rate": 7.013357689233931e-06, + "loss": 2.0553, + "step": 37770 + }, + { + "epoch": 4.3032063329346775, + "grad_norm": 26.592241287231445, + "learning_rate": 7.001940860828863e-06, + "loss": 2.1795, + "step": 37780 + }, + { + "epoch": 4.304345349962982, + "grad_norm": 19.41746711730957, + "learning_rate": 6.990524032423793e-06, + "loss": 2.1653, + "step": 37790 + }, + { + "epoch": 4.305484366991286, + "grad_norm": 17.373085021972656, + "learning_rate": 6.979107204018724e-06, + "loss": 2.061, + "step": 37800 + }, + { + "epoch": 4.306623384019591, + "grad_norm": 20.126338958740234, + "learning_rate": 6.967690375613655e-06, + "loss": 1.8904, + "step": 37810 + }, + { + "epoch": 4.307762401047896, + "grad_norm": 21.566429138183594, + "learning_rate": 6.956273547208586e-06, + "loss": 2.1074, + "step": 37820 + }, + { + "epoch": 4.3089014180762, + "grad_norm": 20.61093521118164, + "learning_rate": 6.944856718803516e-06, + "loss": 2.08, + "step": 37830 + }, + { + "epoch": 4.310040435104505, + "grad_norm": 18.914897918701172, + "learning_rate": 6.933439890398448e-06, + "loss": 1.8261, + "step": 37840 + }, + { + "epoch": 4.3111794521328095, + "grad_norm": 20.095748901367188, + "learning_rate": 6.922023061993378e-06, + "loss": 1.9611, + "step": 37850 + }, + { + "epoch": 4.312318469161114, + "grad_norm": 21.751811981201172, + "learning_rate": 6.91060623358831e-06, + "loss": 2.072, + "step": 37860 + }, + { + "epoch": 4.313457486189418, + "grad_norm": 18.809906005859375, + "learning_rate": 6.89918940518324e-06, + "loss": 2.2382, + "step": 37870 + }, + { + "epoch": 4.314596503217723, + "grad_norm": 15.959148406982422, + "learning_rate": 6.887772576778172e-06, + "loss": 1.8978, + "step": 37880 + }, + { + "epoch": 4.315735520246028, + "grad_norm": 19.607206344604492, + "learning_rate": 6.876355748373102e-06, + "loss": 2.1253, + "step": 37890 + }, + { + "epoch": 4.316874537274332, + "grad_norm": 22.636343002319336, + "learning_rate": 6.864938919968034e-06, + "loss": 2.16, + "step": 37900 + }, + { + "epoch": 4.318013554302637, + "grad_norm": 19.403629302978516, + "learning_rate": 6.853522091562964e-06, + "loss": 2.4255, + "step": 37910 + }, + { + "epoch": 4.3191525713309415, + "grad_norm": 21.906789779663086, + "learning_rate": 6.842105263157896e-06, + "loss": 2.4363, + "step": 37920 + }, + { + "epoch": 4.320291588359246, + "grad_norm": 17.866167068481445, + "learning_rate": 6.830688434752826e-06, + "loss": 2.0562, + "step": 37930 + }, + { + "epoch": 4.32143060538755, + "grad_norm": 23.16889762878418, + "learning_rate": 6.819271606347758e-06, + "loss": 2.1514, + "step": 37940 + }, + { + "epoch": 4.322569622415855, + "grad_norm": 19.992206573486328, + "learning_rate": 6.807854777942688e-06, + "loss": 2.0091, + "step": 37950 + }, + { + "epoch": 4.32370863944416, + "grad_norm": 20.305980682373047, + "learning_rate": 6.79643794953762e-06, + "loss": 2.068, + "step": 37960 + }, + { + "epoch": 4.324847656472464, + "grad_norm": 25.374679565429688, + "learning_rate": 6.78502112113255e-06, + "loss": 1.991, + "step": 37970 + }, + { + "epoch": 4.325986673500768, + "grad_norm": 19.922895431518555, + "learning_rate": 6.773604292727482e-06, + "loss": 1.806, + "step": 37980 + }, + { + "epoch": 4.327125690529074, + "grad_norm": 18.043460845947266, + "learning_rate": 6.762187464322412e-06, + "loss": 2.3444, + "step": 37990 + }, + { + "epoch": 4.328264707557378, + "grad_norm": 29.81731414794922, + "learning_rate": 6.750770635917343e-06, + "loss": 1.9679, + "step": 38000 + }, + { + "epoch": 4.328264707557378, + "eval_loss": 7.4151763916015625, + "eval_runtime": 10.5263, + "eval_samples_per_second": 1.425, + "eval_steps_per_second": 0.19, + "step": 38000 + }, + { + "epoch": 4.329403724585682, + "grad_norm": 16.56136703491211, + "learning_rate": 6.739353807512274e-06, + "loss": 2.167, + "step": 38010 + }, + { + "epoch": 4.3305427416139874, + "grad_norm": 22.178762435913086, + "learning_rate": 6.727936979107205e-06, + "loss": 1.9672, + "step": 38020 + }, + { + "epoch": 4.331681758642292, + "grad_norm": 37.47772216796875, + "learning_rate": 6.716520150702135e-06, + "loss": 2.2508, + "step": 38030 + }, + { + "epoch": 4.332820775670596, + "grad_norm": 17.4763240814209, + "learning_rate": 6.705103322297066e-06, + "loss": 2.1529, + "step": 38040 + }, + { + "epoch": 4.3339597926989, + "grad_norm": 19.65039825439453, + "learning_rate": 6.693686493891997e-06, + "loss": 2.1628, + "step": 38050 + }, + { + "epoch": 4.335098809727206, + "grad_norm": 15.268583297729492, + "learning_rate": 6.682269665486927e-06, + "loss": 2.3756, + "step": 38060 + }, + { + "epoch": 4.33623782675551, + "grad_norm": 15.196014404296875, + "learning_rate": 6.670852837081859e-06, + "loss": 1.937, + "step": 38070 + }, + { + "epoch": 4.337376843783814, + "grad_norm": 26.376205444335938, + "learning_rate": 6.659436008676789e-06, + "loss": 2.2766, + "step": 38080 + }, + { + "epoch": 4.3385158608121195, + "grad_norm": 40.029014587402344, + "learning_rate": 6.648019180271721e-06, + "loss": 2.1662, + "step": 38090 + }, + { + "epoch": 4.339654877840424, + "grad_norm": 19.197725296020508, + "learning_rate": 6.636602351866651e-06, + "loss": 1.9085, + "step": 38100 + }, + { + "epoch": 4.340793894868728, + "grad_norm": 23.35303497314453, + "learning_rate": 6.625185523461583e-06, + "loss": 2.1213, + "step": 38110 + }, + { + "epoch": 4.3419329118970325, + "grad_norm": 22.393056869506836, + "learning_rate": 6.613768695056513e-06, + "loss": 1.8693, + "step": 38120 + }, + { + "epoch": 4.343071928925338, + "grad_norm": 20.846256256103516, + "learning_rate": 6.602351866651445e-06, + "loss": 2.1169, + "step": 38130 + }, + { + "epoch": 4.344210945953642, + "grad_norm": 22.0129451751709, + "learning_rate": 6.590935038246375e-06, + "loss": 1.9123, + "step": 38140 + }, + { + "epoch": 4.345349962981946, + "grad_norm": 22.14680290222168, + "learning_rate": 6.579518209841307e-06, + "loss": 2.3217, + "step": 38150 + }, + { + "epoch": 4.3464889800102515, + "grad_norm": 17.23741912841797, + "learning_rate": 6.568101381436237e-06, + "loss": 2.3661, + "step": 38160 + }, + { + "epoch": 4.347627997038556, + "grad_norm": 15.138941764831543, + "learning_rate": 6.556684553031169e-06, + "loss": 1.9312, + "step": 38170 + }, + { + "epoch": 4.34876701406686, + "grad_norm": 14.638100624084473, + "learning_rate": 6.545267724626099e-06, + "loss": 2.0386, + "step": 38180 + }, + { + "epoch": 4.3499060310951645, + "grad_norm": 21.914152145385742, + "learning_rate": 6.533850896221031e-06, + "loss": 2.1097, + "step": 38190 + }, + { + "epoch": 4.35104504812347, + "grad_norm": 19.58365249633789, + "learning_rate": 6.522434067815961e-06, + "loss": 2.0855, + "step": 38200 + }, + { + "epoch": 4.352184065151774, + "grad_norm": 20.630306243896484, + "learning_rate": 6.511017239410893e-06, + "loss": 2.4127, + "step": 38210 + }, + { + "epoch": 4.353323082180078, + "grad_norm": 23.601476669311523, + "learning_rate": 6.499600411005823e-06, + "loss": 1.9273, + "step": 38220 + }, + { + "epoch": 4.354462099208384, + "grad_norm": 29.156841278076172, + "learning_rate": 6.488183582600754e-06, + "loss": 2.0433, + "step": 38230 + }, + { + "epoch": 4.355601116236688, + "grad_norm": 18.3179931640625, + "learning_rate": 6.476766754195685e-06, + "loss": 2.139, + "step": 38240 + }, + { + "epoch": 4.356740133264992, + "grad_norm": 18.412797927856445, + "learning_rate": 6.465349925790616e-06, + "loss": 2.0943, + "step": 38250 + }, + { + "epoch": 4.3578791502932965, + "grad_norm": 21.92378807067871, + "learning_rate": 6.453933097385546e-06, + "loss": 1.8836, + "step": 38260 + }, + { + "epoch": 4.359018167321602, + "grad_norm": 18.41684913635254, + "learning_rate": 6.442516268980478e-06, + "loss": 2.466, + "step": 38270 + }, + { + "epoch": 4.360157184349906, + "grad_norm": 18.715137481689453, + "learning_rate": 6.431099440575408e-06, + "loss": 1.8527, + "step": 38280 + }, + { + "epoch": 4.36129620137821, + "grad_norm": 43.92091751098633, + "learning_rate": 6.41968261217034e-06, + "loss": 2.0389, + "step": 38290 + }, + { + "epoch": 4.362435218406516, + "grad_norm": 17.217544555664062, + "learning_rate": 6.40826578376527e-06, + "loss": 2.2979, + "step": 38300 + }, + { + "epoch": 4.36357423543482, + "grad_norm": 24.334396362304688, + "learning_rate": 6.396848955360202e-06, + "loss": 1.8752, + "step": 38310 + }, + { + "epoch": 4.364713252463124, + "grad_norm": 38.25008010864258, + "learning_rate": 6.385432126955132e-06, + "loss": 2.144, + "step": 38320 + }, + { + "epoch": 4.365852269491429, + "grad_norm": 28.296932220458984, + "learning_rate": 6.374015298550064e-06, + "loss": 2.1091, + "step": 38330 + }, + { + "epoch": 4.366991286519734, + "grad_norm": 15.157665252685547, + "learning_rate": 6.362598470144994e-06, + "loss": 2.3093, + "step": 38340 + }, + { + "epoch": 4.368130303548038, + "grad_norm": 19.18575096130371, + "learning_rate": 6.351181641739926e-06, + "loss": 1.8464, + "step": 38350 + }, + { + "epoch": 4.369269320576342, + "grad_norm": 20.557147979736328, + "learning_rate": 6.339764813334856e-06, + "loss": 2.1128, + "step": 38360 + }, + { + "epoch": 4.370408337604648, + "grad_norm": 17.51496696472168, + "learning_rate": 6.328347984929788e-06, + "loss": 2.5634, + "step": 38370 + }, + { + "epoch": 4.371547354632952, + "grad_norm": 39.80990982055664, + "learning_rate": 6.316931156524718e-06, + "loss": 2.31, + "step": 38380 + }, + { + "epoch": 4.372686371661256, + "grad_norm": 29.92764663696289, + "learning_rate": 6.305514328119648e-06, + "loss": 1.8257, + "step": 38390 + }, + { + "epoch": 4.373825388689561, + "grad_norm": 17.890737533569336, + "learning_rate": 6.29409749971458e-06, + "loss": 1.9987, + "step": 38400 + }, + { + "epoch": 4.374964405717866, + "grad_norm": 42.723793029785156, + "learning_rate": 6.28268067130951e-06, + "loss": 1.5566, + "step": 38410 + }, + { + "epoch": 4.37610342274617, + "grad_norm": 20.289682388305664, + "learning_rate": 6.271263842904442e-06, + "loss": 2.295, + "step": 38420 + }, + { + "epoch": 4.3772424397744745, + "grad_norm": 21.105762481689453, + "learning_rate": 6.259847014499372e-06, + "loss": 2.2403, + "step": 38430 + }, + { + "epoch": 4.378381456802779, + "grad_norm": 29.060956954956055, + "learning_rate": 6.248430186094304e-06, + "loss": 1.8739, + "step": 38440 + }, + { + "epoch": 4.379520473831084, + "grad_norm": 22.935686111450195, + "learning_rate": 6.237013357689235e-06, + "loss": 2.3242, + "step": 38450 + }, + { + "epoch": 4.380659490859388, + "grad_norm": 27.833587646484375, + "learning_rate": 6.225596529284165e-06, + "loss": 2.0174, + "step": 38460 + }, + { + "epoch": 4.381798507887693, + "grad_norm": 30.996829986572266, + "learning_rate": 6.214179700879096e-06, + "loss": 2.025, + "step": 38470 + }, + { + "epoch": 4.382937524915998, + "grad_norm": 26.970853805541992, + "learning_rate": 6.202762872474027e-06, + "loss": 2.0856, + "step": 38480 + }, + { + "epoch": 4.384076541944302, + "grad_norm": 17.043004989624023, + "learning_rate": 6.191346044068958e-06, + "loss": 2.3329, + "step": 38490 + }, + { + "epoch": 4.3852155589726065, + "grad_norm": 15.339963912963867, + "learning_rate": 6.179929215663889e-06, + "loss": 2.4131, + "step": 38500 + }, + { + "epoch": 4.386354576000911, + "grad_norm": 19.655431747436523, + "learning_rate": 6.16851238725882e-06, + "loss": 1.9076, + "step": 38510 + }, + { + "epoch": 4.387493593029216, + "grad_norm": 19.927396774291992, + "learning_rate": 6.157095558853751e-06, + "loss": 2.1454, + "step": 38520 + }, + { + "epoch": 4.38863261005752, + "grad_norm": 34.31972122192383, + "learning_rate": 6.145678730448682e-06, + "loss": 2.4862, + "step": 38530 + }, + { + "epoch": 4.389771627085825, + "grad_norm": 29.554445266723633, + "learning_rate": 6.134261902043613e-06, + "loss": 2.6929, + "step": 38540 + }, + { + "epoch": 4.39091064411413, + "grad_norm": 29.559202194213867, + "learning_rate": 6.122845073638544e-06, + "loss": 2.1787, + "step": 38550 + }, + { + "epoch": 4.392049661142434, + "grad_norm": 24.816953659057617, + "learning_rate": 6.111428245233474e-06, + "loss": 2.1608, + "step": 38560 + }, + { + "epoch": 4.3931886781707385, + "grad_norm": 14.350375175476074, + "learning_rate": 6.100011416828405e-06, + "loss": 2.3058, + "step": 38570 + }, + { + "epoch": 4.394327695199043, + "grad_norm": 20.322608947753906, + "learning_rate": 6.088594588423336e-06, + "loss": 2.1694, + "step": 38580 + }, + { + "epoch": 4.395466712227348, + "grad_norm": 20.23577308654785, + "learning_rate": 6.077177760018267e-06, + "loss": 1.9954, + "step": 38590 + }, + { + "epoch": 4.396605729255652, + "grad_norm": 32.12469482421875, + "learning_rate": 6.065760931613198e-06, + "loss": 2.145, + "step": 38600 + }, + { + "epoch": 4.397744746283957, + "grad_norm": 17.982051849365234, + "learning_rate": 6.054344103208129e-06, + "loss": 2.4328, + "step": 38610 + }, + { + "epoch": 4.398883763312261, + "grad_norm": 20.217981338500977, + "learning_rate": 6.04292727480306e-06, + "loss": 2.381, + "step": 38620 + }, + { + "epoch": 4.400022780340566, + "grad_norm": 27.260299682617188, + "learning_rate": 6.031510446397991e-06, + "loss": 2.041, + "step": 38630 + }, + { + "epoch": 4.401161797368871, + "grad_norm": 42.04597091674805, + "learning_rate": 6.020093617992922e-06, + "loss": 2.0535, + "step": 38640 + }, + { + "epoch": 4.402300814397175, + "grad_norm": 44.164390563964844, + "learning_rate": 6.008676789587853e-06, + "loss": 2.1642, + "step": 38650 + }, + { + "epoch": 4.40343983142548, + "grad_norm": 18.396827697753906, + "learning_rate": 5.997259961182784e-06, + "loss": 1.9359, + "step": 38660 + }, + { + "epoch": 4.4045788484537844, + "grad_norm": 37.188194274902344, + "learning_rate": 5.985843132777715e-06, + "loss": 2.3399, + "step": 38670 + }, + { + "epoch": 4.405717865482089, + "grad_norm": 16.267602920532227, + "learning_rate": 5.974426304372646e-06, + "loss": 2.0842, + "step": 38680 + }, + { + "epoch": 4.406856882510393, + "grad_norm": 16.498821258544922, + "learning_rate": 5.963009475967577e-06, + "loss": 2.3394, + "step": 38690 + }, + { + "epoch": 4.407995899538698, + "grad_norm": 16.602087020874023, + "learning_rate": 5.951592647562507e-06, + "loss": 2.1753, + "step": 38700 + }, + { + "epoch": 4.409134916567003, + "grad_norm": 36.543941497802734, + "learning_rate": 5.940175819157438e-06, + "loss": 1.9536, + "step": 38710 + }, + { + "epoch": 4.410273933595307, + "grad_norm": 22.962154388427734, + "learning_rate": 5.928758990752369e-06, + "loss": 2.3809, + "step": 38720 + }, + { + "epoch": 4.411412950623612, + "grad_norm": 22.821556091308594, + "learning_rate": 5.9173421623473e-06, + "loss": 2.2538, + "step": 38730 + }, + { + "epoch": 4.4125519676519165, + "grad_norm": 21.451107025146484, + "learning_rate": 5.905925333942231e-06, + "loss": 1.9444, + "step": 38740 + }, + { + "epoch": 4.413690984680221, + "grad_norm": 17.478309631347656, + "learning_rate": 5.894508505537162e-06, + "loss": 2.0681, + "step": 38750 + }, + { + "epoch": 4.414830001708525, + "grad_norm": 29.635379791259766, + "learning_rate": 5.883091677132093e-06, + "loss": 2.1441, + "step": 38760 + }, + { + "epoch": 4.41596901873683, + "grad_norm": 30.223285675048828, + "learning_rate": 5.871674848727024e-06, + "loss": 2.1695, + "step": 38770 + }, + { + "epoch": 4.417108035765135, + "grad_norm": 22.78261375427246, + "learning_rate": 5.860258020321955e-06, + "loss": 2.3462, + "step": 38780 + }, + { + "epoch": 4.418247052793439, + "grad_norm": 23.909183502197266, + "learning_rate": 5.848841191916886e-06, + "loss": 1.9011, + "step": 38790 + }, + { + "epoch": 4.419386069821744, + "grad_norm": 19.285947799682617, + "learning_rate": 5.837424363511817e-06, + "loss": 2.24, + "step": 38800 + }, + { + "epoch": 4.4205250868500485, + "grad_norm": 17.21255111694336, + "learning_rate": 5.826007535106748e-06, + "loss": 1.9518, + "step": 38810 + }, + { + "epoch": 4.421664103878353, + "grad_norm": 26.5579776763916, + "learning_rate": 5.814590706701679e-06, + "loss": 1.8989, + "step": 38820 + }, + { + "epoch": 4.422803120906657, + "grad_norm": 40.50425720214844, + "learning_rate": 5.80317387829661e-06, + "loss": 2.0855, + "step": 38830 + }, + { + "epoch": 4.423942137934962, + "grad_norm": 22.949995040893555, + "learning_rate": 5.791757049891541e-06, + "loss": 2.0137, + "step": 38840 + }, + { + "epoch": 4.425081154963267, + "grad_norm": 87.98580169677734, + "learning_rate": 5.780340221486472e-06, + "loss": 1.9195, + "step": 38850 + }, + { + "epoch": 4.426220171991571, + "grad_norm": 47.75716018676758, + "learning_rate": 5.768923393081403e-06, + "loss": 2.1751, + "step": 38860 + }, + { + "epoch": 4.427359189019876, + "grad_norm": 24.485389709472656, + "learning_rate": 5.757506564676334e-06, + "loss": 1.9798, + "step": 38870 + }, + { + "epoch": 4.428498206048181, + "grad_norm": 17.36442756652832, + "learning_rate": 5.746089736271265e-06, + "loss": 2.2103, + "step": 38880 + }, + { + "epoch": 4.429637223076485, + "grad_norm": 22.69402313232422, + "learning_rate": 5.734672907866196e-06, + "loss": 2.167, + "step": 38890 + }, + { + "epoch": 4.430776240104789, + "grad_norm": 29.086217880249023, + "learning_rate": 5.723256079461126e-06, + "loss": 2.316, + "step": 38900 + }, + { + "epoch": 4.431915257133094, + "grad_norm": 26.8614559173584, + "learning_rate": 5.711839251056057e-06, + "loss": 2.0431, + "step": 38910 + }, + { + "epoch": 4.433054274161399, + "grad_norm": 41.009803771972656, + "learning_rate": 5.700422422650988e-06, + "loss": 1.9971, + "step": 38920 + }, + { + "epoch": 4.434193291189703, + "grad_norm": 23.18709373474121, + "learning_rate": 5.689005594245918e-06, + "loss": 2.2611, + "step": 38930 + }, + { + "epoch": 4.435332308218008, + "grad_norm": 24.28513526916504, + "learning_rate": 5.677588765840849e-06, + "loss": 2.0514, + "step": 38940 + }, + { + "epoch": 4.436471325246313, + "grad_norm": 25.079357147216797, + "learning_rate": 5.66617193743578e-06, + "loss": 1.6822, + "step": 38950 + }, + { + "epoch": 4.437610342274617, + "grad_norm": 25.07917594909668, + "learning_rate": 5.654755109030711e-06, + "loss": 1.9692, + "step": 38960 + }, + { + "epoch": 4.438749359302921, + "grad_norm": 21.096786499023438, + "learning_rate": 5.643338280625642e-06, + "loss": 2.0347, + "step": 38970 + }, + { + "epoch": 4.4398883763312265, + "grad_norm": 20.32276725769043, + "learning_rate": 5.631921452220573e-06, + "loss": 2.1265, + "step": 38980 + }, + { + "epoch": 4.441027393359531, + "grad_norm": 18.574169158935547, + "learning_rate": 5.620504623815504e-06, + "loss": 2.122, + "step": 38990 + }, + { + "epoch": 4.442166410387835, + "grad_norm": 16.053590774536133, + "learning_rate": 5.609087795410435e-06, + "loss": 2.316, + "step": 39000 + }, + { + "epoch": 4.44330542741614, + "grad_norm": 20.908010482788086, + "learning_rate": 5.597670967005366e-06, + "loss": 1.8513, + "step": 39010 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 33.339149475097656, + "learning_rate": 5.586254138600297e-06, + "loss": 2.1788, + "step": 39020 + }, + { + "epoch": 4.445583461472749, + "grad_norm": 34.222267150878906, + "learning_rate": 5.574837310195228e-06, + "loss": 1.8465, + "step": 39030 + }, + { + "epoch": 4.446722478501053, + "grad_norm": 16.31046485900879, + "learning_rate": 5.563420481790159e-06, + "loss": 2.1839, + "step": 39040 + }, + { + "epoch": 4.4478614955293585, + "grad_norm": 26.011871337890625, + "learning_rate": 5.55200365338509e-06, + "loss": 2.0597, + "step": 39050 + }, + { + "epoch": 4.449000512557663, + "grad_norm": 20.108978271484375, + "learning_rate": 5.540586824980021e-06, + "loss": 2.3759, + "step": 39060 + }, + { + "epoch": 4.450139529585967, + "grad_norm": 27.879301071166992, + "learning_rate": 5.529169996574952e-06, + "loss": 2.151, + "step": 39070 + }, + { + "epoch": 4.4512785466142715, + "grad_norm": 20.855777740478516, + "learning_rate": 5.517753168169883e-06, + "loss": 1.9082, + "step": 39080 + }, + { + "epoch": 4.452417563642577, + "grad_norm": 28.832712173461914, + "learning_rate": 5.506336339764814e-06, + "loss": 2.2352, + "step": 39090 + }, + { + "epoch": 4.453556580670881, + "grad_norm": 23.974945068359375, + "learning_rate": 5.494919511359745e-06, + "loss": 2.3737, + "step": 39100 + }, + { + "epoch": 4.454695597699185, + "grad_norm": 36.511043548583984, + "learning_rate": 5.483502682954676e-06, + "loss": 1.8927, + "step": 39110 + }, + { + "epoch": 4.4558346147274905, + "grad_norm": 18.519685745239258, + "learning_rate": 5.472085854549607e-06, + "loss": 2.2206, + "step": 39120 + }, + { + "epoch": 4.456973631755795, + "grad_norm": 29.08961296081543, + "learning_rate": 5.460669026144537e-06, + "loss": 2.1868, + "step": 39130 + }, + { + "epoch": 4.458112648784099, + "grad_norm": 24.380552291870117, + "learning_rate": 5.449252197739468e-06, + "loss": 1.9501, + "step": 39140 + }, + { + "epoch": 4.4592516658124035, + "grad_norm": 22.17682456970215, + "learning_rate": 5.437835369334399e-06, + "loss": 1.7495, + "step": 39150 + }, + { + "epoch": 4.460390682840709, + "grad_norm": 33.453216552734375, + "learning_rate": 5.42641854092933e-06, + "loss": 1.9987, + "step": 39160 + }, + { + "epoch": 4.461529699869013, + "grad_norm": 19.65850257873535, + "learning_rate": 5.415001712524261e-06, + "loss": 1.96, + "step": 39170 + }, + { + "epoch": 4.462668716897317, + "grad_norm": 21.389358520507812, + "learning_rate": 5.403584884119192e-06, + "loss": 1.9344, + "step": 39180 + }, + { + "epoch": 4.463807733925623, + "grad_norm": 39.23703384399414, + "learning_rate": 5.392168055714123e-06, + "loss": 2.0238, + "step": 39190 + }, + { + "epoch": 4.464946750953927, + "grad_norm": 26.289182662963867, + "learning_rate": 5.380751227309054e-06, + "loss": 1.9715, + "step": 39200 + }, + { + "epoch": 4.466085767982231, + "grad_norm": 17.090171813964844, + "learning_rate": 5.369334398903985e-06, + "loss": 1.9328, + "step": 39210 + }, + { + "epoch": 4.4672247850105355, + "grad_norm": 18.325944900512695, + "learning_rate": 5.357917570498916e-06, + "loss": 1.9771, + "step": 39220 + }, + { + "epoch": 4.468363802038841, + "grad_norm": 24.46843910217285, + "learning_rate": 5.346500742093847e-06, + "loss": 2.1634, + "step": 39230 + }, + { + "epoch": 4.469502819067145, + "grad_norm": 14.263288497924805, + "learning_rate": 5.335083913688778e-06, + "loss": 2.1152, + "step": 39240 + }, + { + "epoch": 4.470641836095449, + "grad_norm": 21.600927352905273, + "learning_rate": 5.323667085283708e-06, + "loss": 2.0534, + "step": 39250 + }, + { + "epoch": 4.471780853123754, + "grad_norm": 21.42341423034668, + "learning_rate": 5.312250256878639e-06, + "loss": 1.9169, + "step": 39260 + }, + { + "epoch": 4.472919870152059, + "grad_norm": 22.211706161499023, + "learning_rate": 5.30083342847357e-06, + "loss": 2.1014, + "step": 39270 + }, + { + "epoch": 4.474058887180363, + "grad_norm": 23.015657424926758, + "learning_rate": 5.289416600068501e-06, + "loss": 1.9975, + "step": 39280 + }, + { + "epoch": 4.475197904208668, + "grad_norm": 16.628925323486328, + "learning_rate": 5.277999771663432e-06, + "loss": 2.3775, + "step": 39290 + }, + { + "epoch": 4.476336921236973, + "grad_norm": 22.62873649597168, + "learning_rate": 5.266582943258363e-06, + "loss": 1.9998, + "step": 39300 + }, + { + "epoch": 4.477475938265277, + "grad_norm": 16.74282455444336, + "learning_rate": 5.255166114853294e-06, + "loss": 1.8707, + "step": 39310 + }, + { + "epoch": 4.4786149552935814, + "grad_norm": 48.54405975341797, + "learning_rate": 5.243749286448225e-06, + "loss": 1.8959, + "step": 39320 + }, + { + "epoch": 4.479753972321886, + "grad_norm": 20.61815643310547, + "learning_rate": 5.232332458043156e-06, + "loss": 2.3803, + "step": 39330 + }, + { + "epoch": 4.480892989350191, + "grad_norm": 19.385465621948242, + "learning_rate": 5.220915629638087e-06, + "loss": 2.1338, + "step": 39340 + }, + { + "epoch": 4.482032006378495, + "grad_norm": 23.618255615234375, + "learning_rate": 5.209498801233018e-06, + "loss": 2.2963, + "step": 39350 + }, + { + "epoch": 4.4831710234068, + "grad_norm": 31.286651611328125, + "learning_rate": 5.198081972827949e-06, + "loss": 2.3401, + "step": 39360 + }, + { + "epoch": 4.484310040435105, + "grad_norm": 39.6640739440918, + "learning_rate": 5.186665144422879e-06, + "loss": 1.9781, + "step": 39370 + }, + { + "epoch": 4.485449057463409, + "grad_norm": 16.649465560913086, + "learning_rate": 5.17524831601781e-06, + "loss": 2.3547, + "step": 39380 + }, + { + "epoch": 4.4865880744917135, + "grad_norm": 21.966222763061523, + "learning_rate": 5.163831487612741e-06, + "loss": 2.0539, + "step": 39390 + }, + { + "epoch": 4.487727091520018, + "grad_norm": 22.4364013671875, + "learning_rate": 5.152414659207672e-06, + "loss": 2.0694, + "step": 39400 + }, + { + "epoch": 4.488866108548323, + "grad_norm": 38.82316207885742, + "learning_rate": 5.140997830802603e-06, + "loss": 2.2301, + "step": 39410 + }, + { + "epoch": 4.490005125576627, + "grad_norm": 29.10707664489746, + "learning_rate": 5.129581002397534e-06, + "loss": 2.1342, + "step": 39420 + }, + { + "epoch": 4.491144142604932, + "grad_norm": 26.438182830810547, + "learning_rate": 5.118164173992465e-06, + "loss": 2.1679, + "step": 39430 + }, + { + "epoch": 4.492283159633237, + "grad_norm": 18.471004486083984, + "learning_rate": 5.106747345587396e-06, + "loss": 2.4843, + "step": 39440 + }, + { + "epoch": 4.493422176661541, + "grad_norm": 25.324321746826172, + "learning_rate": 5.095330517182327e-06, + "loss": 2.1815, + "step": 39450 + }, + { + "epoch": 4.4945611936898455, + "grad_norm": 14.240032196044922, + "learning_rate": 5.083913688777258e-06, + "loss": 2.2428, + "step": 39460 + }, + { + "epoch": 4.49570021071815, + "grad_norm": 26.3165225982666, + "learning_rate": 5.072496860372189e-06, + "loss": 2.1048, + "step": 39470 + }, + { + "epoch": 4.496839227746455, + "grad_norm": 17.577499389648438, + "learning_rate": 5.06108003196712e-06, + "loss": 2.3266, + "step": 39480 + }, + { + "epoch": 4.497978244774759, + "grad_norm": 29.622272491455078, + "learning_rate": 5.049663203562051e-06, + "loss": 2.2516, + "step": 39490 + }, + { + "epoch": 4.499117261803064, + "grad_norm": 23.893728256225586, + "learning_rate": 5.038246375156982e-06, + "loss": 2.2597, + "step": 39500 + }, + { + "epoch": 4.500256278831369, + "grad_norm": 19.502521514892578, + "learning_rate": 5.026829546751913e-06, + "loss": 2.4378, + "step": 39510 + }, + { + "epoch": 4.501395295859673, + "grad_norm": 31.023860931396484, + "learning_rate": 5.015412718346844e-06, + "loss": 2.1901, + "step": 39520 + }, + { + "epoch": 4.502534312887978, + "grad_norm": 26.63571548461914, + "learning_rate": 5.003995889941775e-06, + "loss": 2.0757, + "step": 39530 + }, + { + "epoch": 4.503673329916282, + "grad_norm": 25.747509002685547, + "learning_rate": 4.992579061536706e-06, + "loss": 2.2147, + "step": 39540 + }, + { + "epoch": 4.504812346944587, + "grad_norm": 26.80502700805664, + "learning_rate": 4.981162233131637e-06, + "loss": 2.2475, + "step": 39550 + }, + { + "epoch": 4.505951363972891, + "grad_norm": 20.84671974182129, + "learning_rate": 4.969745404726568e-06, + "loss": 2.2552, + "step": 39560 + }, + { + "epoch": 4.507090381001196, + "grad_norm": 22.3966121673584, + "learning_rate": 4.958328576321498e-06, + "loss": 1.9651, + "step": 39570 + }, + { + "epoch": 4.508229398029501, + "grad_norm": 22.270471572875977, + "learning_rate": 4.946911747916429e-06, + "loss": 2.1574, + "step": 39580 + }, + { + "epoch": 4.509368415057805, + "grad_norm": 21.653453826904297, + "learning_rate": 4.93549491951136e-06, + "loss": 2.1189, + "step": 39590 + }, + { + "epoch": 4.51050743208611, + "grad_norm": 19.522645950317383, + "learning_rate": 4.92407809110629e-06, + "loss": 2.1881, + "step": 39600 + }, + { + "epoch": 4.511646449114414, + "grad_norm": 23.786989212036133, + "learning_rate": 4.912661262701221e-06, + "loss": 1.7446, + "step": 39610 + }, + { + "epoch": 4.512785466142719, + "grad_norm": 16.9265079498291, + "learning_rate": 4.901244434296152e-06, + "loss": 2.0346, + "step": 39620 + }, + { + "epoch": 4.5139244831710235, + "grad_norm": 21.18515396118164, + "learning_rate": 4.889827605891083e-06, + "loss": 2.1891, + "step": 39630 + }, + { + "epoch": 4.515063500199328, + "grad_norm": 25.072799682617188, + "learning_rate": 4.878410777486014e-06, + "loss": 1.8711, + "step": 39640 + }, + { + "epoch": 4.516202517227633, + "grad_norm": 96.05767059326172, + "learning_rate": 4.866993949080945e-06, + "loss": 1.9648, + "step": 39650 + }, + { + "epoch": 4.517341534255937, + "grad_norm": 18.524805068969727, + "learning_rate": 4.855577120675876e-06, + "loss": 1.9572, + "step": 39660 + }, + { + "epoch": 4.518480551284242, + "grad_norm": 33.299713134765625, + "learning_rate": 4.844160292270807e-06, + "loss": 2.3018, + "step": 39670 + }, + { + "epoch": 4.519619568312546, + "grad_norm": 19.663288116455078, + "learning_rate": 4.832743463865738e-06, + "loss": 2.2142, + "step": 39680 + }, + { + "epoch": 4.520758585340851, + "grad_norm": 24.79319953918457, + "learning_rate": 4.821326635460669e-06, + "loss": 2.4767, + "step": 39690 + }, + { + "epoch": 4.5218976023691555, + "grad_norm": 43.95841979980469, + "learning_rate": 4.8099098070556e-06, + "loss": 2.5435, + "step": 39700 + }, + { + "epoch": 4.52303661939746, + "grad_norm": 22.800437927246094, + "learning_rate": 4.798492978650531e-06, + "loss": 1.9495, + "step": 39710 + }, + { + "epoch": 4.524175636425765, + "grad_norm": 19.733867645263672, + "learning_rate": 4.787076150245462e-06, + "loss": 1.9048, + "step": 39720 + }, + { + "epoch": 4.525314653454069, + "grad_norm": 19.744220733642578, + "learning_rate": 4.775659321840393e-06, + "loss": 1.9691, + "step": 39730 + }, + { + "epoch": 4.526453670482374, + "grad_norm": 20.982288360595703, + "learning_rate": 4.764242493435324e-06, + "loss": 2.0679, + "step": 39740 + }, + { + "epoch": 4.527592687510678, + "grad_norm": 18.651309967041016, + "learning_rate": 4.752825665030255e-06, + "loss": 1.9237, + "step": 39750 + }, + { + "epoch": 4.528731704538983, + "grad_norm": 24.48828887939453, + "learning_rate": 4.741408836625186e-06, + "loss": 2.1189, + "step": 39760 + }, + { + "epoch": 4.5298707215672875, + "grad_norm": 26.69980812072754, + "learning_rate": 4.729992008220117e-06, + "loss": 2.1254, + "step": 39770 + }, + { + "epoch": 4.531009738595592, + "grad_norm": 29.579418182373047, + "learning_rate": 4.718575179815048e-06, + "loss": 1.9043, + "step": 39780 + }, + { + "epoch": 4.532148755623897, + "grad_norm": 20.64593505859375, + "learning_rate": 4.707158351409979e-06, + "loss": 1.7404, + "step": 39790 + }, + { + "epoch": 4.533287772652201, + "grad_norm": 37.285213470458984, + "learning_rate": 4.695741523004909e-06, + "loss": 1.9959, + "step": 39800 + }, + { + "epoch": 4.534426789680506, + "grad_norm": 23.39985466003418, + "learning_rate": 4.68432469459984e-06, + "loss": 2.39, + "step": 39810 + }, + { + "epoch": 4.53556580670881, + "grad_norm": 21.869203567504883, + "learning_rate": 4.672907866194771e-06, + "loss": 1.9239, + "step": 39820 + }, + { + "epoch": 4.536704823737115, + "grad_norm": 23.45218849182129, + "learning_rate": 4.661491037789702e-06, + "loss": 2.1167, + "step": 39830 + }, + { + "epoch": 4.53784384076542, + "grad_norm": 17.65717124938965, + "learning_rate": 4.650074209384633e-06, + "loss": 2.3905, + "step": 39840 + }, + { + "epoch": 4.538982857793724, + "grad_norm": 37.165042877197266, + "learning_rate": 4.638657380979564e-06, + "loss": 2.1858, + "step": 39850 + }, + { + "epoch": 4.540121874822028, + "grad_norm": 21.526466369628906, + "learning_rate": 4.627240552574495e-06, + "loss": 2.0824, + "step": 39860 + }, + { + "epoch": 4.541260891850333, + "grad_norm": 22.57565689086914, + "learning_rate": 4.615823724169426e-06, + "loss": 2.0321, + "step": 39870 + }, + { + "epoch": 4.542399908878638, + "grad_norm": 23.4373779296875, + "learning_rate": 4.604406895764357e-06, + "loss": 2.0746, + "step": 39880 + }, + { + "epoch": 4.543538925906942, + "grad_norm": 42.485294342041016, + "learning_rate": 4.592990067359288e-06, + "loss": 2.0481, + "step": 39890 + }, + { + "epoch": 4.544677942935246, + "grad_norm": 22.70050048828125, + "learning_rate": 4.581573238954219e-06, + "loss": 2.2605, + "step": 39900 + }, + { + "epoch": 4.545816959963552, + "grad_norm": 20.14665985107422, + "learning_rate": 4.57015641054915e-06, + "loss": 2.5017, + "step": 39910 + }, + { + "epoch": 4.546955976991856, + "grad_norm": 21.76075553894043, + "learning_rate": 4.558739582144081e-06, + "loss": 2.5991, + "step": 39920 + }, + { + "epoch": 4.54809499402016, + "grad_norm": 22.6814022064209, + "learning_rate": 4.547322753739011e-06, + "loss": 2.0445, + "step": 39930 + }, + { + "epoch": 4.5492340110484655, + "grad_norm": 30.271133422851562, + "learning_rate": 4.535905925333942e-06, + "loss": 2.19, + "step": 39940 + }, + { + "epoch": 4.55037302807677, + "grad_norm": 16.300058364868164, + "learning_rate": 4.524489096928873e-06, + "loss": 1.9438, + "step": 39950 + }, + { + "epoch": 4.551512045105074, + "grad_norm": 48.02622604370117, + "learning_rate": 4.513072268523804e-06, + "loss": 1.894, + "step": 39960 + }, + { + "epoch": 4.5526510621333784, + "grad_norm": 34.9205207824707, + "learning_rate": 4.501655440118735e-06, + "loss": 2.188, + "step": 39970 + }, + { + "epoch": 4.553790079161684, + "grad_norm": 23.83540153503418, + "learning_rate": 4.490238611713666e-06, + "loss": 1.9468, + "step": 39980 + }, + { + "epoch": 4.554929096189988, + "grad_norm": 20.996261596679688, + "learning_rate": 4.478821783308597e-06, + "loss": 1.9533, + "step": 39990 + }, + { + "epoch": 4.556068113218292, + "grad_norm": 36.46937561035156, + "learning_rate": 4.467404954903528e-06, + "loss": 1.9744, + "step": 40000 + }, + { + "epoch": 4.556068113218292, + "eval_loss": 7.430954933166504, + "eval_runtime": 11.5765, + "eval_samples_per_second": 1.296, + "eval_steps_per_second": 0.173, + "step": 40000 + }, + { + "epoch": 4.5572071302465975, + "grad_norm": 20.28118896484375, + "learning_rate": 4.455988126498459e-06, + "loss": 2.1172, + "step": 40010 + }, + { + "epoch": 4.558346147274902, + "grad_norm": 21.155750274658203, + "learning_rate": 4.44457129809339e-06, + "loss": 2.1316, + "step": 40020 + }, + { + "epoch": 4.559485164303206, + "grad_norm": 34.09183883666992, + "learning_rate": 4.433154469688321e-06, + "loss": 1.937, + "step": 40030 + }, + { + "epoch": 4.5606241813315105, + "grad_norm": 36.812095642089844, + "learning_rate": 4.421737641283251e-06, + "loss": 1.9069, + "step": 40040 + }, + { + "epoch": 4.561763198359816, + "grad_norm": 23.156200408935547, + "learning_rate": 4.410320812878182e-06, + "loss": 2.0837, + "step": 40050 + }, + { + "epoch": 4.56290221538812, + "grad_norm": 14.93704891204834, + "learning_rate": 4.398903984473113e-06, + "loss": 2.2254, + "step": 40060 + }, + { + "epoch": 4.564041232416424, + "grad_norm": 18.46076202392578, + "learning_rate": 4.387487156068044e-06, + "loss": 2.3518, + "step": 40070 + }, + { + "epoch": 4.5651802494447296, + "grad_norm": 16.568811416625977, + "learning_rate": 4.376070327662975e-06, + "loss": 2.3556, + "step": 40080 + }, + { + "epoch": 4.566319266473034, + "grad_norm": 23.833480834960938, + "learning_rate": 4.364653499257906e-06, + "loss": 2.0857, + "step": 40090 + }, + { + "epoch": 4.567458283501338, + "grad_norm": 35.65871047973633, + "learning_rate": 4.353236670852837e-06, + "loss": 2.2537, + "step": 40100 + }, + { + "epoch": 4.5685973005296425, + "grad_norm": 20.436752319335938, + "learning_rate": 4.341819842447768e-06, + "loss": 2.4209, + "step": 40110 + }, + { + "epoch": 4.569736317557948, + "grad_norm": 15.269256591796875, + "learning_rate": 4.330403014042699e-06, + "loss": 2.1318, + "step": 40120 + }, + { + "epoch": 4.570875334586252, + "grad_norm": 16.96086311340332, + "learning_rate": 4.31898618563763e-06, + "loss": 2.0403, + "step": 40130 + }, + { + "epoch": 4.572014351614556, + "grad_norm": 29.454742431640625, + "learning_rate": 4.307569357232561e-06, + "loss": 2.0296, + "step": 40140 + }, + { + "epoch": 4.573153368642862, + "grad_norm": 24.629676818847656, + "learning_rate": 4.296152528827492e-06, + "loss": 2.0486, + "step": 40150 + }, + { + "epoch": 4.574292385671166, + "grad_norm": 19.61836051940918, + "learning_rate": 4.284735700422423e-06, + "loss": 2.0452, + "step": 40160 + }, + { + "epoch": 4.57543140269947, + "grad_norm": 17.39374542236328, + "learning_rate": 4.273318872017354e-06, + "loss": 2.3243, + "step": 40170 + }, + { + "epoch": 4.576570419727775, + "grad_norm": 17.936925888061523, + "learning_rate": 4.261902043612285e-06, + "loss": 1.9864, + "step": 40180 + }, + { + "epoch": 4.57770943675608, + "grad_norm": 22.40192985534668, + "learning_rate": 4.250485215207216e-06, + "loss": 2.1939, + "step": 40190 + }, + { + "epoch": 4.578848453784384, + "grad_norm": 26.560382843017578, + "learning_rate": 4.239068386802147e-06, + "loss": 1.9444, + "step": 40200 + }, + { + "epoch": 4.579987470812688, + "grad_norm": 42.311832427978516, + "learning_rate": 4.227651558397078e-06, + "loss": 1.9118, + "step": 40210 + }, + { + "epoch": 4.581126487840994, + "grad_norm": 25.300073623657227, + "learning_rate": 4.216234729992009e-06, + "loss": 2.0767, + "step": 40220 + }, + { + "epoch": 4.582265504869298, + "grad_norm": 21.18925666809082, + "learning_rate": 4.204817901586939e-06, + "loss": 1.9787, + "step": 40230 + }, + { + "epoch": 4.583404521897602, + "grad_norm": 22.46856689453125, + "learning_rate": 4.19340107318187e-06, + "loss": 2.285, + "step": 40240 + }, + { + "epoch": 4.584543538925907, + "grad_norm": 33.84992980957031, + "learning_rate": 4.181984244776801e-06, + "loss": 2.0946, + "step": 40250 + }, + { + "epoch": 4.585682555954212, + "grad_norm": 18.506057739257812, + "learning_rate": 4.170567416371732e-06, + "loss": 2.0465, + "step": 40260 + }, + { + "epoch": 4.586821572982516, + "grad_norm": 25.495540618896484, + "learning_rate": 4.159150587966663e-06, + "loss": 1.8954, + "step": 40270 + }, + { + "epoch": 4.5879605900108205, + "grad_norm": 17.841520309448242, + "learning_rate": 4.147733759561593e-06, + "loss": 2.3456, + "step": 40280 + }, + { + "epoch": 4.589099607039126, + "grad_norm": 27.994001388549805, + "learning_rate": 4.136316931156524e-06, + "loss": 1.9023, + "step": 40290 + }, + { + "epoch": 4.59023862406743, + "grad_norm": 19.02427101135254, + "learning_rate": 4.124900102751455e-06, + "loss": 2.3646, + "step": 40300 + }, + { + "epoch": 4.591377641095734, + "grad_norm": 26.314666748046875, + "learning_rate": 4.113483274346386e-06, + "loss": 2.0795, + "step": 40310 + }, + { + "epoch": 4.592516658124039, + "grad_norm": 16.474031448364258, + "learning_rate": 4.102066445941317e-06, + "loss": 2.2428, + "step": 40320 + }, + { + "epoch": 4.593655675152344, + "grad_norm": 21.25417137145996, + "learning_rate": 4.090649617536248e-06, + "loss": 1.7103, + "step": 40330 + }, + { + "epoch": 4.594794692180648, + "grad_norm": 84.85806274414062, + "learning_rate": 4.079232789131179e-06, + "loss": 2.3072, + "step": 40340 + }, + { + "epoch": 4.5959337092089525, + "grad_norm": 25.656068801879883, + "learning_rate": 4.06781596072611e-06, + "loss": 1.8255, + "step": 40350 + }, + { + "epoch": 4.597072726237258, + "grad_norm": 26.108144760131836, + "learning_rate": 4.056399132321041e-06, + "loss": 2.3118, + "step": 40360 + }, + { + "epoch": 4.598211743265562, + "grad_norm": 15.46116828918457, + "learning_rate": 4.044982303915972e-06, + "loss": 2.4082, + "step": 40370 + }, + { + "epoch": 4.599350760293866, + "grad_norm": 25.873424530029297, + "learning_rate": 4.033565475510903e-06, + "loss": 1.9807, + "step": 40380 + }, + { + "epoch": 4.600489777322171, + "grad_norm": 42.28086471557617, + "learning_rate": 4.022148647105834e-06, + "loss": 2.0327, + "step": 40390 + }, + { + "epoch": 4.601628794350476, + "grad_norm": 24.92437744140625, + "learning_rate": 4.010731818700765e-06, + "loss": 2.2466, + "step": 40400 + }, + { + "epoch": 4.60276781137878, + "grad_norm": 35.483253479003906, + "learning_rate": 3.999314990295696e-06, + "loss": 2.2527, + "step": 40410 + }, + { + "epoch": 4.6039068284070845, + "grad_norm": 38.308040618896484, + "learning_rate": 3.987898161890627e-06, + "loss": 1.9487, + "step": 40420 + }, + { + "epoch": 4.60504584543539, + "grad_norm": 34.87318801879883, + "learning_rate": 3.976481333485558e-06, + "loss": 2.0648, + "step": 40430 + }, + { + "epoch": 4.606184862463694, + "grad_norm": 23.286548614501953, + "learning_rate": 3.965064505080489e-06, + "loss": 2.2833, + "step": 40440 + }, + { + "epoch": 4.607323879491998, + "grad_norm": 26.81108856201172, + "learning_rate": 3.95364767667542e-06, + "loss": 1.922, + "step": 40450 + }, + { + "epoch": 4.608462896520303, + "grad_norm": 14.274992942810059, + "learning_rate": 3.942230848270351e-06, + "loss": 2.4131, + "step": 40460 + }, + { + "epoch": 4.609601913548608, + "grad_norm": 22.713680267333984, + "learning_rate": 3.930814019865281e-06, + "loss": 2.2478, + "step": 40470 + }, + { + "epoch": 4.610740930576912, + "grad_norm": 29.16513442993164, + "learning_rate": 3.919397191460212e-06, + "loss": 2.2684, + "step": 40480 + }, + { + "epoch": 4.611879947605217, + "grad_norm": 19.15232276916504, + "learning_rate": 3.907980363055143e-06, + "loss": 2.2575, + "step": 40490 + }, + { + "epoch": 4.613018964633521, + "grad_norm": 20.6922607421875, + "learning_rate": 3.896563534650074e-06, + "loss": 2.1838, + "step": 40500 + }, + { + "epoch": 4.614157981661826, + "grad_norm": 21.437711715698242, + "learning_rate": 3.885146706245005e-06, + "loss": 2.1168, + "step": 40510 + }, + { + "epoch": 4.61529699869013, + "grad_norm": 22.834095001220703, + "learning_rate": 3.873729877839936e-06, + "loss": 1.8947, + "step": 40520 + }, + { + "epoch": 4.616436015718435, + "grad_norm": 28.153804779052734, + "learning_rate": 3.862313049434867e-06, + "loss": 1.9246, + "step": 40530 + }, + { + "epoch": 4.617575032746739, + "grad_norm": 22.463167190551758, + "learning_rate": 3.850896221029798e-06, + "loss": 1.8048, + "step": 40540 + }, + { + "epoch": 4.618714049775044, + "grad_norm": 37.0423698425293, + "learning_rate": 3.839479392624729e-06, + "loss": 2.1945, + "step": 40550 + }, + { + "epoch": 4.619853066803349, + "grad_norm": 23.73196029663086, + "learning_rate": 3.82806256421966e-06, + "loss": 1.9045, + "step": 40560 + }, + { + "epoch": 4.620992083831653, + "grad_norm": 65.1719970703125, + "learning_rate": 3.816645735814591e-06, + "loss": 2.0436, + "step": 40570 + }, + { + "epoch": 4.622131100859958, + "grad_norm": 19.48525047302246, + "learning_rate": 3.805228907409522e-06, + "loss": 2.1875, + "step": 40580 + }, + { + "epoch": 4.6232701178882625, + "grad_norm": 19.362098693847656, + "learning_rate": 3.793812079004453e-06, + "loss": 2.5043, + "step": 40590 + }, + { + "epoch": 4.624409134916567, + "grad_norm": 19.829975128173828, + "learning_rate": 3.782395250599384e-06, + "loss": 1.8309, + "step": 40600 + }, + { + "epoch": 4.625548151944871, + "grad_norm": 21.545780181884766, + "learning_rate": 3.770978422194315e-06, + "loss": 2.4094, + "step": 40610 + }, + { + "epoch": 4.626687168973176, + "grad_norm": 26.796812057495117, + "learning_rate": 3.7595615937892453e-06, + "loss": 1.9475, + "step": 40620 + }, + { + "epoch": 4.627826186001481, + "grad_norm": 15.550358772277832, + "learning_rate": 3.7481447653841763e-06, + "loss": 2.1046, + "step": 40630 + }, + { + "epoch": 4.628965203029785, + "grad_norm": 17.344297409057617, + "learning_rate": 3.7367279369791073e-06, + "loss": 1.9718, + "step": 40640 + }, + { + "epoch": 4.63010422005809, + "grad_norm": 29.06273651123047, + "learning_rate": 3.725311108574038e-06, + "loss": 2.0265, + "step": 40650 + }, + { + "epoch": 4.6312432370863945, + "grad_norm": 31.21932029724121, + "learning_rate": 3.713894280168969e-06, + "loss": 2.0185, + "step": 40660 + }, + { + "epoch": 4.632382254114699, + "grad_norm": 21.523311614990234, + "learning_rate": 3.7024774517639e-06, + "loss": 2.0155, + "step": 40670 + }, + { + "epoch": 4.633521271143003, + "grad_norm": 74.56355285644531, + "learning_rate": 3.691060623358831e-06, + "loss": 2.1667, + "step": 40680 + }, + { + "epoch": 4.634660288171308, + "grad_norm": 14.338943481445312, + "learning_rate": 3.6796437949537618e-06, + "loss": 2.4834, + "step": 40690 + }, + { + "epoch": 4.635799305199613, + "grad_norm": 17.994932174682617, + "learning_rate": 3.6682269665486928e-06, + "loss": 2.1538, + "step": 40700 + }, + { + "epoch": 4.636938322227917, + "grad_norm": 21.936603546142578, + "learning_rate": 3.6568101381436238e-06, + "loss": 2.1427, + "step": 40710 + }, + { + "epoch": 4.638077339256222, + "grad_norm": 21.66320037841797, + "learning_rate": 3.6453933097385548e-06, + "loss": 2.1901, + "step": 40720 + }, + { + "epoch": 4.6392163562845266, + "grad_norm": 28.3718204498291, + "learning_rate": 3.6339764813334857e-06, + "loss": 2.5934, + "step": 40730 + }, + { + "epoch": 4.640355373312831, + "grad_norm": 22.614913940429688, + "learning_rate": 3.6225596529284167e-06, + "loss": 1.8701, + "step": 40740 + }, + { + "epoch": 4.641494390341135, + "grad_norm": 28.130701065063477, + "learning_rate": 3.6111428245233473e-06, + "loss": 2.3998, + "step": 40750 + }, + { + "epoch": 4.64263340736944, + "grad_norm": 17.29996109008789, + "learning_rate": 3.5997259961182783e-06, + "loss": 2.3135, + "step": 40760 + }, + { + "epoch": 4.643772424397745, + "grad_norm": 22.761098861694336, + "learning_rate": 3.5883091677132093e-06, + "loss": 1.9697, + "step": 40770 + }, + { + "epoch": 4.644911441426049, + "grad_norm": 42.505210876464844, + "learning_rate": 3.5768923393081403e-06, + "loss": 2.2177, + "step": 40780 + }, + { + "epoch": 4.646050458454354, + "grad_norm": 23.098838806152344, + "learning_rate": 3.5654755109030713e-06, + "loss": 2.2823, + "step": 40790 + }, + { + "epoch": 4.647189475482659, + "grad_norm": 17.053728103637695, + "learning_rate": 3.5540586824980023e-06, + "loss": 2.113, + "step": 40800 + }, + { + "epoch": 4.648328492510963, + "grad_norm": 25.77522850036621, + "learning_rate": 3.5426418540929332e-06, + "loss": 1.8903, + "step": 40810 + }, + { + "epoch": 4.649467509539267, + "grad_norm": 24.276954650878906, + "learning_rate": 3.5312250256878642e-06, + "loss": 1.7794, + "step": 40820 + }, + { + "epoch": 4.6506065265675725, + "grad_norm": 20.4326229095459, + "learning_rate": 3.5198081972827952e-06, + "loss": 1.8593, + "step": 40830 + }, + { + "epoch": 4.651745543595877, + "grad_norm": 14.628107070922852, + "learning_rate": 3.5083913688777262e-06, + "loss": 1.9423, + "step": 40840 + }, + { + "epoch": 4.652884560624181, + "grad_norm": 35.9196662902832, + "learning_rate": 3.4969745404726568e-06, + "loss": 2.2244, + "step": 40850 + }, + { + "epoch": 4.654023577652486, + "grad_norm": 25.056962966918945, + "learning_rate": 3.4855577120675878e-06, + "loss": 2.2057, + "step": 40860 + }, + { + "epoch": 4.655162594680791, + "grad_norm": 48.19633102416992, + "learning_rate": 3.4741408836625188e-06, + "loss": 2.0837, + "step": 40870 + }, + { + "epoch": 4.656301611709095, + "grad_norm": 23.737659454345703, + "learning_rate": 3.4627240552574498e-06, + "loss": 1.9354, + "step": 40880 + }, + { + "epoch": 4.657440628737399, + "grad_norm": 27.149599075317383, + "learning_rate": 3.4513072268523807e-06, + "loss": 1.7452, + "step": 40890 + }, + { + "epoch": 4.6585796457657045, + "grad_norm": 20.849716186523438, + "learning_rate": 3.4398903984473117e-06, + "loss": 2.224, + "step": 40900 + }, + { + "epoch": 4.659718662794009, + "grad_norm": 15.656664848327637, + "learning_rate": 3.4284735700422427e-06, + "loss": 2.2818, + "step": 40910 + }, + { + "epoch": 4.660857679822313, + "grad_norm": 20.1951904296875, + "learning_rate": 3.4170567416371737e-06, + "loss": 2.4189, + "step": 40920 + }, + { + "epoch": 4.661996696850618, + "grad_norm": 17.477018356323242, + "learning_rate": 3.4056399132321047e-06, + "loss": 2.012, + "step": 40930 + }, + { + "epoch": 4.663135713878923, + "grad_norm": 18.36666488647461, + "learning_rate": 3.3942230848270357e-06, + "loss": 2.1151, + "step": 40940 + }, + { + "epoch": 4.664274730907227, + "grad_norm": 15.96693229675293, + "learning_rate": 3.3828062564219663e-06, + "loss": 2.1247, + "step": 40950 + }, + { + "epoch": 4.665413747935531, + "grad_norm": 19.108427047729492, + "learning_rate": 3.371389428016897e-06, + "loss": 2.2783, + "step": 40960 + }, + { + "epoch": 4.6665527649638365, + "grad_norm": 41.34756851196289, + "learning_rate": 3.359972599611828e-06, + "loss": 2.0349, + "step": 40970 + }, + { + "epoch": 4.667691781992141, + "grad_norm": 20.5064754486084, + "learning_rate": 3.348555771206759e-06, + "loss": 2.1168, + "step": 40980 + }, + { + "epoch": 4.668830799020445, + "grad_norm": 18.09624481201172, + "learning_rate": 3.3371389428016894e-06, + "loss": 1.9942, + "step": 40990 + }, + { + "epoch": 4.66996981604875, + "grad_norm": 29.125307083129883, + "learning_rate": 3.3257221143966204e-06, + "loss": 1.8895, + "step": 41000 + }, + { + "epoch": 4.671108833077055, + "grad_norm": 33.69413757324219, + "learning_rate": 3.3143052859915514e-06, + "loss": 2.4466, + "step": 41010 + }, + { + "epoch": 4.672247850105359, + "grad_norm": 31.16204261779785, + "learning_rate": 3.3028884575864823e-06, + "loss": 2.0407, + "step": 41020 + }, + { + "epoch": 4.673386867133663, + "grad_norm": 19.57454490661621, + "learning_rate": 3.2914716291814133e-06, + "loss": 2.1742, + "step": 41030 + }, + { + "epoch": 4.674525884161969, + "grad_norm": 27.449176788330078, + "learning_rate": 3.2800548007763443e-06, + "loss": 2.1887, + "step": 41040 + }, + { + "epoch": 4.675664901190273, + "grad_norm": 18.539522171020508, + "learning_rate": 3.2686379723712753e-06, + "loss": 2.3035, + "step": 41050 + }, + { + "epoch": 4.676803918218577, + "grad_norm": 16.271577835083008, + "learning_rate": 3.2572211439662063e-06, + "loss": 1.8919, + "step": 41060 + }, + { + "epoch": 4.677942935246882, + "grad_norm": 17.847782135009766, + "learning_rate": 3.2458043155611373e-06, + "loss": 1.9281, + "step": 41070 + }, + { + "epoch": 4.679081952275187, + "grad_norm": 23.48549461364746, + "learning_rate": 3.2343874871560683e-06, + "loss": 2.0102, + "step": 41080 + }, + { + "epoch": 4.680220969303491, + "grad_norm": 24.687597274780273, + "learning_rate": 3.222970658750999e-06, + "loss": 2.2642, + "step": 41090 + }, + { + "epoch": 4.681359986331795, + "grad_norm": 50.728939056396484, + "learning_rate": 3.21155383034593e-06, + "loss": 2.0515, + "step": 41100 + }, + { + "epoch": 4.682499003360101, + "grad_norm": 23.774539947509766, + "learning_rate": 3.200137001940861e-06, + "loss": 2.0903, + "step": 41110 + }, + { + "epoch": 4.683638020388405, + "grad_norm": 18.725481033325195, + "learning_rate": 3.188720173535792e-06, + "loss": 1.9324, + "step": 41120 + }, + { + "epoch": 4.684777037416709, + "grad_norm": 21.664182662963867, + "learning_rate": 3.177303345130723e-06, + "loss": 2.1899, + "step": 41130 + }, + { + "epoch": 4.685916054445014, + "grad_norm": 21.2465763092041, + "learning_rate": 3.165886516725654e-06, + "loss": 2.3188, + "step": 41140 + }, + { + "epoch": 4.687055071473319, + "grad_norm": 54.11538314819336, + "learning_rate": 3.154469688320585e-06, + "loss": 1.9397, + "step": 41150 + }, + { + "epoch": 4.688194088501623, + "grad_norm": 22.550443649291992, + "learning_rate": 3.143052859915516e-06, + "loss": 2.1569, + "step": 41160 + }, + { + "epoch": 4.689333105529927, + "grad_norm": 26.590730667114258, + "learning_rate": 3.1316360315104468e-06, + "loss": 2.462, + "step": 41170 + }, + { + "epoch": 4.690472122558232, + "grad_norm": 21.50946807861328, + "learning_rate": 3.1202192031053778e-06, + "loss": 1.9697, + "step": 41180 + }, + { + "epoch": 4.691611139586537, + "grad_norm": 23.662263870239258, + "learning_rate": 3.1088023747003083e-06, + "loss": 2.1653, + "step": 41190 + }, + { + "epoch": 4.692750156614841, + "grad_norm": 33.12200164794922, + "learning_rate": 3.0973855462952393e-06, + "loss": 1.9513, + "step": 41200 + }, + { + "epoch": 4.693889173643146, + "grad_norm": 101.58634948730469, + "learning_rate": 3.0859687178901703e-06, + "loss": 2.2152, + "step": 41210 + }, + { + "epoch": 4.695028190671451, + "grad_norm": 21.84265899658203, + "learning_rate": 3.074551889485101e-06, + "loss": 2.3828, + "step": 41220 + }, + { + "epoch": 4.696167207699755, + "grad_norm": 21.596426010131836, + "learning_rate": 3.063135061080032e-06, + "loss": 2.1586, + "step": 41230 + }, + { + "epoch": 4.6973062247280595, + "grad_norm": 17.411102294921875, + "learning_rate": 3.051718232674963e-06, + "loss": 1.9603, + "step": 41240 + }, + { + "epoch": 4.698445241756364, + "grad_norm": 32.5643310546875, + "learning_rate": 3.040301404269894e-06, + "loss": 1.5046, + "step": 41250 + }, + { + "epoch": 4.699584258784669, + "grad_norm": 28.38106346130371, + "learning_rate": 3.028884575864825e-06, + "loss": 2.0875, + "step": 41260 + }, + { + "epoch": 4.700723275812973, + "grad_norm": 13.871651649475098, + "learning_rate": 3.017467747459756e-06, + "loss": 2.2842, + "step": 41270 + }, + { + "epoch": 4.701862292841278, + "grad_norm": 35.43418884277344, + "learning_rate": 3.006050919054687e-06, + "loss": 1.9962, + "step": 41280 + }, + { + "epoch": 4.703001309869583, + "grad_norm": 18.605632781982422, + "learning_rate": 2.994634090649618e-06, + "loss": 2.2737, + "step": 41290 + }, + { + "epoch": 4.704140326897887, + "grad_norm": 35.511756896972656, + "learning_rate": 2.983217262244549e-06, + "loss": 2.1054, + "step": 41300 + }, + { + "epoch": 4.7052793439261915, + "grad_norm": 15.427621841430664, + "learning_rate": 2.9718004338394794e-06, + "loss": 2.1165, + "step": 41310 + }, + { + "epoch": 4.706418360954496, + "grad_norm": 11.331278800964355, + "learning_rate": 2.9603836054344104e-06, + "loss": 1.9587, + "step": 41320 + }, + { + "epoch": 4.707557377982801, + "grad_norm": 16.417999267578125, + "learning_rate": 2.9489667770293414e-06, + "loss": 2.0266, + "step": 41330 + }, + { + "epoch": 4.708696395011105, + "grad_norm": 15.927863121032715, + "learning_rate": 2.9375499486242723e-06, + "loss": 2.4389, + "step": 41340 + }, + { + "epoch": 4.70983541203941, + "grad_norm": 18.896665573120117, + "learning_rate": 2.9261331202192033e-06, + "loss": 1.792, + "step": 41350 + }, + { + "epoch": 4.710974429067715, + "grad_norm": 14.645793914794922, + "learning_rate": 2.9147162918141343e-06, + "loss": 2.1785, + "step": 41360 + }, + { + "epoch": 4.712113446096019, + "grad_norm": 21.595312118530273, + "learning_rate": 2.9032994634090653e-06, + "loss": 2.1541, + "step": 41370 + }, + { + "epoch": 4.7132524631243236, + "grad_norm": 19.345293045043945, + "learning_rate": 2.8918826350039963e-06, + "loss": 2.0505, + "step": 41380 + }, + { + "epoch": 4.714391480152628, + "grad_norm": 18.720809936523438, + "learning_rate": 2.880465806598927e-06, + "loss": 2.1777, + "step": 41390 + }, + { + "epoch": 4.715530497180933, + "grad_norm": 14.518261909484863, + "learning_rate": 2.869048978193858e-06, + "loss": 2.0527, + "step": 41400 + }, + { + "epoch": 4.716669514209237, + "grad_norm": 20.90743064880371, + "learning_rate": 2.857632149788789e-06, + "loss": 1.8769, + "step": 41410 + }, + { + "epoch": 4.717808531237542, + "grad_norm": 23.603450775146484, + "learning_rate": 2.8462153213837194e-06, + "loss": 2.2908, + "step": 41420 + }, + { + "epoch": 4.718947548265847, + "grad_norm": 27.330276489257812, + "learning_rate": 2.8347984929786504e-06, + "loss": 2.0999, + "step": 41430 + }, + { + "epoch": 4.720086565294151, + "grad_norm": 19.17913055419922, + "learning_rate": 2.8233816645735814e-06, + "loss": 2.2534, + "step": 41440 + }, + { + "epoch": 4.721225582322456, + "grad_norm": 21.937158584594727, + "learning_rate": 2.8119648361685124e-06, + "loss": 2.1447, + "step": 41450 + }, + { + "epoch": 4.72236459935076, + "grad_norm": 31.000289916992188, + "learning_rate": 2.8005480077634434e-06, + "loss": 2.2022, + "step": 41460 + }, + { + "epoch": 4.723503616379065, + "grad_norm": 30.576736450195312, + "learning_rate": 2.7891311793583744e-06, + "loss": 1.882, + "step": 41470 + }, + { + "epoch": 4.7246426334073695, + "grad_norm": 28.018278121948242, + "learning_rate": 2.7777143509533054e-06, + "loss": 2.1176, + "step": 41480 + }, + { + "epoch": 4.725781650435674, + "grad_norm": 26.066553115844727, + "learning_rate": 2.7662975225482364e-06, + "loss": 2.0295, + "step": 41490 + }, + { + "epoch": 4.726920667463979, + "grad_norm": 25.97825813293457, + "learning_rate": 2.7548806941431673e-06, + "loss": 2.534, + "step": 41500 + }, + { + "epoch": 4.728059684492283, + "grad_norm": 22.897573471069336, + "learning_rate": 2.7434638657380983e-06, + "loss": 2.1461, + "step": 41510 + }, + { + "epoch": 4.729198701520588, + "grad_norm": 15.641851425170898, + "learning_rate": 2.732047037333029e-06, + "loss": 2.1254, + "step": 41520 + }, + { + "epoch": 4.730337718548892, + "grad_norm": 20.667016983032227, + "learning_rate": 2.72063020892796e-06, + "loss": 2.0765, + "step": 41530 + }, + { + "epoch": 4.731476735577197, + "grad_norm": 19.728759765625, + "learning_rate": 2.709213380522891e-06, + "loss": 2.2298, + "step": 41540 + }, + { + "epoch": 4.7326157526055015, + "grad_norm": 23.560476303100586, + "learning_rate": 2.697796552117822e-06, + "loss": 1.9045, + "step": 41550 + }, + { + "epoch": 4.733754769633806, + "grad_norm": 25.78218650817871, + "learning_rate": 2.686379723712753e-06, + "loss": 2.0516, + "step": 41560 + }, + { + "epoch": 4.734893786662111, + "grad_norm": 30.02228546142578, + "learning_rate": 2.6749628953076834e-06, + "loss": 1.9688, + "step": 41570 + }, + { + "epoch": 4.736032803690415, + "grad_norm": 22.432594299316406, + "learning_rate": 2.6635460669026144e-06, + "loss": 2.0243, + "step": 41580 + }, + { + "epoch": 4.73717182071872, + "grad_norm": 23.2596435546875, + "learning_rate": 2.6521292384975454e-06, + "loss": 2.1927, + "step": 41590 + }, + { + "epoch": 4.738310837747024, + "grad_norm": 38.232418060302734, + "learning_rate": 2.6407124100924764e-06, + "loss": 1.9766, + "step": 41600 + }, + { + "epoch": 4.739449854775329, + "grad_norm": 20.036319732666016, + "learning_rate": 2.6292955816874074e-06, + "loss": 2.1516, + "step": 41610 + }, + { + "epoch": 4.7405888718036335, + "grad_norm": 32.12515640258789, + "learning_rate": 2.6178787532823384e-06, + "loss": 1.7411, + "step": 41620 + }, + { + "epoch": 4.741727888831938, + "grad_norm": 26.23710823059082, + "learning_rate": 2.6064619248772694e-06, + "loss": 2.4096, + "step": 41630 + }, + { + "epoch": 4.742866905860243, + "grad_norm": 41.13633346557617, + "learning_rate": 2.5950450964722e-06, + "loss": 2.1761, + "step": 41640 + }, + { + "epoch": 4.744005922888547, + "grad_norm": 20.02508544921875, + "learning_rate": 2.583628268067131e-06, + "loss": 2.1074, + "step": 41650 + }, + { + "epoch": 4.745144939916852, + "grad_norm": 22.41956329345703, + "learning_rate": 2.572211439662062e-06, + "loss": 2.3696, + "step": 41660 + }, + { + "epoch": 4.746283956945156, + "grad_norm": 18.06878089904785, + "learning_rate": 2.560794611256993e-06, + "loss": 2.2291, + "step": 41670 + }, + { + "epoch": 4.747422973973461, + "grad_norm": 33.769981384277344, + "learning_rate": 2.549377782851924e-06, + "loss": 2.0704, + "step": 41680 + }, + { + "epoch": 4.748561991001766, + "grad_norm": 24.32868003845215, + "learning_rate": 2.537960954446855e-06, + "loss": 1.9299, + "step": 41690 + }, + { + "epoch": 4.74970100803007, + "grad_norm": 31.78925323486328, + "learning_rate": 2.526544126041786e-06, + "loss": 1.8738, + "step": 41700 + }, + { + "epoch": 4.750840025058375, + "grad_norm": 19.25320053100586, + "learning_rate": 2.515127297636717e-06, + "loss": 2.0319, + "step": 41710 + }, + { + "epoch": 4.751979042086679, + "grad_norm": 21.71365737915039, + "learning_rate": 2.503710469231648e-06, + "loss": 2.0099, + "step": 41720 + }, + { + "epoch": 4.753118059114984, + "grad_norm": 15.033337593078613, + "learning_rate": 2.492293640826579e-06, + "loss": 2.1962, + "step": 41730 + }, + { + "epoch": 4.754257076143288, + "grad_norm": 33.91618347167969, + "learning_rate": 2.4808768124215094e-06, + "loss": 2.1071, + "step": 41740 + }, + { + "epoch": 4.755396093171593, + "grad_norm": 31.09442710876465, + "learning_rate": 2.4694599840164404e-06, + "loss": 2.1084, + "step": 41750 + }, + { + "epoch": 4.756535110199898, + "grad_norm": 39.78629684448242, + "learning_rate": 2.458043155611371e-06, + "loss": 1.7375, + "step": 41760 + }, + { + "epoch": 4.757674127228202, + "grad_norm": 16.560260772705078, + "learning_rate": 2.446626327206302e-06, + "loss": 1.9215, + "step": 41770 + }, + { + "epoch": 4.758813144256506, + "grad_norm": 14.8968505859375, + "learning_rate": 2.435209498801233e-06, + "loss": 2.0154, + "step": 41780 + }, + { + "epoch": 4.7599521612848115, + "grad_norm": 28.653608322143555, + "learning_rate": 2.423792670396164e-06, + "loss": 2.2211, + "step": 41790 + }, + { + "epoch": 4.761091178313116, + "grad_norm": 19.598594665527344, + "learning_rate": 2.412375841991095e-06, + "loss": 2.2394, + "step": 41800 + }, + { + "epoch": 4.76223019534142, + "grad_norm": 66.2208480834961, + "learning_rate": 2.400959013586026e-06, + "loss": 1.9773, + "step": 41810 + }, + { + "epoch": 4.763369212369724, + "grad_norm": 30.057828903198242, + "learning_rate": 2.389542185180957e-06, + "loss": 2.1213, + "step": 41820 + }, + { + "epoch": 4.76450822939803, + "grad_norm": 25.836355209350586, + "learning_rate": 2.378125356775888e-06, + "loss": 1.9836, + "step": 41830 + }, + { + "epoch": 4.765647246426334, + "grad_norm": 21.258834838867188, + "learning_rate": 2.366708528370819e-06, + "loss": 1.9822, + "step": 41840 + }, + { + "epoch": 4.766786263454638, + "grad_norm": 22.759042739868164, + "learning_rate": 2.35529169996575e-06, + "loss": 2.1389, + "step": 41850 + }, + { + "epoch": 4.7679252804829435, + "grad_norm": 31.323827743530273, + "learning_rate": 2.3438748715606805e-06, + "loss": 1.8518, + "step": 41860 + }, + { + "epoch": 4.769064297511248, + "grad_norm": 43.38692092895508, + "learning_rate": 2.3324580431556115e-06, + "loss": 2.2424, + "step": 41870 + }, + { + "epoch": 4.770203314539552, + "grad_norm": 22.54583740234375, + "learning_rate": 2.3210412147505424e-06, + "loss": 2.0768, + "step": 41880 + }, + { + "epoch": 4.7713423315678565, + "grad_norm": 21.27452278137207, + "learning_rate": 2.3096243863454734e-06, + "loss": 2.0261, + "step": 41890 + }, + { + "epoch": 4.772481348596162, + "grad_norm": 22.538942337036133, + "learning_rate": 2.2982075579404044e-06, + "loss": 1.8942, + "step": 41900 + }, + { + "epoch": 4.773620365624466, + "grad_norm": 17.049379348754883, + "learning_rate": 2.286790729535335e-06, + "loss": 2.3636, + "step": 41910 + }, + { + "epoch": 4.77475938265277, + "grad_norm": 18.71116828918457, + "learning_rate": 2.275373901130266e-06, + "loss": 1.8524, + "step": 41920 + }, + { + "epoch": 4.7758983996810755, + "grad_norm": 33.6178092956543, + "learning_rate": 2.263957072725197e-06, + "loss": 2.1832, + "step": 41930 + }, + { + "epoch": 4.77703741670938, + "grad_norm": 24.332134246826172, + "learning_rate": 2.252540244320128e-06, + "loss": 2.0503, + "step": 41940 + }, + { + "epoch": 4.778176433737684, + "grad_norm": 20.971229553222656, + "learning_rate": 2.241123415915059e-06, + "loss": 1.6425, + "step": 41950 + }, + { + "epoch": 4.7793154507659885, + "grad_norm": 62.76225280761719, + "learning_rate": 2.22970658750999e-06, + "loss": 2.136, + "step": 41960 + }, + { + "epoch": 4.780454467794294, + "grad_norm": 20.96953010559082, + "learning_rate": 2.218289759104921e-06, + "loss": 2.1829, + "step": 41970 + }, + { + "epoch": 4.781593484822598, + "grad_norm": 15.353999137878418, + "learning_rate": 2.2068729306998515e-06, + "loss": 2.1151, + "step": 41980 + }, + { + "epoch": 4.782732501850902, + "grad_norm": 23.195850372314453, + "learning_rate": 2.1954561022947825e-06, + "loss": 1.9683, + "step": 41990 + }, + { + "epoch": 4.783871518879208, + "grad_norm": 50.11465835571289, + "learning_rate": 2.1840392738897135e-06, + "loss": 2.446, + "step": 42000 + }, + { + "epoch": 4.783871518879208, + "eval_loss": 7.4625725746154785, + "eval_runtime": 12.0502, + "eval_samples_per_second": 1.245, + "eval_steps_per_second": 0.166, + "step": 42000 + }, + { + "epoch": 4.785010535907512, + "grad_norm": 16.312358856201172, + "learning_rate": 2.1726224454846445e-06, + "loss": 1.9566, + "step": 42010 + }, + { + "epoch": 4.786149552935816, + "grad_norm": 18.50370979309082, + "learning_rate": 2.1612056170795755e-06, + "loss": 2.1195, + "step": 42020 + }, + { + "epoch": 4.7872885699641206, + "grad_norm": 17.659156799316406, + "learning_rate": 2.1497887886745065e-06, + "loss": 2.2324, + "step": 42030 + }, + { + "epoch": 4.788427586992426, + "grad_norm": 19.813390731811523, + "learning_rate": 2.1383719602694374e-06, + "loss": 2.1722, + "step": 42040 + }, + { + "epoch": 4.78956660402073, + "grad_norm": 21.918062210083008, + "learning_rate": 2.1269551318643684e-06, + "loss": 2.2727, + "step": 42050 + }, + { + "epoch": 4.790705621049034, + "grad_norm": 16.98636245727539, + "learning_rate": 2.1155383034592994e-06, + "loss": 2.1152, + "step": 42060 + }, + { + "epoch": 4.79184463807734, + "grad_norm": 24.551387786865234, + "learning_rate": 2.10412147505423e-06, + "loss": 2.1638, + "step": 42070 + }, + { + "epoch": 4.792983655105644, + "grad_norm": 20.222158432006836, + "learning_rate": 2.092704646649161e-06, + "loss": 2.2903, + "step": 42080 + }, + { + "epoch": 4.794122672133948, + "grad_norm": 28.18195343017578, + "learning_rate": 2.0812878182440915e-06, + "loss": 2.304, + "step": 42090 + }, + { + "epoch": 4.795261689162253, + "grad_norm": 59.049991607666016, + "learning_rate": 2.0698709898390225e-06, + "loss": 2.1866, + "step": 42100 + }, + { + "epoch": 4.796400706190558, + "grad_norm": 18.300731658935547, + "learning_rate": 2.0584541614339535e-06, + "loss": 2.0335, + "step": 42110 + }, + { + "epoch": 4.797539723218862, + "grad_norm": 25.122678756713867, + "learning_rate": 2.0470373330288845e-06, + "loss": 2.1305, + "step": 42120 + }, + { + "epoch": 4.7986787402471665, + "grad_norm": 18.272083282470703, + "learning_rate": 2.0356205046238155e-06, + "loss": 2.1805, + "step": 42130 + }, + { + "epoch": 4.799817757275472, + "grad_norm": 26.403356552124023, + "learning_rate": 2.0242036762187465e-06, + "loss": 1.8508, + "step": 42140 + }, + { + "epoch": 4.800956774303776, + "grad_norm": 19.124774932861328, + "learning_rate": 2.0127868478136775e-06, + "loss": 2.2891, + "step": 42150 + }, + { + "epoch": 4.80209579133208, + "grad_norm": 22.077056884765625, + "learning_rate": 2.0013700194086085e-06, + "loss": 2.1782, + "step": 42160 + }, + { + "epoch": 4.803234808360385, + "grad_norm": 22.749500274658203, + "learning_rate": 1.9899531910035395e-06, + "loss": 2.129, + "step": 42170 + }, + { + "epoch": 4.80437382538869, + "grad_norm": 27.834880828857422, + "learning_rate": 1.9785363625984705e-06, + "loss": 2.2433, + "step": 42180 + }, + { + "epoch": 4.805512842416994, + "grad_norm": 19.033899307250977, + "learning_rate": 1.967119534193401e-06, + "loss": 2.2115, + "step": 42190 + }, + { + "epoch": 4.8066518594452985, + "grad_norm": 26.05759620666504, + "learning_rate": 1.955702705788332e-06, + "loss": 1.7755, + "step": 42200 + }, + { + "epoch": 4.807790876473604, + "grad_norm": 37.59040069580078, + "learning_rate": 1.944285877383263e-06, + "loss": 2.0998, + "step": 42210 + }, + { + "epoch": 4.808929893501908, + "grad_norm": 36.5812873840332, + "learning_rate": 1.932869048978194e-06, + "loss": 1.6679, + "step": 42220 + }, + { + "epoch": 4.810068910530212, + "grad_norm": 20.356979370117188, + "learning_rate": 1.921452220573125e-06, + "loss": 1.9155, + "step": 42230 + }, + { + "epoch": 4.811207927558517, + "grad_norm": 13.821524620056152, + "learning_rate": 1.910035392168056e-06, + "loss": 2.6134, + "step": 42240 + }, + { + "epoch": 4.812346944586822, + "grad_norm": 27.23172950744629, + "learning_rate": 1.8986185637629865e-06, + "loss": 2.2042, + "step": 42250 + }, + { + "epoch": 4.813485961615126, + "grad_norm": 64.11741638183594, + "learning_rate": 1.8872017353579175e-06, + "loss": 1.9247, + "step": 42260 + }, + { + "epoch": 4.8146249786434305, + "grad_norm": 27.220752716064453, + "learning_rate": 1.8757849069528485e-06, + "loss": 2.1754, + "step": 42270 + }, + { + "epoch": 4.815763995671736, + "grad_norm": 26.3674373626709, + "learning_rate": 1.8643680785477793e-06, + "loss": 2.0802, + "step": 42280 + }, + { + "epoch": 4.81690301270004, + "grad_norm": 26.886674880981445, + "learning_rate": 1.8529512501427103e-06, + "loss": 2.0704, + "step": 42290 + }, + { + "epoch": 4.818042029728344, + "grad_norm": 30.28145980834961, + "learning_rate": 1.8415344217376413e-06, + "loss": 1.7818, + "step": 42300 + }, + { + "epoch": 4.819181046756649, + "grad_norm": 23.630435943603516, + "learning_rate": 1.8301175933325723e-06, + "loss": 2.2348, + "step": 42310 + }, + { + "epoch": 4.820320063784954, + "grad_norm": 21.809415817260742, + "learning_rate": 1.8187007649275033e-06, + "loss": 2.1084, + "step": 42320 + }, + { + "epoch": 4.821459080813258, + "grad_norm": 27.316287994384766, + "learning_rate": 1.807283936522434e-06, + "loss": 1.747, + "step": 42330 + }, + { + "epoch": 4.822598097841563, + "grad_norm": 13.889946937561035, + "learning_rate": 1.795867108117365e-06, + "loss": 2.4892, + "step": 42340 + }, + { + "epoch": 4.823737114869868, + "grad_norm": 22.583810806274414, + "learning_rate": 1.784450279712296e-06, + "loss": 2.5241, + "step": 42350 + }, + { + "epoch": 4.824876131898172, + "grad_norm": 37.866661071777344, + "learning_rate": 1.773033451307227e-06, + "loss": 2.2539, + "step": 42360 + }, + { + "epoch": 4.826015148926476, + "grad_norm": 48.19717788696289, + "learning_rate": 1.761616622902158e-06, + "loss": 1.9788, + "step": 42370 + }, + { + "epoch": 4.827154165954781, + "grad_norm": 26.61407470703125, + "learning_rate": 1.7501997944970888e-06, + "loss": 2.1119, + "step": 42380 + }, + { + "epoch": 4.828293182983086, + "grad_norm": 30.52083969116211, + "learning_rate": 1.7387829660920198e-06, + "loss": 1.7713, + "step": 42390 + }, + { + "epoch": 4.82943220001139, + "grad_norm": 24.027544021606445, + "learning_rate": 1.7273661376869508e-06, + "loss": 2.1212, + "step": 42400 + }, + { + "epoch": 4.830571217039695, + "grad_norm": 15.251887321472168, + "learning_rate": 1.7159493092818818e-06, + "loss": 2.1417, + "step": 42410 + }, + { + "epoch": 4.831710234067999, + "grad_norm": 21.110891342163086, + "learning_rate": 1.7045324808768123e-06, + "loss": 2.0248, + "step": 42420 + }, + { + "epoch": 4.832849251096304, + "grad_norm": 27.054014205932617, + "learning_rate": 1.6931156524717433e-06, + "loss": 1.8808, + "step": 42430 + }, + { + "epoch": 4.8339882681246085, + "grad_norm": 20.270843505859375, + "learning_rate": 1.6816988240666743e-06, + "loss": 2.4809, + "step": 42440 + }, + { + "epoch": 4.835127285152913, + "grad_norm": 17.14975357055664, + "learning_rate": 1.670281995661605e-06, + "loss": 2.0296, + "step": 42450 + }, + { + "epoch": 4.836266302181217, + "grad_norm": 18.961040496826172, + "learning_rate": 1.658865167256536e-06, + "loss": 2.2228, + "step": 42460 + }, + { + "epoch": 4.837405319209522, + "grad_norm": 17.892587661743164, + "learning_rate": 1.647448338851467e-06, + "loss": 1.8035, + "step": 42470 + }, + { + "epoch": 4.838544336237827, + "grad_norm": 26.437068939208984, + "learning_rate": 1.636031510446398e-06, + "loss": 2.0168, + "step": 42480 + }, + { + "epoch": 4.839683353266131, + "grad_norm": 17.080066680908203, + "learning_rate": 1.624614682041329e-06, + "loss": 1.8538, + "step": 42490 + }, + { + "epoch": 4.840822370294436, + "grad_norm": 35.432491302490234, + "learning_rate": 1.6131978536362598e-06, + "loss": 2.2317, + "step": 42500 + }, + { + "epoch": 4.8419613873227405, + "grad_norm": 30.46700668334961, + "learning_rate": 1.6017810252311908e-06, + "loss": 1.6325, + "step": 42510 + }, + { + "epoch": 4.843100404351045, + "grad_norm": 26.486875534057617, + "learning_rate": 1.5903641968261218e-06, + "loss": 2.109, + "step": 42520 + }, + { + "epoch": 4.844239421379349, + "grad_norm": 21.433719635009766, + "learning_rate": 1.5789473684210528e-06, + "loss": 2.2081, + "step": 42530 + }, + { + "epoch": 4.845378438407654, + "grad_norm": 28.510398864746094, + "learning_rate": 1.5675305400159838e-06, + "loss": 1.6051, + "step": 42540 + }, + { + "epoch": 4.846517455435959, + "grad_norm": 15.205029487609863, + "learning_rate": 1.5561137116109146e-06, + "loss": 2.1267, + "step": 42550 + }, + { + "epoch": 4.847656472464263, + "grad_norm": 21.264787673950195, + "learning_rate": 1.5446968832058453e-06, + "loss": 2.393, + "step": 42560 + }, + { + "epoch": 4.848795489492568, + "grad_norm": 36.28373336791992, + "learning_rate": 1.5332800548007763e-06, + "loss": 2.0963, + "step": 42570 + }, + { + "epoch": 4.8499345065208725, + "grad_norm": 16.20631980895996, + "learning_rate": 1.5218632263957073e-06, + "loss": 2.0634, + "step": 42580 + }, + { + "epoch": 4.851073523549177, + "grad_norm": 25.005062103271484, + "learning_rate": 1.5104463979906383e-06, + "loss": 1.7097, + "step": 42590 + }, + { + "epoch": 4.852212540577481, + "grad_norm": 22.628549575805664, + "learning_rate": 1.4990295695855693e-06, + "loss": 2.3418, + "step": 42600 + }, + { + "epoch": 4.853351557605786, + "grad_norm": 21.939167022705078, + "learning_rate": 1.4876127411805e-06, + "loss": 2.1178, + "step": 42610 + }, + { + "epoch": 4.854490574634091, + "grad_norm": 21.38382339477539, + "learning_rate": 1.476195912775431e-06, + "loss": 2.0266, + "step": 42620 + }, + { + "epoch": 4.855629591662395, + "grad_norm": 21.068513870239258, + "learning_rate": 1.464779084370362e-06, + "loss": 2.3151, + "step": 42630 + }, + { + "epoch": 4.8567686086907, + "grad_norm": 18.45485496520996, + "learning_rate": 1.4533622559652928e-06, + "loss": 1.9822, + "step": 42640 + }, + { + "epoch": 4.857907625719005, + "grad_norm": 20.9517879486084, + "learning_rate": 1.4419454275602238e-06, + "loss": 1.8991, + "step": 42650 + }, + { + "epoch": 4.859046642747309, + "grad_norm": 17.37051773071289, + "learning_rate": 1.4305285991551548e-06, + "loss": 1.9226, + "step": 42660 + }, + { + "epoch": 4.860185659775613, + "grad_norm": 29.61225128173828, + "learning_rate": 1.4191117707500856e-06, + "loss": 2.3288, + "step": 42670 + }, + { + "epoch": 4.861324676803918, + "grad_norm": 27.416385650634766, + "learning_rate": 1.4076949423450166e-06, + "loss": 1.9653, + "step": 42680 + }, + { + "epoch": 4.862463693832223, + "grad_norm": 23.183807373046875, + "learning_rate": 1.3962781139399476e-06, + "loss": 2.122, + "step": 42690 + }, + { + "epoch": 4.863602710860527, + "grad_norm": 26.133235931396484, + "learning_rate": 1.3848612855348786e-06, + "loss": 2.1107, + "step": 42700 + }, + { + "epoch": 4.864741727888832, + "grad_norm": 31.365652084350586, + "learning_rate": 1.3734444571298096e-06, + "loss": 2.2315, + "step": 42710 + }, + { + "epoch": 4.865880744917137, + "grad_norm": 23.70299530029297, + "learning_rate": 1.3620276287247403e-06, + "loss": 2.0635, + "step": 42720 + }, + { + "epoch": 4.867019761945441, + "grad_norm": 19.910024642944336, + "learning_rate": 1.3506108003196711e-06, + "loss": 2.0115, + "step": 42730 + }, + { + "epoch": 4.868158778973745, + "grad_norm": 19.81175422668457, + "learning_rate": 1.340335654755109e-06, + "loss": 2.0302, + "step": 42740 + }, + { + "epoch": 4.8692977960020505, + "grad_norm": 17.099885940551758, + "learning_rate": 1.32891882635004e-06, + "loss": 2.5206, + "step": 42750 + }, + { + "epoch": 4.870436813030355, + "grad_norm": 17.398080825805664, + "learning_rate": 1.317501997944971e-06, + "loss": 2.4282, + "step": 42760 + }, + { + "epoch": 4.871575830058659, + "grad_norm": 34.670310974121094, + "learning_rate": 1.306085169539902e-06, + "loss": 2.0474, + "step": 42770 + }, + { + "epoch": 4.872714847086964, + "grad_norm": 21.286638259887695, + "learning_rate": 1.2946683411348327e-06, + "loss": 1.9928, + "step": 42780 + }, + { + "epoch": 4.873853864115269, + "grad_norm": 16.5836124420166, + "learning_rate": 1.2832515127297637e-06, + "loss": 2.0706, + "step": 42790 + }, + { + "epoch": 4.874992881143573, + "grad_norm": 24.905271530151367, + "learning_rate": 1.2718346843246947e-06, + "loss": 1.9871, + "step": 42800 + }, + { + "epoch": 4.876131898171877, + "grad_norm": 23.023340225219727, + "learning_rate": 1.2604178559196257e-06, + "loss": 2.0595, + "step": 42810 + }, + { + "epoch": 4.8772709152001825, + "grad_norm": 24.938383102416992, + "learning_rate": 1.2501427103550635e-06, + "loss": 2.4151, + "step": 42820 + }, + { + "epoch": 4.878409932228487, + "grad_norm": 30.464200973510742, + "learning_rate": 1.2387258819499943e-06, + "loss": 2.1528, + "step": 42830 + }, + { + "epoch": 4.879548949256791, + "grad_norm": 28.341678619384766, + "learning_rate": 1.2273090535449253e-06, + "loss": 2.3776, + "step": 42840 + }, + { + "epoch": 4.880687966285096, + "grad_norm": 23.928951263427734, + "learning_rate": 1.215892225139856e-06, + "loss": 2.1533, + "step": 42850 + }, + { + "epoch": 4.881826983313401, + "grad_norm": 16.908361434936523, + "learning_rate": 1.204475396734787e-06, + "loss": 1.9074, + "step": 42860 + }, + { + "epoch": 4.882966000341705, + "grad_norm": 16.132740020751953, + "learning_rate": 1.193058568329718e-06, + "loss": 2.1446, + "step": 42870 + }, + { + "epoch": 4.884105017370009, + "grad_norm": 28.46498680114746, + "learning_rate": 1.181641739924649e-06, + "loss": 1.9904, + "step": 42880 + }, + { + "epoch": 4.885244034398315, + "grad_norm": 26.09286117553711, + "learning_rate": 1.1702249115195798e-06, + "loss": 2.4157, + "step": 42890 + }, + { + "epoch": 4.886383051426619, + "grad_norm": 14.928512573242188, + "learning_rate": 1.1588080831145108e-06, + "loss": 2.5158, + "step": 42900 + }, + { + "epoch": 4.887522068454923, + "grad_norm": 21.614879608154297, + "learning_rate": 1.1473912547094418e-06, + "loss": 2.0684, + "step": 42910 + }, + { + "epoch": 4.888661085483228, + "grad_norm": 20.160425186157227, + "learning_rate": 1.1359744263043728e-06, + "loss": 1.6731, + "step": 42920 + }, + { + "epoch": 4.889800102511533, + "grad_norm": 18.106813430786133, + "learning_rate": 1.1245575978993036e-06, + "loss": 2.0349, + "step": 42930 + }, + { + "epoch": 4.890939119539837, + "grad_norm": 19.25848388671875, + "learning_rate": 1.1131407694942346e-06, + "loss": 2.2406, + "step": 42940 + }, + { + "epoch": 4.892078136568141, + "grad_norm": 73.29264831542969, + "learning_rate": 1.1017239410891654e-06, + "loss": 1.5453, + "step": 42950 + }, + { + "epoch": 4.893217153596447, + "grad_norm": 22.873788833618164, + "learning_rate": 1.0903071126840963e-06, + "loss": 2.2902, + "step": 42960 + }, + { + "epoch": 4.894356170624751, + "grad_norm": 23.639333724975586, + "learning_rate": 1.0788902842790273e-06, + "loss": 1.8889, + "step": 42970 + }, + { + "epoch": 4.895495187653055, + "grad_norm": 39.6263427734375, + "learning_rate": 1.0674734558739583e-06, + "loss": 2.2199, + "step": 42980 + }, + { + "epoch": 4.8966342046813605, + "grad_norm": 16.764909744262695, + "learning_rate": 1.0560566274688893e-06, + "loss": 1.9415, + "step": 42990 + }, + { + "epoch": 4.897773221709665, + "grad_norm": 25.162059783935547, + "learning_rate": 1.04463979906382e-06, + "loss": 2.105, + "step": 43000 + }, + { + "epoch": 4.898912238737969, + "grad_norm": 24.410005569458008, + "learning_rate": 1.033222970658751e-06, + "loss": 2.4185, + "step": 43010 + }, + { + "epoch": 4.900051255766273, + "grad_norm": 18.527870178222656, + "learning_rate": 1.0218061422536819e-06, + "loss": 2.0419, + "step": 43020 + }, + { + "epoch": 4.901190272794579, + "grad_norm": 27.03840446472168, + "learning_rate": 1.0103893138486129e-06, + "loss": 1.9704, + "step": 43030 + }, + { + "epoch": 4.902329289822883, + "grad_norm": 21.27702522277832, + "learning_rate": 9.989724854435438e-07, + "loss": 1.9872, + "step": 43040 + }, + { + "epoch": 4.903468306851187, + "grad_norm": 13.977709770202637, + "learning_rate": 9.875556570384748e-07, + "loss": 2.2585, + "step": 43050 + }, + { + "epoch": 4.9046073238794925, + "grad_norm": 22.648279190063477, + "learning_rate": 9.761388286334056e-07, + "loss": 2.1437, + "step": 43060 + }, + { + "epoch": 4.905746340907797, + "grad_norm": 24.588411331176758, + "learning_rate": 9.647220002283366e-07, + "loss": 2.0577, + "step": 43070 + }, + { + "epoch": 4.906885357936101, + "grad_norm": 29.299461364746094, + "learning_rate": 9.533051718232676e-07, + "loss": 2.333, + "step": 43080 + }, + { + "epoch": 4.9080243749644055, + "grad_norm": 31.78087615966797, + "learning_rate": 9.418883434181986e-07, + "loss": 1.8565, + "step": 43090 + }, + { + "epoch": 4.909163391992711, + "grad_norm": 30.73528289794922, + "learning_rate": 9.304715150131294e-07, + "loss": 1.8478, + "step": 43100 + }, + { + "epoch": 4.910302409021015, + "grad_norm": 16.780263900756836, + "learning_rate": 9.190546866080602e-07, + "loss": 1.9653, + "step": 43110 + }, + { + "epoch": 4.911441426049319, + "grad_norm": 25.756961822509766, + "learning_rate": 9.076378582029912e-07, + "loss": 2.2747, + "step": 43120 + }, + { + "epoch": 4.912580443077624, + "grad_norm": 28.987548828125, + "learning_rate": 8.962210297979221e-07, + "loss": 2.2953, + "step": 43130 + }, + { + "epoch": 4.913719460105929, + "grad_norm": 21.594642639160156, + "learning_rate": 8.848042013928531e-07, + "loss": 2.1243, + "step": 43140 + }, + { + "epoch": 4.914858477134233, + "grad_norm": 21.871192932128906, + "learning_rate": 8.73387372987784e-07, + "loss": 2.3823, + "step": 43150 + }, + { + "epoch": 4.9159974941625375, + "grad_norm": 29.624345779418945, + "learning_rate": 8.61970544582715e-07, + "loss": 2.1485, + "step": 43160 + }, + { + "epoch": 4.917136511190842, + "grad_norm": 30.46200180053711, + "learning_rate": 8.50553716177646e-07, + "loss": 1.9375, + "step": 43170 + }, + { + "epoch": 4.918275528219147, + "grad_norm": 20.667417526245117, + "learning_rate": 8.391368877725769e-07, + "loss": 2.0556, + "step": 43180 + }, + { + "epoch": 4.919414545247451, + "grad_norm": 18.79863929748535, + "learning_rate": 8.277200593675076e-07, + "loss": 2.121, + "step": 43190 + }, + { + "epoch": 4.920553562275756, + "grad_norm": 18.315258026123047, + "learning_rate": 8.163032309624386e-07, + "loss": 2.2413, + "step": 43200 + }, + { + "epoch": 4.921692579304061, + "grad_norm": 16.006052017211914, + "learning_rate": 8.048864025573695e-07, + "loss": 2.4275, + "step": 43210 + }, + { + "epoch": 4.922831596332365, + "grad_norm": 20.375164031982422, + "learning_rate": 7.934695741523005e-07, + "loss": 1.9574, + "step": 43220 + }, + { + "epoch": 4.9239706133606695, + "grad_norm": 24.44364356994629, + "learning_rate": 7.820527457472315e-07, + "loss": 2.0301, + "step": 43230 + }, + { + "epoch": 4.925109630388974, + "grad_norm": 27.508493423461914, + "learning_rate": 7.706359173421624e-07, + "loss": 2.0369, + "step": 43240 + }, + { + "epoch": 4.926248647417279, + "grad_norm": 22.325647354125977, + "learning_rate": 7.592190889370933e-07, + "loss": 2.217, + "step": 43250 + }, + { + "epoch": 4.927387664445583, + "grad_norm": 18.97045135498047, + "learning_rate": 7.478022605320243e-07, + "loss": 2.3761, + "step": 43260 + }, + { + "epoch": 4.928526681473888, + "grad_norm": 20.178020477294922, + "learning_rate": 7.363854321269551e-07, + "loss": 1.9986, + "step": 43270 + }, + { + "epoch": 4.929665698502193, + "grad_norm": 22.980409622192383, + "learning_rate": 7.249686037218861e-07, + "loss": 1.9192, + "step": 43280 + }, + { + "epoch": 4.930804715530497, + "grad_norm": 30.173433303833008, + "learning_rate": 7.13551775316817e-07, + "loss": 2.4624, + "step": 43290 + }, + { + "epoch": 4.931943732558802, + "grad_norm": 17.252126693725586, + "learning_rate": 7.021349469117479e-07, + "loss": 2.1132, + "step": 43300 + }, + { + "epoch": 4.933082749587106, + "grad_norm": 20.48082160949707, + "learning_rate": 6.907181185066789e-07, + "loss": 2.045, + "step": 43310 + }, + { + "epoch": 4.934221766615411, + "grad_norm": 66.91496276855469, + "learning_rate": 6.793012901016098e-07, + "loss": 2.173, + "step": 43320 + }, + { + "epoch": 4.935360783643715, + "grad_norm": 22.771718978881836, + "learning_rate": 6.678844616965408e-07, + "loss": 2.4102, + "step": 43330 + }, + { + "epoch": 4.93649980067202, + "grad_norm": 49.082252502441406, + "learning_rate": 6.564676332914717e-07, + "loss": 1.8165, + "step": 43340 + }, + { + "epoch": 4.937638817700325, + "grad_norm": 22.63939094543457, + "learning_rate": 6.450508048864025e-07, + "loss": 1.9551, + "step": 43350 + }, + { + "epoch": 4.938777834728629, + "grad_norm": 21.749906539916992, + "learning_rate": 6.336339764813335e-07, + "loss": 2.1161, + "step": 43360 + }, + { + "epoch": 4.939916851756934, + "grad_norm": 18.062833786010742, + "learning_rate": 6.222171480762645e-07, + "loss": 2.3725, + "step": 43370 + }, + { + "epoch": 4.941055868785238, + "grad_norm": 23.986589431762695, + "learning_rate": 6.108003196711953e-07, + "loss": 2.0771, + "step": 43380 + }, + { + "epoch": 4.942194885813543, + "grad_norm": 34.163490295410156, + "learning_rate": 5.993834912661263e-07, + "loss": 2.2228, + "step": 43390 + }, + { + "epoch": 4.9433339028418475, + "grad_norm": 55.43227767944336, + "learning_rate": 5.879666628610573e-07, + "loss": 1.9449, + "step": 43400 + }, + { + "epoch": 4.944472919870152, + "grad_norm": 21.36100959777832, + "learning_rate": 5.765498344559882e-07, + "loss": 2.0822, + "step": 43410 + }, + { + "epoch": 4.945611936898457, + "grad_norm": 21.50360870361328, + "learning_rate": 5.651330060509192e-07, + "loss": 2.1207, + "step": 43420 + }, + { + "epoch": 4.946750953926761, + "grad_norm": 19.970691680908203, + "learning_rate": 5.5371617764585e-07, + "loss": 2.066, + "step": 43430 + }, + { + "epoch": 4.947889970955066, + "grad_norm": 25.23279571533203, + "learning_rate": 5.422993492407809e-07, + "loss": 2.0744, + "step": 43440 + }, + { + "epoch": 4.94902898798337, + "grad_norm": 22.18399429321289, + "learning_rate": 5.308825208357119e-07, + "loss": 1.8488, + "step": 43450 + }, + { + "epoch": 4.950168005011675, + "grad_norm": 35.336830139160156, + "learning_rate": 5.194656924306428e-07, + "loss": 2.0321, + "step": 43460 + }, + { + "epoch": 4.9513070220399795, + "grad_norm": 19.688838958740234, + "learning_rate": 5.080488640255737e-07, + "loss": 1.9023, + "step": 43470 + }, + { + "epoch": 4.952446039068284, + "grad_norm": 19.8863468170166, + "learning_rate": 4.966320356205047e-07, + "loss": 2.2773, + "step": 43480 + }, + { + "epoch": 4.953585056096589, + "grad_norm": 18.23637580871582, + "learning_rate": 4.852152072154356e-07, + "loss": 2.4237, + "step": 43490 + }, + { + "epoch": 4.954724073124893, + "grad_norm": 29.53800392150879, + "learning_rate": 4.7379837881036655e-07, + "loss": 2.137, + "step": 43500 + }, + { + "epoch": 4.955863090153198, + "grad_norm": 26.814315795898438, + "learning_rate": 4.623815504052974e-07, + "loss": 2.0225, + "step": 43510 + }, + { + "epoch": 4.957002107181502, + "grad_norm": 23.36654281616211, + "learning_rate": 4.5096472200022837e-07, + "loss": 1.99, + "step": 43520 + }, + { + "epoch": 4.958141124209807, + "grad_norm": 33.0721549987793, + "learning_rate": 4.395478935951593e-07, + "loss": 1.704, + "step": 43530 + }, + { + "epoch": 4.959280141238112, + "grad_norm": 16.49612045288086, + "learning_rate": 4.2813106519009024e-07, + "loss": 1.9406, + "step": 43540 + }, + { + "epoch": 4.960419158266416, + "grad_norm": 53.89641189575195, + "learning_rate": 4.167142367850212e-07, + "loss": 1.9357, + "step": 43550 + }, + { + "epoch": 4.961558175294721, + "grad_norm": 17.75715446472168, + "learning_rate": 4.0529740837995206e-07, + "loss": 2.1469, + "step": 43560 + }, + { + "epoch": 4.962697192323025, + "grad_norm": 19.26421546936035, + "learning_rate": 3.93880579974883e-07, + "loss": 2.3053, + "step": 43570 + }, + { + "epoch": 4.96383620935133, + "grad_norm": 42.125301361083984, + "learning_rate": 3.8246375156981394e-07, + "loss": 2.6552, + "step": 43580 + }, + { + "epoch": 4.964975226379634, + "grad_norm": 19.557138442993164, + "learning_rate": 3.710469231647448e-07, + "loss": 2.0609, + "step": 43590 + }, + { + "epoch": 4.966114243407939, + "grad_norm": 17.626657485961914, + "learning_rate": 3.596300947596758e-07, + "loss": 2.2335, + "step": 43600 + }, + { + "epoch": 4.967253260436244, + "grad_norm": 22.223346710205078, + "learning_rate": 3.482132663546067e-07, + "loss": 2.2536, + "step": 43610 + }, + { + "epoch": 4.968392277464548, + "grad_norm": 22.907625198364258, + "learning_rate": 3.3679643794953764e-07, + "loss": 2.0249, + "step": 43620 + }, + { + "epoch": 4.969531294492853, + "grad_norm": 23.45240592956543, + "learning_rate": 3.2537960954446857e-07, + "loss": 2.1345, + "step": 43630 + }, + { + "epoch": 4.9706703115211575, + "grad_norm": 22.90519142150879, + "learning_rate": 3.139627811393995e-07, + "loss": 1.8825, + "step": 43640 + }, + { + "epoch": 4.971809328549462, + "grad_norm": 34.165626525878906, + "learning_rate": 3.025459527343304e-07, + "loss": 2.0895, + "step": 43650 + }, + { + "epoch": 4.972948345577766, + "grad_norm": 14.791748046875, + "learning_rate": 2.9112912432926133e-07, + "loss": 2.1784, + "step": 43660 + }, + { + "epoch": 4.974087362606071, + "grad_norm": 24.21409797668457, + "learning_rate": 2.7971229592419227e-07, + "loss": 2.3927, + "step": 43670 + }, + { + "epoch": 4.975226379634376, + "grad_norm": 31.54176139831543, + "learning_rate": 2.682954675191232e-07, + "loss": 2.4405, + "step": 43680 + }, + { + "epoch": 4.97636539666268, + "grad_norm": 19.191059112548828, + "learning_rate": 2.568786391140541e-07, + "loss": 1.9045, + "step": 43690 + }, + { + "epoch": 4.977504413690985, + "grad_norm": 55.505306243896484, + "learning_rate": 2.4546181070898503e-07, + "loss": 2.2202, + "step": 43700 + }, + { + "epoch": 4.9786434307192895, + "grad_norm": 16.878713607788086, + "learning_rate": 2.34044982303916e-07, + "loss": 1.9728, + "step": 43710 + }, + { + "epoch": 4.979782447747594, + "grad_norm": 19.552278518676758, + "learning_rate": 2.226281538988469e-07, + "loss": 2.2127, + "step": 43720 + }, + { + "epoch": 4.980921464775898, + "grad_norm": 20.828571319580078, + "learning_rate": 2.1121132549377784e-07, + "loss": 1.9799, + "step": 43730 + }, + { + "epoch": 4.982060481804203, + "grad_norm": 37.50254821777344, + "learning_rate": 1.9979449708870875e-07, + "loss": 2.1735, + "step": 43740 + }, + { + "epoch": 4.983199498832508, + "grad_norm": 42.67470169067383, + "learning_rate": 1.883776686836397e-07, + "loss": 1.9895, + "step": 43750 + }, + { + "epoch": 4.984338515860812, + "grad_norm": 27.682607650756836, + "learning_rate": 1.769608402785706e-07, + "loss": 1.9217, + "step": 43760 + }, + { + "epoch": 4.985477532889116, + "grad_norm": 20.88173484802246, + "learning_rate": 1.6554401187350157e-07, + "loss": 1.8065, + "step": 43770 + }, + { + "epoch": 4.9866165499174215, + "grad_norm": 17.5195369720459, + "learning_rate": 1.5412718346843248e-07, + "loss": 2.0011, + "step": 43780 + }, + { + "epoch": 4.987755566945726, + "grad_norm": 16.822736740112305, + "learning_rate": 1.4271035506336341e-07, + "loss": 2.0502, + "step": 43790 + }, + { + "epoch": 4.98889458397403, + "grad_norm": 36.95533752441406, + "learning_rate": 1.3129352665829435e-07, + "loss": 1.9952, + "step": 43800 + }, + { + "epoch": 4.9900336010023345, + "grad_norm": 29.974193572998047, + "learning_rate": 1.1987669825322526e-07, + "loss": 1.7213, + "step": 43810 + }, + { + "epoch": 4.99117261803064, + "grad_norm": 19.002042770385742, + "learning_rate": 1.0845986984815619e-07, + "loss": 2.1767, + "step": 43820 + }, + { + "epoch": 4.992311635058944, + "grad_norm": 18.42278480529785, + "learning_rate": 9.704304144308711e-08, + "loss": 1.8299, + "step": 43830 + }, + { + "epoch": 4.993450652087248, + "grad_norm": 15.805915832519531, + "learning_rate": 8.562621303801805e-08, + "loss": 2.1852, + "step": 43840 + }, + { + "epoch": 4.994589669115554, + "grad_norm": 24.35159683227539, + "learning_rate": 7.420938463294897e-08, + "loss": 2.0954, + "step": 43850 + }, + { + "epoch": 4.995728686143858, + "grad_norm": 21.857868194580078, + "learning_rate": 6.27925562278799e-08, + "loss": 2.0298, + "step": 43860 + }, + { + "epoch": 4.996867703172162, + "grad_norm": 24.798961639404297, + "learning_rate": 5.137572782281082e-08, + "loss": 1.8788, + "step": 43870 + }, + { + "epoch": 4.9980067202004665, + "grad_norm": 18.57286834716797, + "learning_rate": 3.995889941774175e-08, + "loss": 2.333, + "step": 43880 + }, + { + "epoch": 4.999145737228772, + "grad_norm": 22.775592803955078, + "learning_rate": 2.854207101267268e-08, + "loss": 1.992, + "step": 43890 + }, + { + "epoch": 4.999715245742924, + "step": 43895, + "total_flos": 0.0, + "train_loss": 4.220472787352355, + "train_runtime": 63883.0517, + "train_samples_per_second": 5.497, + "train_steps_per_second": 0.687 + } + ], + "logging_steps": 10, + "max_steps": 43895, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 4000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}