{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.644854490574634, "eval_steps": 2000, "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 9.391961097717285, "eval_runtime": 9.2079, "eval_samples_per_second": 1.629, "eval_steps_per_second": 0.217, "step": 0 }, { "epoch": 0.001139017028304573, "grad_norm": 33.790897369384766, "learning_rate": 3.5000000000000004e-06, "loss": 8.9613, "step": 10 }, { "epoch": 0.002278034056609146, "grad_norm": 35.671512603759766, "learning_rate": 8.000000000000001e-06, "loss": 8.507, "step": 20 }, { "epoch": 0.0034170510849137197, "grad_norm": 24.397680282592773, "learning_rate": 1.3000000000000001e-05, "loss": 7.9465, "step": 30 }, { "epoch": 0.004556068113218292, "grad_norm": 15.802092552185059, "learning_rate": 1.8e-05, "loss": 7.3319, "step": 40 }, { "epoch": 0.005695085141522865, "grad_norm": 14.520373344421387, "learning_rate": 2.3000000000000003e-05, "loss": 7.3491, "step": 50 }, { "epoch": 0.006834102169827439, "grad_norm": 14.86996841430664, "learning_rate": 2.8000000000000003e-05, "loss": 6.8979, "step": 60 }, { "epoch": 0.007973119198132012, "grad_norm": 14.0488920211792, "learning_rate": 3.3e-05, "loss": 6.9052, "step": 70 }, { "epoch": 0.009112136226436585, "grad_norm": 13.32886028289795, "learning_rate": 3.8e-05, "loss": 7.3002, "step": 80 }, { "epoch": 0.010251153254741158, "grad_norm": 11.834951400756836, "learning_rate": 4.3e-05, "loss": 6.9231, "step": 90 }, { "epoch": 0.01139017028304573, "grad_norm": 17.501251220703125, "learning_rate": 4.8e-05, "loss": 6.6756, "step": 100 }, { "epoch": 0.012529187311350304, "grad_norm": 10.640851020812988, "learning_rate": 4.999314990295696e-05, "loss": 6.861, "step": 110 }, { "epoch": 0.013668204339654879, "grad_norm": 58.233848571777344, "learning_rate": 4.998173307455189e-05, "loss": 7.1125, "step": 120 }, { "epoch": 0.014807221367959452, "grad_norm": 15.781530380249023, "learning_rate": 4.997031624614683e-05, "loss": 7.0402, "step": 130 }, { "epoch": 0.015946238396264023, "grad_norm": 10.17588996887207, "learning_rate": 4.995889941774175e-05, "loss": 6.9334, "step": 140 }, { "epoch": 0.017085255424568596, "grad_norm": 14.133599281311035, "learning_rate": 4.9947482589336685e-05, "loss": 6.7971, "step": 150 }, { "epoch": 0.01822427245287317, "grad_norm": 10.439510345458984, "learning_rate": 4.993606576093162e-05, "loss": 7.2431, "step": 160 }, { "epoch": 0.019363289481177742, "grad_norm": 10.945755004882812, "learning_rate": 4.992464893252655e-05, "loss": 6.794, "step": 170 }, { "epoch": 0.020502306509482315, "grad_norm": 14.820952415466309, "learning_rate": 4.9913232104121474e-05, "loss": 6.6915, "step": 180 }, { "epoch": 0.02164132353778689, "grad_norm": 13.331741333007812, "learning_rate": 4.9901815275716405e-05, "loss": 6.9742, "step": 190 }, { "epoch": 0.02278034056609146, "grad_norm": 10.086663246154785, "learning_rate": 4.989039844731134e-05, "loss": 6.8214, "step": 200 }, { "epoch": 0.023919357594396035, "grad_norm": 8.214897155761719, "learning_rate": 4.987898161890627e-05, "loss": 6.899, "step": 210 }, { "epoch": 0.025058374622700608, "grad_norm": 9.853551864624023, "learning_rate": 4.98675647905012e-05, "loss": 6.893, "step": 220 }, { "epoch": 0.026197391651005184, "grad_norm": 8.813733100891113, "learning_rate": 4.985614796209613e-05, "loss": 6.6315, "step": 230 }, { "epoch": 0.027336408679309757, "grad_norm": 15.19206714630127, "learning_rate": 4.9844731133691065e-05, "loss": 6.2331, "step": 240 }, { "epoch": 0.02847542570761433, "grad_norm": 11.389547348022461, "learning_rate": 4.9833314305285996e-05, "loss": 6.5654, "step": 250 }, { "epoch": 0.029614442735918903, "grad_norm": 99.71858978271484, "learning_rate": 4.982418084256194e-05, "loss": 7.0021, "step": 260 }, { "epoch": 0.030753459764223477, "grad_norm": 8.42121696472168, "learning_rate": 4.981276401415687e-05, "loss": 7.3346, "step": 270 }, { "epoch": 0.031892476792528046, "grad_norm": 8.211444854736328, "learning_rate": 4.98013471857518e-05, "loss": 6.4807, "step": 280 }, { "epoch": 0.03303149382083262, "grad_norm": 32.79572296142578, "learning_rate": 4.978993035734673e-05, "loss": 6.1654, "step": 290 }, { "epoch": 0.03417051084913719, "grad_norm": 11.948236465454102, "learning_rate": 4.9778513528941664e-05, "loss": 6.9469, "step": 300 }, { "epoch": 0.035309527877441765, "grad_norm": 8.02177619934082, "learning_rate": 4.976709670053659e-05, "loss": 6.8169, "step": 310 }, { "epoch": 0.03644854490574634, "grad_norm": 7.713070392608643, "learning_rate": 4.975567987213152e-05, "loss": 6.528, "step": 320 }, { "epoch": 0.03758756193405091, "grad_norm": 8.871659278869629, "learning_rate": 4.974426304372646e-05, "loss": 6.445, "step": 330 }, { "epoch": 0.038726578962355485, "grad_norm": 9.975062370300293, "learning_rate": 4.973284621532139e-05, "loss": 6.7275, "step": 340 }, { "epoch": 0.03986559599066006, "grad_norm": 9.243513107299805, "learning_rate": 4.9721429386916316e-05, "loss": 6.3645, "step": 350 }, { "epoch": 0.04100461301896463, "grad_norm": 7.769169807434082, "learning_rate": 4.971001255851125e-05, "loss": 6.568, "step": 360 }, { "epoch": 0.042143630047269204, "grad_norm": 11.605182647705078, "learning_rate": 4.969859573010618e-05, "loss": 6.6352, "step": 370 }, { "epoch": 0.04328264707557378, "grad_norm": 8.109936714172363, "learning_rate": 4.968717890170111e-05, "loss": 6.7942, "step": 380 }, { "epoch": 0.04442166410387835, "grad_norm": 11.485625267028809, "learning_rate": 4.967576207329604e-05, "loss": 6.6572, "step": 390 }, { "epoch": 0.04556068113218292, "grad_norm": 10.854982376098633, "learning_rate": 4.966434524489097e-05, "loss": 6.4459, "step": 400 }, { "epoch": 0.046699698160487496, "grad_norm": 13.574692726135254, "learning_rate": 4.96529284164859e-05, "loss": 6.4014, "step": 410 }, { "epoch": 0.04783871518879207, "grad_norm": 10.07735824584961, "learning_rate": 4.964151158808084e-05, "loss": 6.5581, "step": 420 }, { "epoch": 0.04897773221709664, "grad_norm": 11.299982070922852, "learning_rate": 4.9630094759675764e-05, "loss": 6.6429, "step": 430 }, { "epoch": 0.050116749245401215, "grad_norm": 12.302263259887695, "learning_rate": 4.9618677931270696e-05, "loss": 6.9176, "step": 440 }, { "epoch": 0.051255766273705795, "grad_norm": 280.8349304199219, "learning_rate": 4.960726110286563e-05, "loss": 6.115, "step": 450 }, { "epoch": 0.05239478330201037, "grad_norm": 11.660781860351562, "learning_rate": 4.959584427446056e-05, "loss": 6.8566, "step": 460 }, { "epoch": 0.05353380033031494, "grad_norm": 9.093023300170898, "learning_rate": 4.9584427446055485e-05, "loss": 6.8361, "step": 470 }, { "epoch": 0.054672817358619515, "grad_norm": 11.05521297454834, "learning_rate": 4.9573010617650417e-05, "loss": 6.4291, "step": 480 }, { "epoch": 0.05581183438692409, "grad_norm": 21.002239227294922, "learning_rate": 4.956159378924535e-05, "loss": 6.4612, "step": 490 }, { "epoch": 0.05695085141522866, "grad_norm": 10.657567024230957, "learning_rate": 4.955017696084028e-05, "loss": 6.3579, "step": 500 }, { "epoch": 0.058089868443533234, "grad_norm": 11.03357219696045, "learning_rate": 4.953876013243521e-05, "loss": 6.4549, "step": 510 }, { "epoch": 0.05922888547183781, "grad_norm": 9.404083251953125, "learning_rate": 4.9527343304030144e-05, "loss": 6.0746, "step": 520 }, { "epoch": 0.06036790250014238, "grad_norm": 7.945953845977783, "learning_rate": 4.9515926475625076e-05, "loss": 6.3596, "step": 530 }, { "epoch": 0.06150691952844695, "grad_norm": 7.898467063903809, "learning_rate": 4.950450964722001e-05, "loss": 5.6804, "step": 540 }, { "epoch": 0.06264593655675152, "grad_norm": 15.345819473266602, "learning_rate": 4.949309281881493e-05, "loss": 6.6344, "step": 550 }, { "epoch": 0.06378495358505609, "grad_norm": 10.704774856567383, "learning_rate": 4.9481675990409864e-05, "loss": 6.2971, "step": 560 }, { "epoch": 0.06492397061336067, "grad_norm": 18.340755462646484, "learning_rate": 4.9470259162004796e-05, "loss": 6.5508, "step": 570 }, { "epoch": 0.06606298764166524, "grad_norm": 15.063714981079102, "learning_rate": 4.945884233359973e-05, "loss": 6.291, "step": 580 }, { "epoch": 0.06720200466996981, "grad_norm": 11.307854652404785, "learning_rate": 4.944742550519466e-05, "loss": 6.2575, "step": 590 }, { "epoch": 0.06834102169827438, "grad_norm": 9.267541885375977, "learning_rate": 4.943600867678959e-05, "loss": 6.1397, "step": 600 }, { "epoch": 0.06948003872657896, "grad_norm": 8.097043991088867, "learning_rate": 4.9424591848384524e-05, "loss": 6.4305, "step": 610 }, { "epoch": 0.07061905575488353, "grad_norm": 14.195074081420898, "learning_rate": 4.941317501997945e-05, "loss": 6.1038, "step": 620 }, { "epoch": 0.0717580727831881, "grad_norm": 7.2899980545043945, "learning_rate": 4.940175819157438e-05, "loss": 6.5986, "step": 630 }, { "epoch": 0.07289708981149268, "grad_norm": 7.332784175872803, "learning_rate": 4.939034136316931e-05, "loss": 6.5741, "step": 640 }, { "epoch": 0.07403610683979725, "grad_norm": 17.372455596923828, "learning_rate": 4.9378924534764244e-05, "loss": 6.4233, "step": 650 }, { "epoch": 0.07517512386810182, "grad_norm": 6.290081024169922, "learning_rate": 4.9367507706359176e-05, "loss": 6.1218, "step": 660 }, { "epoch": 0.0763141408964064, "grad_norm": 11.476178169250488, "learning_rate": 4.935609087795411e-05, "loss": 6.3903, "step": 670 }, { "epoch": 0.07745315792471097, "grad_norm": 9.72933578491211, "learning_rate": 4.934467404954904e-05, "loss": 6.2056, "step": 680 }, { "epoch": 0.07859217495301554, "grad_norm": 13.388916015625, "learning_rate": 4.933325722114397e-05, "loss": 6.3394, "step": 690 }, { "epoch": 0.07973119198132012, "grad_norm": 9.48354434967041, "learning_rate": 4.93218403927389e-05, "loss": 6.3088, "step": 700 }, { "epoch": 0.08087020900962469, "grad_norm": 8.708009719848633, "learning_rate": 4.931042356433383e-05, "loss": 5.4808, "step": 710 }, { "epoch": 0.08200922603792926, "grad_norm": 10.767496109008789, "learning_rate": 4.929900673592876e-05, "loss": 6.7952, "step": 720 }, { "epoch": 0.08314824306623383, "grad_norm": 20.067209243774414, "learning_rate": 4.928758990752369e-05, "loss": 6.3908, "step": 730 }, { "epoch": 0.08428726009453841, "grad_norm": 9.885820388793945, "learning_rate": 4.927617307911862e-05, "loss": 6.3423, "step": 740 }, { "epoch": 0.08542627712284298, "grad_norm": 19.079978942871094, "learning_rate": 4.9264756250713556e-05, "loss": 6.742, "step": 750 }, { "epoch": 0.08656529415114755, "grad_norm": 8.339012145996094, "learning_rate": 4.925333942230849e-05, "loss": 6.5489, "step": 760 }, { "epoch": 0.08770431117945213, "grad_norm": 6.8558125495910645, "learning_rate": 4.924192259390342e-05, "loss": 6.2821, "step": 770 }, { "epoch": 0.0888433282077567, "grad_norm": 9.564237594604492, "learning_rate": 4.9230505765498345e-05, "loss": 6.4705, "step": 780 }, { "epoch": 0.08998234523606127, "grad_norm": 8.423482894897461, "learning_rate": 4.9219088937093276e-05, "loss": 6.277, "step": 790 }, { "epoch": 0.09112136226436585, "grad_norm": 11.994951248168945, "learning_rate": 4.920767210868821e-05, "loss": 6.2951, "step": 800 }, { "epoch": 0.09226037929267042, "grad_norm": 6.857611179351807, "learning_rate": 4.919625528028314e-05, "loss": 6.2825, "step": 810 }, { "epoch": 0.09339939632097499, "grad_norm": 10.049439430236816, "learning_rate": 4.9184838451878065e-05, "loss": 6.6446, "step": 820 }, { "epoch": 0.09453841334927957, "grad_norm": 10.375741958618164, "learning_rate": 4.9173421623473004e-05, "loss": 6.4659, "step": 830 }, { "epoch": 0.09567743037758414, "grad_norm": 16.795923233032227, "learning_rate": 4.9162004795067936e-05, "loss": 6.0484, "step": 840 }, { "epoch": 0.09681644740588871, "grad_norm": 9.408246994018555, "learning_rate": 4.915058796666287e-05, "loss": 6.2876, "step": 850 }, { "epoch": 0.09795546443419328, "grad_norm": 9.622159004211426, "learning_rate": 4.913917113825779e-05, "loss": 6.2234, "step": 860 }, { "epoch": 0.09909448146249786, "grad_norm": 6.719747543334961, "learning_rate": 4.9127754309852724e-05, "loss": 6.6025, "step": 870 }, { "epoch": 0.10023349849080243, "grad_norm": 7.316214561462402, "learning_rate": 4.9116337481447656e-05, "loss": 6.0127, "step": 880 }, { "epoch": 0.101372515519107, "grad_norm": 8.370978355407715, "learning_rate": 4.910492065304259e-05, "loss": 6.3505, "step": 890 }, { "epoch": 0.10251153254741159, "grad_norm": 9.227510452270508, "learning_rate": 4.909350382463751e-05, "loss": 6.6696, "step": 900 }, { "epoch": 0.10365054957571616, "grad_norm": 8.848276138305664, "learning_rate": 4.908208699623245e-05, "loss": 6.5989, "step": 910 }, { "epoch": 0.10478956660402074, "grad_norm": 6.87146520614624, "learning_rate": 4.9070670167827383e-05, "loss": 6.1757, "step": 920 }, { "epoch": 0.10592858363232531, "grad_norm": 12.845023155212402, "learning_rate": 4.9059253339422315e-05, "loss": 6.155, "step": 930 }, { "epoch": 0.10706760066062988, "grad_norm": 8.775230407714844, "learning_rate": 4.904783651101724e-05, "loss": 6.6463, "step": 940 }, { "epoch": 0.10820661768893446, "grad_norm": 6.301644802093506, "learning_rate": 4.903641968261217e-05, "loss": 6.3834, "step": 950 }, { "epoch": 0.10934563471723903, "grad_norm": 6.351837635040283, "learning_rate": 4.9025002854207104e-05, "loss": 6.0652, "step": 960 }, { "epoch": 0.1104846517455436, "grad_norm": 12.989243507385254, "learning_rate": 4.9013586025802036e-05, "loss": 6.133, "step": 970 }, { "epoch": 0.11162366877384818, "grad_norm": 10.382040977478027, "learning_rate": 4.900216919739696e-05, "loss": 6.0956, "step": 980 }, { "epoch": 0.11276268580215275, "grad_norm": 10.52625560760498, "learning_rate": 4.89907523689919e-05, "loss": 6.7882, "step": 990 }, { "epoch": 0.11390170283045732, "grad_norm": 7.049230098724365, "learning_rate": 4.897933554058683e-05, "loss": 6.3438, "step": 1000 }, { "epoch": 0.1150407198587619, "grad_norm": 6.671297073364258, "learning_rate": 4.8967918712181756e-05, "loss": 6.4511, "step": 1010 }, { "epoch": 0.11617973688706647, "grad_norm": 6.536120891571045, "learning_rate": 4.895650188377669e-05, "loss": 6.1108, "step": 1020 }, { "epoch": 0.11731875391537104, "grad_norm": 13.030671119689941, "learning_rate": 4.894508505537162e-05, "loss": 6.251, "step": 1030 }, { "epoch": 0.11845777094367561, "grad_norm": 6.829502582550049, "learning_rate": 4.893366822696655e-05, "loss": 6.0828, "step": 1040 }, { "epoch": 0.11959678797198019, "grad_norm": 7.369673252105713, "learning_rate": 4.8922251398561484e-05, "loss": 6.0556, "step": 1050 }, { "epoch": 0.12073580500028476, "grad_norm": 8.547041893005371, "learning_rate": 4.891083457015641e-05, "loss": 6.1014, "step": 1060 }, { "epoch": 0.12187482202858933, "grad_norm": 15.62175464630127, "learning_rate": 4.889941774175134e-05, "loss": 6.2186, "step": 1070 }, { "epoch": 0.1230138390568939, "grad_norm": 12.673507690429688, "learning_rate": 4.888800091334628e-05, "loss": 6.6656, "step": 1080 }, { "epoch": 0.12415285608519848, "grad_norm": 6.221761226654053, "learning_rate": 4.8876584084941204e-05, "loss": 6.1597, "step": 1090 }, { "epoch": 0.12529187311350304, "grad_norm": 6.281069755554199, "learning_rate": 4.8865167256536136e-05, "loss": 6.8511, "step": 1100 }, { "epoch": 0.1264308901418076, "grad_norm": 10.124175071716309, "learning_rate": 4.885375042813107e-05, "loss": 6.1205, "step": 1110 }, { "epoch": 0.12756990717011218, "grad_norm": 12.35534381866455, "learning_rate": 4.8842333599726e-05, "loss": 6.2648, "step": 1120 }, { "epoch": 0.12870892419841676, "grad_norm": 8.898721694946289, "learning_rate": 4.8830916771320925e-05, "loss": 6.4631, "step": 1130 }, { "epoch": 0.12984794122672133, "grad_norm": 5.725305557250977, "learning_rate": 4.881949994291586e-05, "loss": 6.5257, "step": 1140 }, { "epoch": 0.1309869582550259, "grad_norm": 6.875019550323486, "learning_rate": 4.880808311451079e-05, "loss": 6.4072, "step": 1150 }, { "epoch": 0.13212597528333048, "grad_norm": 6.568986415863037, "learning_rate": 4.879666628610573e-05, "loss": 6.2429, "step": 1160 }, { "epoch": 0.13326499231163505, "grad_norm": 6.279566287994385, "learning_rate": 4.878524945770065e-05, "loss": 6.1154, "step": 1170 }, { "epoch": 0.13440400933993962, "grad_norm": 6.139650344848633, "learning_rate": 4.8773832629295584e-05, "loss": 6.4415, "step": 1180 }, { "epoch": 0.1355430263682442, "grad_norm": 15.926244735717773, "learning_rate": 4.8762415800890516e-05, "loss": 5.7659, "step": 1190 }, { "epoch": 0.13668204339654877, "grad_norm": 8.014056205749512, "learning_rate": 4.875099897248545e-05, "loss": 6.0903, "step": 1200 }, { "epoch": 0.13782106042485334, "grad_norm": 8.850829124450684, "learning_rate": 4.873958214408037e-05, "loss": 6.1817, "step": 1210 }, { "epoch": 0.13896007745315792, "grad_norm": 38.34330368041992, "learning_rate": 4.8728165315675305e-05, "loss": 6.6666, "step": 1220 }, { "epoch": 0.1400990944814625, "grad_norm": 11.398346900939941, "learning_rate": 4.8716748487270237e-05, "loss": 6.2435, "step": 1230 }, { "epoch": 0.14123811150976706, "grad_norm": 9.657853126525879, "learning_rate": 4.8705331658865175e-05, "loss": 5.9659, "step": 1240 }, { "epoch": 0.14237712853807163, "grad_norm": 16.08867645263672, "learning_rate": 4.86939148304601e-05, "loss": 6.1505, "step": 1250 }, { "epoch": 0.1435161455663762, "grad_norm": 6.016851425170898, "learning_rate": 4.868249800205503e-05, "loss": 6.1347, "step": 1260 }, { "epoch": 0.14465516259468078, "grad_norm": 7.4131855964660645, "learning_rate": 4.8671081173649964e-05, "loss": 6.1454, "step": 1270 }, { "epoch": 0.14579417962298535, "grad_norm": 27.70649528503418, "learning_rate": 4.8659664345244896e-05, "loss": 6.2655, "step": 1280 }, { "epoch": 0.14693319665128993, "grad_norm": 23.359214782714844, "learning_rate": 4.864824751683982e-05, "loss": 5.9975, "step": 1290 }, { "epoch": 0.1480722136795945, "grad_norm": 7.8880510330200195, "learning_rate": 4.863683068843475e-05, "loss": 6.373, "step": 1300 }, { "epoch": 0.14921123070789907, "grad_norm": 8.129782676696777, "learning_rate": 4.8625413860029684e-05, "loss": 6.0534, "step": 1310 }, { "epoch": 0.15035024773620365, "grad_norm": 5.611935615539551, "learning_rate": 4.8613997031624616e-05, "loss": 6.4988, "step": 1320 }, { "epoch": 0.15148926476450822, "grad_norm": 6.0331597328186035, "learning_rate": 4.860258020321955e-05, "loss": 5.8687, "step": 1330 }, { "epoch": 0.1526282817928128, "grad_norm": 6.858291149139404, "learning_rate": 4.859116337481448e-05, "loss": 6.0291, "step": 1340 }, { "epoch": 0.15376729882111737, "grad_norm": 6.007782936096191, "learning_rate": 4.857974654640941e-05, "loss": 5.9691, "step": 1350 }, { "epoch": 0.15490631584942194, "grad_norm": 5.932456970214844, "learning_rate": 4.8568329718004344e-05, "loss": 6.3551, "step": 1360 }, { "epoch": 0.1560453328777265, "grad_norm": 6.164700984954834, "learning_rate": 4.855691288959927e-05, "loss": 6.0607, "step": 1370 }, { "epoch": 0.15718434990603108, "grad_norm": 6.078372478485107, "learning_rate": 4.85454960611942e-05, "loss": 6.1052, "step": 1380 }, { "epoch": 0.15832336693433566, "grad_norm": 9.424004554748535, "learning_rate": 4.853407923278913e-05, "loss": 6.2598, "step": 1390 }, { "epoch": 0.15946238396264023, "grad_norm": 14.353684425354004, "learning_rate": 4.8522662404384064e-05, "loss": 6.1377, "step": 1400 }, { "epoch": 0.1606014009909448, "grad_norm": 8.278711318969727, "learning_rate": 4.8511245575978996e-05, "loss": 6.2278, "step": 1410 }, { "epoch": 0.16174041801924938, "grad_norm": 9.254868507385254, "learning_rate": 4.849982874757393e-05, "loss": 6.0705, "step": 1420 }, { "epoch": 0.16287943504755395, "grad_norm": 9.009742736816406, "learning_rate": 4.848841191916886e-05, "loss": 6.3962, "step": 1430 }, { "epoch": 0.16401845207585852, "grad_norm": 9.298644065856934, "learning_rate": 4.8476995090763785e-05, "loss": 6.2293, "step": 1440 }, { "epoch": 0.1651574691041631, "grad_norm": 10.836934089660645, "learning_rate": 4.846557826235872e-05, "loss": 6.0921, "step": 1450 }, { "epoch": 0.16629648613246767, "grad_norm": 19.63772201538086, "learning_rate": 4.845416143395365e-05, "loss": 6.4046, "step": 1460 }, { "epoch": 0.16743550316077224, "grad_norm": 15.014579772949219, "learning_rate": 4.844274460554858e-05, "loss": 6.4525, "step": 1470 }, { "epoch": 0.16857452018907682, "grad_norm": 7.850244522094727, "learning_rate": 4.843132777714351e-05, "loss": 6.0932, "step": 1480 }, { "epoch": 0.1697135372173814, "grad_norm": 7.637505531311035, "learning_rate": 4.8419910948738444e-05, "loss": 6.0807, "step": 1490 }, { "epoch": 0.17085255424568596, "grad_norm": 5.155618667602539, "learning_rate": 4.8408494120333376e-05, "loss": 6.2082, "step": 1500 }, { "epoch": 0.17199157127399053, "grad_norm": 8.438528060913086, "learning_rate": 4.839707729192831e-05, "loss": 6.0389, "step": 1510 }, { "epoch": 0.1731305883022951, "grad_norm": 31.926239013671875, "learning_rate": 4.838566046352323e-05, "loss": 5.9333, "step": 1520 }, { "epoch": 0.17426960533059968, "grad_norm": 5.836406707763672, "learning_rate": 4.8374243635118165e-05, "loss": 6.0942, "step": 1530 }, { "epoch": 0.17540862235890425, "grad_norm": 8.088964462280273, "learning_rate": 4.8362826806713096e-05, "loss": 6.2754, "step": 1540 }, { "epoch": 0.17654763938720883, "grad_norm": 6.764881610870361, "learning_rate": 4.835140997830803e-05, "loss": 5.8569, "step": 1550 }, { "epoch": 0.1776866564155134, "grad_norm": 4.88889217376709, "learning_rate": 4.833999314990296e-05, "loss": 6.378, "step": 1560 }, { "epoch": 0.17882567344381797, "grad_norm": 8.175009727478027, "learning_rate": 4.832857632149789e-05, "loss": 6.2938, "step": 1570 }, { "epoch": 0.17996469047212255, "grad_norm": 7.189762115478516, "learning_rate": 4.8317159493092824e-05, "loss": 6.1562, "step": 1580 }, { "epoch": 0.18110370750042712, "grad_norm": 6.389930248260498, "learning_rate": 4.8305742664687756e-05, "loss": 6.2245, "step": 1590 }, { "epoch": 0.1822427245287317, "grad_norm": 9.998733520507812, "learning_rate": 4.829432583628268e-05, "loss": 5.7088, "step": 1600 }, { "epoch": 0.18338174155703627, "grad_norm": 9.267230033874512, "learning_rate": 4.828290900787761e-05, "loss": 5.9327, "step": 1610 }, { "epoch": 0.18452075858534084, "grad_norm": 7.547935485839844, "learning_rate": 4.8271492179472544e-05, "loss": 5.8324, "step": 1620 }, { "epoch": 0.1856597756136454, "grad_norm": 10.875968933105469, "learning_rate": 4.8260075351067476e-05, "loss": 6.0532, "step": 1630 }, { "epoch": 0.18679879264194998, "grad_norm": 7.93349027633667, "learning_rate": 4.82486585226624e-05, "loss": 6.4827, "step": 1640 }, { "epoch": 0.18793780967025456, "grad_norm": 7.426550388336182, "learning_rate": 4.823724169425734e-05, "loss": 6.1161, "step": 1650 }, { "epoch": 0.18907682669855913, "grad_norm": 5.887362003326416, "learning_rate": 4.822582486585227e-05, "loss": 6.1616, "step": 1660 }, { "epoch": 0.1902158437268637, "grad_norm": 9.981441497802734, "learning_rate": 4.8214408037447204e-05, "loss": 6.0732, "step": 1670 }, { "epoch": 0.19135486075516828, "grad_norm": 7.465058326721191, "learning_rate": 4.820299120904213e-05, "loss": 6.1012, "step": 1680 }, { "epoch": 0.19249387778347285, "grad_norm": 14.853341102600098, "learning_rate": 4.819157438063706e-05, "loss": 6.6688, "step": 1690 }, { "epoch": 0.19363289481177742, "grad_norm": 9.4944486618042, "learning_rate": 4.818015755223199e-05, "loss": 6.0464, "step": 1700 }, { "epoch": 0.194771911840082, "grad_norm": 6.113044261932373, "learning_rate": 4.8168740723826924e-05, "loss": 6.0747, "step": 1710 }, { "epoch": 0.19591092886838657, "grad_norm": 11.900188446044922, "learning_rate": 4.815732389542185e-05, "loss": 6.1851, "step": 1720 }, { "epoch": 0.19704994589669114, "grad_norm": 8.415531158447266, "learning_rate": 4.814590706701679e-05, "loss": 5.9874, "step": 1730 }, { "epoch": 0.19818896292499572, "grad_norm": 5.007725238800049, "learning_rate": 4.813449023861172e-05, "loss": 6.3246, "step": 1740 }, { "epoch": 0.1993279799533003, "grad_norm": 7.706382751464844, "learning_rate": 4.812307341020665e-05, "loss": 6.3739, "step": 1750 }, { "epoch": 0.20046699698160486, "grad_norm": 6.697893142700195, "learning_rate": 4.8111656581801577e-05, "loss": 6.3933, "step": 1760 }, { "epoch": 0.20160601400990943, "grad_norm": 8.579668998718262, "learning_rate": 4.810023975339651e-05, "loss": 5.9253, "step": 1770 }, { "epoch": 0.202745031038214, "grad_norm": 7.312700271606445, "learning_rate": 4.808882292499144e-05, "loss": 5.995, "step": 1780 }, { "epoch": 0.20388404806651858, "grad_norm": 12.99907112121582, "learning_rate": 4.807740609658637e-05, "loss": 6.023, "step": 1790 }, { "epoch": 0.20502306509482318, "grad_norm": 7.628302097320557, "learning_rate": 4.80659892681813e-05, "loss": 6.1802, "step": 1800 }, { "epoch": 0.20616208212312775, "grad_norm": 8.317530632019043, "learning_rate": 4.805457243977623e-05, "loss": 6.2685, "step": 1810 }, { "epoch": 0.20730109915143233, "grad_norm": 14.762248039245605, "learning_rate": 4.804315561137117e-05, "loss": 6.1036, "step": 1820 }, { "epoch": 0.2084401161797369, "grad_norm": 6.367677688598633, "learning_rate": 4.803173878296609e-05, "loss": 5.9443, "step": 1830 }, { "epoch": 0.20957913320804147, "grad_norm": 8.055495262145996, "learning_rate": 4.8020321954561024e-05, "loss": 6.0813, "step": 1840 }, { "epoch": 0.21071815023634605, "grad_norm": 8.737330436706543, "learning_rate": 4.8008905126155956e-05, "loss": 6.2417, "step": 1850 }, { "epoch": 0.21185716726465062, "grad_norm": 7.428030490875244, "learning_rate": 4.799748829775089e-05, "loss": 5.9609, "step": 1860 }, { "epoch": 0.2129961842929552, "grad_norm": 6.4950480461120605, "learning_rate": 4.798607146934582e-05, "loss": 6.1071, "step": 1870 }, { "epoch": 0.21413520132125977, "grad_norm": 13.422469139099121, "learning_rate": 4.7974654640940745e-05, "loss": 6.174, "step": 1880 }, { "epoch": 0.21527421834956434, "grad_norm": 5.788820743560791, "learning_rate": 4.796323781253568e-05, "loss": 6.2645, "step": 1890 }, { "epoch": 0.2164132353778689, "grad_norm": 8.139408111572266, "learning_rate": 4.7951820984130615e-05, "loss": 6.165, "step": 1900 }, { "epoch": 0.21755225240617349, "grad_norm": 5.652318954467773, "learning_rate": 4.794040415572554e-05, "loss": 6.0668, "step": 1910 }, { "epoch": 0.21869126943447806, "grad_norm": 7.127692699432373, "learning_rate": 4.792898732732047e-05, "loss": 6.0703, "step": 1920 }, { "epoch": 0.21983028646278263, "grad_norm": 8.72547435760498, "learning_rate": 4.7917570498915404e-05, "loss": 5.8692, "step": 1930 }, { "epoch": 0.2209693034910872, "grad_norm": 11.749885559082031, "learning_rate": 4.7906153670510336e-05, "loss": 6.355, "step": 1940 }, { "epoch": 0.22210832051939178, "grad_norm": 9.093997955322266, "learning_rate": 4.789473684210526e-05, "loss": 6.0883, "step": 1950 }, { "epoch": 0.22324733754769635, "grad_norm": 7.588223457336426, "learning_rate": 4.788332001370019e-05, "loss": 6.3683, "step": 1960 }, { "epoch": 0.22438635457600092, "grad_norm": 7.856176376342773, "learning_rate": 4.7871903185295125e-05, "loss": 5.9866, "step": 1970 }, { "epoch": 0.2255253716043055, "grad_norm": 7.1797404289245605, "learning_rate": 4.7860486356890063e-05, "loss": 6.1741, "step": 1980 }, { "epoch": 0.22666438863261007, "grad_norm": 7.769150733947754, "learning_rate": 4.784906952848499e-05, "loss": 6.0692, "step": 1990 }, { "epoch": 0.22780340566091464, "grad_norm": 5.899435997009277, "learning_rate": 4.783765270007992e-05, "loss": 5.9925, "step": 2000 }, { "epoch": 0.22780340566091464, "eval_loss": 6.29394006729126, "eval_runtime": 11.0939, "eval_samples_per_second": 1.352, "eval_steps_per_second": 0.18, "step": 2000 }, { "epoch": 0.22894242268921922, "grad_norm": 7.498287677764893, "learning_rate": 4.782623587167485e-05, "loss": 5.915, "step": 2010 }, { "epoch": 0.2300814397175238, "grad_norm": 8.568222045898438, "learning_rate": 4.7814819043269784e-05, "loss": 6.0853, "step": 2020 }, { "epoch": 0.23122045674582836, "grad_norm": 6.620724201202393, "learning_rate": 4.780340221486471e-05, "loss": 5.7249, "step": 2030 }, { "epoch": 0.23235947377413294, "grad_norm": 10.718255996704102, "learning_rate": 4.779198538645964e-05, "loss": 6.023, "step": 2040 }, { "epoch": 0.2334984908024375, "grad_norm": 5.444962024688721, "learning_rate": 4.778056855805457e-05, "loss": 6.0138, "step": 2050 }, { "epoch": 0.23463750783074208, "grad_norm": 7.1495041847229, "learning_rate": 4.776915172964951e-05, "loss": 6.0932, "step": 2060 }, { "epoch": 0.23577652485904665, "grad_norm": 8.905622482299805, "learning_rate": 4.7757734901244436e-05, "loss": 6.1093, "step": 2070 }, { "epoch": 0.23691554188735123, "grad_norm": 6.666223526000977, "learning_rate": 4.774631807283937e-05, "loss": 5.9837, "step": 2080 }, { "epoch": 0.2380545589156558, "grad_norm": 11.08031940460205, "learning_rate": 4.77349012444343e-05, "loss": 6.0752, "step": 2090 }, { "epoch": 0.23919357594396037, "grad_norm": 7.527054786682129, "learning_rate": 4.772348441602923e-05, "loss": 5.9818, "step": 2100 }, { "epoch": 0.24033259297226495, "grad_norm": 6.239260673522949, "learning_rate": 4.771206758762416e-05, "loss": 6.0345, "step": 2110 }, { "epoch": 0.24147161000056952, "grad_norm": 13.21486759185791, "learning_rate": 4.770065075921909e-05, "loss": 5.9158, "step": 2120 }, { "epoch": 0.2426106270288741, "grad_norm": 6.5013322830200195, "learning_rate": 4.768923393081402e-05, "loss": 6.1012, "step": 2130 }, { "epoch": 0.24374964405717867, "grad_norm": 7.801065444946289, "learning_rate": 4.767781710240895e-05, "loss": 5.8506, "step": 2140 }, { "epoch": 0.24488866108548324, "grad_norm": 11.960895538330078, "learning_rate": 4.7666400274003884e-05, "loss": 6.1912, "step": 2150 }, { "epoch": 0.2460276781137878, "grad_norm": 5.73373556137085, "learning_rate": 4.7654983445598816e-05, "loss": 6.1602, "step": 2160 }, { "epoch": 0.24716669514209239, "grad_norm": 6.799802303314209, "learning_rate": 4.764356661719375e-05, "loss": 6.1822, "step": 2170 }, { "epoch": 0.24830571217039696, "grad_norm": 10.62204360961914, "learning_rate": 4.763214978878868e-05, "loss": 6.0969, "step": 2180 }, { "epoch": 0.24944472919870153, "grad_norm": 16.851665496826172, "learning_rate": 4.7620732960383605e-05, "loss": 6.0222, "step": 2190 }, { "epoch": 0.2505837462270061, "grad_norm": 12.263028144836426, "learning_rate": 4.760931613197854e-05, "loss": 6.2636, "step": 2200 }, { "epoch": 0.2517227632553107, "grad_norm": 5.9286370277404785, "learning_rate": 4.759789930357347e-05, "loss": 6.2338, "step": 2210 }, { "epoch": 0.2528617802836152, "grad_norm": 4.939316272735596, "learning_rate": 4.75864824751684e-05, "loss": 6.1357, "step": 2220 }, { "epoch": 0.2540007973119198, "grad_norm": 6.26075553894043, "learning_rate": 4.757506564676333e-05, "loss": 6.0128, "step": 2230 }, { "epoch": 0.25513981434022437, "grad_norm": 6.404390811920166, "learning_rate": 4.7563648818358264e-05, "loss": 6.0365, "step": 2240 }, { "epoch": 0.25627883136852897, "grad_norm": 8.060712814331055, "learning_rate": 4.7552231989953196e-05, "loss": 5.8035, "step": 2250 }, { "epoch": 0.2574178483968335, "grad_norm": 7.4588751792907715, "learning_rate": 4.754081516154813e-05, "loss": 6.0485, "step": 2260 }, { "epoch": 0.2585568654251381, "grad_norm": 9.267732620239258, "learning_rate": 4.752939833314305e-05, "loss": 6.0969, "step": 2270 }, { "epoch": 0.25969588245344266, "grad_norm": 6.651428699493408, "learning_rate": 4.7517981504737985e-05, "loss": 6.4401, "step": 2280 }, { "epoch": 0.26083489948174726, "grad_norm": 13.987489700317383, "learning_rate": 4.7506564676332916e-05, "loss": 6.0228, "step": 2290 }, { "epoch": 0.2619739165100518, "grad_norm": 8.124069213867188, "learning_rate": 4.749514784792785e-05, "loss": 6.1741, "step": 2300 }, { "epoch": 0.2631129335383564, "grad_norm": 5.7567524909973145, "learning_rate": 4.748373101952278e-05, "loss": 6.6153, "step": 2310 }, { "epoch": 0.26425195056666095, "grad_norm": 8.237354278564453, "learning_rate": 4.747231419111771e-05, "loss": 6.9511, "step": 2320 }, { "epoch": 0.26539096759496555, "grad_norm": 9.20639705657959, "learning_rate": 4.7460897362712644e-05, "loss": 6.3056, "step": 2330 }, { "epoch": 0.2665299846232701, "grad_norm": 5.040071964263916, "learning_rate": 4.744948053430757e-05, "loss": 6.1363, "step": 2340 }, { "epoch": 0.2676690016515747, "grad_norm": 6.3060808181762695, "learning_rate": 4.74380637059025e-05, "loss": 6.3329, "step": 2350 }, { "epoch": 0.26880801867987925, "grad_norm": 5.8059306144714355, "learning_rate": 4.742664687749743e-05, "loss": 6.1995, "step": 2360 }, { "epoch": 0.26994703570818385, "grad_norm": 6.453045845031738, "learning_rate": 4.7415230049092364e-05, "loss": 5.8708, "step": 2370 }, { "epoch": 0.2710860527364884, "grad_norm": 10.587589263916016, "learning_rate": 4.7403813220687296e-05, "loss": 5.8575, "step": 2380 }, { "epoch": 0.272225069764793, "grad_norm": 9.980673789978027, "learning_rate": 4.739239639228223e-05, "loss": 6.0367, "step": 2390 }, { "epoch": 0.27336408679309754, "grad_norm": 7.341554164886475, "learning_rate": 4.738097956387716e-05, "loss": 6.0386, "step": 2400 }, { "epoch": 0.27450310382140214, "grad_norm": 6.8661627769470215, "learning_rate": 4.736956273547209e-05, "loss": 6.1036, "step": 2410 }, { "epoch": 0.2756421208497067, "grad_norm": 5.913212299346924, "learning_rate": 4.735814590706702e-05, "loss": 5.9632, "step": 2420 }, { "epoch": 0.2767811378780113, "grad_norm": 6.647164821624756, "learning_rate": 4.734672907866195e-05, "loss": 5.5648, "step": 2430 }, { "epoch": 0.27792015490631583, "grad_norm": 10.95175552368164, "learning_rate": 4.733531225025688e-05, "loss": 5.9986, "step": 2440 }, { "epoch": 0.27905917193462043, "grad_norm": 6.893737316131592, "learning_rate": 4.732389542185181e-05, "loss": 6.2376, "step": 2450 }, { "epoch": 0.280198188962925, "grad_norm": 7.816811561584473, "learning_rate": 4.731247859344674e-05, "loss": 5.9885, "step": 2460 }, { "epoch": 0.2813372059912296, "grad_norm": 6.8919782638549805, "learning_rate": 4.730106176504167e-05, "loss": 5.9962, "step": 2470 }, { "epoch": 0.2824762230195341, "grad_norm": 5.464484214782715, "learning_rate": 4.728964493663661e-05, "loss": 6.105, "step": 2480 }, { "epoch": 0.2836152400478387, "grad_norm": 7.527377605438232, "learning_rate": 4.727822810823154e-05, "loss": 6.2536, "step": 2490 }, { "epoch": 0.28475425707614327, "grad_norm": 6.423839569091797, "learning_rate": 4.7266811279826465e-05, "loss": 5.8717, "step": 2500 }, { "epoch": 0.28589327410444787, "grad_norm": 6.446660041809082, "learning_rate": 4.7255394451421397e-05, "loss": 6.3294, "step": 2510 }, { "epoch": 0.2870322911327524, "grad_norm": 14.388496398925781, "learning_rate": 4.724397762301633e-05, "loss": 6.1426, "step": 2520 }, { "epoch": 0.288171308161057, "grad_norm": 6.65110445022583, "learning_rate": 4.723256079461126e-05, "loss": 5.8097, "step": 2530 }, { "epoch": 0.28931032518936156, "grad_norm": 10.555778503417969, "learning_rate": 4.7221143966206185e-05, "loss": 6.128, "step": 2540 }, { "epoch": 0.29044934221766616, "grad_norm": 11.9190092086792, "learning_rate": 4.720972713780112e-05, "loss": 6.2016, "step": 2550 }, { "epoch": 0.2915883592459707, "grad_norm": 7.404531955718994, "learning_rate": 4.7198310309396056e-05, "loss": 5.8386, "step": 2560 }, { "epoch": 0.2927273762742753, "grad_norm": 14.28420639038086, "learning_rate": 4.718689348099099e-05, "loss": 5.8982, "step": 2570 }, { "epoch": 0.29386639330257985, "grad_norm": 8.460867881774902, "learning_rate": 4.717547665258591e-05, "loss": 6.0927, "step": 2580 }, { "epoch": 0.29500541033088445, "grad_norm": 6.630770683288574, "learning_rate": 4.7164059824180844e-05, "loss": 6.0046, "step": 2590 }, { "epoch": 0.296144427359189, "grad_norm": 15.487954139709473, "learning_rate": 4.7152642995775776e-05, "loss": 6.0306, "step": 2600 }, { "epoch": 0.2972834443874936, "grad_norm": 8.283817291259766, "learning_rate": 4.714122616737071e-05, "loss": 5.7432, "step": 2610 }, { "epoch": 0.29842246141579815, "grad_norm": 6.828223705291748, "learning_rate": 4.712980933896563e-05, "loss": 6.3253, "step": 2620 }, { "epoch": 0.29956147844410275, "grad_norm": 5.298694610595703, "learning_rate": 4.7118392510560565e-05, "loss": 6.0055, "step": 2630 }, { "epoch": 0.3007004954724073, "grad_norm": 14.596810340881348, "learning_rate": 4.7106975682155504e-05, "loss": 6.2159, "step": 2640 }, { "epoch": 0.3018395125007119, "grad_norm": 10.004966735839844, "learning_rate": 4.7095558853750436e-05, "loss": 5.9085, "step": 2650 }, { "epoch": 0.30297852952901644, "grad_norm": 23.831050872802734, "learning_rate": 4.708414202534536e-05, "loss": 6.4296, "step": 2660 }, { "epoch": 0.30411754655732104, "grad_norm": 13.838797569274902, "learning_rate": 4.707272519694029e-05, "loss": 6.0649, "step": 2670 }, { "epoch": 0.3052565635856256, "grad_norm": 6.441380023956299, "learning_rate": 4.7061308368535224e-05, "loss": 5.6616, "step": 2680 }, { "epoch": 0.3063955806139302, "grad_norm": 7.748492240905762, "learning_rate": 4.7049891540130156e-05, "loss": 5.9879, "step": 2690 }, { "epoch": 0.30753459764223473, "grad_norm": 6.974376201629639, "learning_rate": 4.703847471172508e-05, "loss": 5.8898, "step": 2700 }, { "epoch": 0.30867361467053933, "grad_norm": 9.72148323059082, "learning_rate": 4.702705788332001e-05, "loss": 5.9921, "step": 2710 }, { "epoch": 0.3098126316988439, "grad_norm": 6.207435131072998, "learning_rate": 4.701564105491495e-05, "loss": 5.8311, "step": 2720 }, { "epoch": 0.3109516487271485, "grad_norm": 8.65808391571045, "learning_rate": 4.700422422650988e-05, "loss": 5.8449, "step": 2730 }, { "epoch": 0.312090665755453, "grad_norm": 5.72158145904541, "learning_rate": 4.699280739810481e-05, "loss": 5.8683, "step": 2740 }, { "epoch": 0.3132296827837576, "grad_norm": 10.032368659973145, "learning_rate": 4.698139056969974e-05, "loss": 5.9892, "step": 2750 }, { "epoch": 0.31436869981206217, "grad_norm": 4.539613723754883, "learning_rate": 4.696997374129467e-05, "loss": 6.9231, "step": 2760 }, { "epoch": 0.31550771684036677, "grad_norm": 6.816250801086426, "learning_rate": 4.6958556912889604e-05, "loss": 5.9322, "step": 2770 }, { "epoch": 0.3166467338686713, "grad_norm": 7.319843292236328, "learning_rate": 4.694714008448453e-05, "loss": 6.1575, "step": 2780 }, { "epoch": 0.3177857508969759, "grad_norm": 8.64138126373291, "learning_rate": 4.693572325607946e-05, "loss": 6.1458, "step": 2790 }, { "epoch": 0.31892476792528046, "grad_norm": 25.99393081665039, "learning_rate": 4.692430642767439e-05, "loss": 5.9521, "step": 2800 }, { "epoch": 0.32006378495358506, "grad_norm": 5.670660018920898, "learning_rate": 4.6912889599269325e-05, "loss": 5.9455, "step": 2810 }, { "epoch": 0.3212028019818896, "grad_norm": 5.817745208740234, "learning_rate": 4.6901472770864256e-05, "loss": 5.7616, "step": 2820 }, { "epoch": 0.3223418190101942, "grad_norm": 6.353646755218506, "learning_rate": 4.689005594245919e-05, "loss": 6.0848, "step": 2830 }, { "epoch": 0.32348083603849875, "grad_norm": 6.609086513519287, "learning_rate": 4.687863911405412e-05, "loss": 6.3497, "step": 2840 }, { "epoch": 0.32461985306680335, "grad_norm": 6.302474498748779, "learning_rate": 4.6867222285649045e-05, "loss": 5.9555, "step": 2850 }, { "epoch": 0.3257588700951079, "grad_norm": 4.505662441253662, "learning_rate": 4.685580545724398e-05, "loss": 6.1585, "step": 2860 }, { "epoch": 0.3268978871234125, "grad_norm": 7.946392059326172, "learning_rate": 4.684438862883891e-05, "loss": 5.8707, "step": 2870 }, { "epoch": 0.32803690415171705, "grad_norm": 5.582015037536621, "learning_rate": 4.683297180043384e-05, "loss": 6.0129, "step": 2880 }, { "epoch": 0.32917592118002165, "grad_norm": 11.491278648376465, "learning_rate": 4.682155497202877e-05, "loss": 6.0328, "step": 2890 }, { "epoch": 0.3303149382083262, "grad_norm": 5.488585948944092, "learning_rate": 4.6810138143623704e-05, "loss": 6.3148, "step": 2900 }, { "epoch": 0.3314539552366308, "grad_norm": 7.975997447967529, "learning_rate": 4.6798721315218636e-05, "loss": 5.7636, "step": 2910 }, { "epoch": 0.33259297226493534, "grad_norm": 7.250860214233398, "learning_rate": 4.678730448681357e-05, "loss": 5.8292, "step": 2920 }, { "epoch": 0.33373198929323994, "grad_norm": 5.324447154998779, "learning_rate": 4.677588765840849e-05, "loss": 5.905, "step": 2930 }, { "epoch": 0.3348710063215445, "grad_norm": 9.724185943603516, "learning_rate": 4.6764470830003425e-05, "loss": 6.2338, "step": 2940 }, { "epoch": 0.3360100233498491, "grad_norm": 6.259258270263672, "learning_rate": 4.675305400159836e-05, "loss": 5.9365, "step": 2950 }, { "epoch": 0.33714904037815363, "grad_norm": 13.341094017028809, "learning_rate": 4.674163717319329e-05, "loss": 5.9414, "step": 2960 }, { "epoch": 0.33828805740645823, "grad_norm": 8.9069185256958, "learning_rate": 4.673022034478822e-05, "loss": 5.9498, "step": 2970 }, { "epoch": 0.3394270744347628, "grad_norm": 8.157875061035156, "learning_rate": 4.671880351638315e-05, "loss": 5.8834, "step": 2980 }, { "epoch": 0.3405660914630674, "grad_norm": 15.065528869628906, "learning_rate": 4.6707386687978084e-05, "loss": 5.9062, "step": 2990 }, { "epoch": 0.3417051084913719, "grad_norm": 7.683790683746338, "learning_rate": 4.6695969859573016e-05, "loss": 5.7754, "step": 3000 }, { "epoch": 0.3428441255196765, "grad_norm": 6.353583335876465, "learning_rate": 4.668455303116794e-05, "loss": 6.0532, "step": 3010 }, { "epoch": 0.34398314254798107, "grad_norm": 5.4052629470825195, "learning_rate": 4.667313620276287e-05, "loss": 5.9156, "step": 3020 }, { "epoch": 0.34512215957628567, "grad_norm": 9.244784355163574, "learning_rate": 4.6661719374357805e-05, "loss": 6.04, "step": 3030 }, { "epoch": 0.3462611766045902, "grad_norm": 5.430369853973389, "learning_rate": 4.6650302545952737e-05, "loss": 5.9407, "step": 3040 }, { "epoch": 0.3474001936328948, "grad_norm": 12.265975952148438, "learning_rate": 4.663888571754767e-05, "loss": 6.3226, "step": 3050 }, { "epoch": 0.34853921066119936, "grad_norm": 6.474460601806641, "learning_rate": 4.66274688891426e-05, "loss": 6.2523, "step": 3060 }, { "epoch": 0.34967822768950396, "grad_norm": 13.527132034301758, "learning_rate": 4.661605206073753e-05, "loss": 6.0565, "step": 3070 }, { "epoch": 0.3508172447178085, "grad_norm": 6.607095241546631, "learning_rate": 4.6604635232332464e-05, "loss": 6.5046, "step": 3080 }, { "epoch": 0.3519562617461131, "grad_norm": 6.4882354736328125, "learning_rate": 4.659321840392739e-05, "loss": 6.1059, "step": 3090 }, { "epoch": 0.35309527877441765, "grad_norm": 10.648322105407715, "learning_rate": 4.658180157552232e-05, "loss": 5.9012, "step": 3100 }, { "epoch": 0.35423429580272225, "grad_norm": 7.89484977722168, "learning_rate": 4.657038474711725e-05, "loss": 6.2262, "step": 3110 }, { "epoch": 0.3553733128310268, "grad_norm": 6.660143852233887, "learning_rate": 4.6558967918712184e-05, "loss": 5.8987, "step": 3120 }, { "epoch": 0.3565123298593314, "grad_norm": 6.835406303405762, "learning_rate": 4.654755109030711e-05, "loss": 5.7315, "step": 3130 }, { "epoch": 0.35765134688763595, "grad_norm": 10.808911323547363, "learning_rate": 4.653613426190205e-05, "loss": 5.8631, "step": 3140 }, { "epoch": 0.35879036391594055, "grad_norm": 6.554528713226318, "learning_rate": 4.652471743349698e-05, "loss": 6.1735, "step": 3150 }, { "epoch": 0.3599293809442451, "grad_norm": 6.966010570526123, "learning_rate": 4.651330060509191e-05, "loss": 6.2185, "step": 3160 }, { "epoch": 0.3610683979725497, "grad_norm": 5.658500671386719, "learning_rate": 4.650188377668684e-05, "loss": 6.3764, "step": 3170 }, { "epoch": 0.36220741500085424, "grad_norm": 6.941939353942871, "learning_rate": 4.649046694828177e-05, "loss": 5.8302, "step": 3180 }, { "epoch": 0.36334643202915884, "grad_norm": 10.871665954589844, "learning_rate": 4.64790501198767e-05, "loss": 5.7199, "step": 3190 }, { "epoch": 0.3644854490574634, "grad_norm": 9.147170066833496, "learning_rate": 4.646763329147163e-05, "loss": 5.851, "step": 3200 }, { "epoch": 0.365624466085768, "grad_norm": 5.908941268920898, "learning_rate": 4.645621646306656e-05, "loss": 6.2582, "step": 3210 }, { "epoch": 0.36676348311407253, "grad_norm": 5.497340202331543, "learning_rate": 4.6444799634661496e-05, "loss": 6.0959, "step": 3220 }, { "epoch": 0.36790250014237713, "grad_norm": 7.915394306182861, "learning_rate": 4.643338280625643e-05, "loss": 6.0134, "step": 3230 }, { "epoch": 0.3690415171706817, "grad_norm": 6.310519218444824, "learning_rate": 4.642196597785135e-05, "loss": 5.519, "step": 3240 }, { "epoch": 0.3701805341989863, "grad_norm": 6.008285045623779, "learning_rate": 4.6410549149446285e-05, "loss": 6.2746, "step": 3250 }, { "epoch": 0.3713195512272908, "grad_norm": 6.441627502441406, "learning_rate": 4.6399132321041217e-05, "loss": 6.4277, "step": 3260 }, { "epoch": 0.3724585682555954, "grad_norm": 5.518830299377441, "learning_rate": 4.638771549263615e-05, "loss": 5.963, "step": 3270 }, { "epoch": 0.37359758528389997, "grad_norm": 5.965517997741699, "learning_rate": 4.637629866423108e-05, "loss": 5.7866, "step": 3280 }, { "epoch": 0.37473660231220457, "grad_norm": 6.320878028869629, "learning_rate": 4.6364881835826005e-05, "loss": 6.1882, "step": 3290 }, { "epoch": 0.3758756193405091, "grad_norm": 13.0669584274292, "learning_rate": 4.6353465007420944e-05, "loss": 6.1091, "step": 3300 }, { "epoch": 0.3770146363688137, "grad_norm": 5.964664459228516, "learning_rate": 4.6342048179015876e-05, "loss": 6.0334, "step": 3310 }, { "epoch": 0.37815365339711826, "grad_norm": 7.827390670776367, "learning_rate": 4.63306313506108e-05, "loss": 6.022, "step": 3320 }, { "epoch": 0.37929267042542286, "grad_norm": 12.887112617492676, "learning_rate": 4.631921452220573e-05, "loss": 5.9171, "step": 3330 }, { "epoch": 0.3804316874537274, "grad_norm": 8.092065811157227, "learning_rate": 4.6307797693800665e-05, "loss": 5.986, "step": 3340 }, { "epoch": 0.381570704482032, "grad_norm": 6.128257751464844, "learning_rate": 4.6296380865395596e-05, "loss": 5.9844, "step": 3350 }, { "epoch": 0.38270972151033655, "grad_norm": 8.12193775177002, "learning_rate": 4.628496403699052e-05, "loss": 5.9676, "step": 3360 }, { "epoch": 0.38384873853864115, "grad_norm": 8.535385131835938, "learning_rate": 4.627354720858545e-05, "loss": 5.8463, "step": 3370 }, { "epoch": 0.3849877555669457, "grad_norm": 8.127837181091309, "learning_rate": 4.626213038018039e-05, "loss": 6.1955, "step": 3380 }, { "epoch": 0.3861267725952503, "grad_norm": 7.3196330070495605, "learning_rate": 4.6250713551775324e-05, "loss": 6.3398, "step": 3390 }, { "epoch": 0.38726578962355485, "grad_norm": 9.154827117919922, "learning_rate": 4.623929672337025e-05, "loss": 5.9425, "step": 3400 }, { "epoch": 0.38840480665185945, "grad_norm": 16.89331817626953, "learning_rate": 4.622787989496518e-05, "loss": 5.8076, "step": 3410 }, { "epoch": 0.389543823680164, "grad_norm": 7.123322010040283, "learning_rate": 4.621646306656011e-05, "loss": 5.8576, "step": 3420 }, { "epoch": 0.3906828407084686, "grad_norm": 10.912338256835938, "learning_rate": 4.6205046238155044e-05, "loss": 6.2962, "step": 3430 }, { "epoch": 0.39182185773677314, "grad_norm": 37.93599319458008, "learning_rate": 4.619362940974997e-05, "loss": 5.8181, "step": 3440 }, { "epoch": 0.39296087476507774, "grad_norm": 8.63291072845459, "learning_rate": 4.61822125813449e-05, "loss": 5.85, "step": 3450 }, { "epoch": 0.3940998917933823, "grad_norm": 7.839759349822998, "learning_rate": 4.617079575293984e-05, "loss": 6.1772, "step": 3460 }, { "epoch": 0.3952389088216869, "grad_norm": 11.179443359375, "learning_rate": 4.615937892453477e-05, "loss": 6.0357, "step": 3470 }, { "epoch": 0.39637792584999143, "grad_norm": 5.794097423553467, "learning_rate": 4.61479620961297e-05, "loss": 6.2638, "step": 3480 }, { "epoch": 0.39751694287829603, "grad_norm": 6.903919696807861, "learning_rate": 4.613654526772463e-05, "loss": 5.9867, "step": 3490 }, { "epoch": 0.3986559599066006, "grad_norm": 7.100025177001953, "learning_rate": 4.612512843931956e-05, "loss": 5.8719, "step": 3500 }, { "epoch": 0.3997949769349052, "grad_norm": 8.104240417480469, "learning_rate": 4.611371161091449e-05, "loss": 5.8909, "step": 3510 }, { "epoch": 0.4009339939632097, "grad_norm": 6.986795902252197, "learning_rate": 4.610229478250942e-05, "loss": 6.195, "step": 3520 }, { "epoch": 0.4020730109915143, "grad_norm": 7.941359043121338, "learning_rate": 4.609087795410435e-05, "loss": 6.2972, "step": 3530 }, { "epoch": 0.40321202801981887, "grad_norm": 8.151006698608398, "learning_rate": 4.607946112569928e-05, "loss": 6.0135, "step": 3540 }, { "epoch": 0.40435104504812347, "grad_norm": 4.726245403289795, "learning_rate": 4.606804429729421e-05, "loss": 6.3568, "step": 3550 }, { "epoch": 0.405490062076428, "grad_norm": 5.867665767669678, "learning_rate": 4.6056627468889145e-05, "loss": 6.1011, "step": 3560 }, { "epoch": 0.4066290791047326, "grad_norm": 8.400825500488281, "learning_rate": 4.6045210640484076e-05, "loss": 6.172, "step": 3570 }, { "epoch": 0.40776809613303716, "grad_norm": 4.860127925872803, "learning_rate": 4.603379381207901e-05, "loss": 6.6524, "step": 3580 }, { "epoch": 0.40890711316134176, "grad_norm": 8.082280158996582, "learning_rate": 4.602237698367394e-05, "loss": 5.8083, "step": 3590 }, { "epoch": 0.41004613018964636, "grad_norm": 6.055807113647461, "learning_rate": 4.6010960155268865e-05, "loss": 6.0123, "step": 3600 }, { "epoch": 0.4111851472179509, "grad_norm": 10.244037628173828, "learning_rate": 4.59995433268638e-05, "loss": 6.172, "step": 3610 }, { "epoch": 0.4123241642462555, "grad_norm": 4.998210906982422, "learning_rate": 4.598812649845873e-05, "loss": 5.9164, "step": 3620 }, { "epoch": 0.41346318127456005, "grad_norm": 5.7088823318481445, "learning_rate": 4.597670967005366e-05, "loss": 5.7632, "step": 3630 }, { "epoch": 0.41460219830286466, "grad_norm": 3.9780211448669434, "learning_rate": 4.596529284164859e-05, "loss": 6.2047, "step": 3640 }, { "epoch": 0.4157412153311692, "grad_norm": 7.240362644195557, "learning_rate": 4.5953876013243524e-05, "loss": 5.8884, "step": 3650 }, { "epoch": 0.4168802323594738, "grad_norm": 8.113978385925293, "learning_rate": 4.5942459184838456e-05, "loss": 5.9598, "step": 3660 }, { "epoch": 0.41801924938777835, "grad_norm": 4.712223052978516, "learning_rate": 4.593104235643339e-05, "loss": 6.242, "step": 3670 }, { "epoch": 0.41915826641608295, "grad_norm": 4.892892360687256, "learning_rate": 4.591962552802831e-05, "loss": 5.8129, "step": 3680 }, { "epoch": 0.4202972834443875, "grad_norm": 5.622137546539307, "learning_rate": 4.5908208699623245e-05, "loss": 5.8886, "step": 3690 }, { "epoch": 0.4214363004726921, "grad_norm": 5.635571479797363, "learning_rate": 4.589679187121818e-05, "loss": 6.3696, "step": 3700 }, { "epoch": 0.42257531750099664, "grad_norm": 10.80083179473877, "learning_rate": 4.588537504281311e-05, "loss": 5.5764, "step": 3710 }, { "epoch": 0.42371433452930124, "grad_norm": 9.474900245666504, "learning_rate": 4.587395821440804e-05, "loss": 6.0212, "step": 3720 }, { "epoch": 0.4248533515576058, "grad_norm": 5.499372959136963, "learning_rate": 4.586254138600297e-05, "loss": 5.8542, "step": 3730 }, { "epoch": 0.4259923685859104, "grad_norm": 6.5461554527282715, "learning_rate": 4.5851124557597904e-05, "loss": 6.0441, "step": 3740 }, { "epoch": 0.42713138561421493, "grad_norm": 15.075446128845215, "learning_rate": 4.583970772919283e-05, "loss": 5.9956, "step": 3750 }, { "epoch": 0.42827040264251953, "grad_norm": 6.579662322998047, "learning_rate": 4.582829090078776e-05, "loss": 5.7716, "step": 3760 }, { "epoch": 0.4294094196708241, "grad_norm": 4.892009258270264, "learning_rate": 4.581687407238269e-05, "loss": 5.5447, "step": 3770 }, { "epoch": 0.4305484366991287, "grad_norm": 5.967264175415039, "learning_rate": 4.5805457243977625e-05, "loss": 5.9216, "step": 3780 }, { "epoch": 0.4316874537274332, "grad_norm": 7.942027568817139, "learning_rate": 4.5794040415572557e-05, "loss": 5.809, "step": 3790 }, { "epoch": 0.4328264707557378, "grad_norm": 5.483458042144775, "learning_rate": 4.578262358716749e-05, "loss": 5.7505, "step": 3800 }, { "epoch": 0.43396548778404237, "grad_norm": 6.87880277633667, "learning_rate": 4.577120675876242e-05, "loss": 5.8504, "step": 3810 }, { "epoch": 0.43510450481234697, "grad_norm": 6.670889854431152, "learning_rate": 4.575978993035735e-05, "loss": 6.173, "step": 3820 }, { "epoch": 0.4362435218406515, "grad_norm": 6.609583377838135, "learning_rate": 4.574837310195228e-05, "loss": 5.7217, "step": 3830 }, { "epoch": 0.4373825388689561, "grad_norm": 7.475062370300293, "learning_rate": 4.573695627354721e-05, "loss": 5.9074, "step": 3840 }, { "epoch": 0.43852155589726066, "grad_norm": 8.09111499786377, "learning_rate": 4.572553944514214e-05, "loss": 6.1935, "step": 3850 }, { "epoch": 0.43966057292556526, "grad_norm": 6.0317559242248535, "learning_rate": 4.571412261673707e-05, "loss": 5.9718, "step": 3860 }, { "epoch": 0.4407995899538698, "grad_norm": 5.886181831359863, "learning_rate": 4.5702705788332e-05, "loss": 5.9667, "step": 3870 }, { "epoch": 0.4419386069821744, "grad_norm": 7.264452934265137, "learning_rate": 4.5691288959926936e-05, "loss": 5.6558, "step": 3880 }, { "epoch": 0.44307762401047895, "grad_norm": 7.599318504333496, "learning_rate": 4.567987213152187e-05, "loss": 6.0186, "step": 3890 }, { "epoch": 0.44421664103878356, "grad_norm": 10.22423267364502, "learning_rate": 4.56684553031168e-05, "loss": 6.089, "step": 3900 }, { "epoch": 0.4453556580670881, "grad_norm": 8.207062721252441, "learning_rate": 4.5657038474711725e-05, "loss": 5.8384, "step": 3910 }, { "epoch": 0.4464946750953927, "grad_norm": 13.894556999206543, "learning_rate": 4.564562164630666e-05, "loss": 5.4603, "step": 3920 }, { "epoch": 0.44763369212369725, "grad_norm": 5.356435775756836, "learning_rate": 4.563420481790159e-05, "loss": 6.0264, "step": 3930 }, { "epoch": 0.44877270915200185, "grad_norm": 5.479897499084473, "learning_rate": 4.562278798949652e-05, "loss": 6.7273, "step": 3940 }, { "epoch": 0.4499117261803064, "grad_norm": 5.720917701721191, "learning_rate": 4.5611371161091446e-05, "loss": 5.7317, "step": 3950 }, { "epoch": 0.451050743208611, "grad_norm": 19.18890380859375, "learning_rate": 4.5599954332686384e-05, "loss": 5.9488, "step": 3960 }, { "epoch": 0.45218976023691554, "grad_norm": 8.365082740783691, "learning_rate": 4.5588537504281316e-05, "loss": 5.7803, "step": 3970 }, { "epoch": 0.45332877726522014, "grad_norm": 10.296485900878906, "learning_rate": 4.557712067587625e-05, "loss": 5.9118, "step": 3980 }, { "epoch": 0.4544677942935247, "grad_norm": 9.0343656539917, "learning_rate": 4.556570384747117e-05, "loss": 5.6937, "step": 3990 }, { "epoch": 0.4556068113218293, "grad_norm": 6.224181175231934, "learning_rate": 4.5554287019066105e-05, "loss": 6.0464, "step": 4000 }, { "epoch": 0.4556068113218293, "eval_loss": 6.078274250030518, "eval_runtime": 11.2491, "eval_samples_per_second": 1.333, "eval_steps_per_second": 0.178, "step": 4000 }, { "epoch": 0.45674582835013383, "grad_norm": 5.356078624725342, "learning_rate": 4.554287019066104e-05, "loss": 5.8924, "step": 4010 }, { "epoch": 0.45788484537843843, "grad_norm": 8.025609970092773, "learning_rate": 4.553145336225597e-05, "loss": 5.9475, "step": 4020 }, { "epoch": 0.459023862406743, "grad_norm": 5.004885196685791, "learning_rate": 4.5520036533850894e-05, "loss": 5.6254, "step": 4030 }, { "epoch": 0.4601628794350476, "grad_norm": 5.545079231262207, "learning_rate": 4.550861970544583e-05, "loss": 6.1783, "step": 4040 }, { "epoch": 0.4613018964633521, "grad_norm": 6.245349884033203, "learning_rate": 4.5497202877040764e-05, "loss": 6.0623, "step": 4050 }, { "epoch": 0.4624409134916567, "grad_norm": 6.350508213043213, "learning_rate": 4.548578604863569e-05, "loss": 6.0522, "step": 4060 }, { "epoch": 0.46357993051996127, "grad_norm": 9.772076606750488, "learning_rate": 4.547436922023062e-05, "loss": 6.064, "step": 4070 }, { "epoch": 0.46471894754826587, "grad_norm": 7.155641078948975, "learning_rate": 4.546295239182555e-05, "loss": 6.0838, "step": 4080 }, { "epoch": 0.4658579645765704, "grad_norm": 5.971468925476074, "learning_rate": 4.5451535563420485e-05, "loss": 6.037, "step": 4090 }, { "epoch": 0.466996981604875, "grad_norm": 8.513012886047363, "learning_rate": 4.5440118735015416e-05, "loss": 6.2869, "step": 4100 }, { "epoch": 0.46813599863317956, "grad_norm": 7.186007022857666, "learning_rate": 4.542870190661034e-05, "loss": 6.1033, "step": 4110 }, { "epoch": 0.46927501566148416, "grad_norm": 8.911818504333496, "learning_rate": 4.541728507820528e-05, "loss": 5.9273, "step": 4120 }, { "epoch": 0.4704140326897887, "grad_norm": 6.835193634033203, "learning_rate": 4.540586824980021e-05, "loss": 5.9101, "step": 4130 }, { "epoch": 0.4715530497180933, "grad_norm": 8.617008209228516, "learning_rate": 4.539445142139514e-05, "loss": 5.8752, "step": 4140 }, { "epoch": 0.47269206674639785, "grad_norm": 6.705082893371582, "learning_rate": 4.538303459299007e-05, "loss": 5.8514, "step": 4150 }, { "epoch": 0.47383108377470246, "grad_norm": 4.137232303619385, "learning_rate": 4.5371617764585e-05, "loss": 6.1345, "step": 4160 }, { "epoch": 0.474970100803007, "grad_norm": 5.951109886169434, "learning_rate": 4.536020093617993e-05, "loss": 5.8639, "step": 4170 }, { "epoch": 0.4761091178313116, "grad_norm": 9.148282051086426, "learning_rate": 4.5348784107774864e-05, "loss": 5.7085, "step": 4180 }, { "epoch": 0.47724813485961615, "grad_norm": 6.89546537399292, "learning_rate": 4.533736727936979e-05, "loss": 5.8127, "step": 4190 }, { "epoch": 0.47838715188792075, "grad_norm": 6.99493932723999, "learning_rate": 4.532595045096472e-05, "loss": 6.6861, "step": 4200 }, { "epoch": 0.4795261689162253, "grad_norm": 5.6303019523620605, "learning_rate": 4.531453362255966e-05, "loss": 6.3338, "step": 4210 }, { "epoch": 0.4806651859445299, "grad_norm": 6.7159953117370605, "learning_rate": 4.5303116794154585e-05, "loss": 5.8867, "step": 4220 }, { "epoch": 0.48180420297283444, "grad_norm": 5.279120922088623, "learning_rate": 4.529169996574952e-05, "loss": 5.9261, "step": 4230 }, { "epoch": 0.48294322000113904, "grad_norm": 6.267137050628662, "learning_rate": 4.528028313734445e-05, "loss": 5.7475, "step": 4240 }, { "epoch": 0.4840822370294436, "grad_norm": 11.94001293182373, "learning_rate": 4.526886630893938e-05, "loss": 5.7601, "step": 4250 }, { "epoch": 0.4852212540577482, "grad_norm": 7.740771293640137, "learning_rate": 4.5257449480534305e-05, "loss": 5.8693, "step": 4260 }, { "epoch": 0.48636027108605273, "grad_norm": 5.90120792388916, "learning_rate": 4.524603265212924e-05, "loss": 5.894, "step": 4270 }, { "epoch": 0.48749928811435733, "grad_norm": 5.867739677429199, "learning_rate": 4.523461582372417e-05, "loss": 5.8287, "step": 4280 }, { "epoch": 0.4886383051426619, "grad_norm": 5.900161266326904, "learning_rate": 4.522319899531911e-05, "loss": 6.0611, "step": 4290 }, { "epoch": 0.4897773221709665, "grad_norm": 5.506997585296631, "learning_rate": 4.521178216691403e-05, "loss": 5.8383, "step": 4300 }, { "epoch": 0.490916339199271, "grad_norm": 5.639462947845459, "learning_rate": 4.5200365338508965e-05, "loss": 5.7852, "step": 4310 }, { "epoch": 0.4920553562275756, "grad_norm": 15.822317123413086, "learning_rate": 4.5188948510103897e-05, "loss": 5.77, "step": 4320 }, { "epoch": 0.49319437325588017, "grad_norm": 6.525705337524414, "learning_rate": 4.517753168169883e-05, "loss": 6.3061, "step": 4330 }, { "epoch": 0.49433339028418477, "grad_norm": 5.3117876052856445, "learning_rate": 4.5166114853293753e-05, "loss": 5.7736, "step": 4340 }, { "epoch": 0.4954724073124893, "grad_norm": 5.8776397705078125, "learning_rate": 4.5154698024888685e-05, "loss": 5.8458, "step": 4350 }, { "epoch": 0.4966114243407939, "grad_norm": 11.251127243041992, "learning_rate": 4.514328119648362e-05, "loss": 5.9588, "step": 4360 }, { "epoch": 0.49775044136909846, "grad_norm": 22.83697509765625, "learning_rate": 4.5131864368078556e-05, "loss": 5.963, "step": 4370 }, { "epoch": 0.49888945839740306, "grad_norm": 6.44933557510376, "learning_rate": 4.512044753967348e-05, "loss": 5.9385, "step": 4380 }, { "epoch": 0.5000284754257076, "grad_norm": 7.210160732269287, "learning_rate": 4.510903071126841e-05, "loss": 5.9247, "step": 4390 }, { "epoch": 0.5011674924540122, "grad_norm": 6.42350959777832, "learning_rate": 4.5097613882863344e-05, "loss": 6.1627, "step": 4400 }, { "epoch": 0.5023065094823168, "grad_norm": 8.11135196685791, "learning_rate": 4.5086197054458276e-05, "loss": 6.4086, "step": 4410 }, { "epoch": 0.5034455265106214, "grad_norm": 7.448008060455322, "learning_rate": 4.50747802260532e-05, "loss": 6.0661, "step": 4420 }, { "epoch": 0.5045845435389259, "grad_norm": 7.212705135345459, "learning_rate": 4.506336339764813e-05, "loss": 5.766, "step": 4430 }, { "epoch": 0.5057235605672304, "grad_norm": 6.085208892822266, "learning_rate": 4.5051946569243065e-05, "loss": 5.7313, "step": 4440 }, { "epoch": 0.5068625775955351, "grad_norm": 8.44282054901123, "learning_rate": 4.5040529740838e-05, "loss": 5.7597, "step": 4450 }, { "epoch": 0.5080015946238396, "grad_norm": 5.492551326751709, "learning_rate": 4.502911291243293e-05, "loss": 5.7822, "step": 4460 }, { "epoch": 0.5091406116521442, "grad_norm": 8.480428695678711, "learning_rate": 4.501769608402786e-05, "loss": 5.6644, "step": 4470 }, { "epoch": 0.5102796286804487, "grad_norm": 9.871870040893555, "learning_rate": 4.500627925562279e-05, "loss": 5.8488, "step": 4480 }, { "epoch": 0.5114186457087534, "grad_norm": 22.278566360473633, "learning_rate": 4.4994862427217724e-05, "loss": 5.7774, "step": 4490 }, { "epoch": 0.5125576627370579, "grad_norm": 6.218201160430908, "learning_rate": 4.498344559881265e-05, "loss": 5.6653, "step": 4500 }, { "epoch": 0.5136966797653625, "grad_norm": 7.463952541351318, "learning_rate": 4.497202877040758e-05, "loss": 5.8319, "step": 4510 }, { "epoch": 0.514835696793667, "grad_norm": 7.079387664794922, "learning_rate": 4.496061194200251e-05, "loss": 5.7116, "step": 4520 }, { "epoch": 0.5159747138219717, "grad_norm": 6.294861793518066, "learning_rate": 4.4949195113597445e-05, "loss": 5.8526, "step": 4530 }, { "epoch": 0.5171137308502762, "grad_norm": 7.084829807281494, "learning_rate": 4.4937778285192377e-05, "loss": 5.6832, "step": 4540 }, { "epoch": 0.5182527478785808, "grad_norm": 4.318167686462402, "learning_rate": 4.492636145678731e-05, "loss": 6.1293, "step": 4550 }, { "epoch": 0.5193917649068853, "grad_norm": 9.80423641204834, "learning_rate": 4.491494462838224e-05, "loss": 5.8818, "step": 4560 }, { "epoch": 0.52053078193519, "grad_norm": 11.214982032775879, "learning_rate": 4.4903527799977165e-05, "loss": 5.8895, "step": 4570 }, { "epoch": 0.5216697989634945, "grad_norm": 8.01582145690918, "learning_rate": 4.48921109715721e-05, "loss": 6.233, "step": 4580 }, { "epoch": 0.5228088159917991, "grad_norm": 6.428025722503662, "learning_rate": 4.488069414316703e-05, "loss": 6.1618, "step": 4590 }, { "epoch": 0.5239478330201036, "grad_norm": 4.905664443969727, "learning_rate": 4.486927731476196e-05, "loss": 5.9563, "step": 4600 }, { "epoch": 0.5250868500484083, "grad_norm": 5.810403823852539, "learning_rate": 4.485786048635689e-05, "loss": 5.7832, "step": 4610 }, { "epoch": 0.5262258670767128, "grad_norm": 5.713348388671875, "learning_rate": 4.4846443657951825e-05, "loss": 6.0745, "step": 4620 }, { "epoch": 0.5273648841050174, "grad_norm": 5.19990348815918, "learning_rate": 4.4835026829546756e-05, "loss": 5.8194, "step": 4630 }, { "epoch": 0.5285039011333219, "grad_norm": 5.478442668914795, "learning_rate": 4.482361000114169e-05, "loss": 5.7365, "step": 4640 }, { "epoch": 0.5296429181616266, "grad_norm": 6.2245378494262695, "learning_rate": 4.481219317273661e-05, "loss": 6.3179, "step": 4650 }, { "epoch": 0.5307819351899311, "grad_norm": 6.621606826782227, "learning_rate": 4.4800776344331545e-05, "loss": 5.7806, "step": 4660 }, { "epoch": 0.5319209522182357, "grad_norm": 5.53688907623291, "learning_rate": 4.478935951592648e-05, "loss": 6.0046, "step": 4670 }, { "epoch": 0.5330599692465402, "grad_norm": 5.000534534454346, "learning_rate": 4.477794268752141e-05, "loss": 5.8763, "step": 4680 }, { "epoch": 0.5341989862748449, "grad_norm": 4.869490623474121, "learning_rate": 4.476652585911634e-05, "loss": 6.3533, "step": 4690 }, { "epoch": 0.5353380033031494, "grad_norm": 6.470453262329102, "learning_rate": 4.475510903071127e-05, "loss": 6.0611, "step": 4700 }, { "epoch": 0.536477020331454, "grad_norm": 5.445845127105713, "learning_rate": 4.4743692202306204e-05, "loss": 5.8835, "step": 4710 }, { "epoch": 0.5376160373597585, "grad_norm": 6.2387237548828125, "learning_rate": 4.4732275373901136e-05, "loss": 5.8729, "step": 4720 }, { "epoch": 0.5387550543880631, "grad_norm": 6.605109691619873, "learning_rate": 4.472085854549606e-05, "loss": 6.14, "step": 4730 }, { "epoch": 0.5398940714163677, "grad_norm": 5.047166347503662, "learning_rate": 4.470944171709099e-05, "loss": 5.8988, "step": 4740 }, { "epoch": 0.5410330884446722, "grad_norm": 5.175670623779297, "learning_rate": 4.4698024888685925e-05, "loss": 5.9424, "step": 4750 }, { "epoch": 0.5421721054729768, "grad_norm": 5.9300079345703125, "learning_rate": 4.468660806028086e-05, "loss": 6.2061, "step": 4760 }, { "epoch": 0.5433111225012814, "grad_norm": 6.683525085449219, "learning_rate": 4.467519123187578e-05, "loss": 5.7376, "step": 4770 }, { "epoch": 0.544450139529586, "grad_norm": 14.139119148254395, "learning_rate": 4.466377440347072e-05, "loss": 6.0151, "step": 4780 }, { "epoch": 0.5455891565578905, "grad_norm": 4.422637939453125, "learning_rate": 4.465235757506565e-05, "loss": 5.7256, "step": 4790 }, { "epoch": 0.5467281735861951, "grad_norm": 4.97141170501709, "learning_rate": 4.4640940746660584e-05, "loss": 5.7168, "step": 4800 }, { "epoch": 0.5478671906144997, "grad_norm": 4.695651531219482, "learning_rate": 4.462952391825551e-05, "loss": 5.7933, "step": 4810 }, { "epoch": 0.5490062076428043, "grad_norm": 5.186215877532959, "learning_rate": 4.461810708985044e-05, "loss": 6.041, "step": 4820 }, { "epoch": 0.5501452246711088, "grad_norm": 6.2940545082092285, "learning_rate": 4.460669026144537e-05, "loss": 6.0442, "step": 4830 }, { "epoch": 0.5512842416994134, "grad_norm": 9.648452758789062, "learning_rate": 4.4595273433040305e-05, "loss": 6.0486, "step": 4840 }, { "epoch": 0.552423258727718, "grad_norm": 6.279506206512451, "learning_rate": 4.458385660463523e-05, "loss": 5.9373, "step": 4850 }, { "epoch": 0.5535622757560226, "grad_norm": 11.7589750289917, "learning_rate": 4.457243977623016e-05, "loss": 5.8724, "step": 4860 }, { "epoch": 0.5547012927843271, "grad_norm": 5.745841979980469, "learning_rate": 4.45610229478251e-05, "loss": 6.1764, "step": 4870 }, { "epoch": 0.5558403098126317, "grad_norm": 10.151200294494629, "learning_rate": 4.454960611942003e-05, "loss": 5.812, "step": 4880 }, { "epoch": 0.5569793268409363, "grad_norm": 6.251079082489014, "learning_rate": 4.453818929101496e-05, "loss": 5.699, "step": 4890 }, { "epoch": 0.5581183438692409, "grad_norm": 7.862876892089844, "learning_rate": 4.452677246260989e-05, "loss": 6.012, "step": 4900 }, { "epoch": 0.5592573608975454, "grad_norm": 6.622585296630859, "learning_rate": 4.451535563420482e-05, "loss": 5.964, "step": 4910 }, { "epoch": 0.56039637792585, "grad_norm": 7.121626377105713, "learning_rate": 4.450393880579975e-05, "loss": 5.8795, "step": 4920 }, { "epoch": 0.5615353949541546, "grad_norm": 7.391700267791748, "learning_rate": 4.449252197739468e-05, "loss": 6.24, "step": 4930 }, { "epoch": 0.5626744119824592, "grad_norm": 5.3456549644470215, "learning_rate": 4.448110514898961e-05, "loss": 6.1187, "step": 4940 }, { "epoch": 0.5638134290107637, "grad_norm": 5.86979866027832, "learning_rate": 4.446968832058455e-05, "loss": 6.0705, "step": 4950 }, { "epoch": 0.5649524460390682, "grad_norm": 17.474056243896484, "learning_rate": 4.445827149217947e-05, "loss": 5.9015, "step": 4960 }, { "epoch": 0.5660914630673729, "grad_norm": 6.048776149749756, "learning_rate": 4.4446854663774405e-05, "loss": 5.78, "step": 4970 }, { "epoch": 0.5672304800956774, "grad_norm": 6.961145877838135, "learning_rate": 4.443543783536934e-05, "loss": 6.045, "step": 4980 }, { "epoch": 0.568369497123982, "grad_norm": 8.503111839294434, "learning_rate": 4.442402100696427e-05, "loss": 5.932, "step": 4990 }, { "epoch": 0.5695085141522865, "grad_norm": 5.485167503356934, "learning_rate": 4.44126041785592e-05, "loss": 5.7518, "step": 5000 }, { "epoch": 0.5706475311805912, "grad_norm": 6.443273544311523, "learning_rate": 4.4401187350154126e-05, "loss": 5.9175, "step": 5010 }, { "epoch": 0.5717865482088957, "grad_norm": 6.290263652801514, "learning_rate": 4.438977052174906e-05, "loss": 5.6413, "step": 5020 }, { "epoch": 0.5729255652372003, "grad_norm": 6.004288196563721, "learning_rate": 4.4378353693343996e-05, "loss": 5.9629, "step": 5030 }, { "epoch": 0.5740645822655048, "grad_norm": 5.7330145835876465, "learning_rate": 4.436693686493892e-05, "loss": 5.9903, "step": 5040 }, { "epoch": 0.5752035992938095, "grad_norm": 6.661035060882568, "learning_rate": 4.435552003653385e-05, "loss": 5.8785, "step": 5050 }, { "epoch": 0.576342616322114, "grad_norm": 8.097799301147461, "learning_rate": 4.4344103208128785e-05, "loss": 5.7706, "step": 5060 }, { "epoch": 0.5774816333504186, "grad_norm": 6.515287399291992, "learning_rate": 4.4332686379723717e-05, "loss": 5.9882, "step": 5070 }, { "epoch": 0.5786206503787231, "grad_norm": 16.282241821289062, "learning_rate": 4.432126955131864e-05, "loss": 5.8521, "step": 5080 }, { "epoch": 0.5797596674070278, "grad_norm": 4.604179382324219, "learning_rate": 4.4309852722913573e-05, "loss": 6.2113, "step": 5090 }, { "epoch": 0.5808986844353323, "grad_norm": 5.025040626525879, "learning_rate": 4.4298435894508505e-05, "loss": 6.7721, "step": 5100 }, { "epoch": 0.5820377014636369, "grad_norm": 7.755475997924805, "learning_rate": 4.4287019066103444e-05, "loss": 5.8835, "step": 5110 }, { "epoch": 0.5831767184919414, "grad_norm": 6.028952598571777, "learning_rate": 4.427560223769837e-05, "loss": 5.909, "step": 5120 }, { "epoch": 0.5843157355202461, "grad_norm": 6.185368537902832, "learning_rate": 4.42641854092933e-05, "loss": 5.6625, "step": 5130 }, { "epoch": 0.5854547525485506, "grad_norm": 8.831610679626465, "learning_rate": 4.425276858088823e-05, "loss": 6.078, "step": 5140 }, { "epoch": 0.5865937695768552, "grad_norm": 7.380201816558838, "learning_rate": 4.4241351752483164e-05, "loss": 5.996, "step": 5150 }, { "epoch": 0.5877327866051597, "grad_norm": 14.59043025970459, "learning_rate": 4.422993492407809e-05, "loss": 6.0636, "step": 5160 }, { "epoch": 0.5888718036334644, "grad_norm": 8.660425186157227, "learning_rate": 4.421851809567302e-05, "loss": 6.1363, "step": 5170 }, { "epoch": 0.5900108206617689, "grad_norm": 6.343860626220703, "learning_rate": 4.420710126726795e-05, "loss": 6.0228, "step": 5180 }, { "epoch": 0.5911498376900735, "grad_norm": 8.988828659057617, "learning_rate": 4.4195684438862885e-05, "loss": 6.1271, "step": 5190 }, { "epoch": 0.592288854718378, "grad_norm": 4.080019474029541, "learning_rate": 4.418426761045782e-05, "loss": 5.717, "step": 5200 }, { "epoch": 0.5934278717466827, "grad_norm": 8.057381629943848, "learning_rate": 4.417285078205275e-05, "loss": 5.9801, "step": 5210 }, { "epoch": 0.5945668887749872, "grad_norm": 6.668858528137207, "learning_rate": 4.416143395364768e-05, "loss": 5.9643, "step": 5220 }, { "epoch": 0.5957059058032917, "grad_norm": 15.661836624145508, "learning_rate": 4.415001712524261e-05, "loss": 5.9624, "step": 5230 }, { "epoch": 0.5968449228315963, "grad_norm": 7.527164936065674, "learning_rate": 4.413860029683754e-05, "loss": 6.7508, "step": 5240 }, { "epoch": 0.597983939859901, "grad_norm": 7.868592262268066, "learning_rate": 4.412718346843247e-05, "loss": 5.7929, "step": 5250 }, { "epoch": 0.5991229568882055, "grad_norm": 9.12926197052002, "learning_rate": 4.41157666400274e-05, "loss": 5.7517, "step": 5260 }, { "epoch": 0.60026197391651, "grad_norm": 7.015052318572998, "learning_rate": 4.410434981162233e-05, "loss": 5.7492, "step": 5270 }, { "epoch": 0.6014009909448146, "grad_norm": 6.547445774078369, "learning_rate": 4.4092932983217265e-05, "loss": 5.6785, "step": 5280 }, { "epoch": 0.6025400079731192, "grad_norm": 6.565978527069092, "learning_rate": 4.40815161548122e-05, "loss": 5.8961, "step": 5290 }, { "epoch": 0.6036790250014238, "grad_norm": 6.1269097328186035, "learning_rate": 4.407009932640713e-05, "loss": 6.0278, "step": 5300 }, { "epoch": 0.6048180420297283, "grad_norm": 4.784152507781982, "learning_rate": 4.405868249800206e-05, "loss": 5.7792, "step": 5310 }, { "epoch": 0.6059570590580329, "grad_norm": 4.875153541564941, "learning_rate": 4.4047265669596985e-05, "loss": 6.0277, "step": 5320 }, { "epoch": 0.6070960760863375, "grad_norm": 4.904726982116699, "learning_rate": 4.403584884119192e-05, "loss": 5.5104, "step": 5330 }, { "epoch": 0.6082350931146421, "grad_norm": 16.596614837646484, "learning_rate": 4.402443201278685e-05, "loss": 5.7808, "step": 5340 }, { "epoch": 0.6093741101429466, "grad_norm": 11.208609580993652, "learning_rate": 4.401301518438178e-05, "loss": 5.7724, "step": 5350 }, { "epoch": 0.6105131271712512, "grad_norm": 7.170653820037842, "learning_rate": 4.400159835597671e-05, "loss": 5.8673, "step": 5360 }, { "epoch": 0.6116521441995558, "grad_norm": 9.264379501342773, "learning_rate": 4.3990181527571645e-05, "loss": 6.0324, "step": 5370 }, { "epoch": 0.6127911612278604, "grad_norm": 5.418370246887207, "learning_rate": 4.3978764699166576e-05, "loss": 5.8417, "step": 5380 }, { "epoch": 0.6139301782561649, "grad_norm": 10.881325721740723, "learning_rate": 4.396734787076151e-05, "loss": 5.6967, "step": 5390 }, { "epoch": 0.6150691952844695, "grad_norm": 21.59162712097168, "learning_rate": 4.395593104235643e-05, "loss": 5.9382, "step": 5400 }, { "epoch": 0.6162082123127741, "grad_norm": 26.581167221069336, "learning_rate": 4.3944514213951365e-05, "loss": 6.0368, "step": 5410 }, { "epoch": 0.6173472293410787, "grad_norm": 9.81440544128418, "learning_rate": 4.39330973855463e-05, "loss": 5.8343, "step": 5420 }, { "epoch": 0.6184862463693832, "grad_norm": 7.034490585327148, "learning_rate": 4.392168055714123e-05, "loss": 5.8832, "step": 5430 }, { "epoch": 0.6196252633976878, "grad_norm": 12.727472305297852, "learning_rate": 4.391026372873616e-05, "loss": 5.4909, "step": 5440 }, { "epoch": 0.6207642804259924, "grad_norm": 5.399456024169922, "learning_rate": 4.389884690033109e-05, "loss": 6.0521, "step": 5450 }, { "epoch": 0.621903297454297, "grad_norm": 15.499351501464844, "learning_rate": 4.3887430071926024e-05, "loss": 6.1447, "step": 5460 }, { "epoch": 0.6230423144826015, "grad_norm": 6.282449722290039, "learning_rate": 4.387601324352095e-05, "loss": 6.1408, "step": 5470 }, { "epoch": 0.624181331510906, "grad_norm": 5.330649375915527, "learning_rate": 4.386459641511588e-05, "loss": 5.7458, "step": 5480 }, { "epoch": 0.6253203485392107, "grad_norm": 5.244607448577881, "learning_rate": 4.385317958671081e-05, "loss": 5.878, "step": 5490 }, { "epoch": 0.6264593655675152, "grad_norm": 6.8278489112854, "learning_rate": 4.3841762758305745e-05, "loss": 5.7342, "step": 5500 }, { "epoch": 0.6275983825958198, "grad_norm": 5.953369617462158, "learning_rate": 4.383034592990068e-05, "loss": 6.0004, "step": 5510 }, { "epoch": 0.6287373996241243, "grad_norm": 4.974438667297363, "learning_rate": 4.381892910149561e-05, "loss": 5.9253, "step": 5520 }, { "epoch": 0.629876416652429, "grad_norm": 4.46335506439209, "learning_rate": 4.380751227309054e-05, "loss": 5.7667, "step": 5530 }, { "epoch": 0.6310154336807335, "grad_norm": 7.2519211769104, "learning_rate": 4.379609544468547e-05, "loss": 5.8392, "step": 5540 }, { "epoch": 0.6321544507090381, "grad_norm": 5.6725053787231445, "learning_rate": 4.37846786162804e-05, "loss": 6.4619, "step": 5550 }, { "epoch": 0.6332934677373426, "grad_norm": 6.412236213684082, "learning_rate": 4.377326178787533e-05, "loss": 6.3128, "step": 5560 }, { "epoch": 0.6344324847656473, "grad_norm": 7.52258825302124, "learning_rate": 4.376184495947026e-05, "loss": 5.7141, "step": 5570 }, { "epoch": 0.6355715017939518, "grad_norm": 6.420504570007324, "learning_rate": 4.375042813106519e-05, "loss": 5.9488, "step": 5580 }, { "epoch": 0.6367105188222564, "grad_norm": 7.359799861907959, "learning_rate": 4.373901130266012e-05, "loss": 5.8208, "step": 5590 }, { "epoch": 0.6378495358505609, "grad_norm": 6.431293487548828, "learning_rate": 4.372759447425505e-05, "loss": 6.0522, "step": 5600 }, { "epoch": 0.6389885528788656, "grad_norm": 7.792017936706543, "learning_rate": 4.371617764584999e-05, "loss": 5.8313, "step": 5610 }, { "epoch": 0.6401275699071701, "grad_norm": 5.352195739746094, "learning_rate": 4.370476081744492e-05, "loss": 5.8553, "step": 5620 }, { "epoch": 0.6412665869354747, "grad_norm": 5.225295066833496, "learning_rate": 4.3693343989039845e-05, "loss": 5.6983, "step": 5630 }, { "epoch": 0.6424056039637792, "grad_norm": 7.8425493240356445, "learning_rate": 4.368192716063478e-05, "loss": 6.0586, "step": 5640 }, { "epoch": 0.6435446209920839, "grad_norm": 5.940848350524902, "learning_rate": 4.367051033222971e-05, "loss": 5.8525, "step": 5650 }, { "epoch": 0.6446836380203884, "grad_norm": 5.690766334533691, "learning_rate": 4.365909350382464e-05, "loss": 5.6485, "step": 5660 }, { "epoch": 0.645822655048693, "grad_norm": 6.9173102378845215, "learning_rate": 4.3647676675419566e-05, "loss": 5.6241, "step": 5670 }, { "epoch": 0.6469616720769975, "grad_norm": 10.341431617736816, "learning_rate": 4.36362598470145e-05, "loss": 5.6291, "step": 5680 }, { "epoch": 0.6481006891053022, "grad_norm": 6.144728660583496, "learning_rate": 4.3624843018609436e-05, "loss": 5.5985, "step": 5690 }, { "epoch": 0.6492397061336067, "grad_norm": 10.256707191467285, "learning_rate": 4.361342619020437e-05, "loss": 5.7659, "step": 5700 }, { "epoch": 0.6503787231619113, "grad_norm": 8.072504997253418, "learning_rate": 4.360200936179929e-05, "loss": 5.7809, "step": 5710 }, { "epoch": 0.6515177401902158, "grad_norm": 6.1178178787231445, "learning_rate": 4.3590592533394225e-05, "loss": 5.7802, "step": 5720 }, { "epoch": 0.6526567572185205, "grad_norm": 10.861801147460938, "learning_rate": 4.357917570498916e-05, "loss": 6.0257, "step": 5730 }, { "epoch": 0.653795774246825, "grad_norm": 7.259403705596924, "learning_rate": 4.356775887658409e-05, "loss": 5.8374, "step": 5740 }, { "epoch": 0.6549347912751295, "grad_norm": 5.754521369934082, "learning_rate": 4.3556342048179014e-05, "loss": 6.006, "step": 5750 }, { "epoch": 0.6560738083034341, "grad_norm": 10.971464157104492, "learning_rate": 4.3544925219773946e-05, "loss": 6.0546, "step": 5760 }, { "epoch": 0.6572128253317387, "grad_norm": 4.653876781463623, "learning_rate": 4.3533508391368884e-05, "loss": 5.6189, "step": 5770 }, { "epoch": 0.6583518423600433, "grad_norm": 14.550445556640625, "learning_rate": 4.352209156296381e-05, "loss": 6.1425, "step": 5780 }, { "epoch": 0.6594908593883478, "grad_norm": 6.240013599395752, "learning_rate": 4.351067473455874e-05, "loss": 5.9321, "step": 5790 }, { "epoch": 0.6606298764166524, "grad_norm": 10.119818687438965, "learning_rate": 4.349925790615367e-05, "loss": 5.9247, "step": 5800 }, { "epoch": 0.661768893444957, "grad_norm": 8.448205947875977, "learning_rate": 4.3487841077748605e-05, "loss": 6.3629, "step": 5810 }, { "epoch": 0.6629079104732616, "grad_norm": 8.628769874572754, "learning_rate": 4.3476424249343537e-05, "loss": 6.2197, "step": 5820 }, { "epoch": 0.6640469275015661, "grad_norm": 5.994671821594238, "learning_rate": 4.346500742093846e-05, "loss": 6.1132, "step": 5830 }, { "epoch": 0.6651859445298707, "grad_norm": 26.20197868347168, "learning_rate": 4.3453590592533393e-05, "loss": 5.7933, "step": 5840 }, { "epoch": 0.6663249615581753, "grad_norm": 10.80951976776123, "learning_rate": 4.344217376412833e-05, "loss": 5.7691, "step": 5850 }, { "epoch": 0.6674639785864799, "grad_norm": 20.93010139465332, "learning_rate": 4.343075693572326e-05, "loss": 5.7721, "step": 5860 }, { "epoch": 0.6686029956147844, "grad_norm": 7.294173240661621, "learning_rate": 4.341934010731819e-05, "loss": 5.8856, "step": 5870 }, { "epoch": 0.669742012643089, "grad_norm": 7.660623073577881, "learning_rate": 4.340792327891312e-05, "loss": 5.8282, "step": 5880 }, { "epoch": 0.6708810296713936, "grad_norm": 7.517411708831787, "learning_rate": 4.339650645050805e-05, "loss": 5.9608, "step": 5890 }, { "epoch": 0.6720200466996982, "grad_norm": 8.141441345214844, "learning_rate": 4.3385089622102985e-05, "loss": 6.053, "step": 5900 }, { "epoch": 0.6731590637280027, "grad_norm": 5.962698936462402, "learning_rate": 4.337367279369791e-05, "loss": 5.9031, "step": 5910 }, { "epoch": 0.6742980807563073, "grad_norm": 5.588809013366699, "learning_rate": 4.336225596529284e-05, "loss": 5.7685, "step": 5920 }, { "epoch": 0.6754370977846119, "grad_norm": 9.669516563415527, "learning_rate": 4.335083913688777e-05, "loss": 5.9723, "step": 5930 }, { "epoch": 0.6765761148129165, "grad_norm": 7.2432756423950195, "learning_rate": 4.3339422308482705e-05, "loss": 5.8062, "step": 5940 }, { "epoch": 0.677715131841221, "grad_norm": 8.40849494934082, "learning_rate": 4.332800548007764e-05, "loss": 5.6209, "step": 5950 }, { "epoch": 0.6788541488695256, "grad_norm": 17.449857711791992, "learning_rate": 4.331658865167257e-05, "loss": 5.8675, "step": 5960 }, { "epoch": 0.6799931658978302, "grad_norm": 4.8293328285217285, "learning_rate": 4.33051718232675e-05, "loss": 5.8157, "step": 5970 }, { "epoch": 0.6811321829261348, "grad_norm": 11.033587455749512, "learning_rate": 4.3293754994862426e-05, "loss": 5.847, "step": 5980 }, { "epoch": 0.6822711999544393, "grad_norm": 6.28988790512085, "learning_rate": 4.328233816645736e-05, "loss": 5.7486, "step": 5990 }, { "epoch": 0.6834102169827438, "grad_norm": 6.424103736877441, "learning_rate": 4.327092133805229e-05, "loss": 5.9153, "step": 6000 }, { "epoch": 0.6834102169827438, "eval_loss": 6.000249862670898, "eval_runtime": 11.8864, "eval_samples_per_second": 1.262, "eval_steps_per_second": 0.168, "step": 6000 }, { "epoch": 0.6845492340110485, "grad_norm": 6.374187469482422, "learning_rate": 4.325950450964722e-05, "loss": 5.7432, "step": 6010 }, { "epoch": 0.685688251039353, "grad_norm": 8.024246215820312, "learning_rate": 4.324808768124215e-05, "loss": 6.0022, "step": 6020 }, { "epoch": 0.6868272680676576, "grad_norm": 9.164938926696777, "learning_rate": 4.3236670852837085e-05, "loss": 6.0697, "step": 6030 }, { "epoch": 0.6879662850959621, "grad_norm": 11.665236473083496, "learning_rate": 4.322525402443202e-05, "loss": 5.997, "step": 6040 }, { "epoch": 0.6891053021242668, "grad_norm": 6.842959403991699, "learning_rate": 4.321383719602695e-05, "loss": 5.3779, "step": 6050 }, { "epoch": 0.6902443191525713, "grad_norm": 6.97825288772583, "learning_rate": 4.3202420367621874e-05, "loss": 5.7943, "step": 6060 }, { "epoch": 0.6913833361808759, "grad_norm": 7.083444118499756, "learning_rate": 4.3191003539216805e-05, "loss": 5.707, "step": 6070 }, { "epoch": 0.6925223532091804, "grad_norm": 6.394343852996826, "learning_rate": 4.317958671081174e-05, "loss": 6.012, "step": 6080 }, { "epoch": 0.6936613702374851, "grad_norm": 6.833849906921387, "learning_rate": 4.316816988240667e-05, "loss": 6.1103, "step": 6090 }, { "epoch": 0.6948003872657896, "grad_norm": 5.809621810913086, "learning_rate": 4.31567530540016e-05, "loss": 5.968, "step": 6100 }, { "epoch": 0.6959394042940942, "grad_norm": 6.108304023742676, "learning_rate": 4.314533622559653e-05, "loss": 5.9533, "step": 6110 }, { "epoch": 0.6970784213223987, "grad_norm": 8.843610763549805, "learning_rate": 4.3133919397191465e-05, "loss": 5.7717, "step": 6120 }, { "epoch": 0.6982174383507034, "grad_norm": 6.577777862548828, "learning_rate": 4.3122502568786396e-05, "loss": 5.9296, "step": 6130 }, { "epoch": 0.6993564553790079, "grad_norm": 6.981563091278076, "learning_rate": 4.311108574038132e-05, "loss": 6.0679, "step": 6140 }, { "epoch": 0.7004954724073125, "grad_norm": 7.8999433517456055, "learning_rate": 4.309966891197625e-05, "loss": 5.8404, "step": 6150 }, { "epoch": 0.701634489435617, "grad_norm": 5.662416934967041, "learning_rate": 4.3088252083571185e-05, "loss": 6.1544, "step": 6160 }, { "epoch": 0.7027735064639217, "grad_norm": 12.454471588134766, "learning_rate": 4.307683525516612e-05, "loss": 5.7012, "step": 6170 }, { "epoch": 0.7039125234922262, "grad_norm": 11.734405517578125, "learning_rate": 4.306541842676105e-05, "loss": 5.7161, "step": 6180 }, { "epoch": 0.7050515405205308, "grad_norm": 7.174385070800781, "learning_rate": 4.305400159835598e-05, "loss": 5.6746, "step": 6190 }, { "epoch": 0.7061905575488353, "grad_norm": 5.351472854614258, "learning_rate": 4.304258476995091e-05, "loss": 5.8431, "step": 6200 }, { "epoch": 0.70732957457714, "grad_norm": 5.916141986846924, "learning_rate": 4.3031167941545844e-05, "loss": 5.9033, "step": 6210 }, { "epoch": 0.7084685916054445, "grad_norm": 7.05497932434082, "learning_rate": 4.301975111314077e-05, "loss": 5.6192, "step": 6220 }, { "epoch": 0.709607608633749, "grad_norm": 14.515453338623047, "learning_rate": 4.30083342847357e-05, "loss": 6.2751, "step": 6230 }, { "epoch": 0.7107466256620536, "grad_norm": 9.84923267364502, "learning_rate": 4.299691745633063e-05, "loss": 5.9913, "step": 6240 }, { "epoch": 0.7118856426903583, "grad_norm": 8.738007545471191, "learning_rate": 4.2985500627925565e-05, "loss": 5.8785, "step": 6250 }, { "epoch": 0.7130246597186628, "grad_norm": 5.9236860275268555, "learning_rate": 4.297408379952049e-05, "loss": 5.8098, "step": 6260 }, { "epoch": 0.7141636767469673, "grad_norm": 6.191288471221924, "learning_rate": 4.296266697111543e-05, "loss": 6.0675, "step": 6270 }, { "epoch": 0.7153026937752719, "grad_norm": 5.791018486022949, "learning_rate": 4.295125014271036e-05, "loss": 6.1648, "step": 6280 }, { "epoch": 0.7164417108035765, "grad_norm": 10.756135940551758, "learning_rate": 4.2939833314305286e-05, "loss": 6.0626, "step": 6290 }, { "epoch": 0.7175807278318811, "grad_norm": 7.2194976806640625, "learning_rate": 4.292841648590022e-05, "loss": 5.7357, "step": 6300 }, { "epoch": 0.7187197448601856, "grad_norm": 6.4318318367004395, "learning_rate": 4.291699965749515e-05, "loss": 5.7417, "step": 6310 }, { "epoch": 0.7198587618884902, "grad_norm": 12.609630584716797, "learning_rate": 4.290558282909008e-05, "loss": 5.7873, "step": 6320 }, { "epoch": 0.7209977789167948, "grad_norm": 6.773333549499512, "learning_rate": 4.289416600068501e-05, "loss": 6.0641, "step": 6330 }, { "epoch": 0.7221367959450994, "grad_norm": 7.28901481628418, "learning_rate": 4.288274917227994e-05, "loss": 5.7901, "step": 6340 }, { "epoch": 0.7232758129734039, "grad_norm": 7.068140029907227, "learning_rate": 4.2871332343874877e-05, "loss": 5.9178, "step": 6350 }, { "epoch": 0.7244148300017085, "grad_norm": 6.572689533233643, "learning_rate": 4.285991551546981e-05, "loss": 5.7508, "step": 6360 }, { "epoch": 0.7255538470300131, "grad_norm": 7.898672580718994, "learning_rate": 4.2848498687064733e-05, "loss": 6.1836, "step": 6370 }, { "epoch": 0.7266928640583177, "grad_norm": 5.110747337341309, "learning_rate": 4.2837081858659665e-05, "loss": 5.6782, "step": 6380 }, { "epoch": 0.7278318810866222, "grad_norm": 10.442312240600586, "learning_rate": 4.28256650302546e-05, "loss": 6.0124, "step": 6390 }, { "epoch": 0.7289708981149268, "grad_norm": 9.792623519897461, "learning_rate": 4.281424820184953e-05, "loss": 5.7531, "step": 6400 }, { "epoch": 0.7301099151432314, "grad_norm": 12.85150146484375, "learning_rate": 4.280283137344446e-05, "loss": 5.7829, "step": 6410 }, { "epoch": 0.731248932171536, "grad_norm": 9.494611740112305, "learning_rate": 4.2791414545039386e-05, "loss": 6.0707, "step": 6420 }, { "epoch": 0.7323879491998405, "grad_norm": 8.019498825073242, "learning_rate": 4.2779997716634324e-05, "loss": 5.9736, "step": 6430 }, { "epoch": 0.7335269662281451, "grad_norm": 6.833261013031006, "learning_rate": 4.2768580888229256e-05, "loss": 5.6632, "step": 6440 }, { "epoch": 0.7346659832564497, "grad_norm": 6.260386943817139, "learning_rate": 4.275716405982418e-05, "loss": 5.7989, "step": 6450 }, { "epoch": 0.7358050002847543, "grad_norm": 11.511929512023926, "learning_rate": 4.274574723141911e-05, "loss": 6.0158, "step": 6460 }, { "epoch": 0.7369440173130588, "grad_norm": 6.045806407928467, "learning_rate": 4.2734330403014045e-05, "loss": 5.6529, "step": 6470 }, { "epoch": 0.7380830343413634, "grad_norm": 5.465932369232178, "learning_rate": 4.272291357460898e-05, "loss": 6.1358, "step": 6480 }, { "epoch": 0.739222051369668, "grad_norm": 4.64259147644043, "learning_rate": 4.27114967462039e-05, "loss": 5.8446, "step": 6490 }, { "epoch": 0.7403610683979726, "grad_norm": 11.950940132141113, "learning_rate": 4.2700079917798834e-05, "loss": 5.5124, "step": 6500 }, { "epoch": 0.7415000854262771, "grad_norm": 11.80821418762207, "learning_rate": 4.268866308939377e-05, "loss": 5.3712, "step": 6510 }, { "epoch": 0.7426391024545816, "grad_norm": 7.256740093231201, "learning_rate": 4.2677246260988704e-05, "loss": 5.6585, "step": 6520 }, { "epoch": 0.7437781194828863, "grad_norm": 9.645147323608398, "learning_rate": 4.266582943258363e-05, "loss": 6.0998, "step": 6530 }, { "epoch": 0.7449171365111908, "grad_norm": 6.811032295227051, "learning_rate": 4.265441260417856e-05, "loss": 5.983, "step": 6540 }, { "epoch": 0.7460561535394954, "grad_norm": 7.274852752685547, "learning_rate": 4.264299577577349e-05, "loss": 5.8492, "step": 6550 }, { "epoch": 0.7471951705677999, "grad_norm": 14.114151954650879, "learning_rate": 4.2631578947368425e-05, "loss": 5.8024, "step": 6560 }, { "epoch": 0.7483341875961046, "grad_norm": 5.180044651031494, "learning_rate": 4.262016211896335e-05, "loss": 5.8085, "step": 6570 }, { "epoch": 0.7494732046244091, "grad_norm": 7.047656536102295, "learning_rate": 4.260874529055828e-05, "loss": 5.9068, "step": 6580 }, { "epoch": 0.7506122216527137, "grad_norm": 5.658132553100586, "learning_rate": 4.2597328462153214e-05, "loss": 5.7752, "step": 6590 }, { "epoch": 0.7517512386810182, "grad_norm": 7.832466125488281, "learning_rate": 4.258591163374815e-05, "loss": 5.9246, "step": 6600 }, { "epoch": 0.7528902557093229, "grad_norm": 7.179642200469971, "learning_rate": 4.257449480534308e-05, "loss": 5.8667, "step": 6610 }, { "epoch": 0.7540292727376274, "grad_norm": 7.623144149780273, "learning_rate": 4.256307797693801e-05, "loss": 6.0546, "step": 6620 }, { "epoch": 0.755168289765932, "grad_norm": 8.365649223327637, "learning_rate": 4.255166114853294e-05, "loss": 5.8384, "step": 6630 }, { "epoch": 0.7563073067942365, "grad_norm": 5.382843494415283, "learning_rate": 4.254024432012787e-05, "loss": 5.6101, "step": 6640 }, { "epoch": 0.7574463238225412, "grad_norm": 6.739308834075928, "learning_rate": 4.25288274917228e-05, "loss": 5.5094, "step": 6650 }, { "epoch": 0.7585853408508457, "grad_norm": 14.110841751098633, "learning_rate": 4.251741066331773e-05, "loss": 5.6044, "step": 6660 }, { "epoch": 0.7597243578791503, "grad_norm": 11.060261726379395, "learning_rate": 4.250599383491266e-05, "loss": 5.8924, "step": 6670 }, { "epoch": 0.7608633749074548, "grad_norm": 10.84189510345459, "learning_rate": 4.249457700650759e-05, "loss": 5.867, "step": 6680 }, { "epoch": 0.7620023919357595, "grad_norm": 8.165534019470215, "learning_rate": 4.2483160178102525e-05, "loss": 5.894, "step": 6690 }, { "epoch": 0.763141408964064, "grad_norm": 13.278603553771973, "learning_rate": 4.247174334969746e-05, "loss": 5.7121, "step": 6700 }, { "epoch": 0.7642804259923686, "grad_norm": 4.628503799438477, "learning_rate": 4.246032652129239e-05, "loss": 6.1128, "step": 6710 }, { "epoch": 0.7654194430206731, "grad_norm": 12.247485160827637, "learning_rate": 4.244890969288732e-05, "loss": 5.8867, "step": 6720 }, { "epoch": 0.7665584600489778, "grad_norm": 7.217540740966797, "learning_rate": 4.2437492864482246e-05, "loss": 5.7733, "step": 6730 }, { "epoch": 0.7676974770772823, "grad_norm": 7.513230800628662, "learning_rate": 4.242607603607718e-05, "loss": 6.0099, "step": 6740 }, { "epoch": 0.7688364941055869, "grad_norm": 7.3117499351501465, "learning_rate": 4.241465920767211e-05, "loss": 5.6005, "step": 6750 }, { "epoch": 0.7699755111338914, "grad_norm": 6.87970495223999, "learning_rate": 4.240324237926704e-05, "loss": 6.2145, "step": 6760 }, { "epoch": 0.7711145281621961, "grad_norm": 22.215946197509766, "learning_rate": 4.239182555086197e-05, "loss": 5.6321, "step": 6770 }, { "epoch": 0.7722535451905006, "grad_norm": 8.393705368041992, "learning_rate": 4.2380408722456905e-05, "loss": 5.8547, "step": 6780 }, { "epoch": 0.7733925622188051, "grad_norm": 6.210604190826416, "learning_rate": 4.236899189405184e-05, "loss": 5.6622, "step": 6790 }, { "epoch": 0.7745315792471097, "grad_norm": 8.281871795654297, "learning_rate": 4.235757506564676e-05, "loss": 5.5941, "step": 6800 }, { "epoch": 0.7756705962754143, "grad_norm": 8.00438404083252, "learning_rate": 4.2346158237241694e-05, "loss": 6.1111, "step": 6810 }, { "epoch": 0.7768096133037189, "grad_norm": 16.651893615722656, "learning_rate": 4.2334741408836625e-05, "loss": 6.0269, "step": 6820 }, { "epoch": 0.7779486303320234, "grad_norm": 8.054128646850586, "learning_rate": 4.232332458043156e-05, "loss": 5.5274, "step": 6830 }, { "epoch": 0.779087647360328, "grad_norm": 6.785789489746094, "learning_rate": 4.231190775202649e-05, "loss": 6.1035, "step": 6840 }, { "epoch": 0.7802266643886326, "grad_norm": 8.805694580078125, "learning_rate": 4.230049092362142e-05, "loss": 6.0284, "step": 6850 }, { "epoch": 0.7813656814169372, "grad_norm": 8.542845726013184, "learning_rate": 4.228907409521635e-05, "loss": 5.5808, "step": 6860 }, { "epoch": 0.7825046984452417, "grad_norm": 10.117310523986816, "learning_rate": 4.2277657266811285e-05, "loss": 5.6974, "step": 6870 }, { "epoch": 0.7836437154735463, "grad_norm": 7.026234149932861, "learning_rate": 4.226624043840621e-05, "loss": 5.6559, "step": 6880 }, { "epoch": 0.7847827325018509, "grad_norm": 6.190097332000732, "learning_rate": 4.225482361000114e-05, "loss": 5.6631, "step": 6890 }, { "epoch": 0.7859217495301555, "grad_norm": 5.212761878967285, "learning_rate": 4.2243406781596073e-05, "loss": 6.198, "step": 6900 }, { "epoch": 0.78706076655846, "grad_norm": 4.255821228027344, "learning_rate": 4.2231989953191005e-05, "loss": 5.9244, "step": 6910 }, { "epoch": 0.7881997835867646, "grad_norm": 3.5964879989624023, "learning_rate": 4.222057312478594e-05, "loss": 6.0321, "step": 6920 }, { "epoch": 0.7893388006150692, "grad_norm": 5.330949783325195, "learning_rate": 4.220915629638087e-05, "loss": 5.7435, "step": 6930 }, { "epoch": 0.7904778176433738, "grad_norm": 17.729000091552734, "learning_rate": 4.21977394679758e-05, "loss": 5.7756, "step": 6940 }, { "epoch": 0.7916168346716783, "grad_norm": 4.763799667358398, "learning_rate": 4.218632263957073e-05, "loss": 5.8928, "step": 6950 }, { "epoch": 0.7927558516999829, "grad_norm": 6.26491641998291, "learning_rate": 4.217490581116566e-05, "loss": 5.7309, "step": 6960 }, { "epoch": 0.7938948687282875, "grad_norm": 6.810174465179443, "learning_rate": 4.216348898276059e-05, "loss": 5.6357, "step": 6970 }, { "epoch": 0.7950338857565921, "grad_norm": 7.7829718589782715, "learning_rate": 4.215207215435552e-05, "loss": 5.7724, "step": 6980 }, { "epoch": 0.7961729027848966, "grad_norm": 15.339445114135742, "learning_rate": 4.214065532595045e-05, "loss": 5.5577, "step": 6990 }, { "epoch": 0.7973119198132012, "grad_norm": 9.3477201461792, "learning_rate": 4.212923849754538e-05, "loss": 6.085, "step": 7000 }, { "epoch": 0.7984509368415058, "grad_norm": 5.963769435882568, "learning_rate": 4.211782166914032e-05, "loss": 5.9804, "step": 7010 }, { "epoch": 0.7995899538698104, "grad_norm": 7.498968124389648, "learning_rate": 4.210640484073525e-05, "loss": 5.8361, "step": 7020 }, { "epoch": 0.8007289708981149, "grad_norm": 15.94110107421875, "learning_rate": 4.209498801233018e-05, "loss": 5.8782, "step": 7030 }, { "epoch": 0.8018679879264194, "grad_norm": 6.359627723693848, "learning_rate": 4.2083571183925106e-05, "loss": 5.9358, "step": 7040 }, { "epoch": 0.8030070049547241, "grad_norm": 4.368448257446289, "learning_rate": 4.207215435552004e-05, "loss": 5.7184, "step": 7050 }, { "epoch": 0.8041460219830286, "grad_norm": 8.287890434265137, "learning_rate": 4.206073752711497e-05, "loss": 5.7268, "step": 7060 }, { "epoch": 0.8052850390113332, "grad_norm": 3.759228467941284, "learning_rate": 4.20493206987099e-05, "loss": 6.0169, "step": 7070 }, { "epoch": 0.8064240560396377, "grad_norm": 7.766053199768066, "learning_rate": 4.2037903870304826e-05, "loss": 5.9249, "step": 7080 }, { "epoch": 0.8075630730679424, "grad_norm": 5.910696029663086, "learning_rate": 4.2026487041899765e-05, "loss": 5.6922, "step": 7090 }, { "epoch": 0.8087020900962469, "grad_norm": 11.534826278686523, "learning_rate": 4.2015070213494697e-05, "loss": 5.8633, "step": 7100 }, { "epoch": 0.8098411071245515, "grad_norm": 11.53891372680664, "learning_rate": 4.200365338508963e-05, "loss": 6.0148, "step": 7110 }, { "epoch": 0.810980124152856, "grad_norm": 9.300012588500977, "learning_rate": 4.1992236556684553e-05, "loss": 5.6454, "step": 7120 }, { "epoch": 0.8121191411811607, "grad_norm": 10.440338134765625, "learning_rate": 4.1980819728279485e-05, "loss": 5.6689, "step": 7130 }, { "epoch": 0.8132581582094652, "grad_norm": 5.3286542892456055, "learning_rate": 4.196940289987442e-05, "loss": 5.7747, "step": 7140 }, { "epoch": 0.8143971752377698, "grad_norm": 6.505975723266602, "learning_rate": 4.195798607146935e-05, "loss": 5.6918, "step": 7150 }, { "epoch": 0.8155361922660743, "grad_norm": 4.70256233215332, "learning_rate": 4.1946569243064274e-05, "loss": 5.6079, "step": 7160 }, { "epoch": 0.816675209294379, "grad_norm": 4.606108665466309, "learning_rate": 4.193515241465921e-05, "loss": 5.7555, "step": 7170 }, { "epoch": 0.8178142263226835, "grad_norm": 5.840761661529541, "learning_rate": 4.1923735586254145e-05, "loss": 5.9702, "step": 7180 }, { "epoch": 0.8189532433509881, "grad_norm": 7.519806385040283, "learning_rate": 4.191231875784907e-05, "loss": 5.8691, "step": 7190 }, { "epoch": 0.8200922603792927, "grad_norm": 9.685820579528809, "learning_rate": 4.1900901929444e-05, "loss": 5.7984, "step": 7200 }, { "epoch": 0.8212312774075973, "grad_norm": 5.0011515617370605, "learning_rate": 4.188948510103893e-05, "loss": 5.6861, "step": 7210 }, { "epoch": 0.8223702944359018, "grad_norm": 7.479933738708496, "learning_rate": 4.1878068272633865e-05, "loss": 5.8211, "step": 7220 }, { "epoch": 0.8235093114642064, "grad_norm": 24.248559951782227, "learning_rate": 4.18666514442288e-05, "loss": 6.106, "step": 7230 }, { "epoch": 0.824648328492511, "grad_norm": 5.33579158782959, "learning_rate": 4.185523461582372e-05, "loss": 5.8561, "step": 7240 }, { "epoch": 0.8257873455208156, "grad_norm": 5.8180341720581055, "learning_rate": 4.1844959470259165e-05, "loss": 5.671, "step": 7250 }, { "epoch": 0.8269263625491201, "grad_norm": 7.822412967681885, "learning_rate": 4.1833542641854097e-05, "loss": 5.7731, "step": 7260 }, { "epoch": 0.8280653795774247, "grad_norm": 4.881489276885986, "learning_rate": 4.182212581344903e-05, "loss": 6.17, "step": 7270 }, { "epoch": 0.8292043966057293, "grad_norm": 9.92574691772461, "learning_rate": 4.1810708985043953e-05, "loss": 5.6865, "step": 7280 }, { "epoch": 0.8303434136340339, "grad_norm": 6.111751556396484, "learning_rate": 4.1799292156638885e-05, "loss": 5.9666, "step": 7290 }, { "epoch": 0.8314824306623384, "grad_norm": 16.988025665283203, "learning_rate": 4.178787532823382e-05, "loss": 5.5669, "step": 7300 }, { "epoch": 0.832621447690643, "grad_norm": 7.006514549255371, "learning_rate": 4.177645849982875e-05, "loss": 5.8763, "step": 7310 }, { "epoch": 0.8337604647189476, "grad_norm": 6.63994836807251, "learning_rate": 4.176504167142368e-05, "loss": 5.6928, "step": 7320 }, { "epoch": 0.8348994817472521, "grad_norm": 7.609856128692627, "learning_rate": 4.175362484301861e-05, "loss": 6.0222, "step": 7330 }, { "epoch": 0.8360384987755567, "grad_norm": 6.351384162902832, "learning_rate": 4.1742208014613544e-05, "loss": 6.1061, "step": 7340 }, { "epoch": 0.8371775158038612, "grad_norm": 9.336108207702637, "learning_rate": 4.1730791186208476e-05, "loss": 5.681, "step": 7350 }, { "epoch": 0.8383165328321659, "grad_norm": 6.0924272537231445, "learning_rate": 4.17193743578034e-05, "loss": 6.0066, "step": 7360 }, { "epoch": 0.8394555498604704, "grad_norm": 9.09196662902832, "learning_rate": 4.170795752939833e-05, "loss": 6.15, "step": 7370 }, { "epoch": 0.840594566888775, "grad_norm": 5.710347652435303, "learning_rate": 4.1696540700993265e-05, "loss": 5.6897, "step": 7380 }, { "epoch": 0.8417335839170795, "grad_norm": 5.6023688316345215, "learning_rate": 4.16851238725882e-05, "loss": 5.6246, "step": 7390 }, { "epoch": 0.8428726009453842, "grad_norm": 10.401467323303223, "learning_rate": 4.167370704418313e-05, "loss": 5.9081, "step": 7400 }, { "epoch": 0.8440116179736887, "grad_norm": 11.755331039428711, "learning_rate": 4.166229021577806e-05, "loss": 5.9648, "step": 7410 }, { "epoch": 0.8451506350019933, "grad_norm": 5.7525248527526855, "learning_rate": 4.165087338737299e-05, "loss": 5.7465, "step": 7420 }, { "epoch": 0.8462896520302978, "grad_norm": 8.131318092346191, "learning_rate": 4.1639456558967924e-05, "loss": 5.8959, "step": 7430 }, { "epoch": 0.8474286690586025, "grad_norm": 6.65851354598999, "learning_rate": 4.162803973056285e-05, "loss": 5.5865, "step": 7440 }, { "epoch": 0.848567686086907, "grad_norm": 5.441812992095947, "learning_rate": 4.161662290215778e-05, "loss": 5.6765, "step": 7450 }, { "epoch": 0.8497067031152116, "grad_norm": 11.929362297058105, "learning_rate": 4.160520607375271e-05, "loss": 5.6566, "step": 7460 }, { "epoch": 0.8508457201435161, "grad_norm": 10.595314025878906, "learning_rate": 4.1593789245347645e-05, "loss": 5.9156, "step": 7470 }, { "epoch": 0.8519847371718208, "grad_norm": 25.11834144592285, "learning_rate": 4.158237241694258e-05, "loss": 5.8055, "step": 7480 }, { "epoch": 0.8531237542001253, "grad_norm": 5.37290620803833, "learning_rate": 4.157095558853751e-05, "loss": 5.8243, "step": 7490 }, { "epoch": 0.8542627712284299, "grad_norm": 10.526466369628906, "learning_rate": 4.155953876013244e-05, "loss": 5.9527, "step": 7500 }, { "epoch": 0.8554017882567344, "grad_norm": 7.707073211669922, "learning_rate": 4.1548121931727365e-05, "loss": 5.6988, "step": 7510 }, { "epoch": 0.8565408052850391, "grad_norm": 7.538547992706299, "learning_rate": 4.15367051033223e-05, "loss": 5.7646, "step": 7520 }, { "epoch": 0.8576798223133436, "grad_norm": 9.679366111755371, "learning_rate": 4.152528827491723e-05, "loss": 5.7077, "step": 7530 }, { "epoch": 0.8588188393416482, "grad_norm": 10.528566360473633, "learning_rate": 4.151387144651216e-05, "loss": 5.9138, "step": 7540 }, { "epoch": 0.8599578563699527, "grad_norm": 7.699685573577881, "learning_rate": 4.150245461810709e-05, "loss": 5.7155, "step": 7550 }, { "epoch": 0.8610968733982574, "grad_norm": 5.603579998016357, "learning_rate": 4.1491037789702025e-05, "loss": 5.5373, "step": 7560 }, { "epoch": 0.8622358904265619, "grad_norm": 6.505054950714111, "learning_rate": 4.1479620961296956e-05, "loss": 5.6922, "step": 7570 }, { "epoch": 0.8633749074548664, "grad_norm": 6.861274242401123, "learning_rate": 4.146820413289189e-05, "loss": 6.0833, "step": 7580 }, { "epoch": 0.864513924483171, "grad_norm": 6.467966079711914, "learning_rate": 4.145678730448681e-05, "loss": 5.8859, "step": 7590 }, { "epoch": 0.8656529415114756, "grad_norm": 6.942263603210449, "learning_rate": 4.1445370476081745e-05, "loss": 5.8636, "step": 7600 }, { "epoch": 0.8667919585397802, "grad_norm": 6.266470432281494, "learning_rate": 4.143395364767668e-05, "loss": 5.7906, "step": 7610 }, { "epoch": 0.8679309755680847, "grad_norm": 10.374220848083496, "learning_rate": 4.142253681927161e-05, "loss": 5.7794, "step": 7620 }, { "epoch": 0.8690699925963893, "grad_norm": 8.868586540222168, "learning_rate": 4.1411119990866534e-05, "loss": 6.1445, "step": 7630 }, { "epoch": 0.8702090096246939, "grad_norm": 7.694916725158691, "learning_rate": 4.139970316246147e-05, "loss": 5.7188, "step": 7640 }, { "epoch": 0.8713480266529985, "grad_norm": 9.376993179321289, "learning_rate": 4.1388286334056404e-05, "loss": 5.5129, "step": 7650 }, { "epoch": 0.872487043681303, "grad_norm": 7.108951568603516, "learning_rate": 4.1376869505651336e-05, "loss": 6.0777, "step": 7660 }, { "epoch": 0.8736260607096076, "grad_norm": 5.349836349487305, "learning_rate": 4.136545267724626e-05, "loss": 5.6746, "step": 7670 }, { "epoch": 0.8747650777379122, "grad_norm": 23.05253028869629, "learning_rate": 4.135403584884119e-05, "loss": 5.6119, "step": 7680 }, { "epoch": 0.8759040947662168, "grad_norm": 8.042037963867188, "learning_rate": 4.1342619020436125e-05, "loss": 6.0481, "step": 7690 }, { "epoch": 0.8770431117945213, "grad_norm": 7.190995216369629, "learning_rate": 4.133120219203106e-05, "loss": 5.6291, "step": 7700 }, { "epoch": 0.8781821288228259, "grad_norm": 6.2798871994018555, "learning_rate": 4.131978536362598e-05, "loss": 5.8272, "step": 7710 }, { "epoch": 0.8793211458511305, "grad_norm": 6.415441036224365, "learning_rate": 4.1308368535220914e-05, "loss": 5.9067, "step": 7720 }, { "epoch": 0.8804601628794351, "grad_norm": 9.83820915222168, "learning_rate": 4.129695170681585e-05, "loss": 6.2126, "step": 7730 }, { "epoch": 0.8815991799077396, "grad_norm": 6.543364524841309, "learning_rate": 4.1285534878410784e-05, "loss": 5.7905, "step": 7740 }, { "epoch": 0.8827381969360442, "grad_norm": 5.862452507019043, "learning_rate": 4.127411805000571e-05, "loss": 5.6371, "step": 7750 }, { "epoch": 0.8838772139643488, "grad_norm": 6.425840377807617, "learning_rate": 4.126270122160064e-05, "loss": 5.6843, "step": 7760 }, { "epoch": 0.8850162309926534, "grad_norm": 7.781904697418213, "learning_rate": 4.125128439319557e-05, "loss": 5.6806, "step": 7770 }, { "epoch": 0.8861552480209579, "grad_norm": 6.408961772918701, "learning_rate": 4.1239867564790505e-05, "loss": 6.0425, "step": 7780 }, { "epoch": 0.8872942650492625, "grad_norm": 6.187387943267822, "learning_rate": 4.122845073638543e-05, "loss": 6.662, "step": 7790 }, { "epoch": 0.8884332820775671, "grad_norm": 5.426062107086182, "learning_rate": 4.121703390798036e-05, "loss": 5.5295, "step": 7800 }, { "epoch": 0.8895722991058717, "grad_norm": 6.168425559997559, "learning_rate": 4.12056170795753e-05, "loss": 5.8367, "step": 7810 }, { "epoch": 0.8907113161341762, "grad_norm": 6.9254679679870605, "learning_rate": 4.1194200251170225e-05, "loss": 5.6099, "step": 7820 }, { "epoch": 0.8918503331624807, "grad_norm": 4.471135139465332, "learning_rate": 4.118278342276516e-05, "loss": 5.8812, "step": 7830 }, { "epoch": 0.8929893501907854, "grad_norm": 11.914435386657715, "learning_rate": 4.117136659436009e-05, "loss": 5.7133, "step": 7840 }, { "epoch": 0.89412836721909, "grad_norm": 6.855250835418701, "learning_rate": 4.115994976595502e-05, "loss": 5.5045, "step": 7850 }, { "epoch": 0.8952673842473945, "grad_norm": 4.5126190185546875, "learning_rate": 4.114853293754995e-05, "loss": 6.1189, "step": 7860 }, { "epoch": 0.896406401275699, "grad_norm": 5.44443941116333, "learning_rate": 4.113711610914488e-05, "loss": 5.6838, "step": 7870 }, { "epoch": 0.8975454183040037, "grad_norm": 6.447963237762451, "learning_rate": 4.112569928073981e-05, "loss": 5.7742, "step": 7880 }, { "epoch": 0.8986844353323082, "grad_norm": 9.784255981445312, "learning_rate": 4.111428245233475e-05, "loss": 5.6421, "step": 7890 }, { "epoch": 0.8998234523606128, "grad_norm": 4.79591178894043, "learning_rate": 4.110286562392967e-05, "loss": 5.8867, "step": 7900 }, { "epoch": 0.9009624693889173, "grad_norm": 10.118879318237305, "learning_rate": 4.1091448795524605e-05, "loss": 5.7606, "step": 7910 }, { "epoch": 0.902101486417222, "grad_norm": 4.7326483726501465, "learning_rate": 4.108003196711954e-05, "loss": 5.8425, "step": 7920 }, { "epoch": 0.9032405034455265, "grad_norm": 9.731658935546875, "learning_rate": 4.106861513871447e-05, "loss": 5.6166, "step": 7930 }, { "epoch": 0.9043795204738311, "grad_norm": 9.106302261352539, "learning_rate": 4.1057198310309394e-05, "loss": 5.9631, "step": 7940 }, { "epoch": 0.9055185375021356, "grad_norm": 8.407746315002441, "learning_rate": 4.1045781481904326e-05, "loss": 5.9826, "step": 7950 }, { "epoch": 0.9066575545304403, "grad_norm": 7.38493537902832, "learning_rate": 4.103436465349926e-05, "loss": 5.3733, "step": 7960 }, { "epoch": 0.9077965715587448, "grad_norm": 4.594963550567627, "learning_rate": 4.1022947825094196e-05, "loss": 5.5044, "step": 7970 }, { "epoch": 0.9089355885870494, "grad_norm": 6.537250995635986, "learning_rate": 4.101153099668912e-05, "loss": 5.8437, "step": 7980 }, { "epoch": 0.9100746056153539, "grad_norm": 8.928478240966797, "learning_rate": 4.100011416828405e-05, "loss": 5.8224, "step": 7990 }, { "epoch": 0.9112136226436586, "grad_norm": 6.1974968910217285, "learning_rate": 4.0988697339878985e-05, "loss": 5.7735, "step": 8000 }, { "epoch": 0.9112136226436586, "eval_loss": 5.963014125823975, "eval_runtime": 12.0216, "eval_samples_per_second": 1.248, "eval_steps_per_second": 0.166, "step": 8000 }, { "epoch": 0.9123526396719631, "grad_norm": 16.432716369628906, "learning_rate": 4.0977280511473917e-05, "loss": 5.3713, "step": 8010 }, { "epoch": 0.9134916567002677, "grad_norm": 8.134661674499512, "learning_rate": 4.096586368306884e-05, "loss": 5.7447, "step": 8020 }, { "epoch": 0.9146306737285722, "grad_norm": 6.499509334564209, "learning_rate": 4.0954446854663773e-05, "loss": 5.5174, "step": 8030 }, { "epoch": 0.9157696907568769, "grad_norm": 6.545501708984375, "learning_rate": 4.0943030026258705e-05, "loss": 6.1282, "step": 8040 }, { "epoch": 0.9169087077851814, "grad_norm": 5.5157976150512695, "learning_rate": 4.093161319785364e-05, "loss": 6.0457, "step": 8050 }, { "epoch": 0.918047724813486, "grad_norm": 6.671703338623047, "learning_rate": 4.092019636944857e-05, "loss": 5.6853, "step": 8060 }, { "epoch": 0.9191867418417905, "grad_norm": 8.327787399291992, "learning_rate": 4.09087795410435e-05, "loss": 5.6512, "step": 8070 }, { "epoch": 0.9203257588700952, "grad_norm": 5.751533031463623, "learning_rate": 4.089736271263843e-05, "loss": 5.5488, "step": 8080 }, { "epoch": 0.9214647758983997, "grad_norm": 5.360276222229004, "learning_rate": 4.0885945884233365e-05, "loss": 5.8269, "step": 8090 }, { "epoch": 0.9226037929267042, "grad_norm": 9.118461608886719, "learning_rate": 4.087452905582829e-05, "loss": 5.783, "step": 8100 }, { "epoch": 0.9237428099550088, "grad_norm": 6.865748882293701, "learning_rate": 4.086311222742322e-05, "loss": 5.4973, "step": 8110 }, { "epoch": 0.9248818269833134, "grad_norm": 5.93018102645874, "learning_rate": 4.085169539901815e-05, "loss": 5.8153, "step": 8120 }, { "epoch": 0.926020844011618, "grad_norm": 4.858203887939453, "learning_rate": 4.0840278570613085e-05, "loss": 5.6685, "step": 8130 }, { "epoch": 0.9271598610399225, "grad_norm": 17.164108276367188, "learning_rate": 4.082886174220802e-05, "loss": 6.1546, "step": 8140 }, { "epoch": 0.9282988780682271, "grad_norm": 7.045877456665039, "learning_rate": 4.081744491380295e-05, "loss": 5.5681, "step": 8150 }, { "epoch": 0.9294378950965317, "grad_norm": 7.193668365478516, "learning_rate": 4.080602808539788e-05, "loss": 5.7963, "step": 8160 }, { "epoch": 0.9305769121248363, "grad_norm": 6.006307601928711, "learning_rate": 4.079461125699281e-05, "loss": 5.7842, "step": 8170 }, { "epoch": 0.9317159291531408, "grad_norm": 5.786032199859619, "learning_rate": 4.078319442858774e-05, "loss": 5.7106, "step": 8180 }, { "epoch": 0.9328549461814454, "grad_norm": 8.014665603637695, "learning_rate": 4.077177760018267e-05, "loss": 5.6216, "step": 8190 }, { "epoch": 0.93399396320975, "grad_norm": 7.127926826477051, "learning_rate": 4.07603607717776e-05, "loss": 5.6205, "step": 8200 }, { "epoch": 0.9351329802380546, "grad_norm": 7.680981159210205, "learning_rate": 4.074894394337253e-05, "loss": 5.5081, "step": 8210 }, { "epoch": 0.9362719972663591, "grad_norm": 7.980518817901611, "learning_rate": 4.0737527114967465e-05, "loss": 5.7191, "step": 8220 }, { "epoch": 0.9374110142946637, "grad_norm": 6.019864559173584, "learning_rate": 4.07261102865624e-05, "loss": 6.177, "step": 8230 }, { "epoch": 0.9385500313229683, "grad_norm": 5.620800018310547, "learning_rate": 4.071469345815733e-05, "loss": 5.6332, "step": 8240 }, { "epoch": 0.9396890483512729, "grad_norm": 6.723660945892334, "learning_rate": 4.070327662975226e-05, "loss": 5.5879, "step": 8250 }, { "epoch": 0.9408280653795774, "grad_norm": 9.479137420654297, "learning_rate": 4.0691859801347185e-05, "loss": 5.8988, "step": 8260 }, { "epoch": 0.941967082407882, "grad_norm": 5.0746989250183105, "learning_rate": 4.068044297294212e-05, "loss": 5.6945, "step": 8270 }, { "epoch": 0.9431060994361866, "grad_norm": 8.483436584472656, "learning_rate": 4.066902614453705e-05, "loss": 5.6908, "step": 8280 }, { "epoch": 0.9442451164644912, "grad_norm": 5.951892852783203, "learning_rate": 4.065760931613198e-05, "loss": 5.6396, "step": 8290 }, { "epoch": 0.9453841334927957, "grad_norm": 6.2564496994018555, "learning_rate": 4.064619248772691e-05, "loss": 5.7533, "step": 8300 }, { "epoch": 0.9465231505211003, "grad_norm": 6.395124435424805, "learning_rate": 4.0634775659321845e-05, "loss": 5.7885, "step": 8310 }, { "epoch": 0.9476621675494049, "grad_norm": 4.897542953491211, "learning_rate": 4.0623358830916776e-05, "loss": 5.8378, "step": 8320 }, { "epoch": 0.9488011845777095, "grad_norm": 7.198699951171875, "learning_rate": 4.06119420025117e-05, "loss": 5.9325, "step": 8330 }, { "epoch": 0.949940201606014, "grad_norm": 5.802608966827393, "learning_rate": 4.060052517410663e-05, "loss": 5.9556, "step": 8340 }, { "epoch": 0.9510792186343185, "grad_norm": 6.417184352874756, "learning_rate": 4.0589108345701565e-05, "loss": 5.5368, "step": 8350 }, { "epoch": 0.9522182356626232, "grad_norm": 6.399797439575195, "learning_rate": 4.05776915172965e-05, "loss": 5.6566, "step": 8360 }, { "epoch": 0.9533572526909277, "grad_norm": 6.135841369628906, "learning_rate": 4.056627468889143e-05, "loss": 5.5519, "step": 8370 }, { "epoch": 0.9544962697192323, "grad_norm": 5.759917736053467, "learning_rate": 4.055485786048636e-05, "loss": 5.9158, "step": 8380 }, { "epoch": 0.9556352867475368, "grad_norm": 4.6408796310424805, "learning_rate": 4.054344103208129e-05, "loss": 5.7527, "step": 8390 }, { "epoch": 0.9567743037758415, "grad_norm": 7.432102203369141, "learning_rate": 4.0532024203676224e-05, "loss": 5.9422, "step": 8400 }, { "epoch": 0.957913320804146, "grad_norm": 8.108816146850586, "learning_rate": 4.052060737527115e-05, "loss": 5.5413, "step": 8410 }, { "epoch": 0.9590523378324506, "grad_norm": 5.005612850189209, "learning_rate": 4.050919054686608e-05, "loss": 5.7046, "step": 8420 }, { "epoch": 0.9601913548607551, "grad_norm": 15.327434539794922, "learning_rate": 4.049777371846101e-05, "loss": 5.7096, "step": 8430 }, { "epoch": 0.9613303718890598, "grad_norm": 5.380997180938721, "learning_rate": 4.0486356890055945e-05, "loss": 5.6903, "step": 8440 }, { "epoch": 0.9624693889173643, "grad_norm": 8.846467971801758, "learning_rate": 4.047494006165087e-05, "loss": 5.732, "step": 8450 }, { "epoch": 0.9636084059456689, "grad_norm": 4.473547458648682, "learning_rate": 4.04635232332458e-05, "loss": 5.9855, "step": 8460 }, { "epoch": 0.9647474229739734, "grad_norm": 4.686957359313965, "learning_rate": 4.045210640484074e-05, "loss": 5.7405, "step": 8470 }, { "epoch": 0.9658864400022781, "grad_norm": 8.524946212768555, "learning_rate": 4.044068957643567e-05, "loss": 5.7864, "step": 8480 }, { "epoch": 0.9670254570305826, "grad_norm": 8.493307113647461, "learning_rate": 4.04292727480306e-05, "loss": 6.0512, "step": 8490 }, { "epoch": 0.9681644740588872, "grad_norm": 4.178739547729492, "learning_rate": 4.041785591962553e-05, "loss": 5.9716, "step": 8500 }, { "epoch": 0.9693034910871917, "grad_norm": 10.36215591430664, "learning_rate": 4.040643909122046e-05, "loss": 5.7114, "step": 8510 }, { "epoch": 0.9704425081154964, "grad_norm": 20.450754165649414, "learning_rate": 4.039502226281539e-05, "loss": 5.5858, "step": 8520 }, { "epoch": 0.9715815251438009, "grad_norm": 10.467815399169922, "learning_rate": 4.038360543441032e-05, "loss": 5.8745, "step": 8530 }, { "epoch": 0.9727205421721055, "grad_norm": 4.771965026855469, "learning_rate": 4.037218860600525e-05, "loss": 5.937, "step": 8540 }, { "epoch": 0.97385955920041, "grad_norm": 5.959897994995117, "learning_rate": 4.036077177760019e-05, "loss": 6.067, "step": 8550 }, { "epoch": 0.9749985762287147, "grad_norm": 5.705534934997559, "learning_rate": 4.034935494919512e-05, "loss": 5.6886, "step": 8560 }, { "epoch": 0.9761375932570192, "grad_norm": 6.315032005310059, "learning_rate": 4.0337938120790045e-05, "loss": 5.8835, "step": 8570 }, { "epoch": 0.9772766102853238, "grad_norm": 14.058731079101562, "learning_rate": 4.032652129238498e-05, "loss": 5.9582, "step": 8580 }, { "epoch": 0.9784156273136283, "grad_norm": 8.842247009277344, "learning_rate": 4.031510446397991e-05, "loss": 5.7506, "step": 8590 }, { "epoch": 0.979554644341933, "grad_norm": 6.420351982116699, "learning_rate": 4.030368763557484e-05, "loss": 5.7064, "step": 8600 }, { "epoch": 0.9806936613702375, "grad_norm": 6.479267120361328, "learning_rate": 4.0292270807169766e-05, "loss": 5.5813, "step": 8610 }, { "epoch": 0.981832678398542, "grad_norm": 13.055333137512207, "learning_rate": 4.02808539787647e-05, "loss": 5.4908, "step": 8620 }, { "epoch": 0.9829716954268466, "grad_norm": 4.641709327697754, "learning_rate": 4.0269437150359636e-05, "loss": 6.0172, "step": 8630 }, { "epoch": 0.9841107124551512, "grad_norm": 10.869007110595703, "learning_rate": 4.025802032195457e-05, "loss": 6.0174, "step": 8640 }, { "epoch": 0.9852497294834558, "grad_norm": 6.283684253692627, "learning_rate": 4.024660349354949e-05, "loss": 6.0299, "step": 8650 }, { "epoch": 0.9863887465117603, "grad_norm": 6.0538859367370605, "learning_rate": 4.0235186665144425e-05, "loss": 5.7183, "step": 8660 }, { "epoch": 0.9875277635400649, "grad_norm": 6.773647785186768, "learning_rate": 4.022376983673936e-05, "loss": 5.9058, "step": 8670 }, { "epoch": 0.9886667805683695, "grad_norm": 7.833893775939941, "learning_rate": 4.021235300833429e-05, "loss": 5.6367, "step": 8680 }, { "epoch": 0.9898057975966741, "grad_norm": 7.423435688018799, "learning_rate": 4.0200936179929214e-05, "loss": 6.4342, "step": 8690 }, { "epoch": 0.9909448146249786, "grad_norm": 6.709625244140625, "learning_rate": 4.0189519351524146e-05, "loss": 5.9904, "step": 8700 }, { "epoch": 0.9920838316532832, "grad_norm": 4.907162666320801, "learning_rate": 4.0178102523119084e-05, "loss": 5.7625, "step": 8710 }, { "epoch": 0.9932228486815878, "grad_norm": 7.05642557144165, "learning_rate": 4.016668569471401e-05, "loss": 5.6486, "step": 8720 }, { "epoch": 0.9943618657098924, "grad_norm": 7.265207290649414, "learning_rate": 4.015526886630894e-05, "loss": 5.658, "step": 8730 }, { "epoch": 0.9955008827381969, "grad_norm": 5.687889099121094, "learning_rate": 4.014385203790387e-05, "loss": 5.6476, "step": 8740 }, { "epoch": 0.9966398997665015, "grad_norm": 11.792612075805664, "learning_rate": 4.0132435209498805e-05, "loss": 5.5635, "step": 8750 }, { "epoch": 0.9977789167948061, "grad_norm": 6.816971778869629, "learning_rate": 4.012101838109374e-05, "loss": 5.8865, "step": 8760 }, { "epoch": 0.9989179338231107, "grad_norm": 7.3285932540893555, "learning_rate": 4.010960155268866e-05, "loss": 5.7944, "step": 8770 }, { "epoch": 1.0000569508514152, "grad_norm": 6.628791809082031, "learning_rate": 4.0098184724283594e-05, "loss": 5.5504, "step": 8780 }, { "epoch": 1.0011959678797198, "grad_norm": 5.887302875518799, "learning_rate": 4.0086767895878525e-05, "loss": 5.254, "step": 8790 }, { "epoch": 1.0023349849080243, "grad_norm": 4.866805076599121, "learning_rate": 4.007535106747346e-05, "loss": 5.3222, "step": 8800 }, { "epoch": 1.0034740019363289, "grad_norm": 7.485427379608154, "learning_rate": 4.006393423906839e-05, "loss": 5.4493, "step": 8810 }, { "epoch": 1.0046130189646336, "grad_norm": 7.071866035461426, "learning_rate": 4.005251741066332e-05, "loss": 5.3074, "step": 8820 }, { "epoch": 1.0057520359929382, "grad_norm": 17.301889419555664, "learning_rate": 4.004110058225825e-05, "loss": 5.4764, "step": 8830 }, { "epoch": 1.0068910530212427, "grad_norm": 13.042938232421875, "learning_rate": 4.002968375385318e-05, "loss": 5.1558, "step": 8840 }, { "epoch": 1.0080300700495473, "grad_norm": 22.195083618164062, "learning_rate": 4.001826692544811e-05, "loss": 5.2606, "step": 8850 }, { "epoch": 1.0091690870778518, "grad_norm": 7.411557674407959, "learning_rate": 4.000685009704304e-05, "loss": 5.4076, "step": 8860 }, { "epoch": 1.0103081041061563, "grad_norm": 8.647936820983887, "learning_rate": 3.999543326863797e-05, "loss": 5.4122, "step": 8870 }, { "epoch": 1.011447121134461, "grad_norm": 6.743743419647217, "learning_rate": 3.9984016440232905e-05, "loss": 5.7691, "step": 8880 }, { "epoch": 1.0125861381627654, "grad_norm": 7.781901836395264, "learning_rate": 3.997259961182784e-05, "loss": 5.4218, "step": 8890 }, { "epoch": 1.0137251551910702, "grad_norm": 15.083586692810059, "learning_rate": 3.996118278342277e-05, "loss": 5.2873, "step": 8900 }, { "epoch": 1.0148641722193747, "grad_norm": 12.412664413452148, "learning_rate": 3.99497659550177e-05, "loss": 5.3881, "step": 8910 }, { "epoch": 1.0160031892476793, "grad_norm": 7.3285722732543945, "learning_rate": 3.9938349126612626e-05, "loss": 5.2282, "step": 8920 }, { "epoch": 1.0171422062759838, "grad_norm": 6.983195781707764, "learning_rate": 3.992693229820756e-05, "loss": 5.2699, "step": 8930 }, { "epoch": 1.0182812233042884, "grad_norm": 5.575123310089111, "learning_rate": 3.991551546980249e-05, "loss": 5.2499, "step": 8940 }, { "epoch": 1.019420240332593, "grad_norm": 6.207281589508057, "learning_rate": 3.990409864139742e-05, "loss": 5.3387, "step": 8950 }, { "epoch": 1.0205592573608975, "grad_norm": 9.925410270690918, "learning_rate": 3.989268181299235e-05, "loss": 5.0628, "step": 8960 }, { "epoch": 1.021698274389202, "grad_norm": 8.906091690063477, "learning_rate": 3.9881264984587285e-05, "loss": 5.3465, "step": 8970 }, { "epoch": 1.0228372914175068, "grad_norm": 7.547214031219482, "learning_rate": 3.986984815618222e-05, "loss": 5.2823, "step": 8980 }, { "epoch": 1.0239763084458113, "grad_norm": 7.977101802825928, "learning_rate": 3.985843132777715e-05, "loss": 5.5895, "step": 8990 }, { "epoch": 1.0251153254741159, "grad_norm": 9.279818534851074, "learning_rate": 3.9847014499372074e-05, "loss": 6.1008, "step": 9000 }, { "epoch": 1.0262543425024204, "grad_norm": 6.907566547393799, "learning_rate": 3.9835597670967005e-05, "loss": 5.1312, "step": 9010 }, { "epoch": 1.027393359530725, "grad_norm": 15.964417457580566, "learning_rate": 3.982418084256194e-05, "loss": 5.2919, "step": 9020 }, { "epoch": 1.0285323765590295, "grad_norm": 7.658446311950684, "learning_rate": 3.981276401415687e-05, "loss": 5.3453, "step": 9030 }, { "epoch": 1.029671393587334, "grad_norm": 5.22704553604126, "learning_rate": 3.98013471857518e-05, "loss": 5.5017, "step": 9040 }, { "epoch": 1.0308104106156386, "grad_norm": 5.284582614898682, "learning_rate": 3.978993035734673e-05, "loss": 5.3824, "step": 9050 }, { "epoch": 1.0319494276439434, "grad_norm": 6.4581170082092285, "learning_rate": 3.9778513528941665e-05, "loss": 6.1145, "step": 9060 }, { "epoch": 1.033088444672248, "grad_norm": 8.916783332824707, "learning_rate": 3.9767096700536596e-05, "loss": 5.3733, "step": 9070 }, { "epoch": 1.0342274617005525, "grad_norm": 6.466397762298584, "learning_rate": 3.975567987213152e-05, "loss": 5.15, "step": 9080 }, { "epoch": 1.035366478728857, "grad_norm": 5.718016147613525, "learning_rate": 3.9744263043726453e-05, "loss": 5.0449, "step": 9090 }, { "epoch": 1.0365054957571616, "grad_norm": 8.794554710388184, "learning_rate": 3.9732846215321385e-05, "loss": 5.2021, "step": 9100 }, { "epoch": 1.037644512785466, "grad_norm": 8.841976165771484, "learning_rate": 3.972142938691632e-05, "loss": 5.1749, "step": 9110 }, { "epoch": 1.0387835298137706, "grad_norm": 6.140927791595459, "learning_rate": 3.971001255851124e-05, "loss": 5.2475, "step": 9120 }, { "epoch": 1.0399225468420752, "grad_norm": 5.292937755584717, "learning_rate": 3.969859573010618e-05, "loss": 5.1817, "step": 9130 }, { "epoch": 1.04106156387038, "grad_norm": 6.455714702606201, "learning_rate": 3.968717890170111e-05, "loss": 5.3106, "step": 9140 }, { "epoch": 1.0422005808986845, "grad_norm": 23.862918853759766, "learning_rate": 3.9675762073296044e-05, "loss": 5.5527, "step": 9150 }, { "epoch": 1.043339597926989, "grad_norm": 6.65964412689209, "learning_rate": 3.966434524489097e-05, "loss": 5.2139, "step": 9160 }, { "epoch": 1.0444786149552936, "grad_norm": 23.59825325012207, "learning_rate": 3.96529284164859e-05, "loss": 5.4358, "step": 9170 }, { "epoch": 1.0456176319835981, "grad_norm": 6.902249813079834, "learning_rate": 3.964151158808083e-05, "loss": 5.2541, "step": 9180 }, { "epoch": 1.0467566490119027, "grad_norm": 7.1851043701171875, "learning_rate": 3.9630094759675765e-05, "loss": 5.3055, "step": 9190 }, { "epoch": 1.0478956660402072, "grad_norm": 8.189417839050293, "learning_rate": 3.961867793127069e-05, "loss": 6.0163, "step": 9200 }, { "epoch": 1.0490346830685118, "grad_norm": 6.41946268081665, "learning_rate": 3.960726110286563e-05, "loss": 5.2336, "step": 9210 }, { "epoch": 1.0501737000968165, "grad_norm": 9.957685470581055, "learning_rate": 3.959584427446056e-05, "loss": 5.4844, "step": 9220 }, { "epoch": 1.051312717125121, "grad_norm": 14.342864036560059, "learning_rate": 3.9584427446055486e-05, "loss": 5.3257, "step": 9230 }, { "epoch": 1.0524517341534256, "grad_norm": 6.256680488586426, "learning_rate": 3.957301061765042e-05, "loss": 5.7186, "step": 9240 }, { "epoch": 1.0535907511817302, "grad_norm": 6.000284671783447, "learning_rate": 3.956159378924535e-05, "loss": 5.0134, "step": 9250 }, { "epoch": 1.0547297682100347, "grad_norm": 7.8761396408081055, "learning_rate": 3.955017696084028e-05, "loss": 5.1664, "step": 9260 }, { "epoch": 1.0558687852383393, "grad_norm": 9.180042266845703, "learning_rate": 3.953876013243521e-05, "loss": 5.2035, "step": 9270 }, { "epoch": 1.0570078022666438, "grad_norm": 9.206707000732422, "learning_rate": 3.952734330403014e-05, "loss": 5.3212, "step": 9280 }, { "epoch": 1.0581468192949484, "grad_norm": 7.057497501373291, "learning_rate": 3.9515926475625077e-05, "loss": 5.0746, "step": 9290 }, { "epoch": 1.0592858363232531, "grad_norm": 8.683945655822754, "learning_rate": 3.950450964722001e-05, "loss": 5.3396, "step": 9300 }, { "epoch": 1.0604248533515577, "grad_norm": 7.847707748413086, "learning_rate": 3.9493092818814933e-05, "loss": 5.1268, "step": 9310 }, { "epoch": 1.0615638703798622, "grad_norm": 10.160079956054688, "learning_rate": 3.9481675990409865e-05, "loss": 5.3413, "step": 9320 }, { "epoch": 1.0627028874081668, "grad_norm": 8.487902641296387, "learning_rate": 3.94702591620048e-05, "loss": 5.1777, "step": 9330 }, { "epoch": 1.0638419044364713, "grad_norm": 11.462549209594727, "learning_rate": 3.945884233359973e-05, "loss": 5.3486, "step": 9340 }, { "epoch": 1.0649809214647759, "grad_norm": 10.42354965209961, "learning_rate": 3.9447425505194654e-05, "loss": 5.3129, "step": 9350 }, { "epoch": 1.0661199384930804, "grad_norm": 7.207667827606201, "learning_rate": 3.9436008676789586e-05, "loss": 5.2524, "step": 9360 }, { "epoch": 1.067258955521385, "grad_norm": 5.92431116104126, "learning_rate": 3.9424591848384525e-05, "loss": 5.0543, "step": 9370 }, { "epoch": 1.0683979725496897, "grad_norm": 7.101565837860107, "learning_rate": 3.9413175019979456e-05, "loss": 5.7859, "step": 9380 }, { "epoch": 1.0695369895779943, "grad_norm": 6.356846809387207, "learning_rate": 3.940175819157438e-05, "loss": 5.1667, "step": 9390 }, { "epoch": 1.0706760066062988, "grad_norm": 7.87912654876709, "learning_rate": 3.939034136316931e-05, "loss": 5.1314, "step": 9400 }, { "epoch": 1.0718150236346033, "grad_norm": 11.108393669128418, "learning_rate": 3.9378924534764245e-05, "loss": 5.2929, "step": 9410 }, { "epoch": 1.072954040662908, "grad_norm": 9.422391891479492, "learning_rate": 3.936750770635918e-05, "loss": 5.6755, "step": 9420 }, { "epoch": 1.0740930576912124, "grad_norm": 8.584696769714355, "learning_rate": 3.93560908779541e-05, "loss": 5.4, "step": 9430 }, { "epoch": 1.075232074719517, "grad_norm": 8.486468315124512, "learning_rate": 3.9344674049549034e-05, "loss": 5.8283, "step": 9440 }, { "epoch": 1.0763710917478215, "grad_norm": 5.808884620666504, "learning_rate": 3.9333257221143966e-05, "loss": 5.3142, "step": 9450 }, { "epoch": 1.0775101087761263, "grad_norm": 7.499875068664551, "learning_rate": 3.9321840392738904e-05, "loss": 5.6757, "step": 9460 }, { "epoch": 1.0786491258044308, "grad_norm": 6.805802345275879, "learning_rate": 3.931042356433383e-05, "loss": 5.4205, "step": 9470 }, { "epoch": 1.0797881428327354, "grad_norm": 7.800095558166504, "learning_rate": 3.929900673592876e-05, "loss": 5.3918, "step": 9480 }, { "epoch": 1.08092715986104, "grad_norm": 7.894876956939697, "learning_rate": 3.928758990752369e-05, "loss": 5.4332, "step": 9490 }, { "epoch": 1.0820661768893445, "grad_norm": 10.799897193908691, "learning_rate": 3.9276173079118625e-05, "loss": 5.1566, "step": 9500 }, { "epoch": 1.083205193917649, "grad_norm": 5.901568412780762, "learning_rate": 3.926475625071355e-05, "loss": 5.2164, "step": 9510 }, { "epoch": 1.0843442109459536, "grad_norm": 8.62924575805664, "learning_rate": 3.925333942230848e-05, "loss": 5.3436, "step": 9520 }, { "epoch": 1.0854832279742581, "grad_norm": 7.14743709564209, "learning_rate": 3.9241922593903414e-05, "loss": 5.6523, "step": 9530 }, { "epoch": 1.0866222450025629, "grad_norm": 6.542008876800537, "learning_rate": 3.9230505765498345e-05, "loss": 5.4717, "step": 9540 }, { "epoch": 1.0877612620308674, "grad_norm": 6.9539713859558105, "learning_rate": 3.921908893709328e-05, "loss": 5.4214, "step": 9550 }, { "epoch": 1.088900279059172, "grad_norm": 7.2530927658081055, "learning_rate": 3.920767210868821e-05, "loss": 5.1499, "step": 9560 }, { "epoch": 1.0900392960874765, "grad_norm": 8.217451095581055, "learning_rate": 3.919625528028314e-05, "loss": 5.3176, "step": 9570 }, { "epoch": 1.091178313115781, "grad_norm": 7.244843482971191, "learning_rate": 3.918483845187807e-05, "loss": 5.1277, "step": 9580 }, { "epoch": 1.0923173301440856, "grad_norm": 10.915613174438477, "learning_rate": 3.9173421623473e-05, "loss": 5.5916, "step": 9590 }, { "epoch": 1.0934563471723902, "grad_norm": 9.508447647094727, "learning_rate": 3.916200479506793e-05, "loss": 5.055, "step": 9600 }, { "epoch": 1.0945953642006947, "grad_norm": 8.85869312286377, "learning_rate": 3.915058796666286e-05, "loss": 5.7979, "step": 9610 }, { "epoch": 1.0957343812289995, "grad_norm": 7.686898231506348, "learning_rate": 3.913917113825779e-05, "loss": 5.5152, "step": 9620 }, { "epoch": 1.096873398257304, "grad_norm": 6.5317277908325195, "learning_rate": 3.9127754309852725e-05, "loss": 5.1522, "step": 9630 }, { "epoch": 1.0980124152856086, "grad_norm": 16.195051193237305, "learning_rate": 3.911633748144766e-05, "loss": 5.2707, "step": 9640 }, { "epoch": 1.099151432313913, "grad_norm": 11.292452812194824, "learning_rate": 3.910492065304259e-05, "loss": 5.1579, "step": 9650 }, { "epoch": 1.1002904493422176, "grad_norm": 9.894390106201172, "learning_rate": 3.909350382463752e-05, "loss": 5.3674, "step": 9660 }, { "epoch": 1.1014294663705222, "grad_norm": 8.536913871765137, "learning_rate": 3.9082086996232446e-05, "loss": 5.2427, "step": 9670 }, { "epoch": 1.1025684833988267, "grad_norm": 6.5137786865234375, "learning_rate": 3.907067016782738e-05, "loss": 5.3871, "step": 9680 }, { "epoch": 1.1037075004271313, "grad_norm": 11.244919776916504, "learning_rate": 3.905925333942231e-05, "loss": 5.1847, "step": 9690 }, { "epoch": 1.104846517455436, "grad_norm": 12.385736465454102, "learning_rate": 3.904783651101724e-05, "loss": 5.5966, "step": 9700 }, { "epoch": 1.1059855344837406, "grad_norm": 5.9896016120910645, "learning_rate": 3.903641968261217e-05, "loss": 5.2129, "step": 9710 }, { "epoch": 1.1071245515120451, "grad_norm": 7.364066123962402, "learning_rate": 3.9025002854207105e-05, "loss": 5.1776, "step": 9720 }, { "epoch": 1.1082635685403497, "grad_norm": 8.434669494628906, "learning_rate": 3.901358602580204e-05, "loss": 5.2653, "step": 9730 }, { "epoch": 1.1094025855686542, "grad_norm": 8.104005813598633, "learning_rate": 3.900216919739696e-05, "loss": 5.2718, "step": 9740 }, { "epoch": 1.1105416025969588, "grad_norm": 8.960792541503906, "learning_rate": 3.8990752368991894e-05, "loss": 5.1523, "step": 9750 }, { "epoch": 1.1116806196252633, "grad_norm": 10.277594566345215, "learning_rate": 3.8979335540586826e-05, "loss": 5.0689, "step": 9760 }, { "epoch": 1.1128196366535679, "grad_norm": 15.712716102600098, "learning_rate": 3.896791871218176e-05, "loss": 5.2961, "step": 9770 }, { "epoch": 1.1139586536818726, "grad_norm": 7.332018852233887, "learning_rate": 3.895650188377669e-05, "loss": 5.4115, "step": 9780 }, { "epoch": 1.1150976707101772, "grad_norm": 10.549822807312012, "learning_rate": 3.894508505537162e-05, "loss": 5.397, "step": 9790 }, { "epoch": 1.1162366877384817, "grad_norm": 6.6530375480651855, "learning_rate": 3.893366822696655e-05, "loss": 5.7133, "step": 9800 }, { "epoch": 1.1173757047667863, "grad_norm": 8.9078369140625, "learning_rate": 3.8922251398561485e-05, "loss": 4.867, "step": 9810 }, { "epoch": 1.1185147217950908, "grad_norm": 8.685425758361816, "learning_rate": 3.891083457015641e-05, "loss": 5.1863, "step": 9820 }, { "epoch": 1.1196537388233954, "grad_norm": 12.476484298706055, "learning_rate": 3.889941774175134e-05, "loss": 5.121, "step": 9830 }, { "epoch": 1.1207927558517, "grad_norm": 8.18492317199707, "learning_rate": 3.8888000913346273e-05, "loss": 5.3926, "step": 9840 }, { "epoch": 1.1219317728800045, "grad_norm": 12.082232475280762, "learning_rate": 3.8876584084941205e-05, "loss": 5.4308, "step": 9850 }, { "epoch": 1.1230707899083092, "grad_norm": 6.331710338592529, "learning_rate": 3.886516725653613e-05, "loss": 5.1105, "step": 9860 }, { "epoch": 1.1242098069366138, "grad_norm": 29.657867431640625, "learning_rate": 3.885375042813107e-05, "loss": 5.1145, "step": 9870 }, { "epoch": 1.1253488239649183, "grad_norm": 6.528309345245361, "learning_rate": 3.8842333599726e-05, "loss": 5.356, "step": 9880 }, { "epoch": 1.1264878409932229, "grad_norm": 8.70510196685791, "learning_rate": 3.883091677132093e-05, "loss": 5.1216, "step": 9890 }, { "epoch": 1.1276268580215274, "grad_norm": 11.015145301818848, "learning_rate": 3.881949994291586e-05, "loss": 5.1805, "step": 9900 }, { "epoch": 1.128765875049832, "grad_norm": 8.160832405090332, "learning_rate": 3.880808311451079e-05, "loss": 5.4908, "step": 9910 }, { "epoch": 1.1299048920781365, "grad_norm": 6.731886386871338, "learning_rate": 3.879666628610572e-05, "loss": 5.1552, "step": 9920 }, { "epoch": 1.1310439091064413, "grad_norm": 9.394970893859863, "learning_rate": 3.878524945770065e-05, "loss": 5.6384, "step": 9930 }, { "epoch": 1.1321829261347458, "grad_norm": 6.584465980529785, "learning_rate": 3.877383262929558e-05, "loss": 5.4019, "step": 9940 }, { "epoch": 1.1333219431630503, "grad_norm": 9.031838417053223, "learning_rate": 3.876241580089052e-05, "loss": 5.308, "step": 9950 }, { "epoch": 1.134460960191355, "grad_norm": 17.675561904907227, "learning_rate": 3.875099897248545e-05, "loss": 5.4112, "step": 9960 }, { "epoch": 1.1355999772196594, "grad_norm": 8.194103240966797, "learning_rate": 3.873958214408038e-05, "loss": 5.294, "step": 9970 }, { "epoch": 1.136738994247964, "grad_norm": 8.301799774169922, "learning_rate": 3.8728165315675306e-05, "loss": 5.3232, "step": 9980 }, { "epoch": 1.1378780112762685, "grad_norm": 9.098228454589844, "learning_rate": 3.871674848727024e-05, "loss": 5.6319, "step": 9990 }, { "epoch": 1.139017028304573, "grad_norm": 4.738903999328613, "learning_rate": 3.870533165886517e-05, "loss": 5.2591, "step": 10000 }, { "epoch": 1.139017028304573, "eval_loss": 5.98829460144043, "eval_runtime": 11.6367, "eval_samples_per_second": 1.289, "eval_steps_per_second": 0.172, "step": 10000 }, { "epoch": 1.1401560453328776, "grad_norm": 8.139161109924316, "learning_rate": 3.86939148304601e-05, "loss": 5.36, "step": 10010 }, { "epoch": 1.1412950623611824, "grad_norm": 5.483173847198486, "learning_rate": 3.8682498002055026e-05, "loss": 5.2019, "step": 10020 }, { "epoch": 1.142434079389487, "grad_norm": 9.01739501953125, "learning_rate": 3.8671081173649965e-05, "loss": 5.138, "step": 10030 }, { "epoch": 1.1435730964177915, "grad_norm": 8.21419906616211, "learning_rate": 3.86596643452449e-05, "loss": 5.5423, "step": 10040 }, { "epoch": 1.144712113446096, "grad_norm": 6.223106384277344, "learning_rate": 3.864824751683982e-05, "loss": 5.555, "step": 10050 }, { "epoch": 1.1458511304744006, "grad_norm": 8.907463073730469, "learning_rate": 3.8636830688434754e-05, "loss": 5.257, "step": 10060 }, { "epoch": 1.1469901475027051, "grad_norm": 13.769535064697266, "learning_rate": 3.8625413860029685e-05, "loss": 5.3491, "step": 10070 }, { "epoch": 1.1481291645310097, "grad_norm": 6.513309478759766, "learning_rate": 3.861399703162462e-05, "loss": 5.2318, "step": 10080 }, { "epoch": 1.1492681815593144, "grad_norm": 7.231842041015625, "learning_rate": 3.860258020321955e-05, "loss": 5.3576, "step": 10090 }, { "epoch": 1.150407198587619, "grad_norm": 7.973701000213623, "learning_rate": 3.8591163374814474e-05, "loss": 5.3865, "step": 10100 }, { "epoch": 1.1515462156159235, "grad_norm": 10.383562088012695, "learning_rate": 3.8579746546409406e-05, "loss": 5.3199, "step": 10110 }, { "epoch": 1.152685232644228, "grad_norm": 10.173810958862305, "learning_rate": 3.8568329718004345e-05, "loss": 5.4949, "step": 10120 }, { "epoch": 1.1538242496725326, "grad_norm": 10.03913688659668, "learning_rate": 3.855691288959927e-05, "loss": 5.437, "step": 10130 }, { "epoch": 1.1549632667008372, "grad_norm": 15.847329139709473, "learning_rate": 3.85454960611942e-05, "loss": 5.1987, "step": 10140 }, { "epoch": 1.1561022837291417, "grad_norm": 8.826560020446777, "learning_rate": 3.853407923278913e-05, "loss": 5.2763, "step": 10150 }, { "epoch": 1.1572413007574462, "grad_norm": 11.843634605407715, "learning_rate": 3.8522662404384065e-05, "loss": 5.0479, "step": 10160 }, { "epoch": 1.1583803177857508, "grad_norm": 11.732421875, "learning_rate": 3.8511245575979e-05, "loss": 5.0428, "step": 10170 }, { "epoch": 1.1595193348140556, "grad_norm": 8.850831985473633, "learning_rate": 3.849982874757392e-05, "loss": 4.8044, "step": 10180 }, { "epoch": 1.16065835184236, "grad_norm": 7.789608955383301, "learning_rate": 3.8488411919168854e-05, "loss": 5.1098, "step": 10190 }, { "epoch": 1.1617973688706646, "grad_norm": 6.359009742736816, "learning_rate": 3.847699509076379e-05, "loss": 5.3908, "step": 10200 }, { "epoch": 1.1629363858989692, "grad_norm": 16.158702850341797, "learning_rate": 3.846557826235872e-05, "loss": 5.3882, "step": 10210 }, { "epoch": 1.1640754029272737, "grad_norm": 6.294594764709473, "learning_rate": 3.845416143395365e-05, "loss": 5.2433, "step": 10220 }, { "epoch": 1.1652144199555783, "grad_norm": 6.593204021453857, "learning_rate": 3.844274460554858e-05, "loss": 5.4319, "step": 10230 }, { "epoch": 1.1663534369838828, "grad_norm": 7.6413702964782715, "learning_rate": 3.843132777714351e-05, "loss": 5.2148, "step": 10240 }, { "epoch": 1.1674924540121876, "grad_norm": 8.203592300415039, "learning_rate": 3.841991094873844e-05, "loss": 5.307, "step": 10250 }, { "epoch": 1.1686314710404921, "grad_norm": 7.085062026977539, "learning_rate": 3.840849412033337e-05, "loss": 5.1634, "step": 10260 }, { "epoch": 1.1697704880687967, "grad_norm": 5.813399791717529, "learning_rate": 3.83970772919283e-05, "loss": 5.4435, "step": 10270 }, { "epoch": 1.1709095050971012, "grad_norm": 8.348408699035645, "learning_rate": 3.838566046352324e-05, "loss": 5.3024, "step": 10280 }, { "epoch": 1.1720485221254058, "grad_norm": 6.418082237243652, "learning_rate": 3.8374243635118165e-05, "loss": 5.2993, "step": 10290 }, { "epoch": 1.1731875391537103, "grad_norm": 9.121561050415039, "learning_rate": 3.83628268067131e-05, "loss": 5.2365, "step": 10300 }, { "epoch": 1.1743265561820149, "grad_norm": 18.363595962524414, "learning_rate": 3.835140997830803e-05, "loss": 5.4046, "step": 10310 }, { "epoch": 1.1754655732103194, "grad_norm": 32.12479782104492, "learning_rate": 3.833999314990296e-05, "loss": 5.2378, "step": 10320 }, { "epoch": 1.176604590238624, "grad_norm": 7.56660795211792, "learning_rate": 3.8328576321497886e-05, "loss": 5.2554, "step": 10330 }, { "epoch": 1.1777436072669287, "grad_norm": 8.347277641296387, "learning_rate": 3.831715949309282e-05, "loss": 5.281, "step": 10340 }, { "epoch": 1.1788826242952333, "grad_norm": 12.190389633178711, "learning_rate": 3.830574266468775e-05, "loss": 5.2473, "step": 10350 }, { "epoch": 1.1800216413235378, "grad_norm": 9.21943187713623, "learning_rate": 3.829432583628269e-05, "loss": 5.602, "step": 10360 }, { "epoch": 1.1811606583518424, "grad_norm": 9.936127662658691, "learning_rate": 3.828290900787761e-05, "loss": 5.3133, "step": 10370 }, { "epoch": 1.182299675380147, "grad_norm": 5.087235450744629, "learning_rate": 3.8271492179472545e-05, "loss": 5.425, "step": 10380 }, { "epoch": 1.1834386924084515, "grad_norm": 6.407649040222168, "learning_rate": 3.826007535106748e-05, "loss": 5.2866, "step": 10390 }, { "epoch": 1.184577709436756, "grad_norm": 11.357165336608887, "learning_rate": 3.824865852266241e-05, "loss": 5.4531, "step": 10400 }, { "epoch": 1.1857167264650608, "grad_norm": 6.227645397186279, "learning_rate": 3.8237241694257334e-05, "loss": 5.0796, "step": 10410 }, { "epoch": 1.1868557434933653, "grad_norm": 7.409129619598389, "learning_rate": 3.8225824865852266e-05, "loss": 5.5382, "step": 10420 }, { "epoch": 1.1879947605216699, "grad_norm": 7.063460350036621, "learning_rate": 3.82144080374472e-05, "loss": 5.2155, "step": 10430 }, { "epoch": 1.1891337775499744, "grad_norm": 8.425795555114746, "learning_rate": 3.820299120904213e-05, "loss": 5.4379, "step": 10440 }, { "epoch": 1.190272794578279, "grad_norm": 7.571264266967773, "learning_rate": 3.819157438063706e-05, "loss": 5.4893, "step": 10450 }, { "epoch": 1.1914118116065835, "grad_norm": 9.26445484161377, "learning_rate": 3.818015755223199e-05, "loss": 5.5383, "step": 10460 }, { "epoch": 1.192550828634888, "grad_norm": 8.3720064163208, "learning_rate": 3.8168740723826925e-05, "loss": 5.0225, "step": 10470 }, { "epoch": 1.1936898456631926, "grad_norm": 7.018798351287842, "learning_rate": 3.815732389542186e-05, "loss": 5.0177, "step": 10480 }, { "epoch": 1.1948288626914971, "grad_norm": 9.46872329711914, "learning_rate": 3.814590706701678e-05, "loss": 5.2551, "step": 10490 }, { "epoch": 1.195967879719802, "grad_norm": 9.977618217468262, "learning_rate": 3.8134490238611714e-05, "loss": 5.4843, "step": 10500 }, { "epoch": 1.1971068967481064, "grad_norm": 7.310171127319336, "learning_rate": 3.8123073410206646e-05, "loss": 5.2402, "step": 10510 }, { "epoch": 1.198245913776411, "grad_norm": 10.744500160217285, "learning_rate": 3.811165658180158e-05, "loss": 5.548, "step": 10520 }, { "epoch": 1.1993849308047155, "grad_norm": 6.208596229553223, "learning_rate": 3.810023975339651e-05, "loss": 5.5792, "step": 10530 }, { "epoch": 1.20052394783302, "grad_norm": 10.301777839660645, "learning_rate": 3.808882292499144e-05, "loss": 5.2611, "step": 10540 }, { "epoch": 1.2016629648613246, "grad_norm": 11.960914611816406, "learning_rate": 3.807740609658637e-05, "loss": 5.0305, "step": 10550 }, { "epoch": 1.2028019818896292, "grad_norm": 6.9739885330200195, "learning_rate": 3.80659892681813e-05, "loss": 5.1622, "step": 10560 }, { "epoch": 1.203940998917934, "grad_norm": 12.12211799621582, "learning_rate": 3.805457243977623e-05, "loss": 5.6689, "step": 10570 }, { "epoch": 1.2050800159462385, "grad_norm": 10.115104675292969, "learning_rate": 3.804315561137116e-05, "loss": 5.4093, "step": 10580 }, { "epoch": 1.206219032974543, "grad_norm": 9.459589004516602, "learning_rate": 3.8031738782966093e-05, "loss": 5.4789, "step": 10590 }, { "epoch": 1.2073580500028476, "grad_norm": 8.0070161819458, "learning_rate": 3.8020321954561025e-05, "loss": 5.0064, "step": 10600 }, { "epoch": 1.2084970670311521, "grad_norm": 6.625135898590088, "learning_rate": 3.800890512615596e-05, "loss": 5.3855, "step": 10610 }, { "epoch": 1.2096360840594567, "grad_norm": 6.648497104644775, "learning_rate": 3.799748829775089e-05, "loss": 5.2896, "step": 10620 }, { "epoch": 1.2107751010877612, "grad_norm": 16.452611923217773, "learning_rate": 3.798607146934582e-05, "loss": 5.346, "step": 10630 }, { "epoch": 1.2119141181160658, "grad_norm": 7.3033447265625, "learning_rate": 3.7974654640940746e-05, "loss": 5.3142, "step": 10640 }, { "epoch": 1.2130531351443703, "grad_norm": 6.744337558746338, "learning_rate": 3.796323781253568e-05, "loss": 5.138, "step": 10650 }, { "epoch": 1.214192152172675, "grad_norm": 7.216135501861572, "learning_rate": 3.795182098413061e-05, "loss": 5.2378, "step": 10660 }, { "epoch": 1.2153311692009796, "grad_norm": 5.454289436340332, "learning_rate": 3.794040415572554e-05, "loss": 5.5945, "step": 10670 }, { "epoch": 1.2164701862292842, "grad_norm": 7.187722206115723, "learning_rate": 3.792898732732047e-05, "loss": 4.9304, "step": 10680 }, { "epoch": 1.2176092032575887, "grad_norm": 9.496403694152832, "learning_rate": 3.7917570498915405e-05, "loss": 5.19, "step": 10690 }, { "epoch": 1.2187482202858932, "grad_norm": 6.7584357261657715, "learning_rate": 3.790615367051034e-05, "loss": 5.471, "step": 10700 }, { "epoch": 1.2198872373141978, "grad_norm": 10.78551959991455, "learning_rate": 3.789473684210527e-05, "loss": 5.2186, "step": 10710 }, { "epoch": 1.2210262543425023, "grad_norm": 5.996976852416992, "learning_rate": 3.7883320013700194e-05, "loss": 5.3735, "step": 10720 }, { "epoch": 1.222165271370807, "grad_norm": 5.803508281707764, "learning_rate": 3.7871903185295126e-05, "loss": 5.2404, "step": 10730 }, { "epoch": 1.2233042883991117, "grad_norm": 8.11447525024414, "learning_rate": 3.786048635689006e-05, "loss": 5.1215, "step": 10740 }, { "epoch": 1.2244433054274162, "grad_norm": 10.285445213317871, "learning_rate": 3.784906952848499e-05, "loss": 5.3977, "step": 10750 }, { "epoch": 1.2255823224557207, "grad_norm": 6.042105674743652, "learning_rate": 3.7837652700079914e-05, "loss": 5.1748, "step": 10760 }, { "epoch": 1.2267213394840253, "grad_norm": 8.941543579101562, "learning_rate": 3.782623587167485e-05, "loss": 5.5393, "step": 10770 }, { "epoch": 1.2278603565123298, "grad_norm": 6.521518230438232, "learning_rate": 3.7814819043269785e-05, "loss": 5.3153, "step": 10780 }, { "epoch": 1.2289993735406344, "grad_norm": 14.907044410705566, "learning_rate": 3.780340221486472e-05, "loss": 5.3014, "step": 10790 }, { "epoch": 1.230138390568939, "grad_norm": 19.862760543823242, "learning_rate": 3.779198538645964e-05, "loss": 5.332, "step": 10800 }, { "epoch": 1.2312774075972435, "grad_norm": 5.705162525177002, "learning_rate": 3.7780568558054574e-05, "loss": 5.2788, "step": 10810 }, { "epoch": 1.2324164246255482, "grad_norm": 6.770884037017822, "learning_rate": 3.7769151729649505e-05, "loss": 5.2975, "step": 10820 }, { "epoch": 1.2335554416538528, "grad_norm": 9.910143852233887, "learning_rate": 3.775773490124444e-05, "loss": 4.8984, "step": 10830 }, { "epoch": 1.2346944586821573, "grad_norm": 6.479272365570068, "learning_rate": 3.774631807283936e-05, "loss": 5.2465, "step": 10840 }, { "epoch": 1.2358334757104619, "grad_norm": 16.471927642822266, "learning_rate": 3.7734901244434294e-05, "loss": 5.461, "step": 10850 }, { "epoch": 1.2369724927387664, "grad_norm": 8.394795417785645, "learning_rate": 3.772348441602923e-05, "loss": 5.1475, "step": 10860 }, { "epoch": 1.238111509767071, "grad_norm": 10.2684326171875, "learning_rate": 3.7712067587624165e-05, "loss": 5.7925, "step": 10870 }, { "epoch": 1.2392505267953755, "grad_norm": 11.658308029174805, "learning_rate": 3.770065075921909e-05, "loss": 5.0855, "step": 10880 }, { "epoch": 1.2403895438236803, "grad_norm": 7.594724178314209, "learning_rate": 3.768923393081402e-05, "loss": 5.0243, "step": 10890 }, { "epoch": 1.2415285608519848, "grad_norm": 9.657943725585938, "learning_rate": 3.767781710240895e-05, "loss": 5.18, "step": 10900 }, { "epoch": 1.2426675778802894, "grad_norm": 10.352492332458496, "learning_rate": 3.7666400274003885e-05, "loss": 5.3739, "step": 10910 }, { "epoch": 1.243806594908594, "grad_norm": 5.806910991668701, "learning_rate": 3.765498344559881e-05, "loss": 5.3483, "step": 10920 }, { "epoch": 1.2449456119368985, "grad_norm": 7.94573450088501, "learning_rate": 3.764356661719374e-05, "loss": 5.545, "step": 10930 }, { "epoch": 1.246084628965203, "grad_norm": 6.481786727905273, "learning_rate": 3.763214978878868e-05, "loss": 5.0522, "step": 10940 }, { "epoch": 1.2472236459935075, "grad_norm": 7.557528972625732, "learning_rate": 3.7620732960383606e-05, "loss": 5.3473, "step": 10950 }, { "epoch": 1.248362663021812, "grad_norm": 8.600546836853027, "learning_rate": 3.760931613197854e-05, "loss": 5.1727, "step": 10960 }, { "epoch": 1.2495016800501166, "grad_norm": 13.744807243347168, "learning_rate": 3.759789930357347e-05, "loss": 5.2168, "step": 10970 }, { "epoch": 1.2506406970784214, "grad_norm": 10.714949607849121, "learning_rate": 3.75864824751684e-05, "loss": 5.4683, "step": 10980 }, { "epoch": 1.251779714106726, "grad_norm": 6.328366756439209, "learning_rate": 3.757506564676333e-05, "loss": 5.3212, "step": 10990 }, { "epoch": 1.2529187311350305, "grad_norm": 6.948147296905518, "learning_rate": 3.756364881835826e-05, "loss": 5.2868, "step": 11000 }, { "epoch": 1.254057748163335, "grad_norm": 8.713366508483887, "learning_rate": 3.755223198995319e-05, "loss": 5.1815, "step": 11010 }, { "epoch": 1.2551967651916396, "grad_norm": 9.481947898864746, "learning_rate": 3.754081516154813e-05, "loss": 5.2775, "step": 11020 }, { "epoch": 1.2563357822199441, "grad_norm": 11.682991981506348, "learning_rate": 3.7529398333143054e-05, "loss": 5.2723, "step": 11030 }, { "epoch": 1.2574747992482487, "grad_norm": 6.145923614501953, "learning_rate": 3.7517981504737986e-05, "loss": 5.4262, "step": 11040 }, { "epoch": 1.2586138162765534, "grad_norm": 7.860983848571777, "learning_rate": 3.750656467633292e-05, "loss": 5.267, "step": 11050 }, { "epoch": 1.259752833304858, "grad_norm": 16.078229904174805, "learning_rate": 3.749514784792785e-05, "loss": 5.5123, "step": 11060 }, { "epoch": 1.2608918503331625, "grad_norm": 12.896660804748535, "learning_rate": 3.7483731019522774e-05, "loss": 5.6769, "step": 11070 }, { "epoch": 1.262030867361467, "grad_norm": 6.555258274078369, "learning_rate": 3.7472314191117706e-05, "loss": 5.1755, "step": 11080 }, { "epoch": 1.2631698843897716, "grad_norm": 8.567139625549316, "learning_rate": 3.746089736271264e-05, "loss": 5.2561, "step": 11090 }, { "epoch": 1.2643089014180762, "grad_norm": 8.549598693847656, "learning_rate": 3.7449480534307577e-05, "loss": 5.5128, "step": 11100 }, { "epoch": 1.2654479184463807, "grad_norm": 13.493013381958008, "learning_rate": 3.74380637059025e-05, "loss": 5.2305, "step": 11110 }, { "epoch": 1.2665869354746855, "grad_norm": 5.492915630340576, "learning_rate": 3.7426646877497433e-05, "loss": 5.3072, "step": 11120 }, { "epoch": 1.2677259525029898, "grad_norm": 7.769402027130127, "learning_rate": 3.7415230049092365e-05, "loss": 5.3097, "step": 11130 }, { "epoch": 1.2688649695312946, "grad_norm": 6.967016696929932, "learning_rate": 3.74038132206873e-05, "loss": 5.518, "step": 11140 }, { "epoch": 1.2700039865595991, "grad_norm": 8.325007438659668, "learning_rate": 3.739239639228222e-05, "loss": 5.1475, "step": 11150 }, { "epoch": 1.2711430035879037, "grad_norm": 7.11507511138916, "learning_rate": 3.7380979563877154e-05, "loss": 5.44, "step": 11160 }, { "epoch": 1.2722820206162082, "grad_norm": 16.661008834838867, "learning_rate": 3.7369562735472086e-05, "loss": 4.9281, "step": 11170 }, { "epoch": 1.2734210376445128, "grad_norm": 6.7068400382995605, "learning_rate": 3.735814590706702e-05, "loss": 5.2555, "step": 11180 }, { "epoch": 1.2745600546728173, "grad_norm": 7.5050554275512695, "learning_rate": 3.734672907866195e-05, "loss": 5.2036, "step": 11190 }, { "epoch": 1.2756990717011218, "grad_norm": 8.88784122467041, "learning_rate": 3.733531225025688e-05, "loss": 5.419, "step": 11200 }, { "epoch": 1.2768380887294266, "grad_norm": 6.129419326782227, "learning_rate": 3.732389542185181e-05, "loss": 5.1893, "step": 11210 }, { "epoch": 1.2779771057577312, "grad_norm": 13.970202445983887, "learning_rate": 3.7312478593446745e-05, "loss": 4.972, "step": 11220 }, { "epoch": 1.2791161227860357, "grad_norm": 6.938068389892578, "learning_rate": 3.730220344788218e-05, "loss": 5.4386, "step": 11230 }, { "epoch": 1.2802551398143402, "grad_norm": 13.684118270874023, "learning_rate": 3.7290786619477106e-05, "loss": 5.0856, "step": 11240 }, { "epoch": 1.2813941568426448, "grad_norm": 5.631530284881592, "learning_rate": 3.7279369791072045e-05, "loss": 5.1861, "step": 11250 }, { "epoch": 1.2825331738709493, "grad_norm": 8.32963752746582, "learning_rate": 3.7267952962666976e-05, "loss": 5.2015, "step": 11260 }, { "epoch": 1.2836721908992539, "grad_norm": 7.235726833343506, "learning_rate": 3.72565361342619e-05, "loss": 5.2528, "step": 11270 }, { "epoch": 1.2848112079275587, "grad_norm": 6.3417887687683105, "learning_rate": 3.724511930585683e-05, "loss": 5.5352, "step": 11280 }, { "epoch": 1.285950224955863, "grad_norm": 35.59114074707031, "learning_rate": 3.7233702477451765e-05, "loss": 5.1489, "step": 11290 }, { "epoch": 1.2870892419841677, "grad_norm": 9.192126274108887, "learning_rate": 3.72222856490467e-05, "loss": 5.0911, "step": 11300 }, { "epoch": 1.2882282590124723, "grad_norm": 5.963927268981934, "learning_rate": 3.721086882064163e-05, "loss": 5.5495, "step": 11310 }, { "epoch": 1.2893672760407768, "grad_norm": 5.8875885009765625, "learning_rate": 3.7199451992236554e-05, "loss": 5.6548, "step": 11320 }, { "epoch": 1.2905062930690814, "grad_norm": 7.632376670837402, "learning_rate": 3.718803516383149e-05, "loss": 5.3078, "step": 11330 }, { "epoch": 1.291645310097386, "grad_norm": 5.833863258361816, "learning_rate": 3.7176618335426424e-05, "loss": 5.4708, "step": 11340 }, { "epoch": 1.2927843271256905, "grad_norm": 7.593703746795654, "learning_rate": 3.716520150702135e-05, "loss": 5.1828, "step": 11350 }, { "epoch": 1.293923344153995, "grad_norm": 24.085445404052734, "learning_rate": 3.715378467861628e-05, "loss": 5.4162, "step": 11360 }, { "epoch": 1.2950623611822998, "grad_norm": 10.530863761901855, "learning_rate": 3.714236785021121e-05, "loss": 5.252, "step": 11370 }, { "epoch": 1.2962013782106043, "grad_norm": 13.61907958984375, "learning_rate": 3.7130951021806145e-05, "loss": 4.842, "step": 11380 }, { "epoch": 1.2973403952389089, "grad_norm": 6.181117534637451, "learning_rate": 3.711953419340107e-05, "loss": 5.3513, "step": 11390 }, { "epoch": 1.2984794122672134, "grad_norm": 7.91089391708374, "learning_rate": 3.7108117364996e-05, "loss": 5.239, "step": 11400 }, { "epoch": 1.299618429295518, "grad_norm": 5.9612202644348145, "learning_rate": 3.709670053659094e-05, "loss": 5.3079, "step": 11410 }, { "epoch": 1.3007574463238225, "grad_norm": 9.246620178222656, "learning_rate": 3.708528370818587e-05, "loss": 5.4449, "step": 11420 }, { "epoch": 1.301896463352127, "grad_norm": 9.88797378540039, "learning_rate": 3.70738668797808e-05, "loss": 5.5098, "step": 11430 }, { "epoch": 1.3030354803804318, "grad_norm": 7.659064769744873, "learning_rate": 3.706245005137573e-05, "loss": 5.4119, "step": 11440 }, { "epoch": 1.3041744974087361, "grad_norm": 5.793743133544922, "learning_rate": 3.705103322297066e-05, "loss": 5.6571, "step": 11450 }, { "epoch": 1.305313514437041, "grad_norm": 11.31017780303955, "learning_rate": 3.703961639456559e-05, "loss": 5.6601, "step": 11460 }, { "epoch": 1.3064525314653455, "grad_norm": 6.8008012771606445, "learning_rate": 3.702819956616052e-05, "loss": 5.4059, "step": 11470 }, { "epoch": 1.30759154849365, "grad_norm": 11.843914031982422, "learning_rate": 3.701678273775545e-05, "loss": 5.3103, "step": 11480 }, { "epoch": 1.3087305655219545, "grad_norm": 8.745990753173828, "learning_rate": 3.700536590935039e-05, "loss": 5.6514, "step": 11490 }, { "epoch": 1.309869582550259, "grad_norm": 23.588220596313477, "learning_rate": 3.699394908094532e-05, "loss": 5.1553, "step": 11500 }, { "epoch": 1.3110085995785636, "grad_norm": 6.995006561279297, "learning_rate": 3.6982532252540245e-05, "loss": 5.4568, "step": 11510 }, { "epoch": 1.3121476166068682, "grad_norm": 12.877575874328613, "learning_rate": 3.697111542413518e-05, "loss": 5.3316, "step": 11520 }, { "epoch": 1.313286633635173, "grad_norm": 14.89468765258789, "learning_rate": 3.695969859573011e-05, "loss": 5.4251, "step": 11530 }, { "epoch": 1.3144256506634775, "grad_norm": 6.0038838386535645, "learning_rate": 3.694828176732504e-05, "loss": 5.6329, "step": 11540 }, { "epoch": 1.315564667691782, "grad_norm": 10.036600112915039, "learning_rate": 3.6936864938919966e-05, "loss": 5.4431, "step": 11550 }, { "epoch": 1.3167036847200866, "grad_norm": 7.397878646850586, "learning_rate": 3.69254481105149e-05, "loss": 5.7798, "step": 11560 }, { "epoch": 1.3178427017483911, "grad_norm": 7.196527481079102, "learning_rate": 3.6914031282109836e-05, "loss": 5.3741, "step": 11570 }, { "epoch": 1.3189817187766957, "grad_norm": 9.94596004486084, "learning_rate": 3.690261445370476e-05, "loss": 5.2022, "step": 11580 }, { "epoch": 1.3201207358050002, "grad_norm": 11.582137107849121, "learning_rate": 3.689119762529969e-05, "loss": 5.3073, "step": 11590 }, { "epoch": 1.321259752833305, "grad_norm": 7.553103446960449, "learning_rate": 3.6879780796894625e-05, "loss": 5.2065, "step": 11600 }, { "epoch": 1.3223987698616093, "grad_norm": 8.516357421875, "learning_rate": 3.686836396848956e-05, "loss": 5.4567, "step": 11610 }, { "epoch": 1.323537786889914, "grad_norm": 5.805466651916504, "learning_rate": 3.685694714008449e-05, "loss": 5.4244, "step": 11620 }, { "epoch": 1.3246768039182186, "grad_norm": 14.516117095947266, "learning_rate": 3.6845530311679414e-05, "loss": 5.3269, "step": 11630 }, { "epoch": 1.3258158209465232, "grad_norm": 8.495697021484375, "learning_rate": 3.6834113483274346e-05, "loss": 5.311, "step": 11640 }, { "epoch": 1.3269548379748277, "grad_norm": 6.78104305267334, "learning_rate": 3.682269665486928e-05, "loss": 5.355, "step": 11650 }, { "epoch": 1.3280938550031323, "grad_norm": 7.765377998352051, "learning_rate": 3.681127982646421e-05, "loss": 5.1988, "step": 11660 }, { "epoch": 1.3292328720314368, "grad_norm": 9.460587501525879, "learning_rate": 3.679986299805914e-05, "loss": 5.4055, "step": 11670 }, { "epoch": 1.3303718890597414, "grad_norm": 13.057355880737305, "learning_rate": 3.678844616965407e-05, "loss": 5.0494, "step": 11680 }, { "epoch": 1.3315109060880461, "grad_norm": 12.284239768981934, "learning_rate": 3.6777029341249005e-05, "loss": 5.5441, "step": 11690 }, { "epoch": 1.3326499231163507, "grad_norm": 12.175599098205566, "learning_rate": 3.676561251284393e-05, "loss": 5.1674, "step": 11700 }, { "epoch": 1.3337889401446552, "grad_norm": 6.106376647949219, "learning_rate": 3.675419568443886e-05, "loss": 5.5162, "step": 11710 }, { "epoch": 1.3349279571729598, "grad_norm": 6.725399971008301, "learning_rate": 3.6742778856033794e-05, "loss": 5.5066, "step": 11720 }, { "epoch": 1.3360669742012643, "grad_norm": 10.35840129852295, "learning_rate": 3.6731362027628725e-05, "loss": 5.4834, "step": 11730 }, { "epoch": 1.3372059912295688, "grad_norm": 6.601029872894287, "learning_rate": 3.671994519922366e-05, "loss": 5.2114, "step": 11740 }, { "epoch": 1.3383450082578734, "grad_norm": 6.504428863525391, "learning_rate": 3.670852837081859e-05, "loss": 5.3814, "step": 11750 }, { "epoch": 1.3394840252861782, "grad_norm": 8.49081802368164, "learning_rate": 3.669711154241352e-05, "loss": 5.1191, "step": 11760 }, { "epoch": 1.3406230423144825, "grad_norm": 30.99142074584961, "learning_rate": 3.668569471400845e-05, "loss": 5.2552, "step": 11770 }, { "epoch": 1.3417620593427872, "grad_norm": 16.623620986938477, "learning_rate": 3.667427788560338e-05, "loss": 5.5582, "step": 11780 }, { "epoch": 1.3429010763710918, "grad_norm": 8.451813697814941, "learning_rate": 3.666286105719831e-05, "loss": 5.3429, "step": 11790 }, { "epoch": 1.3440400933993963, "grad_norm": 7.930083274841309, "learning_rate": 3.665144422879324e-05, "loss": 5.5788, "step": 11800 }, { "epoch": 1.3451791104277009, "grad_norm": 8.974008560180664, "learning_rate": 3.664002740038817e-05, "loss": 5.0104, "step": 11810 }, { "epoch": 1.3463181274560054, "grad_norm": 11.199780464172363, "learning_rate": 3.6628610571983105e-05, "loss": 5.3469, "step": 11820 }, { "epoch": 1.34745714448431, "grad_norm": 16.39617919921875, "learning_rate": 3.661719374357804e-05, "loss": 5.0454, "step": 11830 }, { "epoch": 1.3485961615126145, "grad_norm": 15.259292602539062, "learning_rate": 3.660577691517297e-05, "loss": 5.606, "step": 11840 }, { "epoch": 1.3497351785409193, "grad_norm": 5.952733039855957, "learning_rate": 3.65943600867679e-05, "loss": 5.2637, "step": 11850 }, { "epoch": 1.3508741955692238, "grad_norm": 26.232786178588867, "learning_rate": 3.6582943258362826e-05, "loss": 5.3298, "step": 11860 }, { "epoch": 1.3520132125975284, "grad_norm": 8.518367767333984, "learning_rate": 3.657152642995776e-05, "loss": 5.1626, "step": 11870 }, { "epoch": 1.353152229625833, "grad_norm": 21.344398498535156, "learning_rate": 3.656010960155269e-05, "loss": 5.2306, "step": 11880 }, { "epoch": 1.3542912466541375, "grad_norm": 5.3159589767456055, "learning_rate": 3.654869277314762e-05, "loss": 5.3372, "step": 11890 }, { "epoch": 1.355430263682442, "grad_norm": 6.752881050109863, "learning_rate": 3.653727594474255e-05, "loss": 5.1163, "step": 11900 }, { "epoch": 1.3565692807107466, "grad_norm": 6.732866287231445, "learning_rate": 3.6525859116337485e-05, "loss": 5.3473, "step": 11910 }, { "epoch": 1.3577082977390513, "grad_norm": 7.8479132652282715, "learning_rate": 3.651444228793242e-05, "loss": 5.3538, "step": 11920 }, { "epoch": 1.3588473147673557, "grad_norm": 9.0760498046875, "learning_rate": 3.650302545952735e-05, "loss": 5.3047, "step": 11930 }, { "epoch": 1.3599863317956604, "grad_norm": 5.797522068023682, "learning_rate": 3.6491608631122274e-05, "loss": 5.45, "step": 11940 }, { "epoch": 1.361125348823965, "grad_norm": 10.67641544342041, "learning_rate": 3.6480191802717206e-05, "loss": 5.6555, "step": 11950 }, { "epoch": 1.3622643658522695, "grad_norm": 10.890436172485352, "learning_rate": 3.646877497431214e-05, "loss": 5.6038, "step": 11960 }, { "epoch": 1.363403382880574, "grad_norm": 10.217756271362305, "learning_rate": 3.645735814590707e-05, "loss": 5.2046, "step": 11970 }, { "epoch": 1.3645423999088786, "grad_norm": 7.881415367126465, "learning_rate": 3.6445941317501994e-05, "loss": 5.2958, "step": 11980 }, { "epoch": 1.3656814169371831, "grad_norm": 10.278264045715332, "learning_rate": 3.643452448909693e-05, "loss": 5.2545, "step": 11990 }, { "epoch": 1.3668204339654877, "grad_norm": 6.571601867675781, "learning_rate": 3.6423107660691865e-05, "loss": 5.3605, "step": 12000 }, { "epoch": 1.3668204339654877, "eval_loss": 5.954630374908447, "eval_runtime": 10.9746, "eval_samples_per_second": 1.367, "eval_steps_per_second": 0.182, "step": 12000 }, { "epoch": 1.3679594509937925, "grad_norm": 7.712740421295166, "learning_rate": 3.6411690832286797e-05, "loss": 5.1089, "step": 12010 }, { "epoch": 1.369098468022097, "grad_norm": 15.281449317932129, "learning_rate": 3.640027400388172e-05, "loss": 4.9024, "step": 12020 }, { "epoch": 1.3702374850504015, "grad_norm": 11.042343139648438, "learning_rate": 3.6388857175476653e-05, "loss": 5.3989, "step": 12030 }, { "epoch": 1.371376502078706, "grad_norm": 7.947473526000977, "learning_rate": 3.6377440347071585e-05, "loss": 5.5346, "step": 12040 }, { "epoch": 1.3725155191070106, "grad_norm": 13.120514869689941, "learning_rate": 3.636602351866652e-05, "loss": 5.1866, "step": 12050 }, { "epoch": 1.3736545361353152, "grad_norm": 6.44757080078125, "learning_rate": 3.635460669026144e-05, "loss": 5.406, "step": 12060 }, { "epoch": 1.3747935531636197, "grad_norm": 49.004764556884766, "learning_rate": 3.634318986185638e-05, "loss": 5.2507, "step": 12070 }, { "epoch": 1.3759325701919245, "grad_norm": 6.357001781463623, "learning_rate": 3.633177303345131e-05, "loss": 5.0962, "step": 12080 }, { "epoch": 1.3770715872202288, "grad_norm": 6.734482765197754, "learning_rate": 3.632035620504624e-05, "loss": 5.2718, "step": 12090 }, { "epoch": 1.3782106042485336, "grad_norm": 23.010496139526367, "learning_rate": 3.630893937664117e-05, "loss": 5.2241, "step": 12100 }, { "epoch": 1.3793496212768381, "grad_norm": 5.788354396820068, "learning_rate": 3.62975225482361e-05, "loss": 5.5905, "step": 12110 }, { "epoch": 1.3804886383051427, "grad_norm": 7.649424076080322, "learning_rate": 3.628610571983103e-05, "loss": 5.1133, "step": 12120 }, { "epoch": 1.3816276553334472, "grad_norm": 13.524384498596191, "learning_rate": 3.6274688891425965e-05, "loss": 5.4897, "step": 12130 }, { "epoch": 1.3827666723617518, "grad_norm": 16.350011825561523, "learning_rate": 3.626327206302089e-05, "loss": 5.3957, "step": 12140 }, { "epoch": 1.3839056893900563, "grad_norm": 5.700869560241699, "learning_rate": 3.625185523461583e-05, "loss": 5.3942, "step": 12150 }, { "epoch": 1.3850447064183609, "grad_norm": 13.569497108459473, "learning_rate": 3.624043840621076e-05, "loss": 5.009, "step": 12160 }, { "epoch": 1.3861837234466656, "grad_norm": 8.509134292602539, "learning_rate": 3.6229021577805686e-05, "loss": 5.1335, "step": 12170 }, { "epoch": 1.3873227404749702, "grad_norm": 34.72011184692383, "learning_rate": 3.621760474940062e-05, "loss": 5.4954, "step": 12180 }, { "epoch": 1.3884617575032747, "grad_norm": 9.994758605957031, "learning_rate": 3.620618792099555e-05, "loss": 5.8254, "step": 12190 }, { "epoch": 1.3896007745315793, "grad_norm": 7.645510196685791, "learning_rate": 3.619477109259048e-05, "loss": 5.7755, "step": 12200 }, { "epoch": 1.3907397915598838, "grad_norm": 16.956539154052734, "learning_rate": 3.6183354264185406e-05, "loss": 5.2381, "step": 12210 }, { "epoch": 1.3918788085881884, "grad_norm": 10.36237621307373, "learning_rate": 3.617193743578034e-05, "loss": 5.2316, "step": 12220 }, { "epoch": 1.393017825616493, "grad_norm": 9.028413772583008, "learning_rate": 3.616052060737528e-05, "loss": 5.8723, "step": 12230 }, { "epoch": 1.3941568426447977, "grad_norm": 10.029345512390137, "learning_rate": 3.614910377897021e-05, "loss": 5.2414, "step": 12240 }, { "epoch": 1.395295859673102, "grad_norm": 7.671345233917236, "learning_rate": 3.6137686950565134e-05, "loss": 5.2427, "step": 12250 }, { "epoch": 1.3964348767014068, "grad_norm": 10.010140419006348, "learning_rate": 3.6126270122160065e-05, "loss": 5.1803, "step": 12260 }, { "epoch": 1.3975738937297113, "grad_norm": 4.959784507751465, "learning_rate": 3.6114853293755e-05, "loss": 5.41, "step": 12270 }, { "epoch": 1.3987129107580158, "grad_norm": 7.631646156311035, "learning_rate": 3.610343646534993e-05, "loss": 5.36, "step": 12280 }, { "epoch": 1.3998519277863204, "grad_norm": 8.803350448608398, "learning_rate": 3.6092019636944854e-05, "loss": 5.161, "step": 12290 }, { "epoch": 1.400990944814625, "grad_norm": 6.224495887756348, "learning_rate": 3.6080602808539786e-05, "loss": 5.4243, "step": 12300 }, { "epoch": 1.4021299618429295, "grad_norm": 11.932744979858398, "learning_rate": 3.606918598013472e-05, "loss": 5.3582, "step": 12310 }, { "epoch": 1.403268978871234, "grad_norm": 7.398464679718018, "learning_rate": 3.6057769151729656e-05, "loss": 5.2339, "step": 12320 }, { "epoch": 1.4044079958995388, "grad_norm": 13.733871459960938, "learning_rate": 3.604635232332458e-05, "loss": 5.4102, "step": 12330 }, { "epoch": 1.4055470129278433, "grad_norm": 6.52939510345459, "learning_rate": 3.603493549491951e-05, "loss": 5.4492, "step": 12340 }, { "epoch": 1.4066860299561479, "grad_norm": 12.400009155273438, "learning_rate": 3.6023518666514445e-05, "loss": 5.4458, "step": 12350 }, { "epoch": 1.4078250469844524, "grad_norm": 7.083974361419678, "learning_rate": 3.601210183810938e-05, "loss": 5.4185, "step": 12360 }, { "epoch": 1.408964064012757, "grad_norm": 7.966810703277588, "learning_rate": 3.60006850097043e-05, "loss": 5.4619, "step": 12370 }, { "epoch": 1.4101030810410615, "grad_norm": 10.752161979675293, "learning_rate": 3.5989268181299234e-05, "loss": 5.3817, "step": 12380 }, { "epoch": 1.411242098069366, "grad_norm": 6.735963821411133, "learning_rate": 3.5977851352894166e-05, "loss": 5.4496, "step": 12390 }, { "epoch": 1.4123811150976708, "grad_norm": 8.230522155761719, "learning_rate": 3.5966434524489104e-05, "loss": 5.3922, "step": 12400 }, { "epoch": 1.4135201321259752, "grad_norm": 22.14419937133789, "learning_rate": 3.595501769608403e-05, "loss": 5.4713, "step": 12410 }, { "epoch": 1.41465914915428, "grad_norm": 6.678779602050781, "learning_rate": 3.594360086767896e-05, "loss": 5.3202, "step": 12420 }, { "epoch": 1.4157981661825845, "grad_norm": 6.689286231994629, "learning_rate": 3.593218403927389e-05, "loss": 5.577, "step": 12430 }, { "epoch": 1.416937183210889, "grad_norm": 9.552467346191406, "learning_rate": 3.5920767210868825e-05, "loss": 5.3421, "step": 12440 }, { "epoch": 1.4180762002391936, "grad_norm": 6.153965950012207, "learning_rate": 3.590935038246375e-05, "loss": 5.0896, "step": 12450 }, { "epoch": 1.419215217267498, "grad_norm": 7.473817825317383, "learning_rate": 3.589793355405868e-05, "loss": 5.2073, "step": 12460 }, { "epoch": 1.4203542342958027, "grad_norm": 7.036187171936035, "learning_rate": 3.5886516725653614e-05, "loss": 5.4712, "step": 12470 }, { "epoch": 1.4214932513241072, "grad_norm": 20.243093490600586, "learning_rate": 3.5875099897248545e-05, "loss": 5.4689, "step": 12480 }, { "epoch": 1.422632268352412, "grad_norm": 8.695764541625977, "learning_rate": 3.586368306884348e-05, "loss": 5.308, "step": 12490 }, { "epoch": 1.4237712853807165, "grad_norm": 6.1280837059021, "learning_rate": 3.585226624043841e-05, "loss": 5.4206, "step": 12500 }, { "epoch": 1.424910302409021, "grad_norm": 10.041711807250977, "learning_rate": 3.584084941203334e-05, "loss": 5.4938, "step": 12510 }, { "epoch": 1.4260493194373256, "grad_norm": 7.014266490936279, "learning_rate": 3.582943258362827e-05, "loss": 5.447, "step": 12520 }, { "epoch": 1.4271883364656301, "grad_norm": 7.004973411560059, "learning_rate": 3.58180157552232e-05, "loss": 5.1397, "step": 12530 }, { "epoch": 1.4283273534939347, "grad_norm": 5.695499420166016, "learning_rate": 3.580659892681813e-05, "loss": 5.3655, "step": 12540 }, { "epoch": 1.4294663705222392, "grad_norm": 6.106477737426758, "learning_rate": 3.579518209841306e-05, "loss": 5.2462, "step": 12550 }, { "epoch": 1.430605387550544, "grad_norm": 10.030777931213379, "learning_rate": 3.578376527000799e-05, "loss": 5.52, "step": 12560 }, { "epoch": 1.4317444045788483, "grad_norm": 5.281642436981201, "learning_rate": 3.5772348441602925e-05, "loss": 5.1858, "step": 12570 }, { "epoch": 1.432883421607153, "grad_norm": 5.482350826263428, "learning_rate": 3.576093161319786e-05, "loss": 5.1776, "step": 12580 }, { "epoch": 1.4340224386354576, "grad_norm": 10.585066795349121, "learning_rate": 3.574951478479279e-05, "loss": 5.1566, "step": 12590 }, { "epoch": 1.4351614556637622, "grad_norm": 5.751795768737793, "learning_rate": 3.5738097956387714e-05, "loss": 5.6618, "step": 12600 }, { "epoch": 1.4363004726920667, "grad_norm": 9.302414894104004, "learning_rate": 3.5726681127982646e-05, "loss": 5.4944, "step": 12610 }, { "epoch": 1.4374394897203713, "grad_norm": 8.983521461486816, "learning_rate": 3.571526429957758e-05, "loss": 5.3993, "step": 12620 }, { "epoch": 1.4385785067486758, "grad_norm": 8.087730407714844, "learning_rate": 3.570384747117251e-05, "loss": 5.8598, "step": 12630 }, { "epoch": 1.4397175237769804, "grad_norm": 96.82878112792969, "learning_rate": 3.569243064276744e-05, "loss": 5.3293, "step": 12640 }, { "epoch": 1.4408565408052851, "grad_norm": 30.524112701416016, "learning_rate": 3.568101381436237e-05, "loss": 5.3401, "step": 12650 }, { "epoch": 1.4419955578335897, "grad_norm": 11.92377758026123, "learning_rate": 3.5669596985957305e-05, "loss": 5.8283, "step": 12660 }, { "epoch": 1.4431345748618942, "grad_norm": 7.544785022735596, "learning_rate": 3.565818015755224e-05, "loss": 5.0848, "step": 12670 }, { "epoch": 1.4442735918901988, "grad_norm": 8.341391563415527, "learning_rate": 3.564676332914716e-05, "loss": 5.5114, "step": 12680 }, { "epoch": 1.4454126089185033, "grad_norm": 13.230775833129883, "learning_rate": 3.5635346500742094e-05, "loss": 5.5077, "step": 12690 }, { "epoch": 1.4465516259468079, "grad_norm": 10.898941993713379, "learning_rate": 3.5623929672337026e-05, "loss": 5.7832, "step": 12700 }, { "epoch": 1.4476906429751124, "grad_norm": 20.98676109313965, "learning_rate": 3.561251284393196e-05, "loss": 5.2055, "step": 12710 }, { "epoch": 1.4488296600034172, "grad_norm": 9.478466033935547, "learning_rate": 3.560109601552688e-05, "loss": 5.0588, "step": 12720 }, { "epoch": 1.4499686770317215, "grad_norm": 14.445795059204102, "learning_rate": 3.558967918712182e-05, "loss": 5.4359, "step": 12730 }, { "epoch": 1.4511076940600263, "grad_norm": 7.075606822967529, "learning_rate": 3.557826235871675e-05, "loss": 5.2279, "step": 12740 }, { "epoch": 1.4522467110883308, "grad_norm": 7.631337642669678, "learning_rate": 3.5566845530311685e-05, "loss": 5.3887, "step": 12750 }, { "epoch": 1.4533857281166354, "grad_norm": 5.999801158905029, "learning_rate": 3.555542870190661e-05, "loss": 5.6362, "step": 12760 }, { "epoch": 1.45452474514494, "grad_norm": 9.004730224609375, "learning_rate": 3.554401187350154e-05, "loss": 5.3229, "step": 12770 }, { "epoch": 1.4556637621732444, "grad_norm": 7.371973991394043, "learning_rate": 3.5532595045096473e-05, "loss": 5.3307, "step": 12780 }, { "epoch": 1.456802779201549, "grad_norm": 8.362858772277832, "learning_rate": 3.5521178216691405e-05, "loss": 5.2371, "step": 12790 }, { "epoch": 1.4579417962298535, "grad_norm": 6.346038341522217, "learning_rate": 3.550976138828633e-05, "loss": 5.3268, "step": 12800 }, { "epoch": 1.4590808132581583, "grad_norm": 7.912856578826904, "learning_rate": 3.549834455988127e-05, "loss": 5.0665, "step": 12810 }, { "epoch": 1.4602198302864628, "grad_norm": 26.517242431640625, "learning_rate": 3.54869277314762e-05, "loss": 5.4491, "step": 12820 }, { "epoch": 1.4613588473147674, "grad_norm": 41.19887924194336, "learning_rate": 3.547551090307113e-05, "loss": 5.6174, "step": 12830 }, { "epoch": 1.462497864343072, "grad_norm": 8.585516929626465, "learning_rate": 3.546409407466606e-05, "loss": 5.3851, "step": 12840 }, { "epoch": 1.4636368813713765, "grad_norm": 8.645813941955566, "learning_rate": 3.545267724626099e-05, "loss": 5.136, "step": 12850 }, { "epoch": 1.464775898399681, "grad_norm": 14.942729949951172, "learning_rate": 3.544126041785592e-05, "loss": 5.3279, "step": 12860 }, { "epoch": 1.4659149154279856, "grad_norm": 10.440014839172363, "learning_rate": 3.542984358945085e-05, "loss": 5.4267, "step": 12870 }, { "epoch": 1.4670539324562903, "grad_norm": 8.172478675842285, "learning_rate": 3.541842676104578e-05, "loss": 5.43, "step": 12880 }, { "epoch": 1.4681929494845947, "grad_norm": 24.180456161499023, "learning_rate": 3.540700993264072e-05, "loss": 5.4406, "step": 12890 }, { "epoch": 1.4693319665128994, "grad_norm": 7.078608989715576, "learning_rate": 3.539559310423565e-05, "loss": 5.1815, "step": 12900 }, { "epoch": 1.470470983541204, "grad_norm": 6.023872375488281, "learning_rate": 3.538417627583058e-05, "loss": 5.3495, "step": 12910 }, { "epoch": 1.4716100005695085, "grad_norm": 7.708781719207764, "learning_rate": 3.5372759447425506e-05, "loss": 5.1646, "step": 12920 }, { "epoch": 1.472749017597813, "grad_norm": 8.54637622833252, "learning_rate": 3.536134261902044e-05, "loss": 5.3512, "step": 12930 }, { "epoch": 1.4738880346261176, "grad_norm": 7.830763816833496, "learning_rate": 3.534992579061537e-05, "loss": 5.1291, "step": 12940 }, { "epoch": 1.4750270516544222, "grad_norm": 9.371572494506836, "learning_rate": 3.53385089622103e-05, "loss": 5.4404, "step": 12950 }, { "epoch": 1.4761660686827267, "grad_norm": 6.507216453552246, "learning_rate": 3.5327092133805226e-05, "loss": 5.5138, "step": 12960 }, { "epoch": 1.4773050857110315, "grad_norm": 8.929028511047363, "learning_rate": 3.531567530540016e-05, "loss": 5.7131, "step": 12970 }, { "epoch": 1.478444102739336, "grad_norm": 6.6559906005859375, "learning_rate": 3.53042584769951e-05, "loss": 5.3745, "step": 12980 }, { "epoch": 1.4795831197676406, "grad_norm": 5.723512649536133, "learning_rate": 3.529284164859002e-05, "loss": 5.2418, "step": 12990 }, { "epoch": 1.480722136795945, "grad_norm": 14.410907745361328, "learning_rate": 3.5281424820184954e-05, "loss": 5.1752, "step": 13000 }, { "epoch": 1.4818611538242497, "grad_norm": 6.995190620422363, "learning_rate": 3.5270007991779885e-05, "loss": 5.3642, "step": 13010 }, { "epoch": 1.4830001708525542, "grad_norm": 7.952390193939209, "learning_rate": 3.525859116337482e-05, "loss": 5.3698, "step": 13020 }, { "epoch": 1.4841391878808587, "grad_norm": 9.407118797302246, "learning_rate": 3.524717433496975e-05, "loss": 5.6339, "step": 13030 }, { "epoch": 1.4852782049091635, "grad_norm": 7.846121788024902, "learning_rate": 3.5235757506564674e-05, "loss": 5.4137, "step": 13040 }, { "epoch": 1.4864172219374678, "grad_norm": 8.195741653442383, "learning_rate": 3.5224340678159606e-05, "loss": 5.6291, "step": 13050 }, { "epoch": 1.4875562389657726, "grad_norm": 5.081000328063965, "learning_rate": 3.5212923849754545e-05, "loss": 5.3807, "step": 13060 }, { "epoch": 1.4886952559940771, "grad_norm": 8.810487747192383, "learning_rate": 3.520150702134947e-05, "loss": 5.1507, "step": 13070 }, { "epoch": 1.4898342730223817, "grad_norm": 13.622875213623047, "learning_rate": 3.51900901929444e-05, "loss": 5.2288, "step": 13080 }, { "epoch": 1.4909732900506862, "grad_norm": 6.059431552886963, "learning_rate": 3.517867336453933e-05, "loss": 5.2828, "step": 13090 }, { "epoch": 1.4921123070789908, "grad_norm": 7.449197292327881, "learning_rate": 3.5167256536134265e-05, "loss": 5.2901, "step": 13100 }, { "epoch": 1.4932513241072953, "grad_norm": 9.480165481567383, "learning_rate": 3.515583970772919e-05, "loss": 5.3054, "step": 13110 }, { "epoch": 1.4943903411355999, "grad_norm": 13.476594924926758, "learning_rate": 3.514442287932412e-05, "loss": 5.3112, "step": 13120 }, { "epoch": 1.4955293581639046, "grad_norm": 9.43685531616211, "learning_rate": 3.5133006050919054e-05, "loss": 5.501, "step": 13130 }, { "epoch": 1.4966683751922092, "grad_norm": 31.36126708984375, "learning_rate": 3.512158922251399e-05, "loss": 5.358, "step": 13140 }, { "epoch": 1.4978073922205137, "grad_norm": 6.15058708190918, "learning_rate": 3.511017239410892e-05, "loss": 5.2292, "step": 13150 }, { "epoch": 1.4989464092488183, "grad_norm": 12.908248901367188, "learning_rate": 3.509875556570385e-05, "loss": 5.2005, "step": 13160 }, { "epoch": 1.5000854262771228, "grad_norm": 7.87893533706665, "learning_rate": 3.508733873729878e-05, "loss": 5.2167, "step": 13170 }, { "epoch": 1.5012244433054274, "grad_norm": 7.283593654632568, "learning_rate": 3.507592190889371e-05, "loss": 5.0384, "step": 13180 }, { "epoch": 1.502363460333732, "grad_norm": 7.5403289794921875, "learning_rate": 3.506450508048864e-05, "loss": 5.5789, "step": 13190 }, { "epoch": 1.5035024773620367, "grad_norm": 27.158863067626953, "learning_rate": 3.505308825208357e-05, "loss": 5.1138, "step": 13200 }, { "epoch": 1.504641494390341, "grad_norm": 8.950972557067871, "learning_rate": 3.50416714236785e-05, "loss": 5.1794, "step": 13210 }, { "epoch": 1.5057805114186458, "grad_norm": 6.999053955078125, "learning_rate": 3.503025459527344e-05, "loss": 5.2138, "step": 13220 }, { "epoch": 1.5069195284469503, "grad_norm": 24.662425994873047, "learning_rate": 3.5018837766868366e-05, "loss": 5.5911, "step": 13230 }, { "epoch": 1.5080585454752549, "grad_norm": 10.630668640136719, "learning_rate": 3.50074209384633e-05, "loss": 5.2046, "step": 13240 }, { "epoch": 1.5091975625035594, "grad_norm": 7.645869255065918, "learning_rate": 3.499600411005823e-05, "loss": 5.1066, "step": 13250 }, { "epoch": 1.510336579531864, "grad_norm": 7.014522552490234, "learning_rate": 3.498458728165316e-05, "loss": 5.5894, "step": 13260 }, { "epoch": 1.5114755965601687, "grad_norm": 8.533949851989746, "learning_rate": 3.4973170453248086e-05, "loss": 4.9576, "step": 13270 }, { "epoch": 1.512614613588473, "grad_norm": 6.9750237464904785, "learning_rate": 3.496175362484302e-05, "loss": 5.1692, "step": 13280 }, { "epoch": 1.5137536306167778, "grad_norm": 5.311960220336914, "learning_rate": 3.495033679643795e-05, "loss": 5.0933, "step": 13290 }, { "epoch": 1.5148926476450821, "grad_norm": 18.698442459106445, "learning_rate": 3.493891996803288e-05, "loss": 5.3646, "step": 13300 }, { "epoch": 1.516031664673387, "grad_norm": 17.7315616607666, "learning_rate": 3.4927503139627813e-05, "loss": 5.2188, "step": 13310 }, { "epoch": 1.5171706817016914, "grad_norm": 11.710564613342285, "learning_rate": 3.4916086311222745e-05, "loss": 5.2846, "step": 13320 }, { "epoch": 1.518309698729996, "grad_norm": 5.661081314086914, "learning_rate": 3.490466948281768e-05, "loss": 5.2878, "step": 13330 }, { "epoch": 1.5194487157583005, "grad_norm": 12.068194389343262, "learning_rate": 3.489325265441261e-05, "loss": 5.2608, "step": 13340 }, { "epoch": 1.520587732786605, "grad_norm": 8.578582763671875, "learning_rate": 3.4881835826007534e-05, "loss": 4.9035, "step": 13350 }, { "epoch": 1.5217267498149099, "grad_norm": 9.248255729675293, "learning_rate": 3.4870418997602466e-05, "loss": 5.0897, "step": 13360 }, { "epoch": 1.5228657668432142, "grad_norm": 5.910317420959473, "learning_rate": 3.48590021691974e-05, "loss": 5.1177, "step": 13370 }, { "epoch": 1.524004783871519, "grad_norm": 9.493122100830078, "learning_rate": 3.484758534079233e-05, "loss": 5.5372, "step": 13380 }, { "epoch": 1.5251438008998235, "grad_norm": 14.408531188964844, "learning_rate": 3.483616851238726e-05, "loss": 5.1862, "step": 13390 }, { "epoch": 1.526282817928128, "grad_norm": 9.839924812316895, "learning_rate": 3.482475168398219e-05, "loss": 5.4061, "step": 13400 }, { "epoch": 1.5274218349564326, "grad_norm": 8.718000411987305, "learning_rate": 3.4813334855577125e-05, "loss": 5.4663, "step": 13410 }, { "epoch": 1.5285608519847371, "grad_norm": 6.590074062347412, "learning_rate": 3.480191802717206e-05, "loss": 5.3317, "step": 13420 }, { "epoch": 1.529699869013042, "grad_norm": 6.715322017669678, "learning_rate": 3.479050119876698e-05, "loss": 5.1962, "step": 13430 }, { "epoch": 1.5308388860413462, "grad_norm": 8.20048713684082, "learning_rate": 3.4779084370361914e-05, "loss": 5.5784, "step": 13440 }, { "epoch": 1.531977903069651, "grad_norm": 17.340343475341797, "learning_rate": 3.4767667541956846e-05, "loss": 5.4492, "step": 13450 }, { "epoch": 1.5331169200979553, "grad_norm": 6.961179256439209, "learning_rate": 3.475625071355178e-05, "loss": 5.4859, "step": 13460 }, { "epoch": 1.53425593712626, "grad_norm": 8.843058586120605, "learning_rate": 3.474483388514671e-05, "loss": 5.1902, "step": 13470 }, { "epoch": 1.5353949541545646, "grad_norm": 7.162143707275391, "learning_rate": 3.473341705674164e-05, "loss": 5.3569, "step": 13480 }, { "epoch": 1.5365339711828692, "grad_norm": 7.354994297027588, "learning_rate": 3.472200022833657e-05, "loss": 5.347, "step": 13490 }, { "epoch": 1.5376729882111737, "grad_norm": 14.971761703491211, "learning_rate": 3.47105833999315e-05, "loss": 5.3153, "step": 13500 }, { "epoch": 1.5388120052394783, "grad_norm": 9.347837448120117, "learning_rate": 3.469916657152643e-05, "loss": 5.3571, "step": 13510 }, { "epoch": 1.539951022267783, "grad_norm": 7.197291374206543, "learning_rate": 3.468774974312136e-05, "loss": 5.5024, "step": 13520 }, { "epoch": 1.5410900392960873, "grad_norm": 11.419710159301758, "learning_rate": 3.4676332914716294e-05, "loss": 5.2976, "step": 13530 }, { "epoch": 1.5422290563243921, "grad_norm": 7.408755302429199, "learning_rate": 3.4664916086311225e-05, "loss": 4.9251, "step": 13540 }, { "epoch": 1.5433680733526967, "grad_norm": 7.224884033203125, "learning_rate": 3.465349925790616e-05, "loss": 5.1885, "step": 13550 }, { "epoch": 1.5445070903810012, "grad_norm": 9.694978713989258, "learning_rate": 3.464208242950109e-05, "loss": 5.1941, "step": 13560 }, { "epoch": 1.5456461074093057, "grad_norm": 14.073616027832031, "learning_rate": 3.463066560109602e-05, "loss": 5.104, "step": 13570 }, { "epoch": 1.5467851244376103, "grad_norm": 5.462674617767334, "learning_rate": 3.4619248772690946e-05, "loss": 5.4142, "step": 13580 }, { "epoch": 1.547924141465915, "grad_norm": 8.231867790222168, "learning_rate": 3.460783194428588e-05, "loss": 5.1634, "step": 13590 }, { "epoch": 1.5490631584942194, "grad_norm": 6.0484619140625, "learning_rate": 3.459641511588081e-05, "loss": 5.351, "step": 13600 }, { "epoch": 1.5502021755225242, "grad_norm": 6.931220054626465, "learning_rate": 3.458499828747574e-05, "loss": 5.1483, "step": 13610 }, { "epoch": 1.5513411925508285, "grad_norm": 23.10702896118164, "learning_rate": 3.4573581459070667e-05, "loss": 5.3331, "step": 13620 }, { "epoch": 1.5524802095791332, "grad_norm": 15.228353500366211, "learning_rate": 3.4562164630665605e-05, "loss": 5.3901, "step": 13630 }, { "epoch": 1.5536192266074378, "grad_norm": 7.844686508178711, "learning_rate": 3.455074780226054e-05, "loss": 5.4937, "step": 13640 }, { "epoch": 1.5547582436357423, "grad_norm": 6.5618743896484375, "learning_rate": 3.453933097385547e-05, "loss": 5.0538, "step": 13650 }, { "epoch": 1.5558972606640469, "grad_norm": 6.531167507171631, "learning_rate": 3.4527914145450394e-05, "loss": 5.435, "step": 13660 }, { "epoch": 1.5570362776923514, "grad_norm": 13.602387428283691, "learning_rate": 3.4516497317045326e-05, "loss": 5.3281, "step": 13670 }, { "epoch": 1.5581752947206562, "grad_norm": 7.583394527435303, "learning_rate": 3.450508048864026e-05, "loss": 5.315, "step": 13680 }, { "epoch": 1.5593143117489605, "grad_norm": 8.912744522094727, "learning_rate": 3.449366366023519e-05, "loss": 5.3501, "step": 13690 }, { "epoch": 1.5604533287772653, "grad_norm": 6.5284423828125, "learning_rate": 3.4482246831830114e-05, "loss": 5.2478, "step": 13700 }, { "epoch": 1.5615923458055698, "grad_norm": 18.133039474487305, "learning_rate": 3.4470830003425046e-05, "loss": 5.1715, "step": 13710 }, { "epoch": 1.5627313628338744, "grad_norm": 7.198716640472412, "learning_rate": 3.4459413175019985e-05, "loss": 5.2315, "step": 13720 }, { "epoch": 1.563870379862179, "grad_norm": 8.62277603149414, "learning_rate": 3.444799634661492e-05, "loss": 5.1835, "step": 13730 }, { "epoch": 1.5650093968904835, "grad_norm": 8.421860694885254, "learning_rate": 3.443657951820984e-05, "loss": 5.1822, "step": 13740 }, { "epoch": 1.5661484139187882, "grad_norm": 7.427688121795654, "learning_rate": 3.4425162689804774e-05, "loss": 5.5495, "step": 13750 }, { "epoch": 1.5672874309470926, "grad_norm": 7.007988929748535, "learning_rate": 3.4413745861399705e-05, "loss": 5.1895, "step": 13760 }, { "epoch": 1.5684264479753973, "grad_norm": 9.902037620544434, "learning_rate": 3.440232903299464e-05, "loss": 5.2564, "step": 13770 }, { "epoch": 1.5695654650037016, "grad_norm": 8.029926300048828, "learning_rate": 3.439091220458956e-05, "loss": 5.3869, "step": 13780 }, { "epoch": 1.5707044820320064, "grad_norm": 11.344751358032227, "learning_rate": 3.4379495376184494e-05, "loss": 5.3192, "step": 13790 }, { "epoch": 1.571843499060311, "grad_norm": 19.97797393798828, "learning_rate": 3.436807854777943e-05, "loss": 4.9832, "step": 13800 }, { "epoch": 1.5729825160886155, "grad_norm": 9.381373405456543, "learning_rate": 3.435666171937436e-05, "loss": 5.2027, "step": 13810 }, { "epoch": 1.57412153311692, "grad_norm": 7.4374613761901855, "learning_rate": 3.434524489096929e-05, "loss": 5.2427, "step": 13820 }, { "epoch": 1.5752605501452246, "grad_norm": 8.768608093261719, "learning_rate": 3.433382806256422e-05, "loss": 5.182, "step": 13830 }, { "epoch": 1.5763995671735294, "grad_norm": 10.891498565673828, "learning_rate": 3.432241123415915e-05, "loss": 5.765, "step": 13840 }, { "epoch": 1.5775385842018337, "grad_norm": 20.340749740600586, "learning_rate": 3.4310994405754085e-05, "loss": 5.4202, "step": 13850 }, { "epoch": 1.5786776012301384, "grad_norm": 5.067477226257324, "learning_rate": 3.429957757734901e-05, "loss": 5.5461, "step": 13860 }, { "epoch": 1.579816618258443, "grad_norm": 9.956294059753418, "learning_rate": 3.428816074894394e-05, "loss": 5.2925, "step": 13870 }, { "epoch": 1.5809556352867475, "grad_norm": 8.55966854095459, "learning_rate": 3.427674392053888e-05, "loss": 5.3078, "step": 13880 }, { "epoch": 1.582094652315052, "grad_norm": 10.623746871948242, "learning_rate": 3.4265327092133806e-05, "loss": 5.0856, "step": 13890 }, { "epoch": 1.5832336693433566, "grad_norm": 7.9208526611328125, "learning_rate": 3.425391026372874e-05, "loss": 5.313, "step": 13900 }, { "epoch": 1.5843726863716614, "grad_norm": 9.109285354614258, "learning_rate": 3.424249343532367e-05, "loss": 5.4487, "step": 13910 }, { "epoch": 1.5855117033999657, "grad_norm": 15.563692092895508, "learning_rate": 3.42310766069186e-05, "loss": 5.2342, "step": 13920 }, { "epoch": 1.5866507204282705, "grad_norm": 15.028312683105469, "learning_rate": 3.421965977851353e-05, "loss": 5.0988, "step": 13930 }, { "epoch": 1.5877897374565748, "grad_norm": 10.812498092651367, "learning_rate": 3.420824295010846e-05, "loss": 4.8925, "step": 13940 }, { "epoch": 1.5889287544848796, "grad_norm": 12.279333114624023, "learning_rate": 3.419682612170339e-05, "loss": 5.1012, "step": 13950 }, { "epoch": 1.5900677715131841, "grad_norm": 18.0694637298584, "learning_rate": 3.418540929329833e-05, "loss": 5.4058, "step": 13960 }, { "epoch": 1.5912067885414887, "grad_norm": 12.514638900756836, "learning_rate": 3.4173992464893254e-05, "loss": 5.4547, "step": 13970 }, { "epoch": 1.5923458055697932, "grad_norm": 7.83701229095459, "learning_rate": 3.4162575636488186e-05, "loss": 5.3477, "step": 13980 }, { "epoch": 1.5934848225980978, "grad_norm": 7.048572063446045, "learning_rate": 3.415115880808312e-05, "loss": 5.1777, "step": 13990 }, { "epoch": 1.5946238396264025, "grad_norm": 8.231867790222168, "learning_rate": 3.413974197967805e-05, "loss": 5.3191, "step": 14000 }, { "epoch": 1.5946238396264025, "eval_loss": 5.865710258483887, "eval_runtime": 11.9559, "eval_samples_per_second": 1.255, "eval_steps_per_second": 0.167, "step": 14000 }, { "epoch": 1.5957628566547069, "grad_norm": 7.9335150718688965, "learning_rate": 3.4128325151272974e-05, "loss": 5.1679, "step": 14010 }, { "epoch": 1.5969018736830116, "grad_norm": 11.78247356414795, "learning_rate": 3.4116908322867906e-05, "loss": 5.1418, "step": 14020 }, { "epoch": 1.5980408907113162, "grad_norm": 8.302976608276367, "learning_rate": 3.410549149446284e-05, "loss": 5.2641, "step": 14030 }, { "epoch": 1.5991799077396207, "grad_norm": 6.566915988922119, "learning_rate": 3.409407466605777e-05, "loss": 5.4475, "step": 14040 }, { "epoch": 1.6003189247679253, "grad_norm": 9.077897071838379, "learning_rate": 3.40826578376527e-05, "loss": 5.3925, "step": 14050 }, { "epoch": 1.6014579417962298, "grad_norm": 7.3145880699157715, "learning_rate": 3.4071241009247633e-05, "loss": 5.4688, "step": 14060 }, { "epoch": 1.6025969588245346, "grad_norm": 9.912899017333984, "learning_rate": 3.4059824180842565e-05, "loss": 5.1155, "step": 14070 }, { "epoch": 1.603735975852839, "grad_norm": 8.687728881835938, "learning_rate": 3.40484073524375e-05, "loss": 5.4758, "step": 14080 }, { "epoch": 1.6048749928811437, "grad_norm": 16.98023796081543, "learning_rate": 3.403699052403242e-05, "loss": 5.2506, "step": 14090 }, { "epoch": 1.606014009909448, "grad_norm": 5.677768230438232, "learning_rate": 3.4025573695627354e-05, "loss": 5.4057, "step": 14100 }, { "epoch": 1.6071530269377527, "grad_norm": 7.748041152954102, "learning_rate": 3.4014156867222286e-05, "loss": 5.3705, "step": 14110 }, { "epoch": 1.6082920439660573, "grad_norm": 6.371578216552734, "learning_rate": 3.400274003881722e-05, "loss": 5.1812, "step": 14120 }, { "epoch": 1.6094310609943618, "grad_norm": 7.417301654815674, "learning_rate": 3.399132321041215e-05, "loss": 5.9261, "step": 14130 }, { "epoch": 1.6105700780226664, "grad_norm": 10.484310150146484, "learning_rate": 3.397990638200708e-05, "loss": 5.3389, "step": 14140 }, { "epoch": 1.611709095050971, "grad_norm": 22.2833251953125, "learning_rate": 3.396848955360201e-05, "loss": 5.2009, "step": 14150 }, { "epoch": 1.6128481120792757, "grad_norm": 9.973012924194336, "learning_rate": 3.3957072725196945e-05, "loss": 5.4385, "step": 14160 }, { "epoch": 1.61398712910758, "grad_norm": 7.872779369354248, "learning_rate": 3.394565589679187e-05, "loss": 5.4088, "step": 14170 }, { "epoch": 1.6151261461358848, "grad_norm": 9.12864875793457, "learning_rate": 3.39342390683868e-05, "loss": 5.3384, "step": 14180 }, { "epoch": 1.6162651631641893, "grad_norm": 6.356509685516357, "learning_rate": 3.3922822239981734e-05, "loss": 5.1973, "step": 14190 }, { "epoch": 1.6174041801924939, "grad_norm": 9.473197937011719, "learning_rate": 3.3912547094417177e-05, "loss": 5.555, "step": 14200 }, { "epoch": 1.6185431972207984, "grad_norm": 8.52722454071045, "learning_rate": 3.39011302660121e-05, "loss": 5.0841, "step": 14210 }, { "epoch": 1.619682214249103, "grad_norm": 7.906297206878662, "learning_rate": 3.3889713437607033e-05, "loss": 5.3532, "step": 14220 }, { "epoch": 1.6208212312774077, "grad_norm": 13.823141098022461, "learning_rate": 3.3878296609201965e-05, "loss": 5.3732, "step": 14230 }, { "epoch": 1.621960248305712, "grad_norm": 8.63819408416748, "learning_rate": 3.38668797807969e-05, "loss": 5.7671, "step": 14240 }, { "epoch": 1.6230992653340168, "grad_norm": 11.065068244934082, "learning_rate": 3.385546295239182e-05, "loss": 5.1148, "step": 14250 }, { "epoch": 1.6242382823623212, "grad_norm": 6.661491870880127, "learning_rate": 3.3844046123986754e-05, "loss": 5.1601, "step": 14260 }, { "epoch": 1.625377299390626, "grad_norm": 7.48806619644165, "learning_rate": 3.383262929558169e-05, "loss": 5.4801, "step": 14270 }, { "epoch": 1.6265163164189305, "grad_norm": 6.75936222076416, "learning_rate": 3.3821212467176624e-05, "loss": 4.6146, "step": 14280 }, { "epoch": 1.627655333447235, "grad_norm": 6.72158670425415, "learning_rate": 3.380979563877155e-05, "loss": 5.2395, "step": 14290 }, { "epoch": 1.6287943504755396, "grad_norm": 9.066991806030273, "learning_rate": 3.379837881036648e-05, "loss": 5.4662, "step": 14300 }, { "epoch": 1.629933367503844, "grad_norm": 9.151389122009277, "learning_rate": 3.378696198196141e-05, "loss": 5.5807, "step": 14310 }, { "epoch": 1.6310723845321489, "grad_norm": 5.07937479019165, "learning_rate": 3.3775545153556345e-05, "loss": 5.0922, "step": 14320 }, { "epoch": 1.6322114015604532, "grad_norm": 4.731130599975586, "learning_rate": 3.376412832515127e-05, "loss": 5.3785, "step": 14330 }, { "epoch": 1.633350418588758, "grad_norm": 7.364837646484375, "learning_rate": 3.37527114967462e-05, "loss": 5.0001, "step": 14340 }, { "epoch": 1.6344894356170625, "grad_norm": 9.038762092590332, "learning_rate": 3.374129466834114e-05, "loss": 5.3599, "step": 14350 }, { "epoch": 1.635628452645367, "grad_norm": 8.42864990234375, "learning_rate": 3.372987783993607e-05, "loss": 5.2821, "step": 14360 }, { "epoch": 1.6367674696736716, "grad_norm": 5.388772487640381, "learning_rate": 3.3718461011531e-05, "loss": 5.3046, "step": 14370 }, { "epoch": 1.6379064867019761, "grad_norm": 6.146626949310303, "learning_rate": 3.370704418312593e-05, "loss": 5.1354, "step": 14380 }, { "epoch": 1.639045503730281, "grad_norm": 28.261077880859375, "learning_rate": 3.369562735472086e-05, "loss": 5.1599, "step": 14390 }, { "epoch": 1.6401845207585852, "grad_norm": 16.488996505737305, "learning_rate": 3.368421052631579e-05, "loss": 5.5803, "step": 14400 }, { "epoch": 1.64132353778689, "grad_norm": 5.829269886016846, "learning_rate": 3.367279369791072e-05, "loss": 4.9764, "step": 14410 }, { "epoch": 1.6424625548151945, "grad_norm": 5.7588419914245605, "learning_rate": 3.366137686950565e-05, "loss": 5.5567, "step": 14420 }, { "epoch": 1.643601571843499, "grad_norm": 7.560709476470947, "learning_rate": 3.364996004110059e-05, "loss": 5.4832, "step": 14430 }, { "epoch": 1.6447405888718036, "grad_norm": 13.212182998657227, "learning_rate": 3.3638543212695514e-05, "loss": 5.0167, "step": 14440 }, { "epoch": 1.6458796059001082, "grad_norm": 6.811690807342529, "learning_rate": 3.3627126384290445e-05, "loss": 5.7124, "step": 14450 }, { "epoch": 1.647018622928413, "grad_norm": 8.75715446472168, "learning_rate": 3.361570955588538e-05, "loss": 5.3427, "step": 14460 }, { "epoch": 1.6481576399567173, "grad_norm": 9.054322242736816, "learning_rate": 3.360429272748031e-05, "loss": 5.1287, "step": 14470 }, { "epoch": 1.649296656985022, "grad_norm": 8.605569839477539, "learning_rate": 3.359287589907524e-05, "loss": 5.3172, "step": 14480 }, { "epoch": 1.6504356740133264, "grad_norm": 4.9488348960876465, "learning_rate": 3.3581459070670166e-05, "loss": 5.2897, "step": 14490 }, { "epoch": 1.6515746910416311, "grad_norm": 7.640530109405518, "learning_rate": 3.35700422422651e-05, "loss": 5.1434, "step": 14500 }, { "epoch": 1.6527137080699357, "grad_norm": 18.76363182067871, "learning_rate": 3.355862541386003e-05, "loss": 5.0798, "step": 14510 }, { "epoch": 1.6538527250982402, "grad_norm": 10.534653663635254, "learning_rate": 3.354720858545496e-05, "loss": 5.4461, "step": 14520 }, { "epoch": 1.6549917421265448, "grad_norm": 11.95950698852539, "learning_rate": 3.353579175704989e-05, "loss": 5.403, "step": 14530 }, { "epoch": 1.6561307591548493, "grad_norm": 19.345565795898438, "learning_rate": 3.3524374928644825e-05, "loss": 5.4681, "step": 14540 }, { "epoch": 1.657269776183154, "grad_norm": 21.54135513305664, "learning_rate": 3.351295810023976e-05, "loss": 5.1326, "step": 14550 }, { "epoch": 1.6584087932114584, "grad_norm": 10.59844970703125, "learning_rate": 3.350154127183469e-05, "loss": 5.2563, "step": 14560 }, { "epoch": 1.6595478102397632, "grad_norm": 5.983888149261475, "learning_rate": 3.3490124443429614e-05, "loss": 5.4246, "step": 14570 }, { "epoch": 1.6606868272680677, "grad_norm": 10.829444885253906, "learning_rate": 3.3478707615024546e-05, "loss": 5.3801, "step": 14580 }, { "epoch": 1.6618258442963723, "grad_norm": 7.411653995513916, "learning_rate": 3.346729078661948e-05, "loss": 5.4264, "step": 14590 }, { "epoch": 1.6629648613246768, "grad_norm": 4.255529880523682, "learning_rate": 3.345587395821441e-05, "loss": 5.114, "step": 14600 }, { "epoch": 1.6641038783529813, "grad_norm": 7.266125202178955, "learning_rate": 3.344445712980934e-05, "loss": 5.3673, "step": 14610 }, { "epoch": 1.6652428953812861, "grad_norm": 11.747828483581543, "learning_rate": 3.343304030140427e-05, "loss": 5.0465, "step": 14620 }, { "epoch": 1.6663819124095904, "grad_norm": 7.091350555419922, "learning_rate": 3.3421623472999205e-05, "loss": 5.2813, "step": 14630 }, { "epoch": 1.6675209294378952, "grad_norm": 7.287014484405518, "learning_rate": 3.341020664459413e-05, "loss": 5.3202, "step": 14640 }, { "epoch": 1.6686599464661995, "grad_norm": 8.400530815124512, "learning_rate": 3.339878981618906e-05, "loss": 4.9966, "step": 14650 }, { "epoch": 1.6697989634945043, "grad_norm": 8.203120231628418, "learning_rate": 3.3387372987783994e-05, "loss": 5.0657, "step": 14660 }, { "epoch": 1.6709379805228088, "grad_norm": 8.2782564163208, "learning_rate": 3.3375956159378925e-05, "loss": 5.4649, "step": 14670 }, { "epoch": 1.6720769975511134, "grad_norm": 8.495993614196777, "learning_rate": 3.336453933097386e-05, "loss": 4.9707, "step": 14680 }, { "epoch": 1.673216014579418, "grad_norm": 13.784844398498535, "learning_rate": 3.335312250256879e-05, "loss": 5.396, "step": 14690 }, { "epoch": 1.6743550316077225, "grad_norm": 6.243724822998047, "learning_rate": 3.334170567416372e-05, "loss": 5.1599, "step": 14700 }, { "epoch": 1.6754940486360272, "grad_norm": 6.918847560882568, "learning_rate": 3.333028884575865e-05, "loss": 5.0621, "step": 14710 }, { "epoch": 1.6766330656643316, "grad_norm": 12.930042266845703, "learning_rate": 3.331887201735358e-05, "loss": 5.3289, "step": 14720 }, { "epoch": 1.6777720826926363, "grad_norm": 8.358970642089844, "learning_rate": 3.330745518894851e-05, "loss": 5.1417, "step": 14730 }, { "epoch": 1.6789110997209409, "grad_norm": 5.784093379974365, "learning_rate": 3.329603836054344e-05, "loss": 5.0608, "step": 14740 }, { "epoch": 1.6800501167492454, "grad_norm": 9.293664932250977, "learning_rate": 3.328462153213837e-05, "loss": 5.0801, "step": 14750 }, { "epoch": 1.68118913377755, "grad_norm": 7.548481464385986, "learning_rate": 3.3273204703733305e-05, "loss": 5.3661, "step": 14760 }, { "epoch": 1.6823281508058545, "grad_norm": 6.076251983642578, "learning_rate": 3.326178787532824e-05, "loss": 5.3517, "step": 14770 }, { "epoch": 1.6834671678341593, "grad_norm": 11.553271293640137, "learning_rate": 3.325037104692317e-05, "loss": 5.1621, "step": 14780 }, { "epoch": 1.6846061848624636, "grad_norm": 5.663163185119629, "learning_rate": 3.32389542185181e-05, "loss": 5.2807, "step": 14790 }, { "epoch": 1.6857452018907684, "grad_norm": 8.15644359588623, "learning_rate": 3.3227537390113026e-05, "loss": 5.6571, "step": 14800 }, { "epoch": 1.6868842189190727, "grad_norm": 13.200905799865723, "learning_rate": 3.321612056170796e-05, "loss": 5.1325, "step": 14810 }, { "epoch": 1.6880232359473775, "grad_norm": 10.035882949829102, "learning_rate": 3.320470373330289e-05, "loss": 5.2823, "step": 14820 }, { "epoch": 1.689162252975682, "grad_norm": 6.265589237213135, "learning_rate": 3.319328690489782e-05, "loss": 5.2748, "step": 14830 }, { "epoch": 1.6903012700039866, "grad_norm": 8.532353401184082, "learning_rate": 3.3181870076492746e-05, "loss": 5.4935, "step": 14840 }, { "epoch": 1.691440287032291, "grad_norm": 7.839117050170898, "learning_rate": 3.3170453248087685e-05, "loss": 5.4935, "step": 14850 }, { "epoch": 1.6925793040605956, "grad_norm": 6.248348712921143, "learning_rate": 3.315903641968262e-05, "loss": 5.4433, "step": 14860 }, { "epoch": 1.6937183210889004, "grad_norm": 8.618048667907715, "learning_rate": 3.314761959127755e-05, "loss": 5.2831, "step": 14870 }, { "epoch": 1.6948573381172047, "grad_norm": 7.2228312492370605, "learning_rate": 3.3136202762872474e-05, "loss": 5.3584, "step": 14880 }, { "epoch": 1.6959963551455095, "grad_norm": 10.249879837036133, "learning_rate": 3.3124785934467406e-05, "loss": 5.4896, "step": 14890 }, { "epoch": 1.697135372173814, "grad_norm": 13.266846656799316, "learning_rate": 3.311336910606234e-05, "loss": 5.8706, "step": 14900 }, { "epoch": 1.6982743892021186, "grad_norm": 16.368671417236328, "learning_rate": 3.310195227765727e-05, "loss": 5.1789, "step": 14910 }, { "epoch": 1.6994134062304231, "grad_norm": 5.892331123352051, "learning_rate": 3.3090535449252194e-05, "loss": 5.8149, "step": 14920 }, { "epoch": 1.7005524232587277, "grad_norm": 7.839752674102783, "learning_rate": 3.307911862084713e-05, "loss": 5.1733, "step": 14930 }, { "epoch": 1.7016914402870325, "grad_norm": 6.328537940979004, "learning_rate": 3.3067701792442065e-05, "loss": 5.3515, "step": 14940 }, { "epoch": 1.7028304573153368, "grad_norm": 11.081585884094238, "learning_rate": 3.305628496403699e-05, "loss": 5.7846, "step": 14950 }, { "epoch": 1.7039694743436415, "grad_norm": 9.543967247009277, "learning_rate": 3.304486813563192e-05, "loss": 5.3206, "step": 14960 }, { "epoch": 1.7051084913719459, "grad_norm": 13.206875801086426, "learning_rate": 3.3033451307226853e-05, "loss": 5.2824, "step": 14970 }, { "epoch": 1.7062475084002506, "grad_norm": 9.329044342041016, "learning_rate": 3.3022034478821785e-05, "loss": 5.7017, "step": 14980 }, { "epoch": 1.7073865254285552, "grad_norm": 7.400033950805664, "learning_rate": 3.301061765041672e-05, "loss": 5.2255, "step": 14990 }, { "epoch": 1.7085255424568597, "grad_norm": 10.00680923461914, "learning_rate": 3.299920082201164e-05, "loss": 5.3857, "step": 15000 }, { "epoch": 1.7096645594851643, "grad_norm": 6.304871559143066, "learning_rate": 3.298778399360658e-05, "loss": 5.2932, "step": 15010 }, { "epoch": 1.7108035765134688, "grad_norm": 6.080683708190918, "learning_rate": 3.297636716520151e-05, "loss": 5.2007, "step": 15020 }, { "epoch": 1.7119425935417736, "grad_norm": 11.78959846496582, "learning_rate": 3.296495033679644e-05, "loss": 5.2121, "step": 15030 }, { "epoch": 1.713081610570078, "grad_norm": 12.254242897033691, "learning_rate": 3.295353350839137e-05, "loss": 5.2893, "step": 15040 }, { "epoch": 1.7142206275983827, "grad_norm": 11.91922378540039, "learning_rate": 3.29421166799863e-05, "loss": 5.2825, "step": 15050 }, { "epoch": 1.7153596446266872, "grad_norm": 6.525363922119141, "learning_rate": 3.293069985158123e-05, "loss": 5.2046, "step": 15060 }, { "epoch": 1.7164986616549918, "grad_norm": 9.117419242858887, "learning_rate": 3.2919283023176165e-05, "loss": 5.5099, "step": 15070 }, { "epoch": 1.7176376786832963, "grad_norm": 7.740299224853516, "learning_rate": 3.290786619477109e-05, "loss": 5.4564, "step": 15080 }, { "epoch": 1.7187766957116009, "grad_norm": 32.48822021484375, "learning_rate": 3.289644936636603e-05, "loss": 5.2745, "step": 15090 }, { "epoch": 1.7199157127399056, "grad_norm": 8.313048362731934, "learning_rate": 3.288503253796096e-05, "loss": 5.4062, "step": 15100 }, { "epoch": 1.72105472976821, "grad_norm": 12.474053382873535, "learning_rate": 3.2873615709555886e-05, "loss": 5.4008, "step": 15110 }, { "epoch": 1.7221937467965147, "grad_norm": 7.4052958488464355, "learning_rate": 3.286219888115082e-05, "loss": 5.1607, "step": 15120 }, { "epoch": 1.723332763824819, "grad_norm": 8.364946365356445, "learning_rate": 3.285078205274575e-05, "loss": 5.3591, "step": 15130 }, { "epoch": 1.7244717808531238, "grad_norm": 11.5457763671875, "learning_rate": 3.283936522434068e-05, "loss": 5.294, "step": 15140 }, { "epoch": 1.7256107978814283, "grad_norm": 5.80129337310791, "learning_rate": 3.2827948395935606e-05, "loss": 5.1861, "step": 15150 }, { "epoch": 1.726749814909733, "grad_norm": 12.946269989013672, "learning_rate": 3.281653156753054e-05, "loss": 5.5425, "step": 15160 }, { "epoch": 1.7278888319380374, "grad_norm": 11.868324279785156, "learning_rate": 3.280511473912547e-05, "loss": 5.1072, "step": 15170 }, { "epoch": 1.729027848966342, "grad_norm": 20.0992374420166, "learning_rate": 3.279369791072041e-05, "loss": 5.1057, "step": 15180 }, { "epoch": 1.7301668659946468, "grad_norm": 7.570152759552002, "learning_rate": 3.2782281082315334e-05, "loss": 5.4605, "step": 15190 }, { "epoch": 1.731305883022951, "grad_norm": 8.00123119354248, "learning_rate": 3.2770864253910265e-05, "loss": 5.9011, "step": 15200 }, { "epoch": 1.7324449000512558, "grad_norm": 6.339069366455078, "learning_rate": 3.27594474255052e-05, "loss": 5.5464, "step": 15210 }, { "epoch": 1.7335839170795604, "grad_norm": 7.332450866699219, "learning_rate": 3.274803059710013e-05, "loss": 5.2134, "step": 15220 }, { "epoch": 1.734722934107865, "grad_norm": 7.234862327575684, "learning_rate": 3.2736613768695054e-05, "loss": 5.4308, "step": 15230 }, { "epoch": 1.7358619511361695, "grad_norm": 8.013717651367188, "learning_rate": 3.2725196940289986e-05, "loss": 5.2408, "step": 15240 }, { "epoch": 1.737000968164474, "grad_norm": 5.108926296234131, "learning_rate": 3.271378011188492e-05, "loss": 5.0276, "step": 15250 }, { "epoch": 1.7381399851927788, "grad_norm": 9.906007766723633, "learning_rate": 3.2702363283479856e-05, "loss": 5.1256, "step": 15260 }, { "epoch": 1.7392790022210831, "grad_norm": 5.640520095825195, "learning_rate": 3.269094645507478e-05, "loss": 5.5672, "step": 15270 }, { "epoch": 1.7404180192493879, "grad_norm": 6.319045066833496, "learning_rate": 3.267952962666971e-05, "loss": 4.8686, "step": 15280 }, { "epoch": 1.7415570362776922, "grad_norm": 29.221023559570312, "learning_rate": 3.2668112798264645e-05, "loss": 5.4805, "step": 15290 }, { "epoch": 1.742696053305997, "grad_norm": 5.332036972045898, "learning_rate": 3.265669596985958e-05, "loss": 5.1916, "step": 15300 }, { "epoch": 1.7438350703343015, "grad_norm": 11.026018142700195, "learning_rate": 3.26452791414545e-05, "loss": 5.64, "step": 15310 }, { "epoch": 1.744974087362606, "grad_norm": 12.852426528930664, "learning_rate": 3.2633862313049434e-05, "loss": 5.72, "step": 15320 }, { "epoch": 1.7461131043909106, "grad_norm": 15.072467803955078, "learning_rate": 3.2622445484644366e-05, "loss": 5.2566, "step": 15330 }, { "epoch": 1.7472521214192152, "grad_norm": 12.933263778686523, "learning_rate": 3.26110286562393e-05, "loss": 5.4592, "step": 15340 }, { "epoch": 1.74839113844752, "grad_norm": 8.705323219299316, "learning_rate": 3.259961182783423e-05, "loss": 5.1086, "step": 15350 }, { "epoch": 1.7495301554758242, "grad_norm": 21.354663848876953, "learning_rate": 3.258819499942916e-05, "loss": 5.2691, "step": 15360 }, { "epoch": 1.750669172504129, "grad_norm": 18.818647384643555, "learning_rate": 3.257677817102409e-05, "loss": 5.1616, "step": 15370 }, { "epoch": 1.7518081895324336, "grad_norm": 8.70785140991211, "learning_rate": 3.2565361342619025e-05, "loss": 5.4155, "step": 15380 }, { "epoch": 1.752947206560738, "grad_norm": 10.73972225189209, "learning_rate": 3.255394451421395e-05, "loss": 5.5494, "step": 15390 }, { "epoch": 1.7540862235890426, "grad_norm": 14.080718994140625, "learning_rate": 3.254252768580888e-05, "loss": 5.2188, "step": 15400 }, { "epoch": 1.7552252406173472, "grad_norm": 15.136094093322754, "learning_rate": 3.2531110857403814e-05, "loss": 5.5364, "step": 15410 }, { "epoch": 1.756364257645652, "grad_norm": 8.25161075592041, "learning_rate": 3.2519694028998745e-05, "loss": 5.0991, "step": 15420 }, { "epoch": 1.7575032746739563, "grad_norm": 23.618043899536133, "learning_rate": 3.250827720059368e-05, "loss": 5.0023, "step": 15430 }, { "epoch": 1.758642291702261, "grad_norm": 12.272988319396973, "learning_rate": 3.249686037218861e-05, "loss": 5.2807, "step": 15440 }, { "epoch": 1.7597813087305654, "grad_norm": 10.583504676818848, "learning_rate": 3.248544354378354e-05, "loss": 5.1016, "step": 15450 }, { "epoch": 1.7609203257588701, "grad_norm": 10.107946395874023, "learning_rate": 3.2474026715378466e-05, "loss": 5.402, "step": 15460 }, { "epoch": 1.7620593427871747, "grad_norm": 7.89888334274292, "learning_rate": 3.24626098869734e-05, "loss": 5.1111, "step": 15470 }, { "epoch": 1.7631983598154792, "grad_norm": 8.583559036254883, "learning_rate": 3.245119305856833e-05, "loss": 4.8444, "step": 15480 }, { "epoch": 1.7643373768437838, "grad_norm": 6.2021918296813965, "learning_rate": 3.243977623016326e-05, "loss": 5.0835, "step": 15490 }, { "epoch": 1.7654763938720883, "grad_norm": 7.720624923706055, "learning_rate": 3.2428359401758193e-05, "loss": 5.19, "step": 15500 }, { "epoch": 1.766615410900393, "grad_norm": 10.650630950927734, "learning_rate": 3.2416942573353125e-05, "loss": 5.3394, "step": 15510 }, { "epoch": 1.7677544279286974, "grad_norm": 8.247625350952148, "learning_rate": 3.240552574494806e-05, "loss": 4.9535, "step": 15520 }, { "epoch": 1.7688934449570022, "grad_norm": 35.305152893066406, "learning_rate": 3.239410891654299e-05, "loss": 5.6442, "step": 15530 }, { "epoch": 1.7700324619853067, "grad_norm": 9.185782432556152, "learning_rate": 3.2382692088137914e-05, "loss": 5.2315, "step": 15540 }, { "epoch": 1.7711714790136113, "grad_norm": 8.288222312927246, "learning_rate": 3.2371275259732846e-05, "loss": 5.0178, "step": 15550 }, { "epoch": 1.7723104960419158, "grad_norm": 13.48383903503418, "learning_rate": 3.235985843132778e-05, "loss": 5.2955, "step": 15560 }, { "epoch": 1.7734495130702204, "grad_norm": 7.542227745056152, "learning_rate": 3.234844160292271e-05, "loss": 5.2314, "step": 15570 }, { "epoch": 1.7745885300985251, "grad_norm": 8.83359146118164, "learning_rate": 3.233702477451764e-05, "loss": 5.1114, "step": 15580 }, { "epoch": 1.7757275471268295, "grad_norm": 8.529522895812988, "learning_rate": 3.232560794611257e-05, "loss": 5.0844, "step": 15590 }, { "epoch": 1.7768665641551342, "grad_norm": 8.735173225402832, "learning_rate": 3.2314191117707505e-05, "loss": 4.9814, "step": 15600 }, { "epoch": 1.7780055811834385, "grad_norm": 10.227621078491211, "learning_rate": 3.230277428930244e-05, "loss": 5.6574, "step": 15610 }, { "epoch": 1.7791445982117433, "grad_norm": 6.311840534210205, "learning_rate": 3.229135746089736e-05, "loss": 5.2249, "step": 15620 }, { "epoch": 1.7802836152400479, "grad_norm": 10.498848915100098, "learning_rate": 3.2279940632492294e-05, "loss": 5.1075, "step": 15630 }, { "epoch": 1.7814226322683524, "grad_norm": 12.254350662231445, "learning_rate": 3.2268523804087226e-05, "loss": 5.3643, "step": 15640 }, { "epoch": 1.782561649296657, "grad_norm": 11.579363822937012, "learning_rate": 3.225710697568216e-05, "loss": 5.1973, "step": 15650 }, { "epoch": 1.7837006663249615, "grad_norm": 21.702810287475586, "learning_rate": 3.224569014727708e-05, "loss": 5.1932, "step": 15660 }, { "epoch": 1.7848396833532663, "grad_norm": 6.838681697845459, "learning_rate": 3.223427331887202e-05, "loss": 5.1427, "step": 15670 }, { "epoch": 1.7859787003815706, "grad_norm": 6.818762302398682, "learning_rate": 3.222285649046695e-05, "loss": 5.1011, "step": 15680 }, { "epoch": 1.7871177174098754, "grad_norm": 24.59422492980957, "learning_rate": 3.2211439662061885e-05, "loss": 5.0368, "step": 15690 }, { "epoch": 1.78825673443818, "grad_norm": 7.4565510749816895, "learning_rate": 3.220002283365681e-05, "loss": 5.2689, "step": 15700 }, { "epoch": 1.7893957514664844, "grad_norm": 6.894626140594482, "learning_rate": 3.218860600525174e-05, "loss": 5.3947, "step": 15710 }, { "epoch": 1.790534768494789, "grad_norm": 7.664463520050049, "learning_rate": 3.2177189176846674e-05, "loss": 5.6121, "step": 15720 }, { "epoch": 1.7916737855230935, "grad_norm": 8.247864723205566, "learning_rate": 3.2165772348441605e-05, "loss": 5.5169, "step": 15730 }, { "epoch": 1.7928128025513983, "grad_norm": 7.653250694274902, "learning_rate": 3.215435552003653e-05, "loss": 5.2645, "step": 15740 }, { "epoch": 1.7939518195797026, "grad_norm": 6.541507720947266, "learning_rate": 3.214293869163147e-05, "loss": 5.7815, "step": 15750 }, { "epoch": 1.7950908366080074, "grad_norm": 5.535106658935547, "learning_rate": 3.21315218632264e-05, "loss": 5.5604, "step": 15760 }, { "epoch": 1.7962298536363117, "grad_norm": 5.451852321624756, "learning_rate": 3.212010503482133e-05, "loss": 5.2591, "step": 15770 }, { "epoch": 1.7973688706646165, "grad_norm": 5.946657180786133, "learning_rate": 3.210868820641626e-05, "loss": 5.259, "step": 15780 }, { "epoch": 1.798507887692921, "grad_norm": 7.167417526245117, "learning_rate": 3.209727137801119e-05, "loss": 5.2543, "step": 15790 }, { "epoch": 1.7996469047212256, "grad_norm": 9.464091300964355, "learning_rate": 3.208585454960612e-05, "loss": 5.4242, "step": 15800 }, { "epoch": 1.8007859217495301, "grad_norm": 7.651808261871338, "learning_rate": 3.207443772120105e-05, "loss": 5.557, "step": 15810 }, { "epoch": 1.8019249387778347, "grad_norm": 7.241791725158691, "learning_rate": 3.206302089279598e-05, "loss": 5.3039, "step": 15820 }, { "epoch": 1.8030639558061394, "grad_norm": 35.77559280395508, "learning_rate": 3.205160406439091e-05, "loss": 5.1588, "step": 15830 }, { "epoch": 1.8042029728344438, "grad_norm": 6.354983329772949, "learning_rate": 3.204018723598585e-05, "loss": 5.0738, "step": 15840 }, { "epoch": 1.8053419898627485, "grad_norm": 11.231024742126465, "learning_rate": 3.2028770407580774e-05, "loss": 5.1951, "step": 15850 }, { "epoch": 1.806481006891053, "grad_norm": 4.689850807189941, "learning_rate": 3.2017353579175706e-05, "loss": 5.5869, "step": 15860 }, { "epoch": 1.8076200239193576, "grad_norm": 10.245040893554688, "learning_rate": 3.200593675077064e-05, "loss": 5.2374, "step": 15870 }, { "epoch": 1.8087590409476622, "grad_norm": 14.079906463623047, "learning_rate": 3.199451992236557e-05, "loss": 5.1183, "step": 15880 }, { "epoch": 1.8098980579759667, "grad_norm": 9.724322319030762, "learning_rate": 3.19831030939605e-05, "loss": 5.1199, "step": 15890 }, { "epoch": 1.8110370750042715, "grad_norm": 7.243402004241943, "learning_rate": 3.1971686265555426e-05, "loss": 5.3126, "step": 15900 }, { "epoch": 1.8121760920325758, "grad_norm": 10.498720169067383, "learning_rate": 3.196026943715036e-05, "loss": 5.0482, "step": 15910 }, { "epoch": 1.8133151090608806, "grad_norm": 8.319934844970703, "learning_rate": 3.19488526087453e-05, "loss": 5.0945, "step": 15920 }, { "epoch": 1.8144541260891849, "grad_norm": 8.889106750488281, "learning_rate": 3.193743578034022e-05, "loss": 5.1267, "step": 15930 }, { "epoch": 1.8155931431174896, "grad_norm": 10.439918518066406, "learning_rate": 3.1926018951935154e-05, "loss": 4.9774, "step": 15940 }, { "epoch": 1.8167321601457942, "grad_norm": 9.230948448181152, "learning_rate": 3.1914602123530085e-05, "loss": 5.2242, "step": 15950 }, { "epoch": 1.8178711771740987, "grad_norm": 5.9906535148620605, "learning_rate": 3.190318529512502e-05, "loss": 5.7379, "step": 15960 }, { "epoch": 1.8190101942024033, "grad_norm": 12.100125312805176, "learning_rate": 3.189176846671994e-05, "loss": 5.1342, "step": 15970 }, { "epoch": 1.8201492112307078, "grad_norm": 8.128509521484375, "learning_rate": 3.1880351638314874e-05, "loss": 5.0393, "step": 15980 }, { "epoch": 1.8212882282590126, "grad_norm": 11.983037948608398, "learning_rate": 3.1868934809909806e-05, "loss": 5.4135, "step": 15990 }, { "epoch": 1.822427245287317, "grad_norm": 12.307679176330566, "learning_rate": 3.1857517981504745e-05, "loss": 5.2544, "step": 16000 }, { "epoch": 1.822427245287317, "eval_loss": 5.832084655761719, "eval_runtime": 11.4932, "eval_samples_per_second": 1.305, "eval_steps_per_second": 0.174, "step": 16000 }, { "epoch": 1.8235662623156217, "grad_norm": 9.702421188354492, "learning_rate": 3.184610115309967e-05, "loss": 5.2784, "step": 16010 }, { "epoch": 1.8247052793439262, "grad_norm": 8.732382774353027, "learning_rate": 3.18346843246946e-05, "loss": 5.1524, "step": 16020 }, { "epoch": 1.8258442963722308, "grad_norm": 10.099857330322266, "learning_rate": 3.182326749628953e-05, "loss": 5.0381, "step": 16030 }, { "epoch": 1.8269833134005353, "grad_norm": 7.625748157501221, "learning_rate": 3.1811850667884465e-05, "loss": 5.3359, "step": 16040 }, { "epoch": 1.8281223304288399, "grad_norm": 12.038113594055176, "learning_rate": 3.180043383947939e-05, "loss": 5.247, "step": 16050 }, { "epoch": 1.8292613474571446, "grad_norm": 6.477821350097656, "learning_rate": 3.178901701107432e-05, "loss": 5.4063, "step": 16060 }, { "epoch": 1.830400364485449, "grad_norm": 10.657258033752441, "learning_rate": 3.1777600182669254e-05, "loss": 5.4068, "step": 16070 }, { "epoch": 1.8315393815137537, "grad_norm": 10.139802932739258, "learning_rate": 3.176618335426419e-05, "loss": 5.3456, "step": 16080 }, { "epoch": 1.832678398542058, "grad_norm": 8.613080978393555, "learning_rate": 3.175476652585912e-05, "loss": 5.5659, "step": 16090 }, { "epoch": 1.8338174155703628, "grad_norm": 7.020716190338135, "learning_rate": 3.174334969745405e-05, "loss": 5.31, "step": 16100 }, { "epoch": 1.8349564325986674, "grad_norm": 11.441906929016113, "learning_rate": 3.173193286904898e-05, "loss": 5.436, "step": 16110 }, { "epoch": 1.836095449626972, "grad_norm": 7.583320140838623, "learning_rate": 3.172051604064391e-05, "loss": 5.405, "step": 16120 }, { "epoch": 1.8372344666552765, "grad_norm": 6.088997840881348, "learning_rate": 3.170909921223884e-05, "loss": 5.1049, "step": 16130 }, { "epoch": 1.838373483683581, "grad_norm": 7.740018367767334, "learning_rate": 3.169768238383377e-05, "loss": 5.158, "step": 16140 }, { "epoch": 1.8395125007118858, "grad_norm": 7.6566972732543945, "learning_rate": 3.16862655554287e-05, "loss": 5.357, "step": 16150 }, { "epoch": 1.84065151774019, "grad_norm": 6.360335826873779, "learning_rate": 3.1674848727023634e-05, "loss": 5.1247, "step": 16160 }, { "epoch": 1.8417905347684949, "grad_norm": 9.910289764404297, "learning_rate": 3.1663431898618566e-05, "loss": 5.3082, "step": 16170 }, { "epoch": 1.8429295517967994, "grad_norm": 8.577327728271484, "learning_rate": 3.16520150702135e-05, "loss": 5.2903, "step": 16180 }, { "epoch": 1.844068568825104, "grad_norm": 13.031941413879395, "learning_rate": 3.164059824180843e-05, "loss": 5.0962, "step": 16190 }, { "epoch": 1.8452075858534085, "grad_norm": 10.391229629516602, "learning_rate": 3.162918141340336e-05, "loss": 5.0891, "step": 16200 }, { "epoch": 1.846346602881713, "grad_norm": 7.560512065887451, "learning_rate": 3.1617764584998286e-05, "loss": 5.1244, "step": 16210 }, { "epoch": 1.8474856199100178, "grad_norm": 11.551033020019531, "learning_rate": 3.160634775659322e-05, "loss": 5.262, "step": 16220 }, { "epoch": 1.8486246369383221, "grad_norm": 7.193599224090576, "learning_rate": 3.159493092818815e-05, "loss": 5.0939, "step": 16230 }, { "epoch": 1.849763653966627, "grad_norm": 18.756372451782227, "learning_rate": 3.158351409978308e-05, "loss": 5.2957, "step": 16240 }, { "epoch": 1.8509026709949312, "grad_norm": 13.374124526977539, "learning_rate": 3.1572097271378013e-05, "loss": 5.2796, "step": 16250 }, { "epoch": 1.852041688023236, "grad_norm": 10.634527206420898, "learning_rate": 3.1560680442972945e-05, "loss": 5.3388, "step": 16260 }, { "epoch": 1.8531807050515405, "grad_norm": 7.483227729797363, "learning_rate": 3.154926361456788e-05, "loss": 5.4911, "step": 16270 }, { "epoch": 1.854319722079845, "grad_norm": 12.502581596374512, "learning_rate": 3.153784678616281e-05, "loss": 5.0431, "step": 16280 }, { "epoch": 1.8554587391081496, "grad_norm": 12.948872566223145, "learning_rate": 3.1526429957757734e-05, "loss": 5.5631, "step": 16290 }, { "epoch": 1.8565977561364542, "grad_norm": 7.303791046142578, "learning_rate": 3.1515013129352666e-05, "loss": 5.5589, "step": 16300 }, { "epoch": 1.857736773164759, "grad_norm": 15.113411903381348, "learning_rate": 3.15035963009476e-05, "loss": 5.2462, "step": 16310 }, { "epoch": 1.8588757901930633, "grad_norm": 11.068880081176758, "learning_rate": 3.149217947254253e-05, "loss": 5.4357, "step": 16320 }, { "epoch": 1.860014807221368, "grad_norm": 6.116614818572998, "learning_rate": 3.148076264413746e-05, "loss": 5.4016, "step": 16330 }, { "epoch": 1.8611538242496726, "grad_norm": 7.408304214477539, "learning_rate": 3.146934581573239e-05, "loss": 5.3268, "step": 16340 }, { "epoch": 1.8622928412779771, "grad_norm": 7.863326072692871, "learning_rate": 3.1457928987327325e-05, "loss": 5.178, "step": 16350 }, { "epoch": 1.8634318583062817, "grad_norm": 10.515961647033691, "learning_rate": 3.144651215892225e-05, "loss": 5.3988, "step": 16360 }, { "epoch": 1.8645708753345862, "grad_norm": 6.62656831741333, "learning_rate": 3.143509533051718e-05, "loss": 5.3556, "step": 16370 }, { "epoch": 1.865709892362891, "grad_norm": 10.1450834274292, "learning_rate": 3.1423678502112114e-05, "loss": 5.249, "step": 16380 }, { "epoch": 1.8668489093911953, "grad_norm": 7.126070022583008, "learning_rate": 3.1412261673707046e-05, "loss": 5.3152, "step": 16390 }, { "epoch": 1.8679879264195, "grad_norm": 10.022063255310059, "learning_rate": 3.140084484530198e-05, "loss": 5.1321, "step": 16400 }, { "epoch": 1.8691269434478044, "grad_norm": 7.533326625823975, "learning_rate": 3.138942801689691e-05, "loss": 5.2327, "step": 16410 }, { "epoch": 1.8702659604761092, "grad_norm": 8.48379898071289, "learning_rate": 3.137801118849184e-05, "loss": 4.9058, "step": 16420 }, { "epoch": 1.8714049775044137, "grad_norm": 9.660683631896973, "learning_rate": 3.136659436008677e-05, "loss": 5.2705, "step": 16430 }, { "epoch": 1.8725439945327182, "grad_norm": 5.680925369262695, "learning_rate": 3.13551775316817e-05, "loss": 5.5768, "step": 16440 }, { "epoch": 1.8736830115610228, "grad_norm": 6.803483963012695, "learning_rate": 3.134376070327663e-05, "loss": 5.3157, "step": 16450 }, { "epoch": 1.8748220285893273, "grad_norm": 11.37948989868164, "learning_rate": 3.133234387487156e-05, "loss": 5.2156, "step": 16460 }, { "epoch": 1.875961045617632, "grad_norm": 9.787893295288086, "learning_rate": 3.1320927046466494e-05, "loss": 5.084, "step": 16470 }, { "epoch": 1.8771000626459364, "grad_norm": 22.197452545166016, "learning_rate": 3.130951021806142e-05, "loss": 5.2033, "step": 16480 }, { "epoch": 1.8782390796742412, "grad_norm": 7.01490592956543, "learning_rate": 3.129809338965636e-05, "loss": 5.0844, "step": 16490 }, { "epoch": 1.8793780967025457, "grad_norm": 6.314261436462402, "learning_rate": 3.128667656125129e-05, "loss": 5.2189, "step": 16500 }, { "epoch": 1.8805171137308503, "grad_norm": 6.676644802093506, "learning_rate": 3.127525973284622e-05, "loss": 5.1678, "step": 16510 }, { "epoch": 1.8816561307591548, "grad_norm": 7.425068378448486, "learning_rate": 3.1263842904441146e-05, "loss": 5.2719, "step": 16520 }, { "epoch": 1.8827951477874594, "grad_norm": 10.819995880126953, "learning_rate": 3.125242607603608e-05, "loss": 5.1188, "step": 16530 }, { "epoch": 1.8839341648157641, "grad_norm": 8.873900413513184, "learning_rate": 3.124100924763101e-05, "loss": 5.2426, "step": 16540 }, { "epoch": 1.8850731818440685, "grad_norm": 16.555330276489258, "learning_rate": 3.122959241922594e-05, "loss": 5.3134, "step": 16550 }, { "epoch": 1.8862121988723732, "grad_norm": 8.569807052612305, "learning_rate": 3.1218175590820867e-05, "loss": 5.363, "step": 16560 }, { "epoch": 1.8873512159006776, "grad_norm": 9.172271728515625, "learning_rate": 3.12067587624158e-05, "loss": 4.8459, "step": 16570 }, { "epoch": 1.8884902329289823, "grad_norm": 14.09700870513916, "learning_rate": 3.119534193401074e-05, "loss": 5.2349, "step": 16580 }, { "epoch": 1.8896292499572869, "grad_norm": 12.434288024902344, "learning_rate": 3.118392510560567e-05, "loss": 5.2453, "step": 16590 }, { "epoch": 1.8907682669855914, "grad_norm": 6.912929058074951, "learning_rate": 3.1172508277200594e-05, "loss": 5.0789, "step": 16600 }, { "epoch": 1.891907284013896, "grad_norm": 7.448489665985107, "learning_rate": 3.1161091448795526e-05, "loss": 5.465, "step": 16610 }, { "epoch": 1.8930463010422005, "grad_norm": 6.203219890594482, "learning_rate": 3.114967462039046e-05, "loss": 5.2743, "step": 16620 }, { "epoch": 1.8941853180705053, "grad_norm": 32.255104064941406, "learning_rate": 3.113825779198539e-05, "loss": 5.4738, "step": 16630 }, { "epoch": 1.8953243350988096, "grad_norm": 11.738665580749512, "learning_rate": 3.1126840963580314e-05, "loss": 5.0485, "step": 16640 }, { "epoch": 1.8964633521271144, "grad_norm": 12.012779235839844, "learning_rate": 3.1115424135175246e-05, "loss": 5.4544, "step": 16650 }, { "epoch": 1.897602369155419, "grad_norm": 9.032222747802734, "learning_rate": 3.1104007306770185e-05, "loss": 5.3856, "step": 16660 }, { "epoch": 1.8987413861837235, "grad_norm": 10.080476760864258, "learning_rate": 3.109259047836512e-05, "loss": 5.7127, "step": 16670 }, { "epoch": 1.899880403212028, "grad_norm": 8.859971046447754, "learning_rate": 3.108117364996004e-05, "loss": 5.102, "step": 16680 }, { "epoch": 1.9010194202403325, "grad_norm": 6.79989767074585, "learning_rate": 3.1069756821554974e-05, "loss": 5.2618, "step": 16690 }, { "epoch": 1.9021584372686373, "grad_norm": 14.299168586730957, "learning_rate": 3.1058339993149905e-05, "loss": 5.1497, "step": 16700 }, { "epoch": 1.9032974542969416, "grad_norm": 7.730276107788086, "learning_rate": 3.104692316474484e-05, "loss": 5.4198, "step": 16710 }, { "epoch": 1.9044364713252464, "grad_norm": 6.5476226806640625, "learning_rate": 3.103550633633976e-05, "loss": 5.2444, "step": 16720 }, { "epoch": 1.9055754883535507, "grad_norm": 7.926487445831299, "learning_rate": 3.1024089507934694e-05, "loss": 5.1846, "step": 16730 }, { "epoch": 1.9067145053818555, "grad_norm": 9.054329872131348, "learning_rate": 3.101267267952963e-05, "loss": 5.5013, "step": 16740 }, { "epoch": 1.90785352241016, "grad_norm": 8.103936195373535, "learning_rate": 3.100125585112456e-05, "loss": 5.6069, "step": 16750 }, { "epoch": 1.9089925394384646, "grad_norm": 11.002752304077148, "learning_rate": 3.098983902271949e-05, "loss": 5.0676, "step": 16760 }, { "epoch": 1.9101315564667691, "grad_norm": 7.584782600402832, "learning_rate": 3.097842219431442e-05, "loss": 5.6041, "step": 16770 }, { "epoch": 1.9112705734950737, "grad_norm": 6.534191608428955, "learning_rate": 3.0967005365909353e-05, "loss": 5.2058, "step": 16780 }, { "epoch": 1.9124095905233784, "grad_norm": 7.534450531005859, "learning_rate": 3.0955588537504285e-05, "loss": 5.1247, "step": 16790 }, { "epoch": 1.9135486075516828, "grad_norm": 11.882919311523438, "learning_rate": 3.094417170909921e-05, "loss": 5.3567, "step": 16800 }, { "epoch": 1.9146876245799875, "grad_norm": 7.071094036102295, "learning_rate": 3.093275488069414e-05, "loss": 5.8045, "step": 16810 }, { "epoch": 1.915826641608292, "grad_norm": 39.18484878540039, "learning_rate": 3.092133805228908e-05, "loss": 5.2192, "step": 16820 }, { "epoch": 1.9169656586365966, "grad_norm": 8.408158302307129, "learning_rate": 3.0909921223884006e-05, "loss": 5.7599, "step": 16830 }, { "epoch": 1.9181046756649012, "grad_norm": 30.49919891357422, "learning_rate": 3.089850439547894e-05, "loss": 5.4905, "step": 16840 }, { "epoch": 1.9192436926932057, "grad_norm": 9.431031227111816, "learning_rate": 3.088708756707387e-05, "loss": 5.402, "step": 16850 }, { "epoch": 1.9203827097215105, "grad_norm": 16.699037551879883, "learning_rate": 3.08756707386688e-05, "loss": 5.1874, "step": 16860 }, { "epoch": 1.9215217267498148, "grad_norm": 7.762238025665283, "learning_rate": 3.0864253910263726e-05, "loss": 5.2737, "step": 16870 }, { "epoch": 1.9226607437781196, "grad_norm": 12.07967758178711, "learning_rate": 3.085283708185866e-05, "loss": 5.4094, "step": 16880 }, { "epoch": 1.923799760806424, "grad_norm": 9.500292778015137, "learning_rate": 3.084142025345359e-05, "loss": 5.3492, "step": 16890 }, { "epoch": 1.9249387778347287, "grad_norm": 18.876487731933594, "learning_rate": 3.083000342504852e-05, "loss": 5.1222, "step": 16900 }, { "epoch": 1.9260777948630332, "grad_norm": 11.098732948303223, "learning_rate": 3.0818586596643454e-05, "loss": 4.4881, "step": 16910 }, { "epoch": 1.9272168118913378, "grad_norm": 10.376971244812012, "learning_rate": 3.0807169768238386e-05, "loss": 4.6965, "step": 16920 }, { "epoch": 1.9283558289196423, "grad_norm": 12.247511863708496, "learning_rate": 3.079575293983332e-05, "loss": 5.2123, "step": 16930 }, { "epoch": 1.9294948459479468, "grad_norm": 8.139228820800781, "learning_rate": 3.078433611142825e-05, "loss": 5.3492, "step": 16940 }, { "epoch": 1.9306338629762516, "grad_norm": 9.468058586120605, "learning_rate": 3.0772919283023174e-05, "loss": 5.3156, "step": 16950 }, { "epoch": 1.931772880004556, "grad_norm": 6.062070369720459, "learning_rate": 3.0761502454618106e-05, "loss": 5.3921, "step": 16960 }, { "epoch": 1.9329118970328607, "grad_norm": 6.7293314933776855, "learning_rate": 3.075008562621304e-05, "loss": 5.1339, "step": 16970 }, { "epoch": 1.9340509140611652, "grad_norm": 6.088140487670898, "learning_rate": 3.073866879780797e-05, "loss": 4.9764, "step": 16980 }, { "epoch": 1.9351899310894698, "grad_norm": 6.771167278289795, "learning_rate": 3.07272519694029e-05, "loss": 5.2947, "step": 16990 }, { "epoch": 1.9363289481177743, "grad_norm": 17.922042846679688, "learning_rate": 3.0715835140997834e-05, "loss": 4.9927, "step": 17000 }, { "epoch": 1.9374679651460789, "grad_norm": 7.334212303161621, "learning_rate": 3.0704418312592765e-05, "loss": 5.1482, "step": 17010 }, { "epoch": 1.9386069821743837, "grad_norm": 6.488142490386963, "learning_rate": 3.06930014841877e-05, "loss": 5.1893, "step": 17020 }, { "epoch": 1.939745999202688, "grad_norm": 9.199004173278809, "learning_rate": 3.068158465578262e-05, "loss": 5.6558, "step": 17030 }, { "epoch": 1.9408850162309927, "grad_norm": 7.08030366897583, "learning_rate": 3.0670167827377554e-05, "loss": 5.2228, "step": 17040 }, { "epoch": 1.942024033259297, "grad_norm": 16.6004695892334, "learning_rate": 3.0658750998972486e-05, "loss": 5.1263, "step": 17050 }, { "epoch": 1.9431630502876018, "grad_norm": 5.704008102416992, "learning_rate": 3.064733417056742e-05, "loss": 5.6937, "step": 17060 }, { "epoch": 1.9443020673159064, "grad_norm": 8.700068473815918, "learning_rate": 3.063591734216235e-05, "loss": 5.8973, "step": 17070 }, { "epoch": 1.945441084344211, "grad_norm": 20.755605697631836, "learning_rate": 3.062450051375728e-05, "loss": 5.4337, "step": 17080 }, { "epoch": 1.9465801013725155, "grad_norm": 29.836252212524414, "learning_rate": 3.061308368535221e-05, "loss": 5.333, "step": 17090 }, { "epoch": 1.94771911840082, "grad_norm": 8.315689086914062, "learning_rate": 3.0601666856947145e-05, "loss": 5.1461, "step": 17100 }, { "epoch": 1.9488581354291248, "grad_norm": 22.537151336669922, "learning_rate": 3.059025002854207e-05, "loss": 4.9269, "step": 17110 }, { "epoch": 1.949997152457429, "grad_norm": 8.498539924621582, "learning_rate": 3.0578833200137e-05, "loss": 4.9942, "step": 17120 }, { "epoch": 1.9511361694857339, "grad_norm": 7.357529163360596, "learning_rate": 3.0567416371731934e-05, "loss": 5.1522, "step": 17130 }, { "epoch": 1.9522751865140384, "grad_norm": 7.00098180770874, "learning_rate": 3.0555999543326866e-05, "loss": 5.3363, "step": 17140 }, { "epoch": 1.953414203542343, "grad_norm": 7.778203010559082, "learning_rate": 3.05445827149218e-05, "loss": 5.0716, "step": 17150 }, { "epoch": 1.9545532205706475, "grad_norm": 13.908317565917969, "learning_rate": 3.053316588651673e-05, "loss": 5.5496, "step": 17160 }, { "epoch": 1.955692237598952, "grad_norm": 12.319000244140625, "learning_rate": 3.052174905811166e-05, "loss": 5.1008, "step": 17170 }, { "epoch": 1.9568312546272568, "grad_norm": 6.238765716552734, "learning_rate": 3.051033222970659e-05, "loss": 5.3298, "step": 17180 }, { "epoch": 1.9579702716555611, "grad_norm": 34.22773742675781, "learning_rate": 3.049891540130152e-05, "loss": 5.2814, "step": 17190 }, { "epoch": 1.959109288683866, "grad_norm": 8.195430755615234, "learning_rate": 3.048749857289645e-05, "loss": 5.232, "step": 17200 }, { "epoch": 1.9602483057121702, "grad_norm": 7.08268928527832, "learning_rate": 3.0476081744491382e-05, "loss": 4.9855, "step": 17210 }, { "epoch": 1.961387322740475, "grad_norm": 11.708693504333496, "learning_rate": 3.046466491608631e-05, "loss": 5.2377, "step": 17220 }, { "epoch": 1.9625263397687795, "grad_norm": 12.489038467407227, "learning_rate": 3.0453248087681242e-05, "loss": 5.1972, "step": 17230 }, { "epoch": 1.963665356797084, "grad_norm": 8.34151554107666, "learning_rate": 3.0441831259276177e-05, "loss": 5.1917, "step": 17240 }, { "epoch": 1.9648043738253886, "grad_norm": 6.540013790130615, "learning_rate": 3.0430414430871106e-05, "loss": 5.2306, "step": 17250 }, { "epoch": 1.9659433908536932, "grad_norm": 10.032349586486816, "learning_rate": 3.0418997602466038e-05, "loss": 5.107, "step": 17260 }, { "epoch": 1.967082407881998, "grad_norm": 6.8237128257751465, "learning_rate": 3.0407580774060966e-05, "loss": 5.3935, "step": 17270 }, { "epoch": 1.9682214249103023, "grad_norm": 16.39888572692871, "learning_rate": 3.0396163945655898e-05, "loss": 5.3169, "step": 17280 }, { "epoch": 1.969360441938607, "grad_norm": 7.8522748947143555, "learning_rate": 3.038474711725083e-05, "loss": 5.4692, "step": 17290 }, { "epoch": 1.9704994589669116, "grad_norm": 6.113000869750977, "learning_rate": 3.0373330288845758e-05, "loss": 5.3042, "step": 17300 }, { "epoch": 1.9716384759952161, "grad_norm": 8.276409149169922, "learning_rate": 3.036191346044069e-05, "loss": 5.3894, "step": 17310 }, { "epoch": 1.9727774930235207, "grad_norm": 8.825841903686523, "learning_rate": 3.0350496632035625e-05, "loss": 5.4161, "step": 17320 }, { "epoch": 1.9739165100518252, "grad_norm": 12.91003704071045, "learning_rate": 3.0339079803630554e-05, "loss": 4.9437, "step": 17330 }, { "epoch": 1.97505552708013, "grad_norm": 7.1281418800354, "learning_rate": 3.0327662975225485e-05, "loss": 5.4292, "step": 17340 }, { "epoch": 1.9761945441084343, "grad_norm": 8.980794906616211, "learning_rate": 3.0316246146820414e-05, "loss": 5.6428, "step": 17350 }, { "epoch": 1.977333561136739, "grad_norm": 6.971176624298096, "learning_rate": 3.0304829318415346e-05, "loss": 5.0774, "step": 17360 }, { "epoch": 1.9784725781650434, "grad_norm": 10.74820613861084, "learning_rate": 3.0293412490010274e-05, "loss": 5.1101, "step": 17370 }, { "epoch": 1.9796115951933482, "grad_norm": 4.974326133728027, "learning_rate": 3.0281995661605206e-05, "loss": 5.4859, "step": 17380 }, { "epoch": 1.9807506122216527, "grad_norm": 10.470224380493164, "learning_rate": 3.0270578833200135e-05, "loss": 5.2855, "step": 17390 }, { "epoch": 1.9818896292499573, "grad_norm": 12.764081954956055, "learning_rate": 3.0259162004795073e-05, "loss": 5.5172, "step": 17400 }, { "epoch": 1.9830286462782618, "grad_norm": 6.461774826049805, "learning_rate": 3.024774517639e-05, "loss": 5.252, "step": 17410 }, { "epoch": 1.9841676633065664, "grad_norm": 9.746689796447754, "learning_rate": 3.0236328347984933e-05, "loss": 5.2766, "step": 17420 }, { "epoch": 1.9853066803348711, "grad_norm": 9.114880561828613, "learning_rate": 3.0224911519579862e-05, "loss": 5.182, "step": 17430 }, { "epoch": 1.9864456973631754, "grad_norm": 6.150300979614258, "learning_rate": 3.0213494691174794e-05, "loss": 5.1747, "step": 17440 }, { "epoch": 1.9875847143914802, "grad_norm": 8.727187156677246, "learning_rate": 3.0202077862769722e-05, "loss": 5.3149, "step": 17450 }, { "epoch": 1.9887237314197848, "grad_norm": 8.779123306274414, "learning_rate": 3.0190661034364654e-05, "loss": 5.4248, "step": 17460 }, { "epoch": 1.9898627484480893, "grad_norm": 20.234495162963867, "learning_rate": 3.0179244205959582e-05, "loss": 5.2006, "step": 17470 }, { "epoch": 1.9910017654763938, "grad_norm": 6.3343424797058105, "learning_rate": 3.016782737755452e-05, "loss": 5.7433, "step": 17480 }, { "epoch": 1.9921407825046984, "grad_norm": 6.843255519866943, "learning_rate": 3.015641054914945e-05, "loss": 6.1519, "step": 17490 }, { "epoch": 1.9932797995330032, "grad_norm": 6.034940242767334, "learning_rate": 3.014499372074438e-05, "loss": 5.5021, "step": 17500 }, { "epoch": 1.9944188165613075, "grad_norm": 13.856889724731445, "learning_rate": 3.013357689233931e-05, "loss": 5.3194, "step": 17510 }, { "epoch": 1.9955578335896123, "grad_norm": 8.151268005371094, "learning_rate": 3.012216006393424e-05, "loss": 5.645, "step": 17520 }, { "epoch": 1.9966968506179166, "grad_norm": 7.212371826171875, "learning_rate": 3.011074323552917e-05, "loss": 5.2082, "step": 17530 }, { "epoch": 1.9978358676462213, "grad_norm": 5.859493732452393, "learning_rate": 3.0099326407124102e-05, "loss": 5.1702, "step": 17540 }, { "epoch": 1.9989748846745259, "grad_norm": 12.061100006103516, "learning_rate": 3.008790957871903e-05, "loss": 5.4642, "step": 17550 }, { "epoch": 2.0001139017028304, "grad_norm": 10.034649848937988, "learning_rate": 3.0076492750313962e-05, "loss": 5.104, "step": 17560 }, { "epoch": 2.001252918731135, "grad_norm": 9.353052139282227, "learning_rate": 3.0065075921908897e-05, "loss": 4.5714, "step": 17570 }, { "epoch": 2.0023919357594395, "grad_norm": 10.620824813842773, "learning_rate": 3.005365909350383e-05, "loss": 4.7921, "step": 17580 }, { "epoch": 2.0035309527877443, "grad_norm": 10.396838188171387, "learning_rate": 3.0042242265098758e-05, "loss": 4.3342, "step": 17590 }, { "epoch": 2.0046699698160486, "grad_norm": 27.1042423248291, "learning_rate": 3.003082543669369e-05, "loss": 4.4587, "step": 17600 }, { "epoch": 2.0058089868443534, "grad_norm": 10.320976257324219, "learning_rate": 3.0019408608288618e-05, "loss": 4.2594, "step": 17610 }, { "epoch": 2.0069480038726577, "grad_norm": 8.026143074035645, "learning_rate": 3.000799177988355e-05, "loss": 4.7785, "step": 17620 }, { "epoch": 2.0080870209009625, "grad_norm": 10.650627136230469, "learning_rate": 2.9996574951478478e-05, "loss": 4.5333, "step": 17630 }, { "epoch": 2.0092260379292672, "grad_norm": 18.135908126831055, "learning_rate": 2.998515812307341e-05, "loss": 4.6306, "step": 17640 }, { "epoch": 2.0103650549575716, "grad_norm": 9.425822257995605, "learning_rate": 2.9973741294668345e-05, "loss": 4.4276, "step": 17650 }, { "epoch": 2.0115040719858763, "grad_norm": 7.995151519775391, "learning_rate": 2.9962324466263274e-05, "loss": 4.1761, "step": 17660 }, { "epoch": 2.0126430890141807, "grad_norm": 6.981862545013428, "learning_rate": 2.9950907637858206e-05, "loss": 4.3842, "step": 17670 }, { "epoch": 2.0137821060424854, "grad_norm": 8.841754913330078, "learning_rate": 2.9939490809453134e-05, "loss": 4.599, "step": 17680 }, { "epoch": 2.0149211230707897, "grad_norm": 23.87337875366211, "learning_rate": 2.9928073981048066e-05, "loss": 4.4764, "step": 17690 }, { "epoch": 2.0160601400990945, "grad_norm": 10.238081932067871, "learning_rate": 2.9916657152642998e-05, "loss": 4.6655, "step": 17700 }, { "epoch": 2.017199157127399, "grad_norm": 8.939787864685059, "learning_rate": 2.9905240324237926e-05, "loss": 4.5585, "step": 17710 }, { "epoch": 2.0183381741557036, "grad_norm": 13.194611549377441, "learning_rate": 2.9893823495832858e-05, "loss": 4.4062, "step": 17720 }, { "epoch": 2.0194771911840084, "grad_norm": 7.173688888549805, "learning_rate": 2.9882406667427793e-05, "loss": 4.4404, "step": 17730 }, { "epoch": 2.0206162082123127, "grad_norm": 26.673742294311523, "learning_rate": 2.9870989839022722e-05, "loss": 4.2668, "step": 17740 }, { "epoch": 2.0217552252406175, "grad_norm": 10.922196388244629, "learning_rate": 2.9859573010617654e-05, "loss": 4.1519, "step": 17750 }, { "epoch": 2.022894242268922, "grad_norm": 34.457366943359375, "learning_rate": 2.9848156182212582e-05, "loss": 4.0883, "step": 17760 }, { "epoch": 2.0240332592972266, "grad_norm": 24.143638610839844, "learning_rate": 2.9836739353807514e-05, "loss": 4.4006, "step": 17770 }, { "epoch": 2.025172276325531, "grad_norm": 8.693916320800781, "learning_rate": 2.9825322525402442e-05, "loss": 4.5388, "step": 17780 }, { "epoch": 2.0263112933538356, "grad_norm": 11.42645263671875, "learning_rate": 2.9813905696997374e-05, "loss": 4.6226, "step": 17790 }, { "epoch": 2.0274503103821404, "grad_norm": 10.090991020202637, "learning_rate": 2.9802488868592303e-05, "loss": 4.3568, "step": 17800 }, { "epoch": 2.0285893274104447, "grad_norm": 8.924347877502441, "learning_rate": 2.979107204018724e-05, "loss": 4.4656, "step": 17810 }, { "epoch": 2.0297283444387495, "grad_norm": 8.989141464233398, "learning_rate": 2.977965521178217e-05, "loss": 4.6345, "step": 17820 }, { "epoch": 2.030867361467054, "grad_norm": 12.552188873291016, "learning_rate": 2.97682383833771e-05, "loss": 4.3445, "step": 17830 }, { "epoch": 2.0320063784953586, "grad_norm": 28.57890510559082, "learning_rate": 2.975682155497203e-05, "loss": 4.5935, "step": 17840 }, { "epoch": 2.033145395523663, "grad_norm": 16.600643157958984, "learning_rate": 2.9745404726566962e-05, "loss": 4.5515, "step": 17850 }, { "epoch": 2.0342844125519677, "grad_norm": 10.375631332397461, "learning_rate": 2.973398789816189e-05, "loss": 4.4477, "step": 17860 }, { "epoch": 2.0354234295802724, "grad_norm": 11.461358070373535, "learning_rate": 2.9722571069756822e-05, "loss": 4.3483, "step": 17870 }, { "epoch": 2.0365624466085768, "grad_norm": 9.288942337036133, "learning_rate": 2.971115424135175e-05, "loss": 4.5621, "step": 17880 }, { "epoch": 2.0377014636368815, "grad_norm": 9.822104454040527, "learning_rate": 2.9699737412946682e-05, "loss": 4.3366, "step": 17890 }, { "epoch": 2.038840480665186, "grad_norm": 8.849560737609863, "learning_rate": 2.9688320584541618e-05, "loss": 4.2618, "step": 17900 }, { "epoch": 2.0399794976934906, "grad_norm": 11.033799171447754, "learning_rate": 2.967690375613655e-05, "loss": 4.519, "step": 17910 }, { "epoch": 2.041118514721795, "grad_norm": 10.335617065429688, "learning_rate": 2.9665486927731478e-05, "loss": 4.77, "step": 17920 }, { "epoch": 2.0422575317500997, "grad_norm": 10.127538681030273, "learning_rate": 2.965407009932641e-05, "loss": 4.6134, "step": 17930 }, { "epoch": 2.043396548778404, "grad_norm": 13.892292976379395, "learning_rate": 2.9642653270921338e-05, "loss": 4.7826, "step": 17940 }, { "epoch": 2.044535565806709, "grad_norm": 22.508115768432617, "learning_rate": 2.963123644251627e-05, "loss": 4.9485, "step": 17950 }, { "epoch": 2.0456745828350136, "grad_norm": 9.200506210327148, "learning_rate": 2.96198196141112e-05, "loss": 4.2296, "step": 17960 }, { "epoch": 2.046813599863318, "grad_norm": 20.13701057434082, "learning_rate": 2.960840278570613e-05, "loss": 4.496, "step": 17970 }, { "epoch": 2.0479526168916227, "grad_norm": 17.2512149810791, "learning_rate": 2.9596985957301065e-05, "loss": 4.316, "step": 17980 }, { "epoch": 2.049091633919927, "grad_norm": 11.529026985168457, "learning_rate": 2.9585569128895997e-05, "loss": 4.4849, "step": 17990 }, { "epoch": 2.0502306509482318, "grad_norm": 13.813348770141602, "learning_rate": 2.9574152300490926e-05, "loss": 4.5282, "step": 18000 }, { "epoch": 2.0502306509482318, "eval_loss": 6.060417175292969, "eval_runtime": 10.2589, "eval_samples_per_second": 1.462, "eval_steps_per_second": 0.195, "step": 18000 }, { "epoch": 2.051369667976536, "grad_norm": 10.74505615234375, "learning_rate": 2.9562735472085858e-05, "loss": 4.1056, "step": 18010 }, { "epoch": 2.052508685004841, "grad_norm": 10.346863746643066, "learning_rate": 2.9551318643680786e-05, "loss": 4.3538, "step": 18020 }, { "epoch": 2.053647702033145, "grad_norm": 8.979138374328613, "learning_rate": 2.9539901815275718e-05, "loss": 4.642, "step": 18030 }, { "epoch": 2.05478671906145, "grad_norm": 20.48455810546875, "learning_rate": 2.9528484986870646e-05, "loss": 4.1862, "step": 18040 }, { "epoch": 2.0559257360897547, "grad_norm": 24.90452003479004, "learning_rate": 2.9517068158465578e-05, "loss": 4.1786, "step": 18050 }, { "epoch": 2.057064753118059, "grad_norm": 8.899307250976562, "learning_rate": 2.9505651330060513e-05, "loss": 4.3966, "step": 18060 }, { "epoch": 2.058203770146364, "grad_norm": 40.624732971191406, "learning_rate": 2.9494234501655442e-05, "loss": 4.4136, "step": 18070 }, { "epoch": 2.059342787174668, "grad_norm": 12.927209854125977, "learning_rate": 2.9482817673250374e-05, "loss": 4.4222, "step": 18080 }, { "epoch": 2.060481804202973, "grad_norm": 12.320836067199707, "learning_rate": 2.9471400844845306e-05, "loss": 4.4919, "step": 18090 }, { "epoch": 2.061620821231277, "grad_norm": 8.779129981994629, "learning_rate": 2.9459984016440234e-05, "loss": 4.658, "step": 18100 }, { "epoch": 2.062759838259582, "grad_norm": 10.036825180053711, "learning_rate": 2.9448567188035166e-05, "loss": 4.8078, "step": 18110 }, { "epoch": 2.0638988552878867, "grad_norm": 11.447505950927734, "learning_rate": 2.9437150359630094e-05, "loss": 4.3399, "step": 18120 }, { "epoch": 2.065037872316191, "grad_norm": 8.548605918884277, "learning_rate": 2.9425733531225026e-05, "loss": 4.4198, "step": 18130 }, { "epoch": 2.066176889344496, "grad_norm": 10.151397705078125, "learning_rate": 2.941431670281996e-05, "loss": 4.1909, "step": 18140 }, { "epoch": 2.0673159063728, "grad_norm": 7.891269683837891, "learning_rate": 2.940289987441489e-05, "loss": 4.5161, "step": 18150 }, { "epoch": 2.068454923401105, "grad_norm": 21.275150299072266, "learning_rate": 2.939148304600982e-05, "loss": 4.4974, "step": 18160 }, { "epoch": 2.0695939404294093, "grad_norm": 10.612809181213379, "learning_rate": 2.938006621760475e-05, "loss": 4.5919, "step": 18170 }, { "epoch": 2.070732957457714, "grad_norm": 10.864121437072754, "learning_rate": 2.9368649389199682e-05, "loss": 4.5019, "step": 18180 }, { "epoch": 2.071871974486019, "grad_norm": 15.571329116821289, "learning_rate": 2.935723256079461e-05, "loss": 4.2866, "step": 18190 }, { "epoch": 2.073010991514323, "grad_norm": 11.20418930053711, "learning_rate": 2.9345815732389542e-05, "loss": 4.4531, "step": 18200 }, { "epoch": 2.074150008542628, "grad_norm": 13.021940231323242, "learning_rate": 2.9334398903984474e-05, "loss": 4.5566, "step": 18210 }, { "epoch": 2.075289025570932, "grad_norm": 35.43565368652344, "learning_rate": 2.9322982075579402e-05, "loss": 4.2142, "step": 18220 }, { "epoch": 2.076428042599237, "grad_norm": 16.484302520751953, "learning_rate": 2.9311565247174338e-05, "loss": 4.2342, "step": 18230 }, { "epoch": 2.0775670596275413, "grad_norm": 11.920555114746094, "learning_rate": 2.930014841876927e-05, "loss": 4.574, "step": 18240 }, { "epoch": 2.078706076655846, "grad_norm": 10.326672554016113, "learning_rate": 2.9288731590364198e-05, "loss": 4.4499, "step": 18250 }, { "epoch": 2.0798450936841504, "grad_norm": 10.69715690612793, "learning_rate": 2.927731476195913e-05, "loss": 4.2972, "step": 18260 }, { "epoch": 2.080984110712455, "grad_norm": 13.71670150756836, "learning_rate": 2.9265897933554058e-05, "loss": 4.7965, "step": 18270 }, { "epoch": 2.08212312774076, "grad_norm": 12.21806526184082, "learning_rate": 2.925448110514899e-05, "loss": 4.4754, "step": 18280 }, { "epoch": 2.0832621447690642, "grad_norm": 8.209394454956055, "learning_rate": 2.924306427674392e-05, "loss": 4.3476, "step": 18290 }, { "epoch": 2.084401161797369, "grad_norm": 14.19764518737793, "learning_rate": 2.923164744833885e-05, "loss": 4.2957, "step": 18300 }, { "epoch": 2.0855401788256733, "grad_norm": 15.750473022460938, "learning_rate": 2.9220230619933786e-05, "loss": 4.3946, "step": 18310 }, { "epoch": 2.086679195853978, "grad_norm": 12.75074577331543, "learning_rate": 2.9208813791528717e-05, "loss": 4.7376, "step": 18320 }, { "epoch": 2.0878182128822824, "grad_norm": 10.48817253112793, "learning_rate": 2.9197396963123646e-05, "loss": 4.0535, "step": 18330 }, { "epoch": 2.088957229910587, "grad_norm": 9.252484321594238, "learning_rate": 2.9185980134718578e-05, "loss": 4.4967, "step": 18340 }, { "epoch": 2.0900962469388915, "grad_norm": 12.832139015197754, "learning_rate": 2.9174563306313506e-05, "loss": 4.5884, "step": 18350 }, { "epoch": 2.0912352639671963, "grad_norm": 9.247235298156738, "learning_rate": 2.9163146477908438e-05, "loss": 4.2827, "step": 18360 }, { "epoch": 2.092374280995501, "grad_norm": 13.298909187316895, "learning_rate": 2.9151729649503366e-05, "loss": 4.4996, "step": 18370 }, { "epoch": 2.0935132980238054, "grad_norm": 10.92052173614502, "learning_rate": 2.91403128210983e-05, "loss": 4.5275, "step": 18380 }, { "epoch": 2.09465231505211, "grad_norm": 9.866982460021973, "learning_rate": 2.9128895992693234e-05, "loss": 4.321, "step": 18390 }, { "epoch": 2.0957913320804145, "grad_norm": 11.814825057983398, "learning_rate": 2.9117479164288165e-05, "loss": 4.1376, "step": 18400 }, { "epoch": 2.0969303491087192, "grad_norm": 13.49726676940918, "learning_rate": 2.9106062335883094e-05, "loss": 4.6261, "step": 18410 }, { "epoch": 2.0980693661370236, "grad_norm": 13.917762756347656, "learning_rate": 2.9094645507478026e-05, "loss": 4.2507, "step": 18420 }, { "epoch": 2.0992083831653283, "grad_norm": 10.437445640563965, "learning_rate": 2.9083228679072954e-05, "loss": 4.3688, "step": 18430 }, { "epoch": 2.100347400193633, "grad_norm": 10.63725471496582, "learning_rate": 2.9071811850667886e-05, "loss": 4.1495, "step": 18440 }, { "epoch": 2.1014864172219374, "grad_norm": 12.689176559448242, "learning_rate": 2.9060395022262814e-05, "loss": 4.4013, "step": 18450 }, { "epoch": 2.102625434250242, "grad_norm": 10.703514099121094, "learning_rate": 2.9048978193857746e-05, "loss": 4.0065, "step": 18460 }, { "epoch": 2.1037644512785465, "grad_norm": 12.065997123718262, "learning_rate": 2.903756136545268e-05, "loss": 4.6333, "step": 18470 }, { "epoch": 2.1049034683068513, "grad_norm": 9.926651000976562, "learning_rate": 2.902614453704761e-05, "loss": 4.4492, "step": 18480 }, { "epoch": 2.1060424853351556, "grad_norm": 24.569591522216797, "learning_rate": 2.9014727708642542e-05, "loss": 4.4071, "step": 18490 }, { "epoch": 2.1071815023634604, "grad_norm": 15.291176795959473, "learning_rate": 2.9003310880237474e-05, "loss": 4.3494, "step": 18500 }, { "epoch": 2.108320519391765, "grad_norm": 10.331123352050781, "learning_rate": 2.8991894051832402e-05, "loss": 4.2621, "step": 18510 }, { "epoch": 2.1094595364200694, "grad_norm": 13.224411010742188, "learning_rate": 2.8980477223427334e-05, "loss": 4.5029, "step": 18520 }, { "epoch": 2.110598553448374, "grad_norm": 16.534788131713867, "learning_rate": 2.8969060395022262e-05, "loss": 4.4198, "step": 18530 }, { "epoch": 2.1117375704766785, "grad_norm": 37.1566276550293, "learning_rate": 2.8957643566617194e-05, "loss": 4.6362, "step": 18540 }, { "epoch": 2.1128765875049833, "grad_norm": 10.607845306396484, "learning_rate": 2.894622673821213e-05, "loss": 4.0439, "step": 18550 }, { "epoch": 2.1140156045332876, "grad_norm": 12.826197624206543, "learning_rate": 2.8934809909807058e-05, "loss": 4.4564, "step": 18560 }, { "epoch": 2.1151546215615924, "grad_norm": 13.908170700073242, "learning_rate": 2.892339308140199e-05, "loss": 4.3056, "step": 18570 }, { "epoch": 2.1162936385898967, "grad_norm": 8.121379852294922, "learning_rate": 2.8911976252996918e-05, "loss": 4.328, "step": 18580 }, { "epoch": 2.1174326556182015, "grad_norm": 10.788400650024414, "learning_rate": 2.890055942459185e-05, "loss": 4.4259, "step": 18590 }, { "epoch": 2.1185716726465063, "grad_norm": 15.44107437133789, "learning_rate": 2.888914259618678e-05, "loss": 4.2949, "step": 18600 }, { "epoch": 2.1197106896748106, "grad_norm": 29.66758155822754, "learning_rate": 2.887772576778171e-05, "loss": 4.4404, "step": 18610 }, { "epoch": 2.1208497067031153, "grad_norm": 34.616451263427734, "learning_rate": 2.8866308939376642e-05, "loss": 4.4044, "step": 18620 }, { "epoch": 2.1219887237314197, "grad_norm": 11.029510498046875, "learning_rate": 2.885489211097157e-05, "loss": 4.4095, "step": 18630 }, { "epoch": 2.1231277407597244, "grad_norm": 10.427742958068848, "learning_rate": 2.8843475282566506e-05, "loss": 4.5589, "step": 18640 }, { "epoch": 2.1242667577880288, "grad_norm": 23.671968460083008, "learning_rate": 2.8832058454161438e-05, "loss": 4.3538, "step": 18650 }, { "epoch": 2.1254057748163335, "grad_norm": 12.53283405303955, "learning_rate": 2.8820641625756366e-05, "loss": 4.2531, "step": 18660 }, { "epoch": 2.126544791844638, "grad_norm": 14.044245719909668, "learning_rate": 2.8809224797351298e-05, "loss": 4.6159, "step": 18670 }, { "epoch": 2.1276838088729426, "grad_norm": 13.84389877319336, "learning_rate": 2.8797807968946226e-05, "loss": 4.1674, "step": 18680 }, { "epoch": 2.1288228259012474, "grad_norm": 34.24262237548828, "learning_rate": 2.8786391140541158e-05, "loss": 4.4919, "step": 18690 }, { "epoch": 2.1299618429295517, "grad_norm": 11.81908130645752, "learning_rate": 2.8774974312136087e-05, "loss": 4.454, "step": 18700 }, { "epoch": 2.1311008599578565, "grad_norm": 25.702617645263672, "learning_rate": 2.876355748373102e-05, "loss": 4.3196, "step": 18710 }, { "epoch": 2.132239876986161, "grad_norm": 31.084396362304688, "learning_rate": 2.8752140655325954e-05, "loss": 4.1424, "step": 18720 }, { "epoch": 2.1333788940144656, "grad_norm": 10.866751670837402, "learning_rate": 2.8740723826920886e-05, "loss": 4.7732, "step": 18730 }, { "epoch": 2.13451791104277, "grad_norm": 14.681591033935547, "learning_rate": 2.8729306998515814e-05, "loss": 4.1544, "step": 18740 }, { "epoch": 2.1356569280710747, "grad_norm": 14.033744812011719, "learning_rate": 2.8717890170110746e-05, "loss": 4.477, "step": 18750 }, { "epoch": 2.1367959450993794, "grad_norm": 21.682268142700195, "learning_rate": 2.8706473341705674e-05, "loss": 4.3857, "step": 18760 }, { "epoch": 2.1379349621276837, "grad_norm": 17.77375602722168, "learning_rate": 2.8695056513300606e-05, "loss": 4.6613, "step": 18770 }, { "epoch": 2.1390739791559885, "grad_norm": 10.164037704467773, "learning_rate": 2.8683639684895535e-05, "loss": 4.8563, "step": 18780 }, { "epoch": 2.140212996184293, "grad_norm": 14.451604843139648, "learning_rate": 2.8672222856490466e-05, "loss": 4.3895, "step": 18790 }, { "epoch": 2.1413520132125976, "grad_norm": 8.25815200805664, "learning_rate": 2.86608060280854e-05, "loss": 4.5051, "step": 18800 }, { "epoch": 2.142491030240902, "grad_norm": 19.291881561279297, "learning_rate": 2.8649389199680333e-05, "loss": 4.3857, "step": 18810 }, { "epoch": 2.1436300472692067, "grad_norm": 11.34103775024414, "learning_rate": 2.8637972371275262e-05, "loss": 4.2127, "step": 18820 }, { "epoch": 2.1447690642975115, "grad_norm": 11.805054664611816, "learning_rate": 2.8626555542870194e-05, "loss": 4.2074, "step": 18830 }, { "epoch": 2.145908081325816, "grad_norm": 10.304930686950684, "learning_rate": 2.8615138714465122e-05, "loss": 4.2003, "step": 18840 }, { "epoch": 2.1470470983541206, "grad_norm": 11.176360130310059, "learning_rate": 2.8603721886060054e-05, "loss": 4.3713, "step": 18850 }, { "epoch": 2.148186115382425, "grad_norm": 10.51423454284668, "learning_rate": 2.8592305057654982e-05, "loss": 4.5065, "step": 18860 }, { "epoch": 2.1493251324107296, "grad_norm": 20.841686248779297, "learning_rate": 2.8580888229249914e-05, "loss": 4.0744, "step": 18870 }, { "epoch": 2.150464149439034, "grad_norm": 13.774003028869629, "learning_rate": 2.856947140084485e-05, "loss": 4.5333, "step": 18880 }, { "epoch": 2.1516031664673387, "grad_norm": 13.642642974853516, "learning_rate": 2.8558054572439778e-05, "loss": 4.2307, "step": 18890 }, { "epoch": 2.152742183495643, "grad_norm": 17.952116012573242, "learning_rate": 2.854663774403471e-05, "loss": 4.5442, "step": 18900 }, { "epoch": 2.153881200523948, "grad_norm": 10.366288185119629, "learning_rate": 2.853522091562964e-05, "loss": 4.4885, "step": 18910 }, { "epoch": 2.1550202175522526, "grad_norm": 13.921804428100586, "learning_rate": 2.852380408722457e-05, "loss": 4.2344, "step": 18920 }, { "epoch": 2.156159234580557, "grad_norm": 9.826135635375977, "learning_rate": 2.8512387258819502e-05, "loss": 4.5081, "step": 18930 }, { "epoch": 2.1572982516088617, "grad_norm": 10.746516227722168, "learning_rate": 2.850097043041443e-05, "loss": 4.361, "step": 18940 }, { "epoch": 2.158437268637166, "grad_norm": 16.652273178100586, "learning_rate": 2.8489553602009362e-05, "loss": 4.3208, "step": 18950 }, { "epoch": 2.1595762856654708, "grad_norm": 11.606681823730469, "learning_rate": 2.847813677360429e-05, "loss": 4.4263, "step": 18960 }, { "epoch": 2.160715302693775, "grad_norm": 11.555379867553711, "learning_rate": 2.8466719945199226e-05, "loss": 4.1051, "step": 18970 }, { "epoch": 2.16185431972208, "grad_norm": 14.31595516204834, "learning_rate": 2.8455303116794158e-05, "loss": 4.5425, "step": 18980 }, { "epoch": 2.162993336750384, "grad_norm": 16.75832176208496, "learning_rate": 2.8443886288389086e-05, "loss": 4.2823, "step": 18990 }, { "epoch": 2.164132353778689, "grad_norm": 25.401622772216797, "learning_rate": 2.8432469459984018e-05, "loss": 4.4266, "step": 19000 }, { "epoch": 2.1652713708069937, "grad_norm": 10.595063209533691, "learning_rate": 2.842105263157895e-05, "loss": 4.5758, "step": 19010 }, { "epoch": 2.166410387835298, "grad_norm": 12.421451568603516, "learning_rate": 2.840963580317388e-05, "loss": 4.3854, "step": 19020 }, { "epoch": 2.167549404863603, "grad_norm": 11.803699493408203, "learning_rate": 2.839821897476881e-05, "loss": 4.1064, "step": 19030 }, { "epoch": 2.168688421891907, "grad_norm": 14.315633773803711, "learning_rate": 2.838680214636374e-05, "loss": 4.447, "step": 19040 }, { "epoch": 2.169827438920212, "grad_norm": 9.619046211242676, "learning_rate": 2.8375385317958674e-05, "loss": 4.3029, "step": 19050 }, { "epoch": 2.1709664559485162, "grad_norm": 9.873806953430176, "learning_rate": 2.8363968489553606e-05, "loss": 4.4561, "step": 19060 }, { "epoch": 2.172105472976821, "grad_norm": 11.56844425201416, "learning_rate": 2.8352551661148534e-05, "loss": 4.5356, "step": 19070 }, { "epoch": 2.1732444900051258, "grad_norm": 19.031890869140625, "learning_rate": 2.8341134832743466e-05, "loss": 4.3294, "step": 19080 }, { "epoch": 2.17438350703343, "grad_norm": 11.496087074279785, "learning_rate": 2.8329718004338394e-05, "loss": 4.6025, "step": 19090 }, { "epoch": 2.175522524061735, "grad_norm": 23.368568420410156, "learning_rate": 2.8318301175933326e-05, "loss": 4.2988, "step": 19100 }, { "epoch": 2.176661541090039, "grad_norm": 8.582066535949707, "learning_rate": 2.8306884347528255e-05, "loss": 4.523, "step": 19110 }, { "epoch": 2.177800558118344, "grad_norm": 10.590781211853027, "learning_rate": 2.8295467519123187e-05, "loss": 4.3902, "step": 19120 }, { "epoch": 2.1789395751466483, "grad_norm": 11.430822372436523, "learning_rate": 2.8284050690718122e-05, "loss": 4.6761, "step": 19130 }, { "epoch": 2.180078592174953, "grad_norm": 22.227521896362305, "learning_rate": 2.8272633862313054e-05, "loss": 4.287, "step": 19140 }, { "epoch": 2.181217609203258, "grad_norm": 7.667318820953369, "learning_rate": 2.8261217033907982e-05, "loss": 4.2209, "step": 19150 }, { "epoch": 2.182356626231562, "grad_norm": 33.098731994628906, "learning_rate": 2.8249800205502914e-05, "loss": 4.2548, "step": 19160 }, { "epoch": 2.183495643259867, "grad_norm": 15.808575630187988, "learning_rate": 2.8238383377097842e-05, "loss": 4.3751, "step": 19170 }, { "epoch": 2.184634660288171, "grad_norm": 33.99452209472656, "learning_rate": 2.8226966548692774e-05, "loss": 4.537, "step": 19180 }, { "epoch": 2.185773677316476, "grad_norm": 17.135643005371094, "learning_rate": 2.8215549720287703e-05, "loss": 4.2375, "step": 19190 }, { "epoch": 2.1869126943447803, "grad_norm": 11.814332962036133, "learning_rate": 2.8204132891882634e-05, "loss": 4.3262, "step": 19200 }, { "epoch": 2.188051711373085, "grad_norm": 9.625064849853516, "learning_rate": 2.819271606347757e-05, "loss": 4.3138, "step": 19210 }, { "epoch": 2.1891907284013894, "grad_norm": 11.318060874938965, "learning_rate": 2.81812992350725e-05, "loss": 4.3632, "step": 19220 }, { "epoch": 2.190329745429694, "grad_norm": 11.130118370056152, "learning_rate": 2.816988240666743e-05, "loss": 4.4463, "step": 19230 }, { "epoch": 2.191468762457999, "grad_norm": 40.44393539428711, "learning_rate": 2.8158465578262362e-05, "loss": 4.3744, "step": 19240 }, { "epoch": 2.1926077794863033, "grad_norm": 11.919827461242676, "learning_rate": 2.814704874985729e-05, "loss": 4.7022, "step": 19250 }, { "epoch": 2.193746796514608, "grad_norm": 7.852390289306641, "learning_rate": 2.8135631921452222e-05, "loss": 4.5084, "step": 19260 }, { "epoch": 2.1948858135429123, "grad_norm": 12.34241008758545, "learning_rate": 2.812421509304715e-05, "loss": 4.5451, "step": 19270 }, { "epoch": 2.196024830571217, "grad_norm": 10.834270477294922, "learning_rate": 2.8112798264642082e-05, "loss": 4.67, "step": 19280 }, { "epoch": 2.1971638475995214, "grad_norm": 12.868874549865723, "learning_rate": 2.810138143623701e-05, "loss": 4.1292, "step": 19290 }, { "epoch": 2.198302864627826, "grad_norm": 18.698772430419922, "learning_rate": 2.808996460783195e-05, "loss": 4.7825, "step": 19300 }, { "epoch": 2.1994418816561305, "grad_norm": 10.663459777832031, "learning_rate": 2.8078547779426878e-05, "loss": 4.1828, "step": 19310 }, { "epoch": 2.2005808986844353, "grad_norm": 11.447162628173828, "learning_rate": 2.806713095102181e-05, "loss": 4.3824, "step": 19320 }, { "epoch": 2.20171991571274, "grad_norm": 11.612629890441895, "learning_rate": 2.8055714122616738e-05, "loss": 4.4154, "step": 19330 }, { "epoch": 2.2028589327410444, "grad_norm": 22.35382080078125, "learning_rate": 2.804429729421167e-05, "loss": 4.2377, "step": 19340 }, { "epoch": 2.203997949769349, "grad_norm": 14.205232620239258, "learning_rate": 2.80328804658066e-05, "loss": 4.1946, "step": 19350 }, { "epoch": 2.2051369667976535, "grad_norm": 8.173402786254883, "learning_rate": 2.802146363740153e-05, "loss": 4.3419, "step": 19360 }, { "epoch": 2.2062759838259582, "grad_norm": 13.870865821838379, "learning_rate": 2.801004680899646e-05, "loss": 4.2369, "step": 19370 }, { "epoch": 2.2074150008542626, "grad_norm": 13.200169563293457, "learning_rate": 2.7998629980591394e-05, "loss": 4.2176, "step": 19380 }, { "epoch": 2.2085540178825673, "grad_norm": 11.639530181884766, "learning_rate": 2.7987213152186326e-05, "loss": 4.5517, "step": 19390 }, { "epoch": 2.209693034910872, "grad_norm": 8.542173385620117, "learning_rate": 2.7975796323781254e-05, "loss": 4.5128, "step": 19400 }, { "epoch": 2.2108320519391764, "grad_norm": 11.450385093688965, "learning_rate": 2.7964379495376186e-05, "loss": 4.4245, "step": 19410 }, { "epoch": 2.211971068967481, "grad_norm": 10.551682472229004, "learning_rate": 2.7952962666971118e-05, "loss": 4.4777, "step": 19420 }, { "epoch": 2.2131100859957855, "grad_norm": 20.971851348876953, "learning_rate": 2.7941545838566046e-05, "loss": 4.4916, "step": 19430 }, { "epoch": 2.2142491030240903, "grad_norm": 22.77348518371582, "learning_rate": 2.7930129010160978e-05, "loss": 4.2798, "step": 19440 }, { "epoch": 2.2153881200523946, "grad_norm": 8.859170913696289, "learning_rate": 2.7918712181755907e-05, "loss": 4.567, "step": 19450 }, { "epoch": 2.2165271370806994, "grad_norm": 14.664798736572266, "learning_rate": 2.7907295353350842e-05, "loss": 4.2937, "step": 19460 }, { "epoch": 2.217666154109004, "grad_norm": 14.15429973602295, "learning_rate": 2.7895878524945774e-05, "loss": 4.3192, "step": 19470 }, { "epoch": 2.2188051711373085, "grad_norm": 12.038146018981934, "learning_rate": 2.7884461696540702e-05, "loss": 4.6759, "step": 19480 }, { "epoch": 2.2199441881656132, "grad_norm": 9.109766006469727, "learning_rate": 2.7873044868135634e-05, "loss": 4.2986, "step": 19490 }, { "epoch": 2.2210832051939176, "grad_norm": 14.091863632202148, "learning_rate": 2.7861628039730562e-05, "loss": 4.3147, "step": 19500 }, { "epoch": 2.2222222222222223, "grad_norm": 32.21546173095703, "learning_rate": 2.7850211211325494e-05, "loss": 4.6065, "step": 19510 }, { "epoch": 2.2233612392505266, "grad_norm": 31.96328353881836, "learning_rate": 2.7838794382920426e-05, "loss": 4.5492, "step": 19520 }, { "epoch": 2.2245002562788314, "grad_norm": 15.795702934265137, "learning_rate": 2.7827377554515355e-05, "loss": 4.3908, "step": 19530 }, { "epoch": 2.2256392733071357, "grad_norm": 20.312828063964844, "learning_rate": 2.781596072611029e-05, "loss": 4.3908, "step": 19540 }, { "epoch": 2.2267782903354405, "grad_norm": 11.913114547729492, "learning_rate": 2.780454389770522e-05, "loss": 4.0156, "step": 19550 }, { "epoch": 2.2279173073637453, "grad_norm": 67.34058380126953, "learning_rate": 2.779312706930015e-05, "loss": 4.1589, "step": 19560 }, { "epoch": 2.2290563243920496, "grad_norm": 13.381096839904785, "learning_rate": 2.7781710240895082e-05, "loss": 4.3149, "step": 19570 }, { "epoch": 2.2301953414203544, "grad_norm": 24.699398040771484, "learning_rate": 2.777029341249001e-05, "loss": 4.8308, "step": 19580 }, { "epoch": 2.2313343584486587, "grad_norm": 14.611007690429688, "learning_rate": 2.7758876584084942e-05, "loss": 4.355, "step": 19590 }, { "epoch": 2.2324733754769635, "grad_norm": 9.360657691955566, "learning_rate": 2.774745975567987e-05, "loss": 4.3769, "step": 19600 }, { "epoch": 2.2336123925052678, "grad_norm": 12.764294624328613, "learning_rate": 2.7736042927274803e-05, "loss": 3.9479, "step": 19610 }, { "epoch": 2.2347514095335725, "grad_norm": 16.35887336730957, "learning_rate": 2.772462609886973e-05, "loss": 4.4006, "step": 19620 }, { "epoch": 2.235890426561877, "grad_norm": 10.12515926361084, "learning_rate": 2.771320927046467e-05, "loss": 4.412, "step": 19630 }, { "epoch": 2.2370294435901816, "grad_norm": 11.307372093200684, "learning_rate": 2.7701792442059598e-05, "loss": 4.7658, "step": 19640 }, { "epoch": 2.2381684606184864, "grad_norm": 9.900909423828125, "learning_rate": 2.769037561365453e-05, "loss": 4.399, "step": 19650 }, { "epoch": 2.2393074776467907, "grad_norm": 14.501945495605469, "learning_rate": 2.767895878524946e-05, "loss": 4.2879, "step": 19660 }, { "epoch": 2.2404464946750955, "grad_norm": 10.298788070678711, "learning_rate": 2.766754195684439e-05, "loss": 4.5807, "step": 19670 }, { "epoch": 2.2415855117034, "grad_norm": 17.297077178955078, "learning_rate": 2.765612512843932e-05, "loss": 4.3349, "step": 19680 }, { "epoch": 2.2427245287317046, "grad_norm": 8.442501068115234, "learning_rate": 2.764470830003425e-05, "loss": 4.2717, "step": 19690 }, { "epoch": 2.243863545760009, "grad_norm": 14.779812812805176, "learning_rate": 2.763329147162918e-05, "loss": 4.5482, "step": 19700 }, { "epoch": 2.2450025627883137, "grad_norm": 10.94418716430664, "learning_rate": 2.7621874643224118e-05, "loss": 4.1592, "step": 19710 }, { "epoch": 2.2461415798166184, "grad_norm": 9.443648338317871, "learning_rate": 2.7610457814819046e-05, "loss": 4.485, "step": 19720 }, { "epoch": 2.2472805968449228, "grad_norm": 14.997633934020996, "learning_rate": 2.7599040986413978e-05, "loss": 4.4784, "step": 19730 }, { "epoch": 2.2484196138732275, "grad_norm": 7.396342754364014, "learning_rate": 2.7587624158008906e-05, "loss": 4.903, "step": 19740 }, { "epoch": 2.249558630901532, "grad_norm": 9.457368850708008, "learning_rate": 2.7576207329603838e-05, "loss": 4.5902, "step": 19750 }, { "epoch": 2.2506976479298366, "grad_norm": 12.0418701171875, "learning_rate": 2.7564790501198767e-05, "loss": 4.4375, "step": 19760 }, { "epoch": 2.251836664958141, "grad_norm": 8.690922737121582, "learning_rate": 2.75533736727937e-05, "loss": 4.3008, "step": 19770 }, { "epoch": 2.2529756819864457, "grad_norm": 17.454952239990234, "learning_rate": 2.7541956844388627e-05, "loss": 4.369, "step": 19780 }, { "epoch": 2.2541146990147505, "grad_norm": 9.971555709838867, "learning_rate": 2.7530540015983562e-05, "loss": 4.3026, "step": 19790 }, { "epoch": 2.255253716043055, "grad_norm": 15.098048210144043, "learning_rate": 2.7519123187578494e-05, "loss": 4.1767, "step": 19800 }, { "epoch": 2.2563927330713596, "grad_norm": 15.469975471496582, "learning_rate": 2.7507706359173426e-05, "loss": 4.2714, "step": 19810 }, { "epoch": 2.257531750099664, "grad_norm": 8.48707389831543, "learning_rate": 2.7496289530768354e-05, "loss": 4.3728, "step": 19820 }, { "epoch": 2.2586707671279687, "grad_norm": 11.636101722717285, "learning_rate": 2.7484872702363286e-05, "loss": 4.6468, "step": 19830 }, { "epoch": 2.259809784156273, "grad_norm": 13.015317916870117, "learning_rate": 2.7473455873958214e-05, "loss": 4.2907, "step": 19840 }, { "epoch": 2.2609488011845777, "grad_norm": 26.91872215270996, "learning_rate": 2.7462039045553146e-05, "loss": 4.3164, "step": 19850 }, { "epoch": 2.2620878182128825, "grad_norm": 17.38819122314453, "learning_rate": 2.7450622217148075e-05, "loss": 4.5248, "step": 19860 }, { "epoch": 2.263226835241187, "grad_norm": 10.703744888305664, "learning_rate": 2.743920538874301e-05, "loss": 4.5878, "step": 19870 }, { "epoch": 2.2643658522694916, "grad_norm": 23.601512908935547, "learning_rate": 2.7427788560337942e-05, "loss": 4.291, "step": 19880 }, { "epoch": 2.265504869297796, "grad_norm": 27.674367904663086, "learning_rate": 2.741637173193287e-05, "loss": 4.6971, "step": 19890 }, { "epoch": 2.2666438863261007, "grad_norm": 11.249340057373047, "learning_rate": 2.7404954903527802e-05, "loss": 4.4546, "step": 19900 }, { "epoch": 2.267782903354405, "grad_norm": 18.939645767211914, "learning_rate": 2.739353807512273e-05, "loss": 4.4617, "step": 19910 }, { "epoch": 2.26892192038271, "grad_norm": 8.411871910095215, "learning_rate": 2.7382121246717662e-05, "loss": 4.4564, "step": 19920 }, { "epoch": 2.270060937411014, "grad_norm": 19.94964027404785, "learning_rate": 2.7370704418312594e-05, "loss": 4.4576, "step": 19930 }, { "epoch": 2.271199954439319, "grad_norm": 16.94525146484375, "learning_rate": 2.7359287589907523e-05, "loss": 4.4211, "step": 19940 }, { "epoch": 2.272338971467623, "grad_norm": 12.117650985717773, "learning_rate": 2.7347870761502454e-05, "loss": 4.3448, "step": 19950 }, { "epoch": 2.273477988495928, "grad_norm": 17.60885238647461, "learning_rate": 2.733645393309739e-05, "loss": 4.0793, "step": 19960 }, { "epoch": 2.2746170055242327, "grad_norm": 22.5484561920166, "learning_rate": 2.7325037104692318e-05, "loss": 4.504, "step": 19970 }, { "epoch": 2.275756022552537, "grad_norm": 12.337329864501953, "learning_rate": 2.731362027628725e-05, "loss": 4.4758, "step": 19980 }, { "epoch": 2.276895039580842, "grad_norm": 10.084304809570312, "learning_rate": 2.730220344788218e-05, "loss": 4.5065, "step": 19990 }, { "epoch": 2.278034056609146, "grad_norm": 28.399568557739258, "learning_rate": 2.729078661947711e-05, "loss": 4.3735, "step": 20000 }, { "epoch": 2.278034056609146, "eval_loss": 6.095102310180664, "eval_runtime": 12.419, "eval_samples_per_second": 1.208, "eval_steps_per_second": 0.161, "step": 20000 }, { "epoch": 2.279173073637451, "grad_norm": 9.15103530883789, "learning_rate": 2.727936979107204e-05, "loss": 4.2103, "step": 20010 }, { "epoch": 2.2803120906657552, "grad_norm": 15.227364540100098, "learning_rate": 2.726795296266697e-05, "loss": 4.3143, "step": 20020 }, { "epoch": 2.28145110769406, "grad_norm": 18.669891357421875, "learning_rate": 2.7256536134261902e-05, "loss": 4.2012, "step": 20030 }, { "epoch": 2.2825901247223648, "grad_norm": 11.993313789367676, "learning_rate": 2.7245119305856838e-05, "loss": 4.2507, "step": 20040 }, { "epoch": 2.283729141750669, "grad_norm": 12.1185941696167, "learning_rate": 2.7233702477451766e-05, "loss": 4.5801, "step": 20050 }, { "epoch": 2.284868158778974, "grad_norm": 9.721781730651855, "learning_rate": 2.7222285649046698e-05, "loss": 4.8564, "step": 20060 }, { "epoch": 2.286007175807278, "grad_norm": 10.034998893737793, "learning_rate": 2.7210868820641626e-05, "loss": 4.3931, "step": 20070 }, { "epoch": 2.287146192835583, "grad_norm": 15.754368782043457, "learning_rate": 2.7199451992236558e-05, "loss": 4.4728, "step": 20080 }, { "epoch": 2.2882852098638873, "grad_norm": 15.889591217041016, "learning_rate": 2.7188035163831487e-05, "loss": 4.456, "step": 20090 }, { "epoch": 2.289424226892192, "grad_norm": 8.562204360961914, "learning_rate": 2.717661833542642e-05, "loss": 4.7066, "step": 20100 }, { "epoch": 2.290563243920497, "grad_norm": 38.456298828125, "learning_rate": 2.7165201507021347e-05, "loss": 4.5442, "step": 20110 }, { "epoch": 2.291702260948801, "grad_norm": 9.698710441589355, "learning_rate": 2.7153784678616286e-05, "loss": 4.5983, "step": 20120 }, { "epoch": 2.292841277977106, "grad_norm": 21.27276039123535, "learning_rate": 2.7142367850211214e-05, "loss": 4.5653, "step": 20130 }, { "epoch": 2.2939802950054102, "grad_norm": 14.186841011047363, "learning_rate": 2.7130951021806146e-05, "loss": 4.2409, "step": 20140 }, { "epoch": 2.295119312033715, "grad_norm": 19.440866470336914, "learning_rate": 2.7119534193401074e-05, "loss": 4.2499, "step": 20150 }, { "epoch": 2.2962583290620193, "grad_norm": 24.299516677856445, "learning_rate": 2.7108117364996006e-05, "loss": 4.4182, "step": 20160 }, { "epoch": 2.297397346090324, "grad_norm": 13.558712005615234, "learning_rate": 2.7096700536590935e-05, "loss": 4.4855, "step": 20170 }, { "epoch": 2.298536363118629, "grad_norm": 9.241644859313965, "learning_rate": 2.7085283708185866e-05, "loss": 4.3026, "step": 20180 }, { "epoch": 2.299675380146933, "grad_norm": 13.817281723022461, "learning_rate": 2.7073866879780795e-05, "loss": 4.1999, "step": 20190 }, { "epoch": 2.300814397175238, "grad_norm": 15.542625427246094, "learning_rate": 2.706245005137573e-05, "loss": 4.4185, "step": 20200 }, { "epoch": 2.3019534142035423, "grad_norm": 9.056170463562012, "learning_rate": 2.7051033222970662e-05, "loss": 4.4653, "step": 20210 }, { "epoch": 2.303092431231847, "grad_norm": 29.517549514770508, "learning_rate": 2.7039616394565594e-05, "loss": 4.2953, "step": 20220 }, { "epoch": 2.3042314482601514, "grad_norm": 22.298954010009766, "learning_rate": 2.7028199566160522e-05, "loss": 4.1913, "step": 20230 }, { "epoch": 2.305370465288456, "grad_norm": 28.369159698486328, "learning_rate": 2.7016782737755454e-05, "loss": 4.4942, "step": 20240 }, { "epoch": 2.3065094823167605, "grad_norm": 18.9779109954834, "learning_rate": 2.700650759219089e-05, "loss": 4.053, "step": 20250 }, { "epoch": 2.307648499345065, "grad_norm": 11.17407512664795, "learning_rate": 2.6995090763785825e-05, "loss": 4.5072, "step": 20260 }, { "epoch": 2.3087875163733695, "grad_norm": 15.445562362670898, "learning_rate": 2.6983673935380754e-05, "loss": 4.3479, "step": 20270 }, { "epoch": 2.3099265334016743, "grad_norm": 11.916383743286133, "learning_rate": 2.6972257106975686e-05, "loss": 4.2965, "step": 20280 }, { "epoch": 2.311065550429979, "grad_norm": 13.146663665771484, "learning_rate": 2.6960840278570614e-05, "loss": 4.6778, "step": 20290 }, { "epoch": 2.3122045674582834, "grad_norm": 11.7923002243042, "learning_rate": 2.6949423450165546e-05, "loss": 4.3451, "step": 20300 }, { "epoch": 2.313343584486588, "grad_norm": 13.446571350097656, "learning_rate": 2.6938006621760474e-05, "loss": 4.2385, "step": 20310 }, { "epoch": 2.3144826015148925, "grad_norm": 9.76078987121582, "learning_rate": 2.6926589793355406e-05, "loss": 3.9775, "step": 20320 }, { "epoch": 2.3156216185431973, "grad_norm": 8.01722526550293, "learning_rate": 2.6915172964950335e-05, "loss": 4.1899, "step": 20330 }, { "epoch": 2.3167606355715016, "grad_norm": 15.921638488769531, "learning_rate": 2.6903756136545273e-05, "loss": 4.4662, "step": 20340 }, { "epoch": 2.3178996525998063, "grad_norm": 20.366044998168945, "learning_rate": 2.68923393081402e-05, "loss": 4.007, "step": 20350 }, { "epoch": 2.319038669628111, "grad_norm": 13.331708908081055, "learning_rate": 2.6880922479735133e-05, "loss": 4.3506, "step": 20360 }, { "epoch": 2.3201776866564154, "grad_norm": 14.432374000549316, "learning_rate": 2.6869505651330062e-05, "loss": 4.3741, "step": 20370 }, { "epoch": 2.32131670368472, "grad_norm": 17.888835906982422, "learning_rate": 2.6858088822924994e-05, "loss": 4.4641, "step": 20380 }, { "epoch": 2.3224557207130245, "grad_norm": 12.111369132995605, "learning_rate": 2.6846671994519922e-05, "loss": 4.609, "step": 20390 }, { "epoch": 2.3235947377413293, "grad_norm": 18.915489196777344, "learning_rate": 2.6835255166114854e-05, "loss": 4.3178, "step": 20400 }, { "epoch": 2.3247337547696336, "grad_norm": 13.839448928833008, "learning_rate": 2.6823838337709782e-05, "loss": 4.4563, "step": 20410 }, { "epoch": 2.3258727717979384, "grad_norm": 22.157007217407227, "learning_rate": 2.6812421509304714e-05, "loss": 4.3481, "step": 20420 }, { "epoch": 2.327011788826243, "grad_norm": 11.417917251586914, "learning_rate": 2.680100468089965e-05, "loss": 4.5919, "step": 20430 }, { "epoch": 2.3281508058545475, "grad_norm": 10.939807891845703, "learning_rate": 2.678958785249458e-05, "loss": 4.1816, "step": 20440 }, { "epoch": 2.3292898228828522, "grad_norm": 19.246337890625, "learning_rate": 2.677817102408951e-05, "loss": 4.3146, "step": 20450 }, { "epoch": 2.3304288399111566, "grad_norm": 25.213848114013672, "learning_rate": 2.676675419568444e-05, "loss": 4.372, "step": 20460 }, { "epoch": 2.3315678569394613, "grad_norm": 10.462803840637207, "learning_rate": 2.675533736727937e-05, "loss": 4.5615, "step": 20470 }, { "epoch": 2.3327068739677657, "grad_norm": 25.962284088134766, "learning_rate": 2.6743920538874302e-05, "loss": 4.5635, "step": 20480 }, { "epoch": 2.3338458909960704, "grad_norm": 15.802549362182617, "learning_rate": 2.673250371046923e-05, "loss": 4.3188, "step": 20490 }, { "epoch": 2.334984908024375, "grad_norm": 9.226036071777344, "learning_rate": 2.6721086882064162e-05, "loss": 4.2887, "step": 20500 }, { "epoch": 2.3361239250526795, "grad_norm": 23.429420471191406, "learning_rate": 2.6709670053659097e-05, "loss": 4.31, "step": 20510 }, { "epoch": 2.3372629420809843, "grad_norm": 11.576409339904785, "learning_rate": 2.6698253225254026e-05, "loss": 4.6043, "step": 20520 }, { "epoch": 2.3384019591092886, "grad_norm": 13.73523235321045, "learning_rate": 2.6686836396848958e-05, "loss": 4.4029, "step": 20530 }, { "epoch": 2.3395409761375934, "grad_norm": 9.554508209228516, "learning_rate": 2.667541956844389e-05, "loss": 4.5881, "step": 20540 }, { "epoch": 2.3406799931658977, "grad_norm": 15.159540176391602, "learning_rate": 2.6664002740038818e-05, "loss": 4.4314, "step": 20550 }, { "epoch": 2.3418190101942025, "grad_norm": 9.74484634399414, "learning_rate": 2.665258591163375e-05, "loss": 4.3374, "step": 20560 }, { "epoch": 2.342958027222507, "grad_norm": 14.005760192871094, "learning_rate": 2.664116908322868e-05, "loss": 4.3698, "step": 20570 }, { "epoch": 2.3440970442508116, "grad_norm": 18.01361846923828, "learning_rate": 2.662975225482361e-05, "loss": 4.5266, "step": 20580 }, { "epoch": 2.345236061279116, "grad_norm": 9.59418773651123, "learning_rate": 2.6618335426418545e-05, "loss": 4.4088, "step": 20590 }, { "epoch": 2.3463750783074206, "grad_norm": 11.596879005432129, "learning_rate": 2.6606918598013474e-05, "loss": 4.4025, "step": 20600 }, { "epoch": 2.3475140953357254, "grad_norm": 19.087871551513672, "learning_rate": 2.6595501769608406e-05, "loss": 4.2458, "step": 20610 }, { "epoch": 2.3486531123640297, "grad_norm": 14.203076362609863, "learning_rate": 2.6584084941203334e-05, "loss": 4.7268, "step": 20620 }, { "epoch": 2.3497921293923345, "grad_norm": 25.49370002746582, "learning_rate": 2.6572668112798266e-05, "loss": 4.303, "step": 20630 }, { "epoch": 2.350931146420639, "grad_norm": 23.890228271484375, "learning_rate": 2.6561251284393194e-05, "loss": 4.7874, "step": 20640 }, { "epoch": 2.3520701634489436, "grad_norm": 11.49235725402832, "learning_rate": 2.6549834455988126e-05, "loss": 4.4918, "step": 20650 }, { "epoch": 2.353209180477248, "grad_norm": 15.059396743774414, "learning_rate": 2.6538417627583058e-05, "loss": 4.3722, "step": 20660 }, { "epoch": 2.3543481975055527, "grad_norm": 9.131157875061035, "learning_rate": 2.6527000799177993e-05, "loss": 4.2016, "step": 20670 }, { "epoch": 2.3554872145338575, "grad_norm": 9.978961944580078, "learning_rate": 2.6515583970772922e-05, "loss": 4.5545, "step": 20680 }, { "epoch": 2.3566262315621618, "grad_norm": 10.577940940856934, "learning_rate": 2.6504167142367854e-05, "loss": 4.2959, "step": 20690 }, { "epoch": 2.3577652485904665, "grad_norm": 13.292899131774902, "learning_rate": 2.6492750313962782e-05, "loss": 4.4357, "step": 20700 }, { "epoch": 2.358904265618771, "grad_norm": 12.609500885009766, "learning_rate": 2.6481333485557714e-05, "loss": 4.3743, "step": 20710 }, { "epoch": 2.3600432826470756, "grad_norm": 12.299931526184082, "learning_rate": 2.6469916657152642e-05, "loss": 4.5427, "step": 20720 }, { "epoch": 2.36118229967538, "grad_norm": 15.316471099853516, "learning_rate": 2.6458499828747574e-05, "loss": 4.3887, "step": 20730 }, { "epoch": 2.3623213167036847, "grad_norm": 15.284895896911621, "learning_rate": 2.6447083000342503e-05, "loss": 4.2833, "step": 20740 }, { "epoch": 2.3634603337319895, "grad_norm": 7.876543045043945, "learning_rate": 2.6435666171937434e-05, "loss": 4.3327, "step": 20750 }, { "epoch": 2.364599350760294, "grad_norm": 10.778501510620117, "learning_rate": 2.642424934353237e-05, "loss": 4.2446, "step": 20760 }, { "epoch": 2.3657383677885986, "grad_norm": 15.023706436157227, "learning_rate": 2.64128325151273e-05, "loss": 4.6691, "step": 20770 }, { "epoch": 2.366877384816903, "grad_norm": 14.946088790893555, "learning_rate": 2.640141568672223e-05, "loss": 4.1224, "step": 20780 }, { "epoch": 2.3680164018452077, "grad_norm": 13.244171142578125, "learning_rate": 2.6389998858317162e-05, "loss": 5.0236, "step": 20790 }, { "epoch": 2.369155418873512, "grad_norm": 15.137276649475098, "learning_rate": 2.6379723712752598e-05, "loss": 4.3492, "step": 20800 }, { "epoch": 2.3702944359018168, "grad_norm": 12.232105255126953, "learning_rate": 2.6368306884347533e-05, "loss": 4.4269, "step": 20810 }, { "epoch": 2.3714334529301215, "grad_norm": 14.004165649414062, "learning_rate": 2.635689005594246e-05, "loss": 4.4677, "step": 20820 }, { "epoch": 2.372572469958426, "grad_norm": 16.884231567382812, "learning_rate": 2.6345473227537393e-05, "loss": 4.4402, "step": 20830 }, { "epoch": 2.3737114869867306, "grad_norm": 12.156578063964844, "learning_rate": 2.6334056399132322e-05, "loss": 4.7016, "step": 20840 }, { "epoch": 2.374850504015035, "grad_norm": 12.994196891784668, "learning_rate": 2.6322639570727254e-05, "loss": 4.3935, "step": 20850 }, { "epoch": 2.3759895210433397, "grad_norm": 23.809234619140625, "learning_rate": 2.6311222742322182e-05, "loss": 4.7228, "step": 20860 }, { "epoch": 2.377128538071644, "grad_norm": 16.818578720092773, "learning_rate": 2.6299805913917114e-05, "loss": 4.3697, "step": 20870 }, { "epoch": 2.378267555099949, "grad_norm": 10.739876747131348, "learning_rate": 2.6288389085512046e-05, "loss": 4.2883, "step": 20880 }, { "epoch": 2.379406572128253, "grad_norm": 11.96601676940918, "learning_rate": 2.6276972257106974e-05, "loss": 4.2737, "step": 20890 }, { "epoch": 2.380545589156558, "grad_norm": 12.246533393859863, "learning_rate": 2.626555542870191e-05, "loss": 4.4658, "step": 20900 }, { "epoch": 2.381684606184862, "grad_norm": 12.089892387390137, "learning_rate": 2.625413860029684e-05, "loss": 4.3181, "step": 20910 }, { "epoch": 2.382823623213167, "grad_norm": 11.611150741577148, "learning_rate": 2.624272177189177e-05, "loss": 4.7861, "step": 20920 }, { "epoch": 2.3839626402414718, "grad_norm": 15.781271934509277, "learning_rate": 2.62313049434867e-05, "loss": 4.2517, "step": 20930 }, { "epoch": 2.385101657269776, "grad_norm": 7.367682456970215, "learning_rate": 2.621988811508163e-05, "loss": 4.3842, "step": 20940 }, { "epoch": 2.386240674298081, "grad_norm": 16.008838653564453, "learning_rate": 2.6208471286676562e-05, "loss": 4.3608, "step": 20950 }, { "epoch": 2.387379691326385, "grad_norm": 7.852641582489014, "learning_rate": 2.619705445827149e-05, "loss": 4.8198, "step": 20960 }, { "epoch": 2.38851870835469, "grad_norm": 9.072783470153809, "learning_rate": 2.6185637629866422e-05, "loss": 4.6759, "step": 20970 }, { "epoch": 2.3896577253829943, "grad_norm": 8.352493286132812, "learning_rate": 2.6174220801461357e-05, "loss": 4.4424, "step": 20980 }, { "epoch": 2.390796742411299, "grad_norm": 12.035454750061035, "learning_rate": 2.616280397305629e-05, "loss": 4.6051, "step": 20990 }, { "epoch": 2.391935759439604, "grad_norm": 15.381999969482422, "learning_rate": 2.6151387144651218e-05, "loss": 4.5452, "step": 21000 }, { "epoch": 2.393074776467908, "grad_norm": 8.592596054077148, "learning_rate": 2.613997031624615e-05, "loss": 4.6688, "step": 21010 }, { "epoch": 2.394213793496213, "grad_norm": 12.32676887512207, "learning_rate": 2.6128553487841078e-05, "loss": 4.3724, "step": 21020 }, { "epoch": 2.395352810524517, "grad_norm": 10.219091415405273, "learning_rate": 2.611713665943601e-05, "loss": 4.5526, "step": 21030 }, { "epoch": 2.396491827552822, "grad_norm": 23.337568283081055, "learning_rate": 2.6105719831030938e-05, "loss": 4.4877, "step": 21040 }, { "epoch": 2.3976308445811263, "grad_norm": 23.432353973388672, "learning_rate": 2.609430300262587e-05, "loss": 4.2944, "step": 21050 }, { "epoch": 2.398769861609431, "grad_norm": 21.79475212097168, "learning_rate": 2.6082886174220805e-05, "loss": 4.1989, "step": 21060 }, { "epoch": 2.399908878637736, "grad_norm": 12.396292686462402, "learning_rate": 2.6071469345815737e-05, "loss": 4.45, "step": 21070 }, { "epoch": 2.40104789566604, "grad_norm": 18.911386489868164, "learning_rate": 2.6060052517410665e-05, "loss": 4.5491, "step": 21080 }, { "epoch": 2.402186912694345, "grad_norm": 10.625134468078613, "learning_rate": 2.6048635689005597e-05, "loss": 4.3925, "step": 21090 }, { "epoch": 2.4033259297226492, "grad_norm": 11.378150939941406, "learning_rate": 2.6037218860600526e-05, "loss": 4.4926, "step": 21100 }, { "epoch": 2.404464946750954, "grad_norm": 14.170574188232422, "learning_rate": 2.6025802032195458e-05, "loss": 4.7183, "step": 21110 }, { "epoch": 2.4056039637792583, "grad_norm": 21.01306915283203, "learning_rate": 2.6014385203790386e-05, "loss": 4.1274, "step": 21120 }, { "epoch": 2.406742980807563, "grad_norm": 12.135096549987793, "learning_rate": 2.6002968375385318e-05, "loss": 4.8377, "step": 21130 }, { "epoch": 2.407881997835868, "grad_norm": 10.106658935546875, "learning_rate": 2.5991551546980253e-05, "loss": 4.4585, "step": 21140 }, { "epoch": 2.409021014864172, "grad_norm": 22.058813095092773, "learning_rate": 2.598013471857518e-05, "loss": 4.1022, "step": 21150 }, { "epoch": 2.410160031892477, "grad_norm": 9.689461708068848, "learning_rate": 2.5968717890170113e-05, "loss": 4.2402, "step": 21160 }, { "epoch": 2.4112990489207813, "grad_norm": 22.227386474609375, "learning_rate": 2.5957301061765045e-05, "loss": 4.6234, "step": 21170 }, { "epoch": 2.412438065949086, "grad_norm": 14.180185317993164, "learning_rate": 2.5945884233359974e-05, "loss": 4.1885, "step": 21180 }, { "epoch": 2.4135770829773904, "grad_norm": 11.204601287841797, "learning_rate": 2.5934467404954906e-05, "loss": 4.6157, "step": 21190 }, { "epoch": 2.414716100005695, "grad_norm": 13.349756240844727, "learning_rate": 2.5923050576549834e-05, "loss": 4.4678, "step": 21200 }, { "epoch": 2.4158551170339995, "grad_norm": 8.626503944396973, "learning_rate": 2.5911633748144766e-05, "loss": 4.1648, "step": 21210 }, { "epoch": 2.4169941340623042, "grad_norm": 21.226703643798828, "learning_rate": 2.5900216919739694e-05, "loss": 4.4587, "step": 21220 }, { "epoch": 2.4181331510906086, "grad_norm": 11.00561237335205, "learning_rate": 2.588880009133463e-05, "loss": 4.484, "step": 21230 }, { "epoch": 2.4192721681189133, "grad_norm": 21.933300018310547, "learning_rate": 2.587738326292956e-05, "loss": 4.5352, "step": 21240 }, { "epoch": 2.420411185147218, "grad_norm": 8.876955032348633, "learning_rate": 2.586596643452449e-05, "loss": 4.0314, "step": 21250 }, { "epoch": 2.4215502021755224, "grad_norm": 16.549108505249023, "learning_rate": 2.585454960611942e-05, "loss": 4.4442, "step": 21260 }, { "epoch": 2.422689219203827, "grad_norm": 9.397076606750488, "learning_rate": 2.584313277771435e-05, "loss": 4.3439, "step": 21270 }, { "epoch": 2.4238282362321315, "grad_norm": 13.82287311553955, "learning_rate": 2.5831715949309282e-05, "loss": 4.4222, "step": 21280 }, { "epoch": 2.4249672532604363, "grad_norm": 13.481350898742676, "learning_rate": 2.5820299120904214e-05, "loss": 4.558, "step": 21290 }, { "epoch": 2.4261062702887406, "grad_norm": 17.883499145507812, "learning_rate": 2.5808882292499142e-05, "loss": 4.8666, "step": 21300 }, { "epoch": 2.4272452873170454, "grad_norm": 12.39294719696045, "learning_rate": 2.5797465464094077e-05, "loss": 4.5578, "step": 21310 }, { "epoch": 2.42838430434535, "grad_norm": 16.293148040771484, "learning_rate": 2.578604863568901e-05, "loss": 4.5579, "step": 21320 }, { "epoch": 2.4295233213736545, "grad_norm": 8.16700267791748, "learning_rate": 2.5774631807283938e-05, "loss": 4.5132, "step": 21330 }, { "epoch": 2.430662338401959, "grad_norm": 12.83481216430664, "learning_rate": 2.576321497887887e-05, "loss": 4.6121, "step": 21340 }, { "epoch": 2.4318013554302635, "grad_norm": 16.078487396240234, "learning_rate": 2.5751798150473798e-05, "loss": 4.3257, "step": 21350 }, { "epoch": 2.4329403724585683, "grad_norm": 10.014603614807129, "learning_rate": 2.574038132206873e-05, "loss": 4.3713, "step": 21360 }, { "epoch": 2.4340793894868726, "grad_norm": 10.41618824005127, "learning_rate": 2.5728964493663658e-05, "loss": 4.6494, "step": 21370 }, { "epoch": 2.4352184065151774, "grad_norm": 12.7681245803833, "learning_rate": 2.571754766525859e-05, "loss": 4.4948, "step": 21380 }, { "epoch": 2.436357423543482, "grad_norm": 14.007975578308105, "learning_rate": 2.5706130836853525e-05, "loss": 4.2392, "step": 21390 }, { "epoch": 2.4374964405717865, "grad_norm": 22.04856300354004, "learning_rate": 2.5694714008448457e-05, "loss": 4.0833, "step": 21400 }, { "epoch": 2.4386354576000913, "grad_norm": 10.801676750183105, "learning_rate": 2.5683297180043386e-05, "loss": 4.4641, "step": 21410 }, { "epoch": 2.4397744746283956, "grad_norm": 15.246986389160156, "learning_rate": 2.5671880351638317e-05, "loss": 4.2553, "step": 21420 }, { "epoch": 2.4409134916567004, "grad_norm": 8.889477729797363, "learning_rate": 2.5660463523233246e-05, "loss": 4.4263, "step": 21430 }, { "epoch": 2.4420525086850047, "grad_norm": 12.95500373840332, "learning_rate": 2.5649046694828178e-05, "loss": 4.6147, "step": 21440 }, { "epoch": 2.4431915257133094, "grad_norm": 13.485177993774414, "learning_rate": 2.5637629866423106e-05, "loss": 4.5826, "step": 21450 }, { "epoch": 2.444330542741614, "grad_norm": 11.917974472045898, "learning_rate": 2.5626213038018038e-05, "loss": 4.1817, "step": 21460 }, { "epoch": 2.4454695597699185, "grad_norm": 17.149272918701172, "learning_rate": 2.5614796209612973e-05, "loss": 4.4273, "step": 21470 }, { "epoch": 2.4466085767982233, "grad_norm": 10.053730964660645, "learning_rate": 2.5603379381207905e-05, "loss": 4.4531, "step": 21480 }, { "epoch": 2.4477475938265276, "grad_norm": 26.15191078186035, "learning_rate": 2.5591962552802834e-05, "loss": 4.2981, "step": 21490 }, { "epoch": 2.4488866108548324, "grad_norm": 24.09326171875, "learning_rate": 2.5580545724397765e-05, "loss": 4.4, "step": 21500 }, { "epoch": 2.4500256278831367, "grad_norm": 10.37264633178711, "learning_rate": 2.5569128895992694e-05, "loss": 4.7429, "step": 21510 }, { "epoch": 2.4511646449114415, "grad_norm": 10.826128959655762, "learning_rate": 2.5557712067587626e-05, "loss": 3.9984, "step": 21520 }, { "epoch": 2.4523036619397462, "grad_norm": 12.696362495422363, "learning_rate": 2.5546295239182554e-05, "loss": 4.599, "step": 21530 }, { "epoch": 2.4534426789680506, "grad_norm": 10.249017715454102, "learning_rate": 2.5534878410777486e-05, "loss": 4.6064, "step": 21540 }, { "epoch": 2.4545816959963553, "grad_norm": 17.17274284362793, "learning_rate": 2.5523461582372414e-05, "loss": 4.3368, "step": 21550 }, { "epoch": 2.4557207130246597, "grad_norm": 14.794090270996094, "learning_rate": 2.551204475396735e-05, "loss": 4.4248, "step": 21560 }, { "epoch": 2.4568597300529644, "grad_norm": 18.64275550842285, "learning_rate": 2.550062792556228e-05, "loss": 4.4132, "step": 21570 }, { "epoch": 2.4579987470812688, "grad_norm": 52.16679763793945, "learning_rate": 2.5489211097157213e-05, "loss": 4.7241, "step": 21580 }, { "epoch": 2.4591377641095735, "grad_norm": 10.204142570495605, "learning_rate": 2.5477794268752142e-05, "loss": 4.2251, "step": 21590 }, { "epoch": 2.460276781137878, "grad_norm": 13.011360168457031, "learning_rate": 2.5466377440347074e-05, "loss": 4.2051, "step": 21600 }, { "epoch": 2.4614157981661826, "grad_norm": 13.616783142089844, "learning_rate": 2.5454960611942002e-05, "loss": 4.3595, "step": 21610 }, { "epoch": 2.462554815194487, "grad_norm": 13.076457977294922, "learning_rate": 2.5443543783536934e-05, "loss": 4.4622, "step": 21620 }, { "epoch": 2.4636938322227917, "grad_norm": 16.63337516784668, "learning_rate": 2.5432126955131862e-05, "loss": 4.4244, "step": 21630 }, { "epoch": 2.4648328492510965, "grad_norm": 18.748550415039062, "learning_rate": 2.5420710126726798e-05, "loss": 4.375, "step": 21640 }, { "epoch": 2.465971866279401, "grad_norm": 9.516826629638672, "learning_rate": 2.540929329832173e-05, "loss": 4.3369, "step": 21650 }, { "epoch": 2.4671108833077056, "grad_norm": 24.19636344909668, "learning_rate": 2.5397876469916658e-05, "loss": 4.1031, "step": 21660 }, { "epoch": 2.46824990033601, "grad_norm": 16.849750518798828, "learning_rate": 2.538645964151159e-05, "loss": 4.4579, "step": 21670 }, { "epoch": 2.4693889173643147, "grad_norm": 13.501879692077637, "learning_rate": 2.537504281310652e-05, "loss": 4.7379, "step": 21680 }, { "epoch": 2.470527934392619, "grad_norm": 22.789653778076172, "learning_rate": 2.536362598470145e-05, "loss": 4.3932, "step": 21690 }, { "epoch": 2.4716669514209237, "grad_norm": 12.54429817199707, "learning_rate": 2.5352209156296382e-05, "loss": 4.3613, "step": 21700 }, { "epoch": 2.4728059684492285, "grad_norm": 26.30118751525879, "learning_rate": 2.534079232789131e-05, "loss": 4.3107, "step": 21710 }, { "epoch": 2.473944985477533, "grad_norm": 10.114954948425293, "learning_rate": 2.5329375499486245e-05, "loss": 4.2892, "step": 21720 }, { "epoch": 2.4750840025058376, "grad_norm": 21.673948287963867, "learning_rate": 2.5317958671081177e-05, "loss": 4.5669, "step": 21730 }, { "epoch": 2.476223019534142, "grad_norm": 14.892142295837402, "learning_rate": 2.5306541842676106e-05, "loss": 4.2368, "step": 21740 }, { "epoch": 2.4773620365624467, "grad_norm": 17.35381507873535, "learning_rate": 2.5295125014271038e-05, "loss": 4.3717, "step": 21750 }, { "epoch": 2.478501053590751, "grad_norm": 18.587623596191406, "learning_rate": 2.5283708185865966e-05, "loss": 4.4697, "step": 21760 }, { "epoch": 2.479640070619056, "grad_norm": 32.892051696777344, "learning_rate": 2.5272291357460898e-05, "loss": 4.3698, "step": 21770 }, { "epoch": 2.4807790876473605, "grad_norm": 26.555744171142578, "learning_rate": 2.5260874529055826e-05, "loss": 4.2927, "step": 21780 }, { "epoch": 2.481918104675665, "grad_norm": 8.519964218139648, "learning_rate": 2.5249457700650758e-05, "loss": 4.534, "step": 21790 }, { "epoch": 2.4830571217039696, "grad_norm": 13.621071815490723, "learning_rate": 2.5238040872245693e-05, "loss": 4.5586, "step": 21800 }, { "epoch": 2.484196138732274, "grad_norm": 15.400689125061035, "learning_rate": 2.5226624043840625e-05, "loss": 4.2739, "step": 21810 }, { "epoch": 2.4853351557605787, "grad_norm": 14.989195823669434, "learning_rate": 2.5215207215435554e-05, "loss": 4.5925, "step": 21820 }, { "epoch": 2.486474172788883, "grad_norm": 13.892701148986816, "learning_rate": 2.5203790387030486e-05, "loss": 4.274, "step": 21830 }, { "epoch": 2.487613189817188, "grad_norm": 11.06151294708252, "learning_rate": 2.5192373558625414e-05, "loss": 4.2846, "step": 21840 }, { "epoch": 2.4887522068454926, "grad_norm": 15.464399337768555, "learning_rate": 2.5180956730220346e-05, "loss": 4.3184, "step": 21850 }, { "epoch": 2.489891223873797, "grad_norm": 16.73919105529785, "learning_rate": 2.5169539901815274e-05, "loss": 4.4767, "step": 21860 }, { "epoch": 2.4910302409021017, "grad_norm": 10.512547492980957, "learning_rate": 2.5158123073410206e-05, "loss": 4.4806, "step": 21870 }, { "epoch": 2.492169257930406, "grad_norm": 12.304697036743164, "learning_rate": 2.5146706245005135e-05, "loss": 4.5144, "step": 21880 }, { "epoch": 2.4933082749587108, "grad_norm": 12.59383487701416, "learning_rate": 2.5135289416600073e-05, "loss": 4.4376, "step": 21890 }, { "epoch": 2.494447291987015, "grad_norm": 28.72754669189453, "learning_rate": 2.5123872588195e-05, "loss": 4.2984, "step": 21900 }, { "epoch": 2.49558630901532, "grad_norm": 15.549976348876953, "learning_rate": 2.5112455759789933e-05, "loss": 4.3363, "step": 21910 }, { "epoch": 2.496725326043624, "grad_norm": 13.449384689331055, "learning_rate": 2.5101038931384862e-05, "loss": 4.5456, "step": 21920 }, { "epoch": 2.497864343071929, "grad_norm": 15.404632568359375, "learning_rate": 2.5089622102979794e-05, "loss": 4.5399, "step": 21930 }, { "epoch": 2.4990033601002333, "grad_norm": 36.435874938964844, "learning_rate": 2.5078205274574722e-05, "loss": 4.4067, "step": 21940 }, { "epoch": 2.500142377128538, "grad_norm": 13.879344940185547, "learning_rate": 2.5066788446169654e-05, "loss": 4.2819, "step": 21950 }, { "epoch": 2.501281394156843, "grad_norm": 12.27030086517334, "learning_rate": 2.5055371617764582e-05, "loss": 4.5013, "step": 21960 }, { "epoch": 2.502420411185147, "grad_norm": 9.679683685302734, "learning_rate": 2.504395478935952e-05, "loss": 4.4269, "step": 21970 }, { "epoch": 2.503559428213452, "grad_norm": 21.375797271728516, "learning_rate": 2.503253796095445e-05, "loss": 4.6372, "step": 21980 }, { "epoch": 2.504698445241756, "grad_norm": 13.330611228942871, "learning_rate": 2.502112113254938e-05, "loss": 4.267, "step": 21990 }, { "epoch": 2.505837462270061, "grad_norm": 31.16680908203125, "learning_rate": 2.500970430414431e-05, "loss": 4.541, "step": 22000 }, { "epoch": 2.505837462270061, "eval_loss": 6.085844039916992, "eval_runtime": 11.247, "eval_samples_per_second": 1.334, "eval_steps_per_second": 0.178, "step": 22000 }, { "epoch": 2.5069764792983653, "grad_norm": 15.743983268737793, "learning_rate": 2.499828747573924e-05, "loss": 4.5104, "step": 22010 }, { "epoch": 2.50811549632667, "grad_norm": 22.402015686035156, "learning_rate": 2.498687064733417e-05, "loss": 4.4935, "step": 22020 }, { "epoch": 2.509254513354975, "grad_norm": 12.458966255187988, "learning_rate": 2.4975453818929105e-05, "loss": 4.166, "step": 22030 }, { "epoch": 2.510393530383279, "grad_norm": 11.811493873596191, "learning_rate": 2.4964036990524034e-05, "loss": 4.2387, "step": 22040 }, { "epoch": 2.511532547411584, "grad_norm": 10.788464546203613, "learning_rate": 2.4952620162118966e-05, "loss": 4.4599, "step": 22050 }, { "epoch": 2.5126715644398883, "grad_norm": 19.103111267089844, "learning_rate": 2.4941203333713894e-05, "loss": 4.2352, "step": 22060 }, { "epoch": 2.513810581468193, "grad_norm": 11.614288330078125, "learning_rate": 2.4929786505308826e-05, "loss": 4.3215, "step": 22070 }, { "epoch": 2.5149495984964974, "grad_norm": 11.717473030090332, "learning_rate": 2.4918369676903758e-05, "loss": 4.5536, "step": 22080 }, { "epoch": 2.516088615524802, "grad_norm": 10.13782024383545, "learning_rate": 2.490695284849869e-05, "loss": 5.0282, "step": 22090 }, { "epoch": 2.517227632553107, "grad_norm": 19.63833999633789, "learning_rate": 2.4895536020093618e-05, "loss": 4.5342, "step": 22100 }, { "epoch": 2.518366649581411, "grad_norm": 21.288978576660156, "learning_rate": 2.488411919168855e-05, "loss": 4.2393, "step": 22110 }, { "epoch": 2.519505666609716, "grad_norm": 14.36782455444336, "learning_rate": 2.4872702363283482e-05, "loss": 4.5864, "step": 22120 }, { "epoch": 2.5206446836380203, "grad_norm": 15.093581199645996, "learning_rate": 2.486128553487841e-05, "loss": 4.7658, "step": 22130 }, { "epoch": 2.521783700666325, "grad_norm": 13.700653076171875, "learning_rate": 2.4849868706473342e-05, "loss": 4.2336, "step": 22140 }, { "epoch": 2.5229227176946294, "grad_norm": 7.751123428344727, "learning_rate": 2.4838451878068274e-05, "loss": 4.8393, "step": 22150 }, { "epoch": 2.524061734722934, "grad_norm": 13.696725845336914, "learning_rate": 2.4827035049663206e-05, "loss": 4.5059, "step": 22160 }, { "epoch": 2.525200751751239, "grad_norm": 8.605315208435059, "learning_rate": 2.4815618221258134e-05, "loss": 4.4345, "step": 22170 }, { "epoch": 2.5263397687795432, "grad_norm": 22.05066680908203, "learning_rate": 2.4804201392853066e-05, "loss": 4.2108, "step": 22180 }, { "epoch": 2.5274787858078476, "grad_norm": 21.944713592529297, "learning_rate": 2.4792784564447998e-05, "loss": 4.3732, "step": 22190 }, { "epoch": 2.5286178028361523, "grad_norm": 19.73705291748047, "learning_rate": 2.478136773604293e-05, "loss": 4.8541, "step": 22200 }, { "epoch": 2.529756819864457, "grad_norm": 10.252677917480469, "learning_rate": 2.4769950907637858e-05, "loss": 4.4317, "step": 22210 }, { "epoch": 2.5308958368927614, "grad_norm": 10.081154823303223, "learning_rate": 2.475853407923279e-05, "loss": 4.5045, "step": 22220 }, { "epoch": 2.532034853921066, "grad_norm": 12.180397987365723, "learning_rate": 2.474711725082772e-05, "loss": 4.654, "step": 22230 }, { "epoch": 2.533173870949371, "grad_norm": 11.417940139770508, "learning_rate": 2.4735700422422654e-05, "loss": 4.6833, "step": 22240 }, { "epoch": 2.5343128879776753, "grad_norm": 14.055529594421387, "learning_rate": 2.4724283594017582e-05, "loss": 4.5018, "step": 22250 }, { "epoch": 2.5354519050059796, "grad_norm": 27.471111297607422, "learning_rate": 2.4712866765612514e-05, "loss": 4.3245, "step": 22260 }, { "epoch": 2.5365909220342844, "grad_norm": 10.814193725585938, "learning_rate": 2.4701449937207442e-05, "loss": 4.4528, "step": 22270 }, { "epoch": 2.537729939062589, "grad_norm": 22.31861686706543, "learning_rate": 2.4690033108802378e-05, "loss": 4.5534, "step": 22280 }, { "epoch": 2.5388689560908935, "grad_norm": 8.343460083007812, "learning_rate": 2.4678616280397306e-05, "loss": 4.6753, "step": 22290 }, { "epoch": 2.5400079731191982, "grad_norm": 92.80477142333984, "learning_rate": 2.4667199451992238e-05, "loss": 5.0062, "step": 22300 }, { "epoch": 2.5411469901475026, "grad_norm": 18.66383934020996, "learning_rate": 2.4655782623587166e-05, "loss": 4.2345, "step": 22310 }, { "epoch": 2.5422860071758073, "grad_norm": 17.985797882080078, "learning_rate": 2.46443657951821e-05, "loss": 4.4421, "step": 22320 }, { "epoch": 2.5434250242041117, "grad_norm": 11.158591270446777, "learning_rate": 2.463294896677703e-05, "loss": 4.4239, "step": 22330 }, { "epoch": 2.5445640412324164, "grad_norm": 11.361953735351562, "learning_rate": 2.4621532138371962e-05, "loss": 4.4585, "step": 22340 }, { "epoch": 2.545703058260721, "grad_norm": 11.874592781066895, "learning_rate": 2.461011530996689e-05, "loss": 4.3894, "step": 22350 }, { "epoch": 2.5468420752890255, "grad_norm": 10.548654556274414, "learning_rate": 2.4598698481561825e-05, "loss": 4.2967, "step": 22360 }, { "epoch": 2.5479810923173303, "grad_norm": 9.199511528015137, "learning_rate": 2.4587281653156754e-05, "loss": 4.4433, "step": 22370 }, { "epoch": 2.5491201093456346, "grad_norm": 11.830724716186523, "learning_rate": 2.4575864824751686e-05, "loss": 4.2873, "step": 22380 }, { "epoch": 2.5502591263739394, "grad_norm": 13.431696891784668, "learning_rate": 2.4564447996346614e-05, "loss": 4.6081, "step": 22390 }, { "epoch": 2.5513981434022437, "grad_norm": 9.607420921325684, "learning_rate": 2.455303116794155e-05, "loss": 4.528, "step": 22400 }, { "epoch": 2.5525371604305485, "grad_norm": 9.920609474182129, "learning_rate": 2.4541614339536478e-05, "loss": 4.3601, "step": 22410 }, { "epoch": 2.5536761774588532, "grad_norm": 17.428359985351562, "learning_rate": 2.453019751113141e-05, "loss": 4.6, "step": 22420 }, { "epoch": 2.5548151944871575, "grad_norm": 15.154529571533203, "learning_rate": 2.4518780682726338e-05, "loss": 4.621, "step": 22430 }, { "epoch": 2.5559542115154623, "grad_norm": 8.525446891784668, "learning_rate": 2.4507363854321273e-05, "loss": 4.2658, "step": 22440 }, { "epoch": 2.5570932285437666, "grad_norm": 13.262287139892578, "learning_rate": 2.4495947025916202e-05, "loss": 4.4597, "step": 22450 }, { "epoch": 2.5582322455720714, "grad_norm": 16.254911422729492, "learning_rate": 2.4484530197511134e-05, "loss": 4.3065, "step": 22460 }, { "epoch": 2.5593712626003757, "grad_norm": 9.883187294006348, "learning_rate": 2.4473113369106062e-05, "loss": 4.7282, "step": 22470 }, { "epoch": 2.5605102796286805, "grad_norm": 12.782791137695312, "learning_rate": 2.4461696540700997e-05, "loss": 4.4512, "step": 22480 }, { "epoch": 2.5616492966569853, "grad_norm": 16.31939697265625, "learning_rate": 2.4450279712295926e-05, "loss": 4.5547, "step": 22490 }, { "epoch": 2.5627883136852896, "grad_norm": 9.065730094909668, "learning_rate": 2.4438862883890858e-05, "loss": 4.404, "step": 22500 }, { "epoch": 2.563927330713594, "grad_norm": 12.488701820373535, "learning_rate": 2.4427446055485786e-05, "loss": 4.5696, "step": 22510 }, { "epoch": 2.5650663477418987, "grad_norm": 9.828531265258789, "learning_rate": 2.4416029227080718e-05, "loss": 4.3566, "step": 22520 }, { "epoch": 2.5662053647702034, "grad_norm": 14.15413761138916, "learning_rate": 2.440461239867565e-05, "loss": 4.5018, "step": 22530 }, { "epoch": 2.5673443817985078, "grad_norm": 14.06329345703125, "learning_rate": 2.439319557027058e-05, "loss": 4.4367, "step": 22540 }, { "epoch": 2.5684833988268125, "grad_norm": 13.392003059387207, "learning_rate": 2.438177874186551e-05, "loss": 4.9594, "step": 22550 }, { "epoch": 2.5696224158551173, "grad_norm": 9.897027969360352, "learning_rate": 2.4370361913460442e-05, "loss": 4.4857, "step": 22560 }, { "epoch": 2.5707614328834216, "grad_norm": 16.1251163482666, "learning_rate": 2.4358945085055374e-05, "loss": 4.2417, "step": 22570 }, { "epoch": 2.571900449911726, "grad_norm": 17.117238998413086, "learning_rate": 2.4347528256650302e-05, "loss": 4.3507, "step": 22580 }, { "epoch": 2.5730394669400307, "grad_norm": 15.291457176208496, "learning_rate": 2.4336111428245234e-05, "loss": 4.5056, "step": 22590 }, { "epoch": 2.5741784839683355, "grad_norm": 12.484232902526855, "learning_rate": 2.4324694599840166e-05, "loss": 4.3101, "step": 22600 }, { "epoch": 2.57531750099664, "grad_norm": 13.828505516052246, "learning_rate": 2.4313277771435098e-05, "loss": 4.3826, "step": 22610 }, { "epoch": 2.5764565180249446, "grad_norm": 22.372825622558594, "learning_rate": 2.4301860943030026e-05, "loss": 4.4154, "step": 22620 }, { "epoch": 2.577595535053249, "grad_norm": 27.027711868286133, "learning_rate": 2.4290444114624958e-05, "loss": 4.0432, "step": 22630 }, { "epoch": 2.5787345520815537, "grad_norm": 12.284584045410156, "learning_rate": 2.4279027286219886e-05, "loss": 4.6365, "step": 22640 }, { "epoch": 2.579873569109858, "grad_norm": 42.92761993408203, "learning_rate": 2.426761045781482e-05, "loss": 4.3148, "step": 22650 }, { "epoch": 2.5810125861381628, "grad_norm": 13.668088912963867, "learning_rate": 2.425619362940975e-05, "loss": 4.459, "step": 22660 }, { "epoch": 2.5821516031664675, "grad_norm": 30.51060676574707, "learning_rate": 2.4244776801004682e-05, "loss": 4.2199, "step": 22670 }, { "epoch": 2.583290620194772, "grad_norm": 7.775396823883057, "learning_rate": 2.423335997259961e-05, "loss": 4.3111, "step": 22680 }, { "epoch": 2.5844296372230766, "grad_norm": 17.842409133911133, "learning_rate": 2.4221943144194546e-05, "loss": 4.2721, "step": 22690 }, { "epoch": 2.585568654251381, "grad_norm": 16.127065658569336, "learning_rate": 2.4210526315789474e-05, "loss": 4.544, "step": 22700 }, { "epoch": 2.5867076712796857, "grad_norm": 10.717621803283691, "learning_rate": 2.4199109487384406e-05, "loss": 4.2006, "step": 22710 }, { "epoch": 2.58784668830799, "grad_norm": 78.16552734375, "learning_rate": 2.4187692658979334e-05, "loss": 4.6037, "step": 22720 }, { "epoch": 2.588985705336295, "grad_norm": 10.776037216186523, "learning_rate": 2.417627583057427e-05, "loss": 4.1843, "step": 22730 }, { "epoch": 2.5901247223645996, "grad_norm": 12.085357666015625, "learning_rate": 2.4164859002169198e-05, "loss": 4.2983, "step": 22740 }, { "epoch": 2.591263739392904, "grad_norm": 14.134846687316895, "learning_rate": 2.415344217376413e-05, "loss": 4.4293, "step": 22750 }, { "epoch": 2.5924027564212087, "grad_norm": 13.935742378234863, "learning_rate": 2.414202534535906e-05, "loss": 4.2963, "step": 22760 }, { "epoch": 2.593541773449513, "grad_norm": 11.380349159240723, "learning_rate": 2.4130608516953994e-05, "loss": 4.6185, "step": 22770 }, { "epoch": 2.5946807904778177, "grad_norm": 9.011507987976074, "learning_rate": 2.4119191688548922e-05, "loss": 4.3858, "step": 22780 }, { "epoch": 2.595819807506122, "grad_norm": 13.143577575683594, "learning_rate": 2.4107774860143854e-05, "loss": 4.801, "step": 22790 }, { "epoch": 2.596958824534427, "grad_norm": 10.079553604125977, "learning_rate": 2.4096358031738782e-05, "loss": 4.4776, "step": 22800 }, { "epoch": 2.5980978415627316, "grad_norm": 15.193427085876465, "learning_rate": 2.4084941203333717e-05, "loss": 4.1893, "step": 22810 }, { "epoch": 2.599236858591036, "grad_norm": 18.3494815826416, "learning_rate": 2.4073524374928646e-05, "loss": 4.2356, "step": 22820 }, { "epoch": 2.6003758756193402, "grad_norm": 9.987313270568848, "learning_rate": 2.4062107546523578e-05, "loss": 4.1031, "step": 22830 }, { "epoch": 2.601514892647645, "grad_norm": 10.688200950622559, "learning_rate": 2.4050690718118506e-05, "loss": 4.5624, "step": 22840 }, { "epoch": 2.60265390967595, "grad_norm": 17.7220401763916, "learning_rate": 2.403927388971344e-05, "loss": 4.2588, "step": 22850 }, { "epoch": 2.603792926704254, "grad_norm": 15.036736488342285, "learning_rate": 2.402785706130837e-05, "loss": 4.3631, "step": 22860 }, { "epoch": 2.604931943732559, "grad_norm": 13.211268424987793, "learning_rate": 2.4016440232903302e-05, "loss": 4.1452, "step": 22870 }, { "epoch": 2.6060709607608636, "grad_norm": 11.788058280944824, "learning_rate": 2.400502340449823e-05, "loss": 4.6403, "step": 22880 }, { "epoch": 2.607209977789168, "grad_norm": 11.017247200012207, "learning_rate": 2.3993606576093162e-05, "loss": 4.6076, "step": 22890 }, { "epoch": 2.6083489948174723, "grad_norm": 32.86994171142578, "learning_rate": 2.3982189747688094e-05, "loss": 4.045, "step": 22900 }, { "epoch": 2.609488011845777, "grad_norm": 11.733973503112793, "learning_rate": 2.3970772919283026e-05, "loss": 4.4082, "step": 22910 }, { "epoch": 2.610627028874082, "grad_norm": 12.394948959350586, "learning_rate": 2.3959356090877954e-05, "loss": 4.4233, "step": 22920 }, { "epoch": 2.611766045902386, "grad_norm": 8.273716926574707, "learning_rate": 2.3947939262472886e-05, "loss": 4.3974, "step": 22930 }, { "epoch": 2.612905062930691, "grad_norm": 12.322443962097168, "learning_rate": 2.3936522434067818e-05, "loss": 4.5955, "step": 22940 }, { "epoch": 2.6140440799589952, "grad_norm": 15.869829177856445, "learning_rate": 2.392510560566275e-05, "loss": 4.3106, "step": 22950 }, { "epoch": 2.6151830969873, "grad_norm": 9.611001968383789, "learning_rate": 2.3913688777257678e-05, "loss": 4.4168, "step": 22960 }, { "epoch": 2.6163221140156043, "grad_norm": 10.356759071350098, "learning_rate": 2.390227194885261e-05, "loss": 4.4961, "step": 22970 }, { "epoch": 2.617461131043909, "grad_norm": 34.95732879638672, "learning_rate": 2.3890855120447542e-05, "loss": 4.0985, "step": 22980 }, { "epoch": 2.618600148072214, "grad_norm": 39.855934143066406, "learning_rate": 2.3879438292042474e-05, "loss": 4.1007, "step": 22990 }, { "epoch": 2.619739165100518, "grad_norm": 9.185993194580078, "learning_rate": 2.3868021463637402e-05, "loss": 4.4709, "step": 23000 }, { "epoch": 2.620878182128823, "grad_norm": 14.936957359313965, "learning_rate": 2.3856604635232334e-05, "loss": 4.6402, "step": 23010 }, { "epoch": 2.6220171991571273, "grad_norm": 11.28511905670166, "learning_rate": 2.3845187806827266e-05, "loss": 4.8301, "step": 23020 }, { "epoch": 2.623156216185432, "grad_norm": 14.267093658447266, "learning_rate": 2.3833770978422194e-05, "loss": 4.439, "step": 23030 }, { "epoch": 2.6242952332137364, "grad_norm": 9.55140209197998, "learning_rate": 2.3822354150017126e-05, "loss": 4.4499, "step": 23040 }, { "epoch": 2.625434250242041, "grad_norm": 12.693892478942871, "learning_rate": 2.3810937321612058e-05, "loss": 4.3799, "step": 23050 }, { "epoch": 2.626573267270346, "grad_norm": 19.29254150390625, "learning_rate": 2.379952049320699e-05, "loss": 4.4028, "step": 23060 }, { "epoch": 2.6277122842986502, "grad_norm": 18.440366744995117, "learning_rate": 2.3788103664801918e-05, "loss": 4.2386, "step": 23070 }, { "epoch": 2.628851301326955, "grad_norm": 11.92179012298584, "learning_rate": 2.377668683639685e-05, "loss": 4.1973, "step": 23080 }, { "epoch": 2.6299903183552593, "grad_norm": 10.336345672607422, "learning_rate": 2.376527000799178e-05, "loss": 4.4222, "step": 23090 }, { "epoch": 2.631129335383564, "grad_norm": 20.774919509887695, "learning_rate": 2.3753853179586714e-05, "loss": 4.2102, "step": 23100 }, { "epoch": 2.6322683524118684, "grad_norm": 11.118274688720703, "learning_rate": 2.3742436351181642e-05, "loss": 4.7242, "step": 23110 }, { "epoch": 2.633407369440173, "grad_norm": 9.938228607177734, "learning_rate": 2.3731019522776574e-05, "loss": 4.5976, "step": 23120 }, { "epoch": 2.634546386468478, "grad_norm": 13.257858276367188, "learning_rate": 2.3720744377212013e-05, "loss": 4.2183, "step": 23130 }, { "epoch": 2.6356854034967823, "grad_norm": 12.497907638549805, "learning_rate": 2.3709327548806942e-05, "loss": 4.5292, "step": 23140 }, { "epoch": 2.6368244205250866, "grad_norm": 11.595098495483398, "learning_rate": 2.3697910720401874e-05, "loss": 4.2823, "step": 23150 }, { "epoch": 2.6379634375533914, "grad_norm": 10.914962768554688, "learning_rate": 2.3686493891996805e-05, "loss": 4.4755, "step": 23160 }, { "epoch": 2.639102454581696, "grad_norm": 8.901376724243164, "learning_rate": 2.3675077063591737e-05, "loss": 4.5236, "step": 23170 }, { "epoch": 2.6402414716100004, "grad_norm": 9.814475059509277, "learning_rate": 2.3663660235186666e-05, "loss": 4.5523, "step": 23180 }, { "epoch": 2.641380488638305, "grad_norm": 26.18296241760254, "learning_rate": 2.3652243406781598e-05, "loss": 4.3089, "step": 23190 }, { "epoch": 2.64251950566661, "grad_norm": 16.817184448242188, "learning_rate": 2.364082657837653e-05, "loss": 4.3481, "step": 23200 }, { "epoch": 2.6436585226949143, "grad_norm": 11.565564155578613, "learning_rate": 2.3629409749971458e-05, "loss": 4.4284, "step": 23210 }, { "epoch": 2.6447975397232186, "grad_norm": 18.328445434570312, "learning_rate": 2.361799292156639e-05, "loss": 4.5163, "step": 23220 }, { "epoch": 2.6459365567515234, "grad_norm": 17.54689598083496, "learning_rate": 2.360657609316132e-05, "loss": 4.1599, "step": 23230 }, { "epoch": 2.647075573779828, "grad_norm": 11.191732406616211, "learning_rate": 2.3595159264756253e-05, "loss": 4.4104, "step": 23240 }, { "epoch": 2.6482145908081325, "grad_norm": 9.781057357788086, "learning_rate": 2.3583742436351182e-05, "loss": 4.2188, "step": 23250 }, { "epoch": 2.6493536078364373, "grad_norm": 14.09641170501709, "learning_rate": 2.3572325607946114e-05, "loss": 4.1248, "step": 23260 }, { "epoch": 2.6504926248647416, "grad_norm": 21.603864669799805, "learning_rate": 2.3560908779541045e-05, "loss": 4.4381, "step": 23270 }, { "epoch": 2.6516316418930463, "grad_norm": 34.63275909423828, "learning_rate": 2.3549491951135977e-05, "loss": 4.3809, "step": 23280 }, { "epoch": 2.6527706589213507, "grad_norm": 15.982152938842773, "learning_rate": 2.3538075122730906e-05, "loss": 4.2162, "step": 23290 }, { "epoch": 2.6539096759496554, "grad_norm": 11.666319847106934, "learning_rate": 2.3526658294325838e-05, "loss": 4.7708, "step": 23300 }, { "epoch": 2.65504869297796, "grad_norm": 13.447303771972656, "learning_rate": 2.3515241465920766e-05, "loss": 4.6003, "step": 23310 }, { "epoch": 2.6561877100062645, "grad_norm": 30.06685447692871, "learning_rate": 2.35038246375157e-05, "loss": 4.3196, "step": 23320 }, { "epoch": 2.6573267270345693, "grad_norm": 18.421146392822266, "learning_rate": 2.349240780911063e-05, "loss": 4.5392, "step": 23330 }, { "epoch": 2.6584657440628736, "grad_norm": 7.0810227394104, "learning_rate": 2.348099098070556e-05, "loss": 4.5955, "step": 23340 }, { "epoch": 2.6596047610911784, "grad_norm": 18.516857147216797, "learning_rate": 2.346957415230049e-05, "loss": 4.3521, "step": 23350 }, { "epoch": 2.6607437781194827, "grad_norm": 10.7908935546875, "learning_rate": 2.3458157323895422e-05, "loss": 4.2535, "step": 23360 }, { "epoch": 2.6618827951477875, "grad_norm": 46.05020523071289, "learning_rate": 2.3446740495490354e-05, "loss": 4.271, "step": 23370 }, { "epoch": 2.6630218121760922, "grad_norm": 16.857059478759766, "learning_rate": 2.3435323667085286e-05, "loss": 4.7449, "step": 23380 }, { "epoch": 2.6641608292043966, "grad_norm": 15.106719017028809, "learning_rate": 2.3423906838680214e-05, "loss": 4.6435, "step": 23390 }, { "epoch": 2.6652998462327013, "grad_norm": 19.65445899963379, "learning_rate": 2.3412490010275146e-05, "loss": 4.3389, "step": 23400 }, { "epoch": 2.6664388632610057, "grad_norm": 10.110185623168945, "learning_rate": 2.3401073181870078e-05, "loss": 4.544, "step": 23410 }, { "epoch": 2.6675778802893104, "grad_norm": 24.37737274169922, "learning_rate": 2.338965635346501e-05, "loss": 4.5911, "step": 23420 }, { "epoch": 2.6687168973176147, "grad_norm": 26.02604103088379, "learning_rate": 2.3378239525059938e-05, "loss": 4.1215, "step": 23430 }, { "epoch": 2.6698559143459195, "grad_norm": 10.337889671325684, "learning_rate": 2.336682269665487e-05, "loss": 4.4812, "step": 23440 }, { "epoch": 2.6709949313742243, "grad_norm": 12.305615425109863, "learning_rate": 2.33554058682498e-05, "loss": 4.3729, "step": 23450 }, { "epoch": 2.6721339484025286, "grad_norm": 23.679931640625, "learning_rate": 2.3343989039844733e-05, "loss": 4.5077, "step": 23460 }, { "epoch": 2.673272965430833, "grad_norm": 19.276321411132812, "learning_rate": 2.3332572211439662e-05, "loss": 4.7323, "step": 23470 }, { "epoch": 2.6744119824591377, "grad_norm": 18.629981994628906, "learning_rate": 2.3321155383034594e-05, "loss": 4.6628, "step": 23480 }, { "epoch": 2.6755509994874425, "grad_norm": 57.32600021362305, "learning_rate": 2.3309738554629526e-05, "loss": 4.3868, "step": 23490 }, { "epoch": 2.676690016515747, "grad_norm": 10.035811424255371, "learning_rate": 2.3298321726224457e-05, "loss": 4.4898, "step": 23500 }, { "epoch": 2.6778290335440516, "grad_norm": 11.750822067260742, "learning_rate": 2.3286904897819386e-05, "loss": 4.542, "step": 23510 }, { "epoch": 2.6789680505723563, "grad_norm": 11.74675178527832, "learning_rate": 2.3275488069414318e-05, "loss": 4.7508, "step": 23520 }, { "epoch": 2.6801070676006606, "grad_norm": 13.379685401916504, "learning_rate": 2.326407124100925e-05, "loss": 4.4905, "step": 23530 }, { "epoch": 2.681246084628965, "grad_norm": 17.032699584960938, "learning_rate": 2.325265441260418e-05, "loss": 4.4351, "step": 23540 }, { "epoch": 2.6823851016572697, "grad_norm": 11.6006441116333, "learning_rate": 2.324123758419911e-05, "loss": 4.0651, "step": 23550 }, { "epoch": 2.6835241186855745, "grad_norm": 10.766636848449707, "learning_rate": 2.322982075579404e-05, "loss": 4.6497, "step": 23560 }, { "epoch": 2.684663135713879, "grad_norm": 17.304126739501953, "learning_rate": 2.3218403927388973e-05, "loss": 4.3875, "step": 23570 }, { "epoch": 2.6858021527421836, "grad_norm": 8.679882049560547, "learning_rate": 2.3206987098983905e-05, "loss": 4.3275, "step": 23580 }, { "epoch": 2.686941169770488, "grad_norm": 12.993999481201172, "learning_rate": 2.3195570270578834e-05, "loss": 4.5775, "step": 23590 }, { "epoch": 2.6880801867987927, "grad_norm": 19.391279220581055, "learning_rate": 2.3184153442173766e-05, "loss": 4.3392, "step": 23600 }, { "epoch": 2.689219203827097, "grad_norm": 9.784872055053711, "learning_rate": 2.3172736613768697e-05, "loss": 4.4187, "step": 23610 }, { "epoch": 2.6903582208554018, "grad_norm": 17.43511199951172, "learning_rate": 2.316131978536363e-05, "loss": 4.6063, "step": 23620 }, { "epoch": 2.6914972378837065, "grad_norm": 47.65671157836914, "learning_rate": 2.3149902956958558e-05, "loss": 4.1528, "step": 23630 }, { "epoch": 2.692636254912011, "grad_norm": 11.7083740234375, "learning_rate": 2.313848612855349e-05, "loss": 4.5552, "step": 23640 }, { "epoch": 2.6937752719403156, "grad_norm": 42.59748840332031, "learning_rate": 2.312706930014842e-05, "loss": 4.3025, "step": 23650 }, { "epoch": 2.69491428896862, "grad_norm": 23.80282974243164, "learning_rate": 2.311565247174335e-05, "loss": 4.593, "step": 23660 }, { "epoch": 2.6960533059969247, "grad_norm": 12.07687759399414, "learning_rate": 2.310423564333828e-05, "loss": 4.6442, "step": 23670 }, { "epoch": 2.697192323025229, "grad_norm": 9.943156242370605, "learning_rate": 2.3092818814933214e-05, "loss": 4.3141, "step": 23680 }, { "epoch": 2.698331340053534, "grad_norm": 45.91529846191406, "learning_rate": 2.3081401986528142e-05, "loss": 4.6511, "step": 23690 }, { "epoch": 2.6994703570818386, "grad_norm": 14.183732032775879, "learning_rate": 2.3069985158123074e-05, "loss": 4.3711, "step": 23700 }, { "epoch": 2.700609374110143, "grad_norm": 10.457621574401855, "learning_rate": 2.3058568329718006e-05, "loss": 4.1473, "step": 23710 }, { "epoch": 2.7017483911384477, "grad_norm": 15.812500953674316, "learning_rate": 2.3047151501312934e-05, "loss": 4.4821, "step": 23720 }, { "epoch": 2.702887408166752, "grad_norm": 10.635324478149414, "learning_rate": 2.3035734672907866e-05, "loss": 4.6118, "step": 23730 }, { "epoch": 2.7040264251950568, "grad_norm": 23.687637329101562, "learning_rate": 2.3024317844502798e-05, "loss": 4.4089, "step": 23740 }, { "epoch": 2.705165442223361, "grad_norm": 11.307161331176758, "learning_rate": 2.301290101609773e-05, "loss": 4.4876, "step": 23750 }, { "epoch": 2.706304459251666, "grad_norm": 9.728133201599121, "learning_rate": 2.3001484187692658e-05, "loss": 4.4325, "step": 23760 }, { "epoch": 2.7074434762799706, "grad_norm": 17.820701599121094, "learning_rate": 2.299006735928759e-05, "loss": 4.4467, "step": 23770 }, { "epoch": 2.708582493308275, "grad_norm": 14.636841773986816, "learning_rate": 2.2978650530882522e-05, "loss": 4.4374, "step": 23780 }, { "epoch": 2.7097215103365793, "grad_norm": 14.008092880249023, "learning_rate": 2.2967233702477454e-05, "loss": 3.9471, "step": 23790 }, { "epoch": 2.710860527364884, "grad_norm": 11.054430961608887, "learning_rate": 2.2955816874072382e-05, "loss": 4.1937, "step": 23800 }, { "epoch": 2.711999544393189, "grad_norm": 10.359509468078613, "learning_rate": 2.2944400045667314e-05, "loss": 4.6554, "step": 23810 }, { "epoch": 2.713138561421493, "grad_norm": 11.266721725463867, "learning_rate": 2.2932983217262246e-05, "loss": 4.2294, "step": 23820 }, { "epoch": 2.714277578449798, "grad_norm": 13.704965591430664, "learning_rate": 2.2921566388857178e-05, "loss": 4.446, "step": 23830 }, { "epoch": 2.7154165954781027, "grad_norm": 10.502473831176758, "learning_rate": 2.2910149560452106e-05, "loss": 4.669, "step": 23840 }, { "epoch": 2.716555612506407, "grad_norm": 11.850542068481445, "learning_rate": 2.2898732732047038e-05, "loss": 4.5317, "step": 23850 }, { "epoch": 2.7176946295347113, "grad_norm": 28.340858459472656, "learning_rate": 2.288731590364197e-05, "loss": 4.4106, "step": 23860 }, { "epoch": 2.718833646563016, "grad_norm": 13.200830459594727, "learning_rate": 2.28758990752369e-05, "loss": 4.4204, "step": 23870 }, { "epoch": 2.719972663591321, "grad_norm": 22.98053741455078, "learning_rate": 2.286448224683183e-05, "loss": 4.1871, "step": 23880 }, { "epoch": 2.721111680619625, "grad_norm": 20.95444107055664, "learning_rate": 2.2853065418426762e-05, "loss": 4.3372, "step": 23890 }, { "epoch": 2.72225069764793, "grad_norm": 11.450345993041992, "learning_rate": 2.2841648590021694e-05, "loss": 4.4479, "step": 23900 }, { "epoch": 2.7233897146762343, "grad_norm": 48.11774826049805, "learning_rate": 2.2830231761616625e-05, "loss": 4.208, "step": 23910 }, { "epoch": 2.724528731704539, "grad_norm": 17.685771942138672, "learning_rate": 2.2818814933211554e-05, "loss": 4.3589, "step": 23920 }, { "epoch": 2.7256677487328433, "grad_norm": 13.555407524108887, "learning_rate": 2.2807398104806486e-05, "loss": 4.4688, "step": 23930 }, { "epoch": 2.726806765761148, "grad_norm": 12.690849304199219, "learning_rate": 2.2795981276401418e-05, "loss": 4.1132, "step": 23940 }, { "epoch": 2.727945782789453, "grad_norm": 12.816424369812012, "learning_rate": 2.278456444799635e-05, "loss": 4.5699, "step": 23950 }, { "epoch": 2.729084799817757, "grad_norm": 10.102202415466309, "learning_rate": 2.2773147619591278e-05, "loss": 4.3418, "step": 23960 }, { "epoch": 2.730223816846062, "grad_norm": 12.096992492675781, "learning_rate": 2.276173079118621e-05, "loss": 4.3054, "step": 23970 }, { "epoch": 2.7313628338743663, "grad_norm": 11.40463638305664, "learning_rate": 2.275031396278114e-05, "loss": 4.4429, "step": 23980 }, { "epoch": 2.732501850902671, "grad_norm": 28.24003028869629, "learning_rate": 2.2738897134376073e-05, "loss": 4.2202, "step": 23990 }, { "epoch": 2.7336408679309754, "grad_norm": 12.26915168762207, "learning_rate": 2.2727480305971002e-05, "loss": 4.4253, "step": 24000 }, { "epoch": 2.7336408679309754, "eval_loss": 6.015596389770508, "eval_runtime": 11.1696, "eval_samples_per_second": 1.343, "eval_steps_per_second": 0.179, "step": 24000 }, { "epoch": 2.73477988495928, "grad_norm": 11.82247543334961, "learning_rate": 2.2716063477565934e-05, "loss": 4.3248, "step": 24010 }, { "epoch": 2.735918901987585, "grad_norm": 19.194496154785156, "learning_rate": 2.2704646649160862e-05, "loss": 4.542, "step": 24020 }, { "epoch": 2.7370579190158892, "grad_norm": 21.586811065673828, "learning_rate": 2.2693229820755797e-05, "loss": 4.1183, "step": 24030 }, { "epoch": 2.738196936044194, "grad_norm": 10.829854011535645, "learning_rate": 2.2681812992350726e-05, "loss": 4.367, "step": 24040 }, { "epoch": 2.7393359530724983, "grad_norm": 9.324262619018555, "learning_rate": 2.2670396163945658e-05, "loss": 4.5154, "step": 24050 }, { "epoch": 2.740474970100803, "grad_norm": 13.50275993347168, "learning_rate": 2.2658979335540586e-05, "loss": 4.6754, "step": 24060 }, { "epoch": 2.7416139871291074, "grad_norm": 20.0113468170166, "learning_rate": 2.264756250713552e-05, "loss": 4.7198, "step": 24070 }, { "epoch": 2.742753004157412, "grad_norm": 10.483927726745605, "learning_rate": 2.263614567873045e-05, "loss": 4.4205, "step": 24080 }, { "epoch": 2.743892021185717, "grad_norm": 16.790781021118164, "learning_rate": 2.262472885032538e-05, "loss": 4.5606, "step": 24090 }, { "epoch": 2.7450310382140213, "grad_norm": 11.87458610534668, "learning_rate": 2.261331202192031e-05, "loss": 4.3453, "step": 24100 }, { "epoch": 2.7461700552423256, "grad_norm": 11.971658706665039, "learning_rate": 2.2601895193515242e-05, "loss": 4.116, "step": 24110 }, { "epoch": 2.7473090722706304, "grad_norm": 9.94765853881836, "learning_rate": 2.2590478365110174e-05, "loss": 4.4187, "step": 24120 }, { "epoch": 2.748448089298935, "grad_norm": 16.397294998168945, "learning_rate": 2.2579061536705106e-05, "loss": 4.5822, "step": 24130 }, { "epoch": 2.7495871063272395, "grad_norm": 18.915102005004883, "learning_rate": 2.2567644708300034e-05, "loss": 4.5509, "step": 24140 }, { "epoch": 2.7507261233555442, "grad_norm": 19.87112808227539, "learning_rate": 2.2556227879894966e-05, "loss": 4.7238, "step": 24150 }, { "epoch": 2.751865140383849, "grad_norm": 39.02969741821289, "learning_rate": 2.2544811051489898e-05, "loss": 4.2455, "step": 24160 }, { "epoch": 2.7530041574121533, "grad_norm": 13.967534065246582, "learning_rate": 2.2533394223084826e-05, "loss": 4.7613, "step": 24170 }, { "epoch": 2.7541431744404576, "grad_norm": 21.92757797241211, "learning_rate": 2.2521977394679758e-05, "loss": 4.2869, "step": 24180 }, { "epoch": 2.7552821914687624, "grad_norm": 12.102161407470703, "learning_rate": 2.251056056627469e-05, "loss": 4.2129, "step": 24190 }, { "epoch": 2.756421208497067, "grad_norm": 8.07431411743164, "learning_rate": 2.249914373786962e-05, "loss": 4.4516, "step": 24200 }, { "epoch": 2.7575602255253715, "grad_norm": 8.32588005065918, "learning_rate": 2.248772690946455e-05, "loss": 4.4829, "step": 24210 }, { "epoch": 2.7586992425536763, "grad_norm": 7.788595676422119, "learning_rate": 2.2476310081059482e-05, "loss": 4.4237, "step": 24220 }, { "epoch": 2.7598382595819806, "grad_norm": 9.158100128173828, "learning_rate": 2.2464893252654414e-05, "loss": 4.1744, "step": 24230 }, { "epoch": 2.7609772766102854, "grad_norm": 12.381442070007324, "learning_rate": 2.2453476424249346e-05, "loss": 4.3906, "step": 24240 }, { "epoch": 2.7621162936385897, "grad_norm": 11.133394241333008, "learning_rate": 2.2442059595844274e-05, "loss": 4.1881, "step": 24250 }, { "epoch": 2.7632553106668944, "grad_norm": 14.3223295211792, "learning_rate": 2.2430642767439206e-05, "loss": 4.4534, "step": 24260 }, { "epoch": 2.764394327695199, "grad_norm": 11.660000801086426, "learning_rate": 2.2419225939034138e-05, "loss": 4.399, "step": 24270 }, { "epoch": 2.7655333447235035, "grad_norm": 9.384432792663574, "learning_rate": 2.240780911062907e-05, "loss": 4.447, "step": 24280 }, { "epoch": 2.7666723617518083, "grad_norm": 10.920495986938477, "learning_rate": 2.2396392282223998e-05, "loss": 4.4813, "step": 24290 }, { "epoch": 2.7678113787801126, "grad_norm": 10.877670288085938, "learning_rate": 2.238497545381893e-05, "loss": 4.5073, "step": 24300 }, { "epoch": 2.7689503958084174, "grad_norm": 11.857516288757324, "learning_rate": 2.237355862541386e-05, "loss": 4.4858, "step": 24310 }, { "epoch": 2.7700894128367217, "grad_norm": 19.363431930541992, "learning_rate": 2.2362141797008794e-05, "loss": 4.4434, "step": 24320 }, { "epoch": 2.7712284298650265, "grad_norm": 13.742807388305664, "learning_rate": 2.2350724968603722e-05, "loss": 3.8316, "step": 24330 }, { "epoch": 2.7723674468933313, "grad_norm": 7.8158135414123535, "learning_rate": 2.2339308140198654e-05, "loss": 4.4209, "step": 24340 }, { "epoch": 2.7735064639216356, "grad_norm": 19.696626663208008, "learning_rate": 2.2327891311793586e-05, "loss": 4.4275, "step": 24350 }, { "epoch": 2.7746454809499403, "grad_norm": 13.6576509475708, "learning_rate": 2.2316474483388517e-05, "loss": 4.53, "step": 24360 }, { "epoch": 2.7757844979782447, "grad_norm": 12.400209426879883, "learning_rate": 2.2305057654983446e-05, "loss": 4.3855, "step": 24370 }, { "epoch": 2.7769235150065494, "grad_norm": 24.432838439941406, "learning_rate": 2.2293640826578378e-05, "loss": 4.2563, "step": 24380 }, { "epoch": 2.7780625320348538, "grad_norm": 9.757552146911621, "learning_rate": 2.2282223998173306e-05, "loss": 4.2891, "step": 24390 }, { "epoch": 2.7792015490631585, "grad_norm": 50.52961349487305, "learning_rate": 2.227080716976824e-05, "loss": 4.4685, "step": 24400 }, { "epoch": 2.7803405660914633, "grad_norm": 28.591514587402344, "learning_rate": 2.225939034136317e-05, "loss": 4.6151, "step": 24410 }, { "epoch": 2.7814795831197676, "grad_norm": 13.835251808166504, "learning_rate": 2.2247973512958102e-05, "loss": 4.3123, "step": 24420 }, { "epoch": 2.782618600148072, "grad_norm": 9.26570987701416, "learning_rate": 2.223655668455303e-05, "loss": 4.4072, "step": 24430 }, { "epoch": 2.7837576171763767, "grad_norm": 13.097249031066895, "learning_rate": 2.2225139856147965e-05, "loss": 4.4835, "step": 24440 }, { "epoch": 2.7848966342046815, "grad_norm": 8.850903511047363, "learning_rate": 2.2213723027742894e-05, "loss": 4.8888, "step": 24450 }, { "epoch": 2.786035651232986, "grad_norm": 14.328838348388672, "learning_rate": 2.2202306199337826e-05, "loss": 4.1116, "step": 24460 }, { "epoch": 2.7871746682612906, "grad_norm": 12.785542488098145, "learning_rate": 2.2190889370932754e-05, "loss": 4.6598, "step": 24470 }, { "epoch": 2.7883136852895953, "grad_norm": 11.574085235595703, "learning_rate": 2.217947254252769e-05, "loss": 4.7404, "step": 24480 }, { "epoch": 2.7894527023178997, "grad_norm": 13.864222526550293, "learning_rate": 2.2168055714122618e-05, "loss": 4.5075, "step": 24490 }, { "epoch": 2.790591719346204, "grad_norm": 9.302299499511719, "learning_rate": 2.215663888571755e-05, "loss": 4.4674, "step": 24500 }, { "epoch": 2.7917307363745087, "grad_norm": 11.267061233520508, "learning_rate": 2.2145222057312478e-05, "loss": 4.5262, "step": 24510 }, { "epoch": 2.7928697534028135, "grad_norm": 15.274855613708496, "learning_rate": 2.213380522890741e-05, "loss": 4.1499, "step": 24520 }, { "epoch": 2.794008770431118, "grad_norm": 12.671558380126953, "learning_rate": 2.2122388400502342e-05, "loss": 4.6206, "step": 24530 }, { "epoch": 2.7951477874594226, "grad_norm": 12.626591682434082, "learning_rate": 2.2110971572097274e-05, "loss": 4.5262, "step": 24540 }, { "epoch": 2.796286804487727, "grad_norm": 33.061283111572266, "learning_rate": 2.2099554743692202e-05, "loss": 3.9884, "step": 24550 }, { "epoch": 2.7974258215160317, "grad_norm": 14.592642784118652, "learning_rate": 2.2088137915287134e-05, "loss": 4.2963, "step": 24560 }, { "epoch": 2.798564838544336, "grad_norm": 20.097068786621094, "learning_rate": 2.2076721086882066e-05, "loss": 4.4814, "step": 24570 }, { "epoch": 2.799703855572641, "grad_norm": 11.807546615600586, "learning_rate": 2.2065304258476998e-05, "loss": 4.2837, "step": 24580 }, { "epoch": 2.8008428726009456, "grad_norm": 13.322919845581055, "learning_rate": 2.2053887430071926e-05, "loss": 4.328, "step": 24590 }, { "epoch": 2.80198188962925, "grad_norm": 10.876242637634277, "learning_rate": 2.2042470601666858e-05, "loss": 4.4833, "step": 24600 }, { "epoch": 2.8031209066575546, "grad_norm": 8.46240234375, "learning_rate": 2.203105377326179e-05, "loss": 4.0987, "step": 24610 }, { "epoch": 2.804259923685859, "grad_norm": 8.846264839172363, "learning_rate": 2.2019636944856718e-05, "loss": 4.2597, "step": 24620 }, { "epoch": 2.8053989407141637, "grad_norm": 11.06393814086914, "learning_rate": 2.200822011645165e-05, "loss": 4.4988, "step": 24630 }, { "epoch": 2.806537957742468, "grad_norm": 16.189380645751953, "learning_rate": 2.1996803288046582e-05, "loss": 4.245, "step": 24640 }, { "epoch": 2.807676974770773, "grad_norm": 9.137038230895996, "learning_rate": 2.1985386459641514e-05, "loss": 4.3353, "step": 24650 }, { "epoch": 2.8088159917990776, "grad_norm": 20.32309913635254, "learning_rate": 2.1973969631236442e-05, "loss": 4.1225, "step": 24660 }, { "epoch": 2.809955008827382, "grad_norm": 9.596814155578613, "learning_rate": 2.1962552802831374e-05, "loss": 4.2842, "step": 24670 }, { "epoch": 2.8110940258556867, "grad_norm": 42.25757598876953, "learning_rate": 2.1951135974426306e-05, "loss": 4.3953, "step": 24680 }, { "epoch": 2.812233042883991, "grad_norm": 10.706561088562012, "learning_rate": 2.1939719146021238e-05, "loss": 4.3581, "step": 24690 }, { "epoch": 2.8133720599122958, "grad_norm": 8.807883262634277, "learning_rate": 2.1928302317616166e-05, "loss": 4.4784, "step": 24700 }, { "epoch": 2.8145110769406, "grad_norm": 12.849080085754395, "learning_rate": 2.1916885489211098e-05, "loss": 4.1881, "step": 24710 }, { "epoch": 2.815650093968905, "grad_norm": 31.404457092285156, "learning_rate": 2.1905468660806026e-05, "loss": 3.8575, "step": 24720 }, { "epoch": 2.8167891109972096, "grad_norm": 10.283102989196777, "learning_rate": 2.189405183240096e-05, "loss": 4.3447, "step": 24730 }, { "epoch": 2.817928128025514, "grad_norm": 14.568113327026367, "learning_rate": 2.188263500399589e-05, "loss": 4.3281, "step": 24740 }, { "epoch": 2.8190671450538183, "grad_norm": 19.217689514160156, "learning_rate": 2.1871218175590822e-05, "loss": 4.1652, "step": 24750 }, { "epoch": 2.820206162082123, "grad_norm": 13.5247802734375, "learning_rate": 2.185980134718575e-05, "loss": 4.4339, "step": 24760 }, { "epoch": 2.821345179110428, "grad_norm": 31.91325569152832, "learning_rate": 2.1848384518780686e-05, "loss": 4.5414, "step": 24770 }, { "epoch": 2.822484196138732, "grad_norm": 10.52747631072998, "learning_rate": 2.1836967690375614e-05, "loss": 4.1582, "step": 24780 }, { "epoch": 2.823623213167037, "grad_norm": 10.953377723693848, "learning_rate": 2.1825550861970546e-05, "loss": 4.7382, "step": 24790 }, { "epoch": 2.8247622301953417, "grad_norm": 16.29425048828125, "learning_rate": 2.1814134033565474e-05, "loss": 4.3219, "step": 24800 }, { "epoch": 2.825901247223646, "grad_norm": 11.031655311584473, "learning_rate": 2.180271720516041e-05, "loss": 4.5626, "step": 24810 }, { "epoch": 2.8270402642519503, "grad_norm": 10.524728775024414, "learning_rate": 2.1791300376755338e-05, "loss": 4.4956, "step": 24820 }, { "epoch": 2.828179281280255, "grad_norm": 18.397052764892578, "learning_rate": 2.177988354835027e-05, "loss": 4.6385, "step": 24830 }, { "epoch": 2.82931829830856, "grad_norm": 9.142768859863281, "learning_rate": 2.1768466719945198e-05, "loss": 4.0946, "step": 24840 }, { "epoch": 2.830457315336864, "grad_norm": 21.6534366607666, "learning_rate": 2.1757049891540133e-05, "loss": 4.0786, "step": 24850 }, { "epoch": 2.831596332365169, "grad_norm": 15.395153045654297, "learning_rate": 2.1745633063135062e-05, "loss": 4.2351, "step": 24860 }, { "epoch": 2.8327353493934733, "grad_norm": 10.324874877929688, "learning_rate": 2.1734216234729994e-05, "loss": 4.5813, "step": 24870 }, { "epoch": 2.833874366421778, "grad_norm": 8.68514347076416, "learning_rate": 2.1722799406324922e-05, "loss": 4.1804, "step": 24880 }, { "epoch": 2.8350133834500824, "grad_norm": 24.952619552612305, "learning_rate": 2.1711382577919857e-05, "loss": 4.4424, "step": 24890 }, { "epoch": 2.836152400478387, "grad_norm": 10.112604141235352, "learning_rate": 2.1699965749514786e-05, "loss": 4.3136, "step": 24900 }, { "epoch": 2.837291417506692, "grad_norm": 9.500236511230469, "learning_rate": 2.1688548921109718e-05, "loss": 4.2868, "step": 24910 }, { "epoch": 2.838430434534996, "grad_norm": 11.29405403137207, "learning_rate": 2.1677132092704646e-05, "loss": 4.8407, "step": 24920 }, { "epoch": 2.839569451563301, "grad_norm": 14.321564674377441, "learning_rate": 2.166571526429958e-05, "loss": 4.587, "step": 24930 }, { "epoch": 2.8407084685916053, "grad_norm": 14.331136703491211, "learning_rate": 2.165429843589451e-05, "loss": 4.5715, "step": 24940 }, { "epoch": 2.84184748561991, "grad_norm": 10.820293426513672, "learning_rate": 2.164288160748944e-05, "loss": 4.4391, "step": 24950 }, { "epoch": 2.8429865026482144, "grad_norm": 17.32324981689453, "learning_rate": 2.163146477908437e-05, "loss": 4.2561, "step": 24960 }, { "epoch": 2.844125519676519, "grad_norm": 31.806528091430664, "learning_rate": 2.1620047950679302e-05, "loss": 4.1464, "step": 24970 }, { "epoch": 2.845264536704824, "grad_norm": 37.59844970703125, "learning_rate": 2.1608631122274234e-05, "loss": 4.1737, "step": 24980 }, { "epoch": 2.8464035537331283, "grad_norm": 11.175416946411133, "learning_rate": 2.1597214293869166e-05, "loss": 4.13, "step": 24990 }, { "epoch": 2.847542570761433, "grad_norm": 12.340738296508789, "learning_rate": 2.1585797465464094e-05, "loss": 4.8189, "step": 25000 }, { "epoch": 2.8486815877897373, "grad_norm": 13.959515571594238, "learning_rate": 2.1574380637059026e-05, "loss": 4.2027, "step": 25010 }, { "epoch": 2.849820604818042, "grad_norm": 10.458575248718262, "learning_rate": 2.1562963808653958e-05, "loss": 4.1396, "step": 25020 }, { "epoch": 2.8509596218463464, "grad_norm": 21.41703224182129, "learning_rate": 2.1551546980248886e-05, "loss": 4.3719, "step": 25030 }, { "epoch": 2.852098638874651, "grad_norm": 11.951301574707031, "learning_rate": 2.1540130151843818e-05, "loss": 4.4561, "step": 25040 }, { "epoch": 2.853237655902956, "grad_norm": 20.143756866455078, "learning_rate": 2.152871332343875e-05, "loss": 4.3852, "step": 25050 }, { "epoch": 2.8543766729312603, "grad_norm": 13.41882038116455, "learning_rate": 2.1517296495033682e-05, "loss": 4.3855, "step": 25060 }, { "epoch": 2.8555156899595646, "grad_norm": 21.393659591674805, "learning_rate": 2.150587966662861e-05, "loss": 4.4363, "step": 25070 }, { "epoch": 2.8566547069878694, "grad_norm": 10.970490455627441, "learning_rate": 2.1494462838223542e-05, "loss": 4.3659, "step": 25080 }, { "epoch": 2.857793724016174, "grad_norm": 14.912433624267578, "learning_rate": 2.148304600981847e-05, "loss": 4.567, "step": 25090 }, { "epoch": 2.8589327410444785, "grad_norm": 23.709640502929688, "learning_rate": 2.1471629181413406e-05, "loss": 4.416, "step": 25100 }, { "epoch": 2.8600717580727832, "grad_norm": 10.713531494140625, "learning_rate": 2.1460212353008334e-05, "loss": 4.2958, "step": 25110 }, { "epoch": 2.861210775101088, "grad_norm": 20.735071182250977, "learning_rate": 2.1448795524603266e-05, "loss": 4.6377, "step": 25120 }, { "epoch": 2.8623497921293923, "grad_norm": 14.454633712768555, "learning_rate": 2.1437378696198194e-05, "loss": 4.7845, "step": 25130 }, { "epoch": 2.8634888091576967, "grad_norm": 11.987555503845215, "learning_rate": 2.142596186779313e-05, "loss": 4.3835, "step": 25140 }, { "epoch": 2.8646278261860014, "grad_norm": 28.97137451171875, "learning_rate": 2.1414545039388058e-05, "loss": 4.3128, "step": 25150 }, { "epoch": 2.865766843214306, "grad_norm": 12.924393653869629, "learning_rate": 2.140312821098299e-05, "loss": 4.5555, "step": 25160 }, { "epoch": 2.8669058602426105, "grad_norm": 8.018871307373047, "learning_rate": 2.139171138257792e-05, "loss": 4.6651, "step": 25170 }, { "epoch": 2.8680448772709153, "grad_norm": 8.445608139038086, "learning_rate": 2.1380294554172854e-05, "loss": 4.9271, "step": 25180 }, { "epoch": 2.86918389429922, "grad_norm": 11.492569923400879, "learning_rate": 2.1368877725767782e-05, "loss": 4.4513, "step": 25190 }, { "epoch": 2.8703229113275244, "grad_norm": 10.3760347366333, "learning_rate": 2.1357460897362714e-05, "loss": 4.4288, "step": 25200 }, { "epoch": 2.8714619283558287, "grad_norm": 10.512873649597168, "learning_rate": 2.1346044068957642e-05, "loss": 4.4764, "step": 25210 }, { "epoch": 2.8726009453841335, "grad_norm": 15.735414505004883, "learning_rate": 2.1334627240552578e-05, "loss": 4.4197, "step": 25220 }, { "epoch": 2.8737399624124382, "grad_norm": 11.466535568237305, "learning_rate": 2.1323210412147506e-05, "loss": 4.5131, "step": 25230 }, { "epoch": 2.8748789794407426, "grad_norm": 12.852303504943848, "learning_rate": 2.1311793583742438e-05, "loss": 4.3155, "step": 25240 }, { "epoch": 2.8760179964690473, "grad_norm": 10.068410873413086, "learning_rate": 2.1300376755337366e-05, "loss": 4.4798, "step": 25250 }, { "epoch": 2.8771570134973516, "grad_norm": 13.780503273010254, "learning_rate": 2.12889599269323e-05, "loss": 4.4898, "step": 25260 }, { "epoch": 2.8782960305256564, "grad_norm": 15.617043495178223, "learning_rate": 2.127754309852723e-05, "loss": 4.4464, "step": 25270 }, { "epoch": 2.8794350475539607, "grad_norm": 11.50779914855957, "learning_rate": 2.1266126270122162e-05, "loss": 4.3775, "step": 25280 }, { "epoch": 2.8805740645822655, "grad_norm": 9.568475723266602, "learning_rate": 2.125470944171709e-05, "loss": 4.6033, "step": 25290 }, { "epoch": 2.8817130816105703, "grad_norm": 12.430456161499023, "learning_rate": 2.1243292613312026e-05, "loss": 4.3264, "step": 25300 }, { "epoch": 2.8828520986388746, "grad_norm": 18.374462127685547, "learning_rate": 2.1231875784906954e-05, "loss": 4.4276, "step": 25310 }, { "epoch": 2.8839911156671794, "grad_norm": 12.369324684143066, "learning_rate": 2.1220458956501886e-05, "loss": 4.5652, "step": 25320 }, { "epoch": 2.8851301326954837, "grad_norm": 10.463457107543945, "learning_rate": 2.1209042128096814e-05, "loss": 4.6378, "step": 25330 }, { "epoch": 2.8862691497237885, "grad_norm": 15.187032699584961, "learning_rate": 2.119762529969175e-05, "loss": 4.2882, "step": 25340 }, { "epoch": 2.8874081667520928, "grad_norm": 14.741857528686523, "learning_rate": 2.1186208471286678e-05, "loss": 4.8769, "step": 25350 }, { "epoch": 2.8885471837803975, "grad_norm": 12.209632873535156, "learning_rate": 2.117479164288161e-05, "loss": 4.6269, "step": 25360 }, { "epoch": 2.8896862008087023, "grad_norm": 10.20417594909668, "learning_rate": 2.1163374814476538e-05, "loss": 4.4955, "step": 25370 }, { "epoch": 2.8908252178370066, "grad_norm": 51.44119644165039, "learning_rate": 2.115195798607147e-05, "loss": 4.6589, "step": 25380 }, { "epoch": 2.8919642348653114, "grad_norm": 23.65181541442871, "learning_rate": 2.1140541157666402e-05, "loss": 4.0472, "step": 25390 }, { "epoch": 2.8931032518936157, "grad_norm": 9.626806259155273, "learning_rate": 2.1129124329261334e-05, "loss": 4.2082, "step": 25400 }, { "epoch": 2.8942422689219205, "grad_norm": 18.213037490844727, "learning_rate": 2.1117707500856262e-05, "loss": 4.5842, "step": 25410 }, { "epoch": 2.895381285950225, "grad_norm": 12.3046293258667, "learning_rate": 2.1106290672451194e-05, "loss": 4.6834, "step": 25420 }, { "epoch": 2.8965203029785296, "grad_norm": 9.713820457458496, "learning_rate": 2.1094873844046126e-05, "loss": 4.3921, "step": 25430 }, { "epoch": 2.8976593200068343, "grad_norm": 12.635478019714355, "learning_rate": 2.1083457015641058e-05, "loss": 4.2344, "step": 25440 }, { "epoch": 2.8987983370351387, "grad_norm": 8.912229537963867, "learning_rate": 2.1072040187235986e-05, "loss": 4.4152, "step": 25450 }, { "epoch": 2.899937354063443, "grad_norm": 10.33163070678711, "learning_rate": 2.1060623358830918e-05, "loss": 4.408, "step": 25460 }, { "epoch": 2.9010763710917478, "grad_norm": 14.34139347076416, "learning_rate": 2.104920653042585e-05, "loss": 4.4, "step": 25470 }, { "epoch": 2.9022153881200525, "grad_norm": 13.725435256958008, "learning_rate": 2.1037789702020778e-05, "loss": 5.1318, "step": 25480 }, { "epoch": 2.903354405148357, "grad_norm": 27.68000602722168, "learning_rate": 2.102637287361571e-05, "loss": 4.7994, "step": 25490 }, { "epoch": 2.9044934221766616, "grad_norm": 13.529292106628418, "learning_rate": 2.1014956045210642e-05, "loss": 4.4018, "step": 25500 }, { "epoch": 2.9056324392049664, "grad_norm": 11.97656536102295, "learning_rate": 2.1003539216805574e-05, "loss": 4.4313, "step": 25510 }, { "epoch": 2.9067714562332707, "grad_norm": 11.98920726776123, "learning_rate": 2.0992122388400502e-05, "loss": 4.5136, "step": 25520 }, { "epoch": 2.907910473261575, "grad_norm": 17.4196834564209, "learning_rate": 2.0980705559995434e-05, "loss": 4.1763, "step": 25530 }, { "epoch": 2.90904949028988, "grad_norm": 9.960898399353027, "learning_rate": 2.0969288731590362e-05, "loss": 4.4402, "step": 25540 }, { "epoch": 2.9101885073181846, "grad_norm": 10.668099403381348, "learning_rate": 2.0957871903185298e-05, "loss": 4.3085, "step": 25550 }, { "epoch": 2.911327524346489, "grad_norm": 10.457843780517578, "learning_rate": 2.0946455074780226e-05, "loss": 4.3377, "step": 25560 }, { "epoch": 2.9124665413747937, "grad_norm": 28.04571533203125, "learning_rate": 2.0935038246375158e-05, "loss": 4.4546, "step": 25570 }, { "epoch": 2.913605558403098, "grad_norm": 65.01653289794922, "learning_rate": 2.0923621417970086e-05, "loss": 4.3514, "step": 25580 }, { "epoch": 2.9147445754314028, "grad_norm": 23.05303955078125, "learning_rate": 2.091220458956502e-05, "loss": 4.3337, "step": 25590 }, { "epoch": 2.915883592459707, "grad_norm": 17.922874450683594, "learning_rate": 2.090078776115995e-05, "loss": 4.2132, "step": 25600 }, { "epoch": 2.917022609488012, "grad_norm": 14.76237964630127, "learning_rate": 2.0889370932754882e-05, "loss": 4.3352, "step": 25610 }, { "epoch": 2.9181616265163166, "grad_norm": 9.139527320861816, "learning_rate": 2.087795410434981e-05, "loss": 4.4611, "step": 25620 }, { "epoch": 2.919300643544621, "grad_norm": 28.567909240722656, "learning_rate": 2.0866537275944746e-05, "loss": 4.8546, "step": 25630 }, { "epoch": 2.9204396605729257, "grad_norm": 9.053948402404785, "learning_rate": 2.0855120447539674e-05, "loss": 4.2187, "step": 25640 }, { "epoch": 2.92157867760123, "grad_norm": 10.97348403930664, "learning_rate": 2.0843703619134606e-05, "loss": 4.0639, "step": 25650 }, { "epoch": 2.922717694629535, "grad_norm": 12.089532852172852, "learning_rate": 2.0832286790729534e-05, "loss": 4.6351, "step": 25660 }, { "epoch": 2.923856711657839, "grad_norm": 58.029869079589844, "learning_rate": 2.082086996232447e-05, "loss": 4.2459, "step": 25670 }, { "epoch": 2.924995728686144, "grad_norm": 19.57094955444336, "learning_rate": 2.0809453133919398e-05, "loss": 4.583, "step": 25680 }, { "epoch": 2.9261347457144486, "grad_norm": 22.886457443237305, "learning_rate": 2.0799177988354837e-05, "loss": 4.3523, "step": 25690 }, { "epoch": 2.927273762742753, "grad_norm": 17.789207458496094, "learning_rate": 2.0787761159949766e-05, "loss": 4.2271, "step": 25700 }, { "epoch": 2.9284127797710577, "grad_norm": 14.033312797546387, "learning_rate": 2.0776344331544698e-05, "loss": 4.4841, "step": 25710 }, { "epoch": 2.929551796799362, "grad_norm": 9.000493049621582, "learning_rate": 2.076492750313963e-05, "loss": 4.4725, "step": 25720 }, { "epoch": 2.930690813827667, "grad_norm": 12.288798332214355, "learning_rate": 2.075351067473456e-05, "loss": 4.4778, "step": 25730 }, { "epoch": 2.931829830855971, "grad_norm": 15.534489631652832, "learning_rate": 2.074209384632949e-05, "loss": 4.3485, "step": 25740 }, { "epoch": 2.932968847884276, "grad_norm": 11.241649627685547, "learning_rate": 2.073067701792442e-05, "loss": 4.4533, "step": 25750 }, { "epoch": 2.9341078649125807, "grad_norm": 12.891498565673828, "learning_rate": 2.071926018951935e-05, "loss": 4.2297, "step": 25760 }, { "epoch": 2.935246881940885, "grad_norm": 10.07817268371582, "learning_rate": 2.0707843361114285e-05, "loss": 4.5676, "step": 25770 }, { "epoch": 2.9363858989691893, "grad_norm": 10.336227416992188, "learning_rate": 2.0696426532709214e-05, "loss": 4.3517, "step": 25780 }, { "epoch": 2.937524915997494, "grad_norm": 25.487110137939453, "learning_rate": 2.0685009704304146e-05, "loss": 3.9777, "step": 25790 }, { "epoch": 2.938663933025799, "grad_norm": 10.53114128112793, "learning_rate": 2.0673592875899074e-05, "loss": 4.2856, "step": 25800 }, { "epoch": 2.939802950054103, "grad_norm": 38.20022964477539, "learning_rate": 2.066217604749401e-05, "loss": 4.1268, "step": 25810 }, { "epoch": 2.940941967082408, "grad_norm": 11.7878999710083, "learning_rate": 2.0650759219088938e-05, "loss": 4.5761, "step": 25820 }, { "epoch": 2.9420809841107127, "grad_norm": 9.836922645568848, "learning_rate": 2.063934239068387e-05, "loss": 4.415, "step": 25830 }, { "epoch": 2.943220001139017, "grad_norm": 29.2979736328125, "learning_rate": 2.0627925562278798e-05, "loss": 4.4441, "step": 25840 }, { "epoch": 2.9443590181673214, "grad_norm": 9.555939674377441, "learning_rate": 2.061650873387373e-05, "loss": 4.7018, "step": 25850 }, { "epoch": 2.945498035195626, "grad_norm": 9.398253440856934, "learning_rate": 2.060509190546866e-05, "loss": 4.3748, "step": 25860 }, { "epoch": 2.946637052223931, "grad_norm": 15.86281681060791, "learning_rate": 2.0593675077063594e-05, "loss": 4.3274, "step": 25870 }, { "epoch": 2.9477760692522352, "grad_norm": 19.11648941040039, "learning_rate": 2.0582258248658522e-05, "loss": 4.8558, "step": 25880 }, { "epoch": 2.94891508628054, "grad_norm": 12.06795883178711, "learning_rate": 2.0570841420253454e-05, "loss": 4.8912, "step": 25890 }, { "epoch": 2.9500541033088443, "grad_norm": 14.896605491638184, "learning_rate": 2.0559424591848386e-05, "loss": 4.6473, "step": 25900 }, { "epoch": 2.951193120337149, "grad_norm": 14.027856826782227, "learning_rate": 2.0548007763443317e-05, "loss": 4.4415, "step": 25910 }, { "epoch": 2.9523321373654534, "grad_norm": 10.832018852233887, "learning_rate": 2.0536590935038246e-05, "loss": 4.125, "step": 25920 }, { "epoch": 2.953471154393758, "grad_norm": 18.01789093017578, "learning_rate": 2.0525174106633178e-05, "loss": 4.1041, "step": 25930 }, { "epoch": 2.954610171422063, "grad_norm": 19.601741790771484, "learning_rate": 2.051375727822811e-05, "loss": 4.1179, "step": 25940 }, { "epoch": 2.9557491884503673, "grad_norm": 11.704947471618652, "learning_rate": 2.050234044982304e-05, "loss": 4.5004, "step": 25950 }, { "epoch": 2.956888205478672, "grad_norm": 11.281723976135254, "learning_rate": 2.049092362141797e-05, "loss": 4.6813, "step": 25960 }, { "epoch": 2.9580272225069764, "grad_norm": 36.630462646484375, "learning_rate": 2.0479506793012902e-05, "loss": 4.3721, "step": 25970 }, { "epoch": 2.959166239535281, "grad_norm": 10.66319751739502, "learning_rate": 2.0468089964607834e-05, "loss": 4.341, "step": 25980 }, { "epoch": 2.9603052565635855, "grad_norm": 24.53282928466797, "learning_rate": 2.0456673136202765e-05, "loss": 4.1195, "step": 25990 }, { "epoch": 2.96144427359189, "grad_norm": 10.011075019836426, "learning_rate": 2.0445256307797694e-05, "loss": 4.4799, "step": 26000 }, { "epoch": 2.96144427359189, "eval_loss": 6.0944600105285645, "eval_runtime": 12.0712, "eval_samples_per_second": 1.243, "eval_steps_per_second": 0.166, "step": 26000 }, { "epoch": 2.962583290620195, "grad_norm": 9.768843650817871, "learning_rate": 2.0433839479392626e-05, "loss": 4.3493, "step": 26010 }, { "epoch": 2.9637223076484993, "grad_norm": 8.531599998474121, "learning_rate": 2.0422422650987558e-05, "loss": 4.5169, "step": 26020 }, { "epoch": 2.964861324676804, "grad_norm": 13.053476333618164, "learning_rate": 2.041100582258249e-05, "loss": 4.4832, "step": 26030 }, { "epoch": 2.9660003417051084, "grad_norm": 17.686269760131836, "learning_rate": 2.0399588994177418e-05, "loss": 4.5952, "step": 26040 }, { "epoch": 2.967139358733413, "grad_norm": 11.529505729675293, "learning_rate": 2.038817216577235e-05, "loss": 4.3991, "step": 26050 }, { "epoch": 2.9682783757617175, "grad_norm": 41.76747512817383, "learning_rate": 2.037675533736728e-05, "loss": 4.6966, "step": 26060 }, { "epoch": 2.9694173927900223, "grad_norm": 16.205829620361328, "learning_rate": 2.0365338508962213e-05, "loss": 4.4837, "step": 26070 }, { "epoch": 2.970556409818327, "grad_norm": 12.326767921447754, "learning_rate": 2.0353921680557142e-05, "loss": 4.306, "step": 26080 }, { "epoch": 2.9716954268466313, "grad_norm": 16.592323303222656, "learning_rate": 2.0342504852152074e-05, "loss": 4.2738, "step": 26090 }, { "epoch": 2.9728344438749357, "grad_norm": 14.362883567810059, "learning_rate": 2.0331088023747005e-05, "loss": 4.4225, "step": 26100 }, { "epoch": 2.9739734609032404, "grad_norm": 17.716650009155273, "learning_rate": 2.0319671195341934e-05, "loss": 4.447, "step": 26110 }, { "epoch": 2.975112477931545, "grad_norm": 15.68718147277832, "learning_rate": 2.0308254366936866e-05, "loss": 4.4186, "step": 26120 }, { "epoch": 2.9762514949598495, "grad_norm": 8.298096656799316, "learning_rate": 2.0296837538531798e-05, "loss": 4.3944, "step": 26130 }, { "epoch": 2.9773905119881543, "grad_norm": 14.487730979919434, "learning_rate": 2.028542071012673e-05, "loss": 4.4883, "step": 26140 }, { "epoch": 2.978529529016459, "grad_norm": 45.469085693359375, "learning_rate": 2.0274003881721658e-05, "loss": 4.3144, "step": 26150 }, { "epoch": 2.9796685460447634, "grad_norm": 11.404308319091797, "learning_rate": 2.026258705331659e-05, "loss": 4.1847, "step": 26160 }, { "epoch": 2.9808075630730677, "grad_norm": 14.415459632873535, "learning_rate": 2.0251170224911518e-05, "loss": 4.5272, "step": 26170 }, { "epoch": 2.9819465801013725, "grad_norm": 22.10926055908203, "learning_rate": 2.0239753396506453e-05, "loss": 4.6152, "step": 26180 }, { "epoch": 2.9830855971296772, "grad_norm": 9.495277404785156, "learning_rate": 2.0228336568101382e-05, "loss": 4.6948, "step": 26190 }, { "epoch": 2.9842246141579816, "grad_norm": 8.213211059570312, "learning_rate": 2.0216919739696314e-05, "loss": 4.3108, "step": 26200 }, { "epoch": 2.9853636311862863, "grad_norm": 15.879585266113281, "learning_rate": 2.0205502911291242e-05, "loss": 4.4845, "step": 26210 }, { "epoch": 2.9865026482145907, "grad_norm": 7.781096935272217, "learning_rate": 2.0194086082886174e-05, "loss": 4.3855, "step": 26220 }, { "epoch": 2.9876416652428954, "grad_norm": 17.566858291625977, "learning_rate": 2.0182669254481106e-05, "loss": 4.2371, "step": 26230 }, { "epoch": 2.9887806822711998, "grad_norm": 10.631742477416992, "learning_rate": 2.0171252426076038e-05, "loss": 4.5712, "step": 26240 }, { "epoch": 2.9899196992995045, "grad_norm": 11.58210277557373, "learning_rate": 2.0159835597670966e-05, "loss": 4.3123, "step": 26250 }, { "epoch": 2.9910587163278093, "grad_norm": 12.677685737609863, "learning_rate": 2.0148418769265898e-05, "loss": 4.1608, "step": 26260 }, { "epoch": 2.9921977333561136, "grad_norm": 9.210295677185059, "learning_rate": 2.013700194086083e-05, "loss": 4.4583, "step": 26270 }, { "epoch": 2.9933367503844184, "grad_norm": 43.07964324951172, "learning_rate": 2.012558511245576e-05, "loss": 4.2155, "step": 26280 }, { "epoch": 2.9944757674127227, "grad_norm": 12.508554458618164, "learning_rate": 2.011416828405069e-05, "loss": 4.1294, "step": 26290 }, { "epoch": 2.9956147844410275, "grad_norm": 9.088356018066406, "learning_rate": 2.0102751455645622e-05, "loss": 4.3443, "step": 26300 }, { "epoch": 2.996753801469332, "grad_norm": 10.917645454406738, "learning_rate": 2.0091334627240554e-05, "loss": 4.356, "step": 26310 }, { "epoch": 2.9978928184976366, "grad_norm": 12.575218200683594, "learning_rate": 2.0079917798835486e-05, "loss": 4.1655, "step": 26320 }, { "epoch": 2.9990318355259413, "grad_norm": 12.658269882202148, "learning_rate": 2.0068500970430414e-05, "loss": 4.5894, "step": 26330 }, { "epoch": 3.0001708525542456, "grad_norm": 8.49130630493164, "learning_rate": 2.0057084142025346e-05, "loss": 4.3359, "step": 26340 }, { "epoch": 3.0013098695825504, "grad_norm": 18.979001998901367, "learning_rate": 2.0045667313620278e-05, "loss": 3.4778, "step": 26350 }, { "epoch": 3.0024488866108547, "grad_norm": 20.550743103027344, "learning_rate": 2.003425048521521e-05, "loss": 3.1705, "step": 26360 }, { "epoch": 3.0035879036391595, "grad_norm": 23.528396606445312, "learning_rate": 2.0022833656810138e-05, "loss": 3.3643, "step": 26370 }, { "epoch": 3.004726920667464, "grad_norm": 13.984031677246094, "learning_rate": 2.001141682840507e-05, "loss": 3.1264, "step": 26380 }, { "epoch": 3.0058659376957686, "grad_norm": 8.928504943847656, "learning_rate": 2e-05, "loss": 3.389, "step": 26390 }, { "epoch": 3.007004954724073, "grad_norm": 26.28722381591797, "learning_rate": 1.9988583171594933e-05, "loss": 3.4283, "step": 26400 }, { "epoch": 3.0081439717523777, "grad_norm": 19.788816452026367, "learning_rate": 1.9977166343189862e-05, "loss": 3.0866, "step": 26410 }, { "epoch": 3.0092829887806825, "grad_norm": 12.7147216796875, "learning_rate": 1.9965749514784794e-05, "loss": 3.226, "step": 26420 }, { "epoch": 3.010422005808987, "grad_norm": 12.794295310974121, "learning_rate": 1.9954332686379726e-05, "loss": 3.5337, "step": 26430 }, { "epoch": 3.0115610228372915, "grad_norm": 25.851694107055664, "learning_rate": 1.9942915857974657e-05, "loss": 3.2126, "step": 26440 }, { "epoch": 3.012700039865596, "grad_norm": 13.408803939819336, "learning_rate": 1.9931499029569586e-05, "loss": 3.0875, "step": 26450 }, { "epoch": 3.0138390568939006, "grad_norm": 16.897953033447266, "learning_rate": 1.9920082201164518e-05, "loss": 3.1841, "step": 26460 }, { "epoch": 3.014978073922205, "grad_norm": 37.791839599609375, "learning_rate": 1.990866537275945e-05, "loss": 3.3605, "step": 26470 }, { "epoch": 3.0161170909505097, "grad_norm": 19.288755416870117, "learning_rate": 1.989724854435438e-05, "loss": 3.4918, "step": 26480 }, { "epoch": 3.0172561079788145, "grad_norm": 20.846179962158203, "learning_rate": 1.988583171594931e-05, "loss": 3.2042, "step": 26490 }, { "epoch": 3.018395125007119, "grad_norm": 19.75118064880371, "learning_rate": 1.987441488754424e-05, "loss": 3.3452, "step": 26500 }, { "epoch": 3.0195341420354236, "grad_norm": 13.249592781066895, "learning_rate": 1.9862998059139174e-05, "loss": 3.2647, "step": 26510 }, { "epoch": 3.020673159063728, "grad_norm": 16.227333068847656, "learning_rate": 1.9851581230734105e-05, "loss": 3.28, "step": 26520 }, { "epoch": 3.0218121760920327, "grad_norm": 11.824593544006348, "learning_rate": 1.9840164402329034e-05, "loss": 3.2526, "step": 26530 }, { "epoch": 3.022951193120337, "grad_norm": 12.387273788452148, "learning_rate": 1.9828747573923966e-05, "loss": 2.9822, "step": 26540 }, { "epoch": 3.0240902101486418, "grad_norm": 25.661296844482422, "learning_rate": 1.9817330745518894e-05, "loss": 3.5264, "step": 26550 }, { "epoch": 3.025229227176946, "grad_norm": 11.836889266967773, "learning_rate": 1.9805913917113826e-05, "loss": 3.0826, "step": 26560 }, { "epoch": 3.026368244205251, "grad_norm": 11.189791679382324, "learning_rate": 1.9794497088708758e-05, "loss": 3.3436, "step": 26570 }, { "epoch": 3.0275072612335556, "grad_norm": 13.328079223632812, "learning_rate": 1.978308026030369e-05, "loss": 3.4042, "step": 26580 }, { "epoch": 3.02864627826186, "grad_norm": 10.731392860412598, "learning_rate": 1.9771663431898618e-05, "loss": 3.1904, "step": 26590 }, { "epoch": 3.0297852952901647, "grad_norm": 15.390854835510254, "learning_rate": 1.976024660349355e-05, "loss": 3.2487, "step": 26600 }, { "epoch": 3.030924312318469, "grad_norm": 13.056636810302734, "learning_rate": 1.9748829775088482e-05, "loss": 3.2201, "step": 26610 }, { "epoch": 3.032063329346774, "grad_norm": 16.926904678344727, "learning_rate": 1.973741294668341e-05, "loss": 3.5044, "step": 26620 }, { "epoch": 3.033202346375078, "grad_norm": 23.026594161987305, "learning_rate": 1.9725996118278342e-05, "loss": 3.0956, "step": 26630 }, { "epoch": 3.034341363403383, "grad_norm": 15.527753829956055, "learning_rate": 1.9714579289873274e-05, "loss": 3.3468, "step": 26640 }, { "epoch": 3.0354803804316877, "grad_norm": 31.828067779541016, "learning_rate": 1.9703162461468206e-05, "loss": 2.9932, "step": 26650 }, { "epoch": 3.036619397459992, "grad_norm": 16.210556030273438, "learning_rate": 1.9691745633063134e-05, "loss": 3.3843, "step": 26660 }, { "epoch": 3.0377584144882968, "grad_norm": 18.695133209228516, "learning_rate": 1.9680328804658066e-05, "loss": 3.4115, "step": 26670 }, { "epoch": 3.038897431516601, "grad_norm": 10.234685897827148, "learning_rate": 1.9668911976252998e-05, "loss": 3.4362, "step": 26680 }, { "epoch": 3.040036448544906, "grad_norm": 13.889457702636719, "learning_rate": 1.965749514784793e-05, "loss": 3.2165, "step": 26690 }, { "epoch": 3.04117546557321, "grad_norm": 13.6388521194458, "learning_rate": 1.9646078319442858e-05, "loss": 3.0997, "step": 26700 }, { "epoch": 3.042314482601515, "grad_norm": 17.444013595581055, "learning_rate": 1.963466149103779e-05, "loss": 3.2621, "step": 26710 }, { "epoch": 3.0434534996298193, "grad_norm": 12.85341739654541, "learning_rate": 1.9623244662632722e-05, "loss": 3.0798, "step": 26720 }, { "epoch": 3.044592516658124, "grad_norm": 23.041946411132812, "learning_rate": 1.9611827834227654e-05, "loss": 3.4639, "step": 26730 }, { "epoch": 3.045731533686429, "grad_norm": 14.276169776916504, "learning_rate": 1.9600411005822582e-05, "loss": 3.2518, "step": 26740 }, { "epoch": 3.046870550714733, "grad_norm": 11.501389503479004, "learning_rate": 1.9588994177417514e-05, "loss": 2.7835, "step": 26750 }, { "epoch": 3.048009567743038, "grad_norm": 18.737586975097656, "learning_rate": 1.9577577349012446e-05, "loss": 3.1787, "step": 26760 }, { "epoch": 3.049148584771342, "grad_norm": 12.630542755126953, "learning_rate": 1.9566160520607378e-05, "loss": 3.1533, "step": 26770 }, { "epoch": 3.050287601799647, "grad_norm": 18.454057693481445, "learning_rate": 1.9554743692202306e-05, "loss": 2.8906, "step": 26780 }, { "epoch": 3.0514266188279513, "grad_norm": 14.291041374206543, "learning_rate": 1.9543326863797238e-05, "loss": 3.347, "step": 26790 }, { "epoch": 3.052565635856256, "grad_norm": 29.922286987304688, "learning_rate": 1.953191003539217e-05, "loss": 3.139, "step": 26800 }, { "epoch": 3.053704652884561, "grad_norm": 11.581583023071289, "learning_rate": 1.95204932069871e-05, "loss": 2.96, "step": 26810 }, { "epoch": 3.054843669912865, "grad_norm": 12.896805763244629, "learning_rate": 1.950907637858203e-05, "loss": 3.3296, "step": 26820 }, { "epoch": 3.05598268694117, "grad_norm": 29.834461212158203, "learning_rate": 1.9497659550176962e-05, "loss": 3.2579, "step": 26830 }, { "epoch": 3.0571217039694742, "grad_norm": 12.87248420715332, "learning_rate": 1.9486242721771894e-05, "loss": 2.9772, "step": 26840 }, { "epoch": 3.058260720997779, "grad_norm": 31.38313865661621, "learning_rate": 1.9474825893366826e-05, "loss": 3.2572, "step": 26850 }, { "epoch": 3.0593997380260833, "grad_norm": 15.08276653289795, "learning_rate": 1.9463409064961754e-05, "loss": 3.1538, "step": 26860 }, { "epoch": 3.060538755054388, "grad_norm": 11.733377456665039, "learning_rate": 1.9451992236556686e-05, "loss": 3.2799, "step": 26870 }, { "epoch": 3.0616777720826924, "grad_norm": 15.852864265441895, "learning_rate": 1.9440575408151614e-05, "loss": 3.327, "step": 26880 }, { "epoch": 3.062816789110997, "grad_norm": 12.521268844604492, "learning_rate": 1.942915857974655e-05, "loss": 3.2815, "step": 26890 }, { "epoch": 3.063955806139302, "grad_norm": 32.56338882446289, "learning_rate": 1.9417741751341478e-05, "loss": 3.2574, "step": 26900 }, { "epoch": 3.0650948231676063, "grad_norm": 13.80693244934082, "learning_rate": 1.940632492293641e-05, "loss": 2.9822, "step": 26910 }, { "epoch": 3.066233840195911, "grad_norm": 16.31395149230957, "learning_rate": 1.9394908094531338e-05, "loss": 3.1047, "step": 26920 }, { "epoch": 3.0673728572242154, "grad_norm": 14.875893592834473, "learning_rate": 1.9383491266126273e-05, "loss": 3.2336, "step": 26930 }, { "epoch": 3.06851187425252, "grad_norm": 23.317651748657227, "learning_rate": 1.9372074437721202e-05, "loss": 3.2547, "step": 26940 }, { "epoch": 3.0696508912808245, "grad_norm": 16.346988677978516, "learning_rate": 1.9360657609316134e-05, "loss": 3.1503, "step": 26950 }, { "epoch": 3.0707899083091292, "grad_norm": 17.115734100341797, "learning_rate": 1.9349240780911062e-05, "loss": 3.3662, "step": 26960 }, { "epoch": 3.071928925337434, "grad_norm": 34.3673095703125, "learning_rate": 1.9337823952505994e-05, "loss": 3.2764, "step": 26970 }, { "epoch": 3.0730679423657383, "grad_norm": 20.209665298461914, "learning_rate": 1.9326407124100926e-05, "loss": 3.6596, "step": 26980 }, { "epoch": 3.074206959394043, "grad_norm": 16.05097770690918, "learning_rate": 1.9314990295695858e-05, "loss": 3.2271, "step": 26990 }, { "epoch": 3.0753459764223474, "grad_norm": 13.981256484985352, "learning_rate": 1.9303573467290786e-05, "loss": 3.4841, "step": 27000 }, { "epoch": 3.076484993450652, "grad_norm": 17.1622314453125, "learning_rate": 1.9292156638885718e-05, "loss": 2.9933, "step": 27010 }, { "epoch": 3.0776240104789565, "grad_norm": 43.3394889831543, "learning_rate": 1.928073981048065e-05, "loss": 2.9414, "step": 27020 }, { "epoch": 3.0787630275072613, "grad_norm": 14.771462440490723, "learning_rate": 1.926932298207558e-05, "loss": 3.0738, "step": 27030 }, { "epoch": 3.0799020445355656, "grad_norm": 13.060588836669922, "learning_rate": 1.925790615367051e-05, "loss": 3.1623, "step": 27040 }, { "epoch": 3.0810410615638704, "grad_norm": 30.9134521484375, "learning_rate": 1.9246489325265442e-05, "loss": 3.2988, "step": 27050 }, { "epoch": 3.082180078592175, "grad_norm": 11.042315483093262, "learning_rate": 1.9235072496860374e-05, "loss": 3.1594, "step": 27060 }, { "epoch": 3.0833190956204795, "grad_norm": 18.68390655517578, "learning_rate": 1.9223655668455302e-05, "loss": 3.2803, "step": 27070 }, { "epoch": 3.0844581126487842, "grad_norm": 13.154905319213867, "learning_rate": 1.9212238840050234e-05, "loss": 3.1246, "step": 27080 }, { "epoch": 3.0855971296770885, "grad_norm": 29.026409149169922, "learning_rate": 1.9200822011645166e-05, "loss": 3.3788, "step": 27090 }, { "epoch": 3.0867361467053933, "grad_norm": 15.003780364990234, "learning_rate": 1.9189405183240098e-05, "loss": 3.289, "step": 27100 }, { "epoch": 3.0878751637336976, "grad_norm": 14.786931037902832, "learning_rate": 1.9177988354835026e-05, "loss": 3.0621, "step": 27110 }, { "epoch": 3.0890141807620024, "grad_norm": 18.21460723876953, "learning_rate": 1.9166571526429958e-05, "loss": 3.5111, "step": 27120 }, { "epoch": 3.090153197790307, "grad_norm": 12.12824821472168, "learning_rate": 1.915515469802489e-05, "loss": 3.2396, "step": 27130 }, { "epoch": 3.0912922148186115, "grad_norm": 10.3527250289917, "learning_rate": 1.914373786961982e-05, "loss": 3.3413, "step": 27140 }, { "epoch": 3.0924312318469163, "grad_norm": 11.547093391418457, "learning_rate": 1.913232104121475e-05, "loss": 3.2743, "step": 27150 }, { "epoch": 3.0935702488752206, "grad_norm": 31.078540802001953, "learning_rate": 1.9120904212809682e-05, "loss": 2.6648, "step": 27160 }, { "epoch": 3.0947092659035254, "grad_norm": 15.62102222442627, "learning_rate": 1.9109487384404614e-05, "loss": 2.994, "step": 27170 }, { "epoch": 3.0958482829318297, "grad_norm": 10.508420944213867, "learning_rate": 1.9098070555999546e-05, "loss": 3.4438, "step": 27180 }, { "epoch": 3.0969872999601344, "grad_norm": 23.045381546020508, "learning_rate": 1.9086653727594474e-05, "loss": 3.171, "step": 27190 }, { "epoch": 3.0981263169884388, "grad_norm": 15.51526927947998, "learning_rate": 1.9075236899189406e-05, "loss": 3.0396, "step": 27200 }, { "epoch": 3.0992653340167435, "grad_norm": 11.499743461608887, "learning_rate": 1.9063820070784338e-05, "loss": 3.1568, "step": 27210 }, { "epoch": 3.1004043510450483, "grad_norm": 15.384307861328125, "learning_rate": 1.905240324237927e-05, "loss": 3.1907, "step": 27220 }, { "epoch": 3.1015433680733526, "grad_norm": 16.944202423095703, "learning_rate": 1.9040986413974198e-05, "loss": 3.3019, "step": 27230 }, { "epoch": 3.1026823851016574, "grad_norm": 19.369962692260742, "learning_rate": 1.902956958556913e-05, "loss": 3.1569, "step": 27240 }, { "epoch": 3.1038214021299617, "grad_norm": 24.748016357421875, "learning_rate": 1.901815275716406e-05, "loss": 3.323, "step": 27250 }, { "epoch": 3.1049604191582665, "grad_norm": 17.034259796142578, "learning_rate": 1.9006735928758994e-05, "loss": 2.9709, "step": 27260 }, { "epoch": 3.106099436186571, "grad_norm": 14.272811889648438, "learning_rate": 1.8995319100353922e-05, "loss": 3.0763, "step": 27270 }, { "epoch": 3.1072384532148756, "grad_norm": 16.28206443786621, "learning_rate": 1.8983902271948854e-05, "loss": 3.297, "step": 27280 }, { "epoch": 3.1083774702431803, "grad_norm": 16.134042739868164, "learning_rate": 1.8972485443543782e-05, "loss": 3.4735, "step": 27290 }, { "epoch": 3.1095164872714847, "grad_norm": 44.60962677001953, "learning_rate": 1.8961068615138718e-05, "loss": 3.4407, "step": 27300 }, { "epoch": 3.1106555042997894, "grad_norm": 14.325160026550293, "learning_rate": 1.8949651786733646e-05, "loss": 3.8483, "step": 27310 }, { "epoch": 3.1117945213280938, "grad_norm": 14.658393859863281, "learning_rate": 1.8938234958328578e-05, "loss": 2.8621, "step": 27320 }, { "epoch": 3.1129335383563985, "grad_norm": 32.288665771484375, "learning_rate": 1.8926818129923506e-05, "loss": 3.3189, "step": 27330 }, { "epoch": 3.114072555384703, "grad_norm": 15.71682357788086, "learning_rate": 1.891540130151844e-05, "loss": 3.2937, "step": 27340 }, { "epoch": 3.1152115724130076, "grad_norm": 22.568588256835938, "learning_rate": 1.890398447311337e-05, "loss": 3.1074, "step": 27350 }, { "epoch": 3.116350589441312, "grad_norm": 12.612749099731445, "learning_rate": 1.8892567644708302e-05, "loss": 3.345, "step": 27360 }, { "epoch": 3.1174896064696167, "grad_norm": 15.754531860351562, "learning_rate": 1.888115081630323e-05, "loss": 3.3005, "step": 27370 }, { "epoch": 3.1186286234979215, "grad_norm": 12.605548858642578, "learning_rate": 1.8869733987898165e-05, "loss": 3.2184, "step": 27380 }, { "epoch": 3.119767640526226, "grad_norm": 12.064836502075195, "learning_rate": 1.8858317159493094e-05, "loss": 3.2672, "step": 27390 }, { "epoch": 3.1209066575545306, "grad_norm": 16.371063232421875, "learning_rate": 1.8846900331088026e-05, "loss": 3.0152, "step": 27400 }, { "epoch": 3.122045674582835, "grad_norm": 15.042854309082031, "learning_rate": 1.8835483502682954e-05, "loss": 2.9543, "step": 27410 }, { "epoch": 3.1231846916111397, "grad_norm": 23.035600662231445, "learning_rate": 1.8824066674277886e-05, "loss": 3.3771, "step": 27420 }, { "epoch": 3.124323708639444, "grad_norm": 34.90754699707031, "learning_rate": 1.8812649845872818e-05, "loss": 3.411, "step": 27430 }, { "epoch": 3.1254627256677487, "grad_norm": 17.134292602539062, "learning_rate": 1.880123301746775e-05, "loss": 3.3458, "step": 27440 }, { "epoch": 3.1266017426960535, "grad_norm": 41.59492874145508, "learning_rate": 1.8789816189062678e-05, "loss": 2.8683, "step": 27450 }, { "epoch": 3.127740759724358, "grad_norm": 11.885684967041016, "learning_rate": 1.877839936065761e-05, "loss": 3.0729, "step": 27460 }, { "epoch": 3.1288797767526626, "grad_norm": 18.801475524902344, "learning_rate": 1.8766982532252542e-05, "loss": 3.579, "step": 27470 }, { "epoch": 3.130018793780967, "grad_norm": 18.656755447387695, "learning_rate": 1.875556570384747e-05, "loss": 3.1842, "step": 27480 }, { "epoch": 3.1311578108092717, "grad_norm": 16.117191314697266, "learning_rate": 1.8744148875442402e-05, "loss": 3.2142, "step": 27490 }, { "epoch": 3.132296827837576, "grad_norm": 38.7790641784668, "learning_rate": 1.8732732047037334e-05, "loss": 2.9492, "step": 27500 }, { "epoch": 3.133435844865881, "grad_norm": 24.200565338134766, "learning_rate": 1.8721315218632266e-05, "loss": 3.2178, "step": 27510 }, { "epoch": 3.134574861894185, "grad_norm": 17.899675369262695, "learning_rate": 1.8709898390227194e-05, "loss": 3.0469, "step": 27520 }, { "epoch": 3.13571387892249, "grad_norm": 17.653701782226562, "learning_rate": 1.8698481561822126e-05, "loss": 3.2397, "step": 27530 }, { "epoch": 3.1368528959507946, "grad_norm": 15.90965461730957, "learning_rate": 1.8687064733417058e-05, "loss": 3.3233, "step": 27540 }, { "epoch": 3.137991912979099, "grad_norm": 30.597681045532227, "learning_rate": 1.867564790501199e-05, "loss": 3.1581, "step": 27550 }, { "epoch": 3.1391309300074037, "grad_norm": 18.15818214416504, "learning_rate": 1.8664231076606918e-05, "loss": 3.2221, "step": 27560 }, { "epoch": 3.140269947035708, "grad_norm": 13.555785179138184, "learning_rate": 1.865281424820185e-05, "loss": 3.2178, "step": 27570 }, { "epoch": 3.141408964064013, "grad_norm": 15.18497371673584, "learning_rate": 1.864139741979678e-05, "loss": 2.8833, "step": 27580 }, { "epoch": 3.142547981092317, "grad_norm": 20.986042022705078, "learning_rate": 1.8629980591391714e-05, "loss": 3.0598, "step": 27590 }, { "epoch": 3.143686998120622, "grad_norm": 15.958553314208984, "learning_rate": 1.8618563762986642e-05, "loss": 3.6522, "step": 27600 }, { "epoch": 3.1448260151489267, "grad_norm": 50.318267822265625, "learning_rate": 1.8607146934581574e-05, "loss": 3.1646, "step": 27610 }, { "epoch": 3.145965032177231, "grad_norm": 15.260852813720703, "learning_rate": 1.8595730106176502e-05, "loss": 3.2088, "step": 27620 }, { "epoch": 3.1471040492055358, "grad_norm": 14.715402603149414, "learning_rate": 1.8584313277771438e-05, "loss": 3.2356, "step": 27630 }, { "epoch": 3.14824306623384, "grad_norm": 13.372052192687988, "learning_rate": 1.8572896449366366e-05, "loss": 3.4894, "step": 27640 }, { "epoch": 3.149382083262145, "grad_norm": 41.42268371582031, "learning_rate": 1.8561479620961298e-05, "loss": 3.0889, "step": 27650 }, { "epoch": 3.150521100290449, "grad_norm": 16.11638069152832, "learning_rate": 1.8550062792556226e-05, "loss": 3.325, "step": 27660 }, { "epoch": 3.151660117318754, "grad_norm": 17.42380142211914, "learning_rate": 1.853864596415116e-05, "loss": 3.2423, "step": 27670 }, { "epoch": 3.1527991343470587, "grad_norm": 18.778663635253906, "learning_rate": 1.852722913574609e-05, "loss": 3.4032, "step": 27680 }, { "epoch": 3.153938151375363, "grad_norm": 16.600717544555664, "learning_rate": 1.8515812307341022e-05, "loss": 3.2928, "step": 27690 }, { "epoch": 3.155077168403668, "grad_norm": 20.402591705322266, "learning_rate": 1.850439547893595e-05, "loss": 3.6319, "step": 27700 }, { "epoch": 3.156216185431972, "grad_norm": 33.90158462524414, "learning_rate": 1.8492978650530886e-05, "loss": 3.2214, "step": 27710 }, { "epoch": 3.157355202460277, "grad_norm": 12.001391410827637, "learning_rate": 1.8481561822125814e-05, "loss": 3.3975, "step": 27720 }, { "epoch": 3.1584942194885812, "grad_norm": 16.599849700927734, "learning_rate": 1.8470144993720746e-05, "loss": 3.271, "step": 27730 }, { "epoch": 3.159633236516886, "grad_norm": 17.87288475036621, "learning_rate": 1.8458728165315674e-05, "loss": 3.1615, "step": 27740 }, { "epoch": 3.1607722535451903, "grad_norm": 17.997711181640625, "learning_rate": 1.844731133691061e-05, "loss": 3.2489, "step": 27750 }, { "epoch": 3.161911270573495, "grad_norm": 35.24727249145508, "learning_rate": 1.8435894508505538e-05, "loss": 3.1077, "step": 27760 }, { "epoch": 3.1630502876018, "grad_norm": 16.674970626831055, "learning_rate": 1.842447768010047e-05, "loss": 3.0481, "step": 27770 }, { "epoch": 3.164189304630104, "grad_norm": 12.34125804901123, "learning_rate": 1.8413060851695398e-05, "loss": 3.324, "step": 27780 }, { "epoch": 3.165328321658409, "grad_norm": 23.691303253173828, "learning_rate": 1.8401644023290334e-05, "loss": 3.254, "step": 27790 }, { "epoch": 3.1664673386867133, "grad_norm": 33.48701477050781, "learning_rate": 1.8390227194885262e-05, "loss": 3.2557, "step": 27800 }, { "epoch": 3.167606355715018, "grad_norm": 15.587418556213379, "learning_rate": 1.8378810366480194e-05, "loss": 3.4168, "step": 27810 }, { "epoch": 3.1687453727433224, "grad_norm": 17.93613624572754, "learning_rate": 1.8367393538075122e-05, "loss": 3.1407, "step": 27820 }, { "epoch": 3.169884389771627, "grad_norm": 23.921279907226562, "learning_rate": 1.8355976709670054e-05, "loss": 2.6977, "step": 27830 }, { "epoch": 3.1710234067999314, "grad_norm": 13.405139923095703, "learning_rate": 1.8344559881264986e-05, "loss": 3.1785, "step": 27840 }, { "epoch": 3.172162423828236, "grad_norm": 11.815970420837402, "learning_rate": 1.8333143052859918e-05, "loss": 3.1895, "step": 27850 }, { "epoch": 3.173301440856541, "grad_norm": 23.689008712768555, "learning_rate": 1.8321726224454846e-05, "loss": 3.2437, "step": 27860 }, { "epoch": 3.1744404578848453, "grad_norm": 16.902727127075195, "learning_rate": 1.8310309396049778e-05, "loss": 3.4533, "step": 27870 }, { "epoch": 3.17557947491315, "grad_norm": 42.89224624633789, "learning_rate": 1.829889256764471e-05, "loss": 2.8198, "step": 27880 }, { "epoch": 3.1767184919414544, "grad_norm": 18.913209915161133, "learning_rate": 1.8287475739239642e-05, "loss": 2.9572, "step": 27890 }, { "epoch": 3.177857508969759, "grad_norm": 27.72986602783203, "learning_rate": 1.827605891083457e-05, "loss": 3.2509, "step": 27900 }, { "epoch": 3.1789965259980635, "grad_norm": 42.71379852294922, "learning_rate": 1.8264642082429502e-05, "loss": 3.1344, "step": 27910 }, { "epoch": 3.1801355430263682, "grad_norm": 17.06867218017578, "learning_rate": 1.8253225254024434e-05, "loss": 2.9378, "step": 27920 }, { "epoch": 3.181274560054673, "grad_norm": 24.895004272460938, "learning_rate": 1.8241808425619362e-05, "loss": 2.9792, "step": 27930 }, { "epoch": 3.1824135770829773, "grad_norm": 41.76930618286133, "learning_rate": 1.8230391597214294e-05, "loss": 3.4044, "step": 27940 }, { "epoch": 3.183552594111282, "grad_norm": 12.345658302307129, "learning_rate": 1.8218974768809226e-05, "loss": 3.3096, "step": 27950 }, { "epoch": 3.1846916111395864, "grad_norm": 24.934452056884766, "learning_rate": 1.8207557940404158e-05, "loss": 2.6984, "step": 27960 }, { "epoch": 3.185830628167891, "grad_norm": 18.349428176879883, "learning_rate": 1.8196141111999086e-05, "loss": 3.1897, "step": 27970 }, { "epoch": 3.1869696451961955, "grad_norm": 18.513832092285156, "learning_rate": 1.8184724283594018e-05, "loss": 3.4669, "step": 27980 }, { "epoch": 3.1881086622245003, "grad_norm": 83.0946273803711, "learning_rate": 1.8173307455188947e-05, "loss": 3.1882, "step": 27990 }, { "epoch": 3.189247679252805, "grad_norm": 32.92939758300781, "learning_rate": 1.8161890626783882e-05, "loss": 3.4756, "step": 28000 }, { "epoch": 3.189247679252805, "eval_loss": 6.487438678741455, "eval_runtime": 11.7208, "eval_samples_per_second": 1.28, "eval_steps_per_second": 0.171, "step": 28000 }, { "epoch": 3.1903866962811094, "grad_norm": 13.909664154052734, "learning_rate": 1.815047379837881e-05, "loss": 3.2001, "step": 28010 }, { "epoch": 3.191525713309414, "grad_norm": 18.252351760864258, "learning_rate": 1.8139056969973742e-05, "loss": 3.3558, "step": 28020 }, { "epoch": 3.1926647303377185, "grad_norm": 23.161178588867188, "learning_rate": 1.812764014156867e-05, "loss": 3.0862, "step": 28030 }, { "epoch": 3.1938037473660232, "grad_norm": 11.257837295532227, "learning_rate": 1.8116223313163606e-05, "loss": 3.3629, "step": 28040 }, { "epoch": 3.1949427643943276, "grad_norm": 53.57904815673828, "learning_rate": 1.8104806484758534e-05, "loss": 3.2856, "step": 28050 }, { "epoch": 3.1960817814226323, "grad_norm": 17.169322967529297, "learning_rate": 1.8093389656353466e-05, "loss": 3.1074, "step": 28060 }, { "epoch": 3.1972207984509367, "grad_norm": 12.976736068725586, "learning_rate": 1.8081972827948394e-05, "loss": 3.2706, "step": 28070 }, { "epoch": 3.1983598154792414, "grad_norm": 15.092696189880371, "learning_rate": 1.807055599954333e-05, "loss": 3.1404, "step": 28080 }, { "epoch": 3.199498832507546, "grad_norm": 58.73832321166992, "learning_rate": 1.8059139171138258e-05, "loss": 3.3886, "step": 28090 }, { "epoch": 3.2006378495358505, "grad_norm": 19.246742248535156, "learning_rate": 1.804772234273319e-05, "loss": 3.0287, "step": 28100 }, { "epoch": 3.2017768665641553, "grad_norm": 21.087007522583008, "learning_rate": 1.803630551432812e-05, "loss": 3.088, "step": 28110 }, { "epoch": 3.2029158835924596, "grad_norm": 40.340824127197266, "learning_rate": 1.8024888685923054e-05, "loss": 3.0932, "step": 28120 }, { "epoch": 3.2040549006207644, "grad_norm": 15.739103317260742, "learning_rate": 1.8013471857517982e-05, "loss": 3.0367, "step": 28130 }, { "epoch": 3.2051939176490687, "grad_norm": 26.22688102722168, "learning_rate": 1.8002055029112914e-05, "loss": 3.0504, "step": 28140 }, { "epoch": 3.2063329346773735, "grad_norm": 16.44894027709961, "learning_rate": 1.7990638200707842e-05, "loss": 3.2259, "step": 28150 }, { "epoch": 3.207471951705678, "grad_norm": 14.065156936645508, "learning_rate": 1.7979221372302778e-05, "loss": 3.5373, "step": 28160 }, { "epoch": 3.2086109687339825, "grad_norm": 16.340435028076172, "learning_rate": 1.7967804543897706e-05, "loss": 3.4307, "step": 28170 }, { "epoch": 3.2097499857622873, "grad_norm": 13.92990493774414, "learning_rate": 1.7956387715492638e-05, "loss": 3.2959, "step": 28180 }, { "epoch": 3.2108890027905916, "grad_norm": 13.67980670928955, "learning_rate": 1.7944970887087566e-05, "loss": 3.1901, "step": 28190 }, { "epoch": 3.2120280198188964, "grad_norm": 58.88777542114258, "learning_rate": 1.79335540586825e-05, "loss": 3.167, "step": 28200 }, { "epoch": 3.2131670368472007, "grad_norm": 17.12492561340332, "learning_rate": 1.792213723027743e-05, "loss": 3.7324, "step": 28210 }, { "epoch": 3.2143060538755055, "grad_norm": 19.977720260620117, "learning_rate": 1.7910720401872362e-05, "loss": 3.1822, "step": 28220 }, { "epoch": 3.21544507090381, "grad_norm": 21.183141708374023, "learning_rate": 1.789930357346729e-05, "loss": 3.1884, "step": 28230 }, { "epoch": 3.2165840879321146, "grad_norm": 20.963783264160156, "learning_rate": 1.7887886745062226e-05, "loss": 3.544, "step": 28240 }, { "epoch": 3.2177231049604194, "grad_norm": 34.167781829833984, "learning_rate": 1.7876469916657154e-05, "loss": 3.2661, "step": 28250 }, { "epoch": 3.2188621219887237, "grad_norm": 17.497081756591797, "learning_rate": 1.7865053088252086e-05, "loss": 2.9077, "step": 28260 }, { "epoch": 3.2200011390170284, "grad_norm": 31.264101028442383, "learning_rate": 1.7853636259847014e-05, "loss": 3.0755, "step": 28270 }, { "epoch": 3.2211401560453328, "grad_norm": 30.14015769958496, "learning_rate": 1.7842219431441946e-05, "loss": 3.9268, "step": 28280 }, { "epoch": 3.2222791730736375, "grad_norm": 15.659712791442871, "learning_rate": 1.7830802603036878e-05, "loss": 3.5625, "step": 28290 }, { "epoch": 3.223418190101942, "grad_norm": 18.148754119873047, "learning_rate": 1.781938577463181e-05, "loss": 2.8605, "step": 28300 }, { "epoch": 3.2245572071302466, "grad_norm": 44.391685485839844, "learning_rate": 1.7807968946226738e-05, "loss": 3.3872, "step": 28310 }, { "epoch": 3.2256962241585514, "grad_norm": 16.15254020690918, "learning_rate": 1.779655211782167e-05, "loss": 3.4028, "step": 28320 }, { "epoch": 3.2268352411868557, "grad_norm": 14.51396369934082, "learning_rate": 1.7785135289416602e-05, "loss": 3.3886, "step": 28330 }, { "epoch": 3.2279742582151605, "grad_norm": 28.465871810913086, "learning_rate": 1.777371846101153e-05, "loss": 3.0538, "step": 28340 }, { "epoch": 3.229113275243465, "grad_norm": 15.167338371276855, "learning_rate": 1.7762301632606462e-05, "loss": 3.4863, "step": 28350 }, { "epoch": 3.2302522922717696, "grad_norm": 15.840221405029297, "learning_rate": 1.7750884804201394e-05, "loss": 2.9079, "step": 28360 }, { "epoch": 3.231391309300074, "grad_norm": 17.186038970947266, "learning_rate": 1.7739467975796326e-05, "loss": 3.5992, "step": 28370 }, { "epoch": 3.2325303263283787, "grad_norm": 14.156516075134277, "learning_rate": 1.7728051147391254e-05, "loss": 3.6097, "step": 28380 }, { "epoch": 3.233669343356683, "grad_norm": 25.355093002319336, "learning_rate": 1.7716634318986186e-05, "loss": 3.3119, "step": 28390 }, { "epoch": 3.2348083603849878, "grad_norm": 26.929956436157227, "learning_rate": 1.7705217490581118e-05, "loss": 3.3645, "step": 28400 }, { "epoch": 3.2359473774132925, "grad_norm": 18.3270263671875, "learning_rate": 1.769380066217605e-05, "loss": 3.2608, "step": 28410 }, { "epoch": 3.237086394441597, "grad_norm": 15.915236473083496, "learning_rate": 1.7682383833770978e-05, "loss": 3.1598, "step": 28420 }, { "epoch": 3.2382254114699016, "grad_norm": 13.311419486999512, "learning_rate": 1.767096700536591e-05, "loss": 3.2362, "step": 28430 }, { "epoch": 3.239364428498206, "grad_norm": 18.324195861816406, "learning_rate": 1.765955017696084e-05, "loss": 3.3171, "step": 28440 }, { "epoch": 3.2405034455265107, "grad_norm": 17.85279083251953, "learning_rate": 1.7648133348555774e-05, "loss": 3.3206, "step": 28450 }, { "epoch": 3.241642462554815, "grad_norm": 15.024346351623535, "learning_rate": 1.763785820299121e-05, "loss": 3.3791, "step": 28460 }, { "epoch": 3.24278147958312, "grad_norm": 24.47608184814453, "learning_rate": 1.762644137458614e-05, "loss": 3.3351, "step": 28470 }, { "epoch": 3.243920496611424, "grad_norm": 16.118467330932617, "learning_rate": 1.7615024546181073e-05, "loss": 3.241, "step": 28480 }, { "epoch": 3.245059513639729, "grad_norm": 14.81120777130127, "learning_rate": 1.7603607717776002e-05, "loss": 3.3641, "step": 28490 }, { "epoch": 3.2461985306680337, "grad_norm": 15.765020370483398, "learning_rate": 1.7592190889370934e-05, "loss": 3.4755, "step": 28500 }, { "epoch": 3.247337547696338, "grad_norm": 22.138263702392578, "learning_rate": 1.7580774060965866e-05, "loss": 3.1655, "step": 28510 }, { "epoch": 3.2484765647246427, "grad_norm": 21.298864364624023, "learning_rate": 1.7569357232560797e-05, "loss": 3.1699, "step": 28520 }, { "epoch": 3.249615581752947, "grad_norm": 13.472810745239258, "learning_rate": 1.7557940404155726e-05, "loss": 2.773, "step": 28530 }, { "epoch": 3.250754598781252, "grad_norm": 17.579721450805664, "learning_rate": 1.7546523575750658e-05, "loss": 3.2553, "step": 28540 }, { "epoch": 3.251893615809556, "grad_norm": 18.842206954956055, "learning_rate": 1.753510674734559e-05, "loss": 3.2994, "step": 28550 }, { "epoch": 3.253032632837861, "grad_norm": 22.36919593811035, "learning_rate": 1.7523689918940518e-05, "loss": 2.9392, "step": 28560 }, { "epoch": 3.2541716498661657, "grad_norm": 21.19603157043457, "learning_rate": 1.751227309053545e-05, "loss": 3.3386, "step": 28570 }, { "epoch": 3.25531066689447, "grad_norm": 19.441781997680664, "learning_rate": 1.750085626213038e-05, "loss": 3.1832, "step": 28580 }, { "epoch": 3.256449683922775, "grad_norm": 22.879182815551758, "learning_rate": 1.7489439433725313e-05, "loss": 3.3685, "step": 28590 }, { "epoch": 3.257588700951079, "grad_norm": 12.05748176574707, "learning_rate": 1.7478022605320242e-05, "loss": 3.0341, "step": 28600 }, { "epoch": 3.258727717979384, "grad_norm": 32.73973083496094, "learning_rate": 1.7466605776915174e-05, "loss": 3.0821, "step": 28610 }, { "epoch": 3.259866735007688, "grad_norm": 18.262226104736328, "learning_rate": 1.7455188948510102e-05, "loss": 3.092, "step": 28620 }, { "epoch": 3.261005752035993, "grad_norm": 19.36309242248535, "learning_rate": 1.7443772120105037e-05, "loss": 3.3871, "step": 28630 }, { "epoch": 3.2621447690642977, "grad_norm": 13.321374893188477, "learning_rate": 1.7432355291699966e-05, "loss": 3.2189, "step": 28640 }, { "epoch": 3.263283786092602, "grad_norm": 16.028152465820312, "learning_rate": 1.7420938463294898e-05, "loss": 3.091, "step": 28650 }, { "epoch": 3.264422803120907, "grad_norm": 15.734786987304688, "learning_rate": 1.7409521634889826e-05, "loss": 3.4497, "step": 28660 }, { "epoch": 3.265561820149211, "grad_norm": 19.153057098388672, "learning_rate": 1.739810480648476e-05, "loss": 3.1372, "step": 28670 }, { "epoch": 3.266700837177516, "grad_norm": 33.36358642578125, "learning_rate": 1.738668797807969e-05, "loss": 3.2049, "step": 28680 }, { "epoch": 3.2678398542058202, "grad_norm": 19.029390335083008, "learning_rate": 1.737527114967462e-05, "loss": 3.0428, "step": 28690 }, { "epoch": 3.268978871234125, "grad_norm": 15.155694007873535, "learning_rate": 1.736385432126955e-05, "loss": 3.1559, "step": 28700 }, { "epoch": 3.2701178882624298, "grad_norm": 18.706727981567383, "learning_rate": 1.7352437492864482e-05, "loss": 3.0554, "step": 28710 }, { "epoch": 3.271256905290734, "grad_norm": 17.440990447998047, "learning_rate": 1.7341020664459414e-05, "loss": 3.55, "step": 28720 }, { "epoch": 3.272395922319039, "grad_norm": 23.9125919342041, "learning_rate": 1.7329603836054346e-05, "loss": 3.3082, "step": 28730 }, { "epoch": 3.273534939347343, "grad_norm": 32.674766540527344, "learning_rate": 1.7318187007649274e-05, "loss": 3.3228, "step": 28740 }, { "epoch": 3.274673956375648, "grad_norm": 14.836384773254395, "learning_rate": 1.7306770179244206e-05, "loss": 3.3278, "step": 28750 }, { "epoch": 3.2758129734039523, "grad_norm": 18.351093292236328, "learning_rate": 1.7295353350839138e-05, "loss": 3.328, "step": 28760 }, { "epoch": 3.276951990432257, "grad_norm": 17.573938369750977, "learning_rate": 1.728393652243407e-05, "loss": 3.2982, "step": 28770 }, { "epoch": 3.2780910074605614, "grad_norm": 19.292461395263672, "learning_rate": 1.7272519694028998e-05, "loss": 3.1923, "step": 28780 }, { "epoch": 3.279230024488866, "grad_norm": 22.580219268798828, "learning_rate": 1.726110286562393e-05, "loss": 3.1601, "step": 28790 }, { "epoch": 3.2803690415171705, "grad_norm": 24.21674346923828, "learning_rate": 1.7249686037218862e-05, "loss": 3.3531, "step": 28800 }, { "epoch": 3.2815080585454752, "grad_norm": 19.79977798461914, "learning_rate": 1.7238269208813794e-05, "loss": 3.2324, "step": 28810 }, { "epoch": 3.28264707557378, "grad_norm": 32.75783920288086, "learning_rate": 1.7226852380408722e-05, "loss": 3.0395, "step": 28820 }, { "epoch": 3.2837860926020843, "grad_norm": 14.975643157958984, "learning_rate": 1.7215435552003654e-05, "loss": 3.1527, "step": 28830 }, { "epoch": 3.284925109630389, "grad_norm": 23.69989585876465, "learning_rate": 1.7204018723598586e-05, "loss": 3.5149, "step": 28840 }, { "epoch": 3.2860641266586934, "grad_norm": 15.182013511657715, "learning_rate": 1.7192601895193518e-05, "loss": 3.5798, "step": 28850 }, { "epoch": 3.287203143686998, "grad_norm": 22.140403747558594, "learning_rate": 1.7181185066788446e-05, "loss": 3.0379, "step": 28860 }, { "epoch": 3.2883421607153025, "grad_norm": 15.012598991394043, "learning_rate": 1.7169768238383378e-05, "loss": 3.3189, "step": 28870 }, { "epoch": 3.2894811777436073, "grad_norm": 23.979196548461914, "learning_rate": 1.715835140997831e-05, "loss": 3.1195, "step": 28880 }, { "epoch": 3.290620194771912, "grad_norm": 19.301538467407227, "learning_rate": 1.714693458157324e-05, "loss": 3.2205, "step": 28890 }, { "epoch": 3.2917592118002164, "grad_norm": 24.4025936126709, "learning_rate": 1.713551775316817e-05, "loss": 3.2443, "step": 28900 }, { "epoch": 3.292898228828521, "grad_norm": 12.656296730041504, "learning_rate": 1.7124100924763102e-05, "loss": 3.0962, "step": 28910 }, { "epoch": 3.2940372458568254, "grad_norm": 84.212890625, "learning_rate": 1.7112684096358034e-05, "loss": 3.1838, "step": 28920 }, { "epoch": 3.29517626288513, "grad_norm": 19.40008544921875, "learning_rate": 1.7101267267952965e-05, "loss": 3.4919, "step": 28930 }, { "epoch": 3.2963152799134345, "grad_norm": 14.742105484008789, "learning_rate": 1.7089850439547894e-05, "loss": 3.4597, "step": 28940 }, { "epoch": 3.2974542969417393, "grad_norm": 27.001304626464844, "learning_rate": 1.7078433611142826e-05, "loss": 3.2382, "step": 28950 }, { "epoch": 3.298593313970044, "grad_norm": 53.63911437988281, "learning_rate": 1.7067016782737758e-05, "loss": 3.3239, "step": 28960 }, { "epoch": 3.2997323309983484, "grad_norm": 86.16913604736328, "learning_rate": 1.705559995433269e-05, "loss": 2.9274, "step": 28970 }, { "epoch": 3.300871348026653, "grad_norm": 15.573724746704102, "learning_rate": 1.7044183125927618e-05, "loss": 3.355, "step": 28980 }, { "epoch": 3.3020103650549575, "grad_norm": 17.94513702392578, "learning_rate": 1.703276629752255e-05, "loss": 3.3487, "step": 28990 }, { "epoch": 3.3031493820832623, "grad_norm": 32.797203063964844, "learning_rate": 1.702134946911748e-05, "loss": 3.099, "step": 29000 }, { "epoch": 3.3042883991115666, "grad_norm": 18.942058563232422, "learning_rate": 1.700993264071241e-05, "loss": 3.6015, "step": 29010 }, { "epoch": 3.3054274161398713, "grad_norm": 36.88163375854492, "learning_rate": 1.6998515812307342e-05, "loss": 3.4283, "step": 29020 }, { "epoch": 3.306566433168176, "grad_norm": 14.28677749633789, "learning_rate": 1.6987098983902274e-05, "loss": 3.3481, "step": 29030 }, { "epoch": 3.3077054501964804, "grad_norm": 13.87667179107666, "learning_rate": 1.6975682155497202e-05, "loss": 3.1402, "step": 29040 }, { "epoch": 3.308844467224785, "grad_norm": 22.34500503540039, "learning_rate": 1.6964265327092134e-05, "loss": 3.1876, "step": 29050 }, { "epoch": 3.3099834842530895, "grad_norm": 13.625529289245605, "learning_rate": 1.6952848498687066e-05, "loss": 3.1295, "step": 29060 }, { "epoch": 3.3111225012813943, "grad_norm": 20.435510635375977, "learning_rate": 1.6941431670281994e-05, "loss": 2.9283, "step": 29070 }, { "epoch": 3.3122615183096986, "grad_norm": 25.591533660888672, "learning_rate": 1.6930014841876926e-05, "loss": 3.3265, "step": 29080 }, { "epoch": 3.3134005353380034, "grad_norm": 17.3295841217041, "learning_rate": 1.6918598013471858e-05, "loss": 3.1512, "step": 29090 }, { "epoch": 3.3145395523663077, "grad_norm": 43.755069732666016, "learning_rate": 1.690718118506679e-05, "loss": 2.9848, "step": 29100 }, { "epoch": 3.3156785693946125, "grad_norm": 16.596729278564453, "learning_rate": 1.6895764356661718e-05, "loss": 3.2103, "step": 29110 }, { "epoch": 3.316817586422917, "grad_norm": 25.127235412597656, "learning_rate": 1.688434752825665e-05, "loss": 3.0511, "step": 29120 }, { "epoch": 3.3179566034512216, "grad_norm": 24.36378288269043, "learning_rate": 1.6872930699851582e-05, "loss": 3.2856, "step": 29130 }, { "epoch": 3.3190956204795263, "grad_norm": 21.73314094543457, "learning_rate": 1.6861513871446514e-05, "loss": 3.1421, "step": 29140 }, { "epoch": 3.3202346375078307, "grad_norm": 17.820711135864258, "learning_rate": 1.6850097043041442e-05, "loss": 3.0204, "step": 29150 }, { "epoch": 3.3213736545361354, "grad_norm": 21.556001663208008, "learning_rate": 1.6838680214636374e-05, "loss": 2.9305, "step": 29160 }, { "epoch": 3.3225126715644397, "grad_norm": 14.502067565917969, "learning_rate": 1.6827263386231306e-05, "loss": 3.0541, "step": 29170 }, { "epoch": 3.3236516885927445, "grad_norm": 26.800825119018555, "learning_rate": 1.6815846557826238e-05, "loss": 3.1339, "step": 29180 }, { "epoch": 3.324790705621049, "grad_norm": 17.516202926635742, "learning_rate": 1.6804429729421166e-05, "loss": 3.11, "step": 29190 }, { "epoch": 3.3259297226493536, "grad_norm": 15.0630521774292, "learning_rate": 1.6793012901016098e-05, "loss": 3.012, "step": 29200 }, { "epoch": 3.3270687396776584, "grad_norm": 30.267250061035156, "learning_rate": 1.678159607261103e-05, "loss": 3.4193, "step": 29210 }, { "epoch": 3.3282077567059627, "grad_norm": 38.177757263183594, "learning_rate": 1.677017924420596e-05, "loss": 3.302, "step": 29220 }, { "epoch": 3.3293467737342675, "grad_norm": 15.641914367675781, "learning_rate": 1.675876241580089e-05, "loss": 3.3494, "step": 29230 }, { "epoch": 3.330485790762572, "grad_norm": 16.158750534057617, "learning_rate": 1.6747345587395822e-05, "loss": 3.3635, "step": 29240 }, { "epoch": 3.3316248077908766, "grad_norm": 20.805484771728516, "learning_rate": 1.6735928758990754e-05, "loss": 3.1683, "step": 29250 }, { "epoch": 3.332763824819181, "grad_norm": 32.77595901489258, "learning_rate": 1.6724511930585686e-05, "loss": 3.324, "step": 29260 }, { "epoch": 3.3339028418474856, "grad_norm": 35.97859573364258, "learning_rate": 1.6713095102180614e-05, "loss": 2.9465, "step": 29270 }, { "epoch": 3.3350418588757904, "grad_norm": 23.729339599609375, "learning_rate": 1.6701678273775546e-05, "loss": 3.0817, "step": 29280 }, { "epoch": 3.3361808759040947, "grad_norm": 15.407066345214844, "learning_rate": 1.6690261445370478e-05, "loss": 3.0837, "step": 29290 }, { "epoch": 3.3373198929323995, "grad_norm": 13.66457462310791, "learning_rate": 1.667884461696541e-05, "loss": 3.2846, "step": 29300 }, { "epoch": 3.338458909960704, "grad_norm": 22.89889907836914, "learning_rate": 1.6667427788560338e-05, "loss": 3.2511, "step": 29310 }, { "epoch": 3.3395979269890086, "grad_norm": 131.8885498046875, "learning_rate": 1.665601096015527e-05, "loss": 3.0335, "step": 29320 }, { "epoch": 3.340736944017313, "grad_norm": 20.44186019897461, "learning_rate": 1.66445941317502e-05, "loss": 3.2246, "step": 29330 }, { "epoch": 3.3418759610456177, "grad_norm": 14.773825645446777, "learning_rate": 1.6633177303345134e-05, "loss": 3.481, "step": 29340 }, { "epoch": 3.3430149780739224, "grad_norm": 46.8711051940918, "learning_rate": 1.6621760474940062e-05, "loss": 2.9988, "step": 29350 }, { "epoch": 3.3441539951022268, "grad_norm": 22.26974868774414, "learning_rate": 1.6610343646534994e-05, "loss": 3.2196, "step": 29360 }, { "epoch": 3.3452930121305315, "grad_norm": 24.883068084716797, "learning_rate": 1.6598926818129926e-05, "loss": 3.1028, "step": 29370 }, { "epoch": 3.346432029158836, "grad_norm": 17.86937141418457, "learning_rate": 1.6587509989724857e-05, "loss": 3.3909, "step": 29380 }, { "epoch": 3.3475710461871406, "grad_norm": 14.344905853271484, "learning_rate": 1.6576093161319786e-05, "loss": 3.6976, "step": 29390 }, { "epoch": 3.348710063215445, "grad_norm": 13.908252716064453, "learning_rate": 1.6564676332914718e-05, "loss": 3.252, "step": 29400 }, { "epoch": 3.3498490802437497, "grad_norm": 42.503692626953125, "learning_rate": 1.6553259504509646e-05, "loss": 3.5162, "step": 29410 }, { "epoch": 3.350988097272054, "grad_norm": 23.021549224853516, "learning_rate": 1.6541842676104578e-05, "loss": 3.2051, "step": 29420 }, { "epoch": 3.352127114300359, "grad_norm": 13.986491203308105, "learning_rate": 1.653042584769951e-05, "loss": 3.0717, "step": 29430 }, { "epoch": 3.353266131328663, "grad_norm": 23.040395736694336, "learning_rate": 1.6519009019294442e-05, "loss": 2.8798, "step": 29440 }, { "epoch": 3.354405148356968, "grad_norm": 31.770923614501953, "learning_rate": 1.650759219088937e-05, "loss": 3.4936, "step": 29450 }, { "epoch": 3.3555441653852727, "grad_norm": 21.78679847717285, "learning_rate": 1.6496175362484302e-05, "loss": 3.3232, "step": 29460 }, { "epoch": 3.356683182413577, "grad_norm": 18.76166534423828, "learning_rate": 1.6484758534079234e-05, "loss": 3.3764, "step": 29470 }, { "epoch": 3.3578221994418818, "grad_norm": 14.343433380126953, "learning_rate": 1.6473341705674166e-05, "loss": 3.4346, "step": 29480 }, { "epoch": 3.358961216470186, "grad_norm": 13.465190887451172, "learning_rate": 1.6461924877269094e-05, "loss": 3.1158, "step": 29490 }, { "epoch": 3.360100233498491, "grad_norm": 20.464893341064453, "learning_rate": 1.6450508048864026e-05, "loss": 3.3114, "step": 29500 }, { "epoch": 3.361239250526795, "grad_norm": 14.541702270507812, "learning_rate": 1.6439091220458958e-05, "loss": 3.0706, "step": 29510 }, { "epoch": 3.3623782675551, "grad_norm": 16.677627563476562, "learning_rate": 1.6427674392053886e-05, "loss": 3.1693, "step": 29520 }, { "epoch": 3.3635172845834047, "grad_norm": 23.162879943847656, "learning_rate": 1.6416257563648818e-05, "loss": 3.3536, "step": 29530 }, { "epoch": 3.364656301611709, "grad_norm": 17.129453659057617, "learning_rate": 1.640484073524375e-05, "loss": 3.1853, "step": 29540 }, { "epoch": 3.365795318640014, "grad_norm": 11.490333557128906, "learning_rate": 1.6393423906838682e-05, "loss": 3.3844, "step": 29550 }, { "epoch": 3.366934335668318, "grad_norm": 16.73991584777832, "learning_rate": 1.638200707843361e-05, "loss": 3.4305, "step": 29560 }, { "epoch": 3.368073352696623, "grad_norm": 22.227336883544922, "learning_rate": 1.6370590250028542e-05, "loss": 3.1781, "step": 29570 }, { "epoch": 3.369212369724927, "grad_norm": 11.860732078552246, "learning_rate": 1.6359173421623474e-05, "loss": 3.0631, "step": 29580 }, { "epoch": 3.370351386753232, "grad_norm": 24.782207489013672, "learning_rate": 1.6347756593218406e-05, "loss": 3.3757, "step": 29590 }, { "epoch": 3.3714904037815367, "grad_norm": 13.964835166931152, "learning_rate": 1.6336339764813334e-05, "loss": 3.3611, "step": 29600 }, { "epoch": 3.372629420809841, "grad_norm": 16.538766860961914, "learning_rate": 1.6324922936408266e-05, "loss": 3.3345, "step": 29610 }, { "epoch": 3.373768437838146, "grad_norm": 13.755891799926758, "learning_rate": 1.6313506108003198e-05, "loss": 3.2177, "step": 29620 }, { "epoch": 3.37490745486645, "grad_norm": 15.727559089660645, "learning_rate": 1.630208927959813e-05, "loss": 3.5218, "step": 29630 }, { "epoch": 3.376046471894755, "grad_norm": 14.589932441711426, "learning_rate": 1.6290672451193058e-05, "loss": 3.0755, "step": 29640 }, { "epoch": 3.3771854889230593, "grad_norm": 14.680362701416016, "learning_rate": 1.627925562278799e-05, "loss": 3.5294, "step": 29650 }, { "epoch": 3.378324505951364, "grad_norm": 20.469858169555664, "learning_rate": 1.6267838794382922e-05, "loss": 3.106, "step": 29660 }, { "epoch": 3.379463522979669, "grad_norm": 16.367835998535156, "learning_rate": 1.6256421965977854e-05, "loss": 3.355, "step": 29670 }, { "epoch": 3.380602540007973, "grad_norm": 19.499868392944336, "learning_rate": 1.6245005137572782e-05, "loss": 3.2907, "step": 29680 }, { "epoch": 3.381741557036278, "grad_norm": 25.876239776611328, "learning_rate": 1.6233588309167714e-05, "loss": 2.9573, "step": 29690 }, { "epoch": 3.382880574064582, "grad_norm": 28.356945037841797, "learning_rate": 1.6222171480762646e-05, "loss": 3.0147, "step": 29700 }, { "epoch": 3.384019591092887, "grad_norm": 20.488754272460938, "learning_rate": 1.6210754652357578e-05, "loss": 2.8789, "step": 29710 }, { "epoch": 3.3851586081211913, "grad_norm": 19.562116622924805, "learning_rate": 1.6199337823952506e-05, "loss": 3.412, "step": 29720 }, { "epoch": 3.386297625149496, "grad_norm": 15.406684875488281, "learning_rate": 1.6187920995547438e-05, "loss": 3.0184, "step": 29730 }, { "epoch": 3.3874366421778004, "grad_norm": 16.705585479736328, "learning_rate": 1.6176504167142366e-05, "loss": 3.0089, "step": 29740 }, { "epoch": 3.388575659206105, "grad_norm": 17.685901641845703, "learning_rate": 1.61650873387373e-05, "loss": 3.8159, "step": 29750 }, { "epoch": 3.3897146762344095, "grad_norm": 14.58100414276123, "learning_rate": 1.615367051033223e-05, "loss": 3.2699, "step": 29760 }, { "epoch": 3.3908536932627142, "grad_norm": 23.760499954223633, "learning_rate": 1.6142253681927162e-05, "loss": 3.0052, "step": 29770 }, { "epoch": 3.391992710291019, "grad_norm": 19.12250328063965, "learning_rate": 1.613083685352209e-05, "loss": 3.2497, "step": 29780 }, { "epoch": 3.3931317273193233, "grad_norm": 13.280375480651855, "learning_rate": 1.6119420025117026e-05, "loss": 3.107, "step": 29790 }, { "epoch": 3.394270744347628, "grad_norm": 17.527719497680664, "learning_rate": 1.6108003196711954e-05, "loss": 2.9071, "step": 29800 }, { "epoch": 3.3954097613759324, "grad_norm": 17.869985580444336, "learning_rate": 1.6096586368306886e-05, "loss": 3.4961, "step": 29810 }, { "epoch": 3.396548778404237, "grad_norm": 21.941728591918945, "learning_rate": 1.6085169539901814e-05, "loss": 3.0918, "step": 29820 }, { "epoch": 3.3976877954325415, "grad_norm": 14.9606351852417, "learning_rate": 1.607375271149675e-05, "loss": 3.3902, "step": 29830 }, { "epoch": 3.3988268124608463, "grad_norm": 11.171098709106445, "learning_rate": 1.6062335883091678e-05, "loss": 3.2473, "step": 29840 }, { "epoch": 3.399965829489151, "grad_norm": 11.946869850158691, "learning_rate": 1.605091905468661e-05, "loss": 3.0678, "step": 29850 }, { "epoch": 3.4011048465174554, "grad_norm": 22.47144889831543, "learning_rate": 1.6039502226281538e-05, "loss": 3.3255, "step": 29860 }, { "epoch": 3.40224386354576, "grad_norm": 14.231310844421387, "learning_rate": 1.602808539787647e-05, "loss": 3.7008, "step": 29870 }, { "epoch": 3.4033828805740645, "grad_norm": 31.942094802856445, "learning_rate": 1.6016668569471402e-05, "loss": 3.1198, "step": 29880 }, { "epoch": 3.4045218976023692, "grad_norm": 85.79116821289062, "learning_rate": 1.6005251741066334e-05, "loss": 3.1192, "step": 29890 }, { "epoch": 3.4056609146306736, "grad_norm": 20.57061004638672, "learning_rate": 1.5993834912661262e-05, "loss": 3.2926, "step": 29900 }, { "epoch": 3.4067999316589783, "grad_norm": 16.892438888549805, "learning_rate": 1.5982418084256194e-05, "loss": 3.2882, "step": 29910 }, { "epoch": 3.407938948687283, "grad_norm": 25.468454360961914, "learning_rate": 1.5971001255851126e-05, "loss": 3.1861, "step": 29920 }, { "epoch": 3.4090779657155874, "grad_norm": 18.283166885375977, "learning_rate": 1.5959584427446054e-05, "loss": 3.0752, "step": 29930 }, { "epoch": 3.410216982743892, "grad_norm": 18.180814743041992, "learning_rate": 1.5948167599040986e-05, "loss": 3.4524, "step": 29940 }, { "epoch": 3.4113559997721965, "grad_norm": 26.990262985229492, "learning_rate": 1.5936750770635918e-05, "loss": 3.2659, "step": 29950 }, { "epoch": 3.4124950168005013, "grad_norm": 16.079538345336914, "learning_rate": 1.592533394223085e-05, "loss": 3.9032, "step": 29960 }, { "epoch": 3.4136340338288056, "grad_norm": 22.733596801757812, "learning_rate": 1.5913917113825778e-05, "loss": 3.2087, "step": 29970 }, { "epoch": 3.4147730508571104, "grad_norm": 23.974925994873047, "learning_rate": 1.590250028542071e-05, "loss": 3.2805, "step": 29980 }, { "epoch": 3.415912067885415, "grad_norm": 18.082353591918945, "learning_rate": 1.5891083457015642e-05, "loss": 3.1268, "step": 29990 }, { "epoch": 3.4170510849137194, "grad_norm": 12.27398681640625, "learning_rate": 1.5879666628610574e-05, "loss": 3.3404, "step": 30000 }, { "epoch": 3.4170510849137194, "eval_loss": 6.625226974487305, "eval_runtime": 10.8277, "eval_samples_per_second": 1.385, "eval_steps_per_second": 0.185, "step": 30000 }, { "epoch": 3.418190101942024, "grad_norm": 15.191031455993652, "learning_rate": 1.5868249800205502e-05, "loss": 3.0973, "step": 30010 }, { "epoch": 3.4193291189703285, "grad_norm": 14.566495895385742, "learning_rate": 1.5856832971800434e-05, "loss": 3.2779, "step": 30020 }, { "epoch": 3.4204681359986333, "grad_norm": 19.6086483001709, "learning_rate": 1.5845416143395366e-05, "loss": 2.934, "step": 30030 }, { "epoch": 3.4216071530269376, "grad_norm": 23.241235733032227, "learning_rate": 1.5833999314990298e-05, "loss": 3.334, "step": 30040 }, { "epoch": 3.4227461700552424, "grad_norm": 14.927157402038574, "learning_rate": 1.5822582486585226e-05, "loss": 3.4852, "step": 30050 }, { "epoch": 3.4238851870835467, "grad_norm": 26.401268005371094, "learning_rate": 1.5811165658180158e-05, "loss": 3.1974, "step": 30060 }, { "epoch": 3.4250242041118515, "grad_norm": 18.7486515045166, "learning_rate": 1.5799748829775086e-05, "loss": 3.0255, "step": 30070 }, { "epoch": 3.426163221140156, "grad_norm": 16.352279663085938, "learning_rate": 1.5788332001370022e-05, "loss": 3.306, "step": 30080 }, { "epoch": 3.4273022381684606, "grad_norm": 16.137226104736328, "learning_rate": 1.577691517296495e-05, "loss": 3.5011, "step": 30090 }, { "epoch": 3.4284412551967653, "grad_norm": 11.79498291015625, "learning_rate": 1.5765498344559882e-05, "loss": 2.9683, "step": 30100 }, { "epoch": 3.4295802722250697, "grad_norm": 55.726356506347656, "learning_rate": 1.575408151615481e-05, "loss": 2.8664, "step": 30110 }, { "epoch": 3.4307192892533744, "grad_norm": 13.45319652557373, "learning_rate": 1.5742664687749746e-05, "loss": 3.3007, "step": 30120 }, { "epoch": 3.4318583062816788, "grad_norm": 34.02750015258789, "learning_rate": 1.5731247859344674e-05, "loss": 3.0453, "step": 30130 }, { "epoch": 3.4329973233099835, "grad_norm": 36.55954360961914, "learning_rate": 1.5719831030939606e-05, "loss": 3.1496, "step": 30140 }, { "epoch": 3.434136340338288, "grad_norm": 27.306396484375, "learning_rate": 1.5708414202534534e-05, "loss": 3.196, "step": 30150 }, { "epoch": 3.4352753573665926, "grad_norm": 12.680956840515137, "learning_rate": 1.569699737412947e-05, "loss": 3.3181, "step": 30160 }, { "epoch": 3.4364143743948974, "grad_norm": 18.007972717285156, "learning_rate": 1.5685580545724398e-05, "loss": 2.9668, "step": 30170 }, { "epoch": 3.4375533914232017, "grad_norm": 19.509634017944336, "learning_rate": 1.567416371731933e-05, "loss": 3.5464, "step": 30180 }, { "epoch": 3.4386924084515065, "grad_norm": 28.20089340209961, "learning_rate": 1.566274688891426e-05, "loss": 3.1637, "step": 30190 }, { "epoch": 3.439831425479811, "grad_norm": 55.81390380859375, "learning_rate": 1.5651330060509194e-05, "loss": 3.2181, "step": 30200 }, { "epoch": 3.4409704425081156, "grad_norm": 17.022470474243164, "learning_rate": 1.5639913232104122e-05, "loss": 3.2408, "step": 30210 }, { "epoch": 3.44210945953642, "grad_norm": 19.492403030395508, "learning_rate": 1.5628496403699054e-05, "loss": 3.2508, "step": 30220 }, { "epoch": 3.4432484765647247, "grad_norm": 22.010845184326172, "learning_rate": 1.5617079575293982e-05, "loss": 3.2779, "step": 30230 }, { "epoch": 3.4443874935930294, "grad_norm": 20.154726028442383, "learning_rate": 1.5605662746888918e-05, "loss": 3.3812, "step": 30240 }, { "epoch": 3.4455265106213337, "grad_norm": 17.139026641845703, "learning_rate": 1.5594245918483846e-05, "loss": 3.1647, "step": 30250 }, { "epoch": 3.4466655276496385, "grad_norm": 17.56887435913086, "learning_rate": 1.5582829090078778e-05, "loss": 3.4022, "step": 30260 }, { "epoch": 3.447804544677943, "grad_norm": 27.469118118286133, "learning_rate": 1.5571412261673706e-05, "loss": 2.9381, "step": 30270 }, { "epoch": 3.4489435617062476, "grad_norm": 26.647045135498047, "learning_rate": 1.555999543326864e-05, "loss": 3.7534, "step": 30280 }, { "epoch": 3.450082578734552, "grad_norm": 31.436521530151367, "learning_rate": 1.554857860486357e-05, "loss": 3.2533, "step": 30290 }, { "epoch": 3.4512215957628567, "grad_norm": 16.83370018005371, "learning_rate": 1.5537161776458502e-05, "loss": 3.0931, "step": 30300 }, { "epoch": 3.4523606127911615, "grad_norm": 16.535404205322266, "learning_rate": 1.552574494805343e-05, "loss": 3.1092, "step": 30310 }, { "epoch": 3.453499629819466, "grad_norm": 30.36872673034668, "learning_rate": 1.5514328119648362e-05, "loss": 2.9677, "step": 30320 }, { "epoch": 3.4546386468477706, "grad_norm": 12.996785163879395, "learning_rate": 1.5502911291243294e-05, "loss": 2.9729, "step": 30330 }, { "epoch": 3.455777663876075, "grad_norm": 45.42363357543945, "learning_rate": 1.5491494462838226e-05, "loss": 2.915, "step": 30340 }, { "epoch": 3.4569166809043796, "grad_norm": 16.04938316345215, "learning_rate": 1.5480077634433154e-05, "loss": 3.538, "step": 30350 }, { "epoch": 3.458055697932684, "grad_norm": 17.409317016601562, "learning_rate": 1.5468660806028086e-05, "loss": 3.0676, "step": 30360 }, { "epoch": 3.4591947149609887, "grad_norm": 26.960065841674805, "learning_rate": 1.5457243977623018e-05, "loss": 3.8267, "step": 30370 }, { "epoch": 3.460333731989293, "grad_norm": 21.414134979248047, "learning_rate": 1.5445827149217946e-05, "loss": 3.2971, "step": 30380 }, { "epoch": 3.461472749017598, "grad_norm": 15.664322853088379, "learning_rate": 1.5434410320812878e-05, "loss": 3.0403, "step": 30390 }, { "epoch": 3.462611766045902, "grad_norm": 39.66256332397461, "learning_rate": 1.542299349240781e-05, "loss": 3.1763, "step": 30400 }, { "epoch": 3.463750783074207, "grad_norm": 18.446809768676758, "learning_rate": 1.5411576664002742e-05, "loss": 3.4536, "step": 30410 }, { "epoch": 3.4648898001025117, "grad_norm": 19.519460678100586, "learning_rate": 1.540015983559767e-05, "loss": 3.5506, "step": 30420 }, { "epoch": 3.466028817130816, "grad_norm": 17.096614837646484, "learning_rate": 1.5388743007192602e-05, "loss": 3.2732, "step": 30430 }, { "epoch": 3.4671678341591208, "grad_norm": 20.875993728637695, "learning_rate": 1.537732617878753e-05, "loss": 2.9171, "step": 30440 }, { "epoch": 3.468306851187425, "grad_norm": 23.673830032348633, "learning_rate": 1.5365909350382466e-05, "loss": 3.0686, "step": 30450 }, { "epoch": 3.46944586821573, "grad_norm": 25.037826538085938, "learning_rate": 1.5354492521977394e-05, "loss": 3.1601, "step": 30460 }, { "epoch": 3.470584885244034, "grad_norm": 18.874675750732422, "learning_rate": 1.5343075693572326e-05, "loss": 3.2137, "step": 30470 }, { "epoch": 3.471723902272339, "grad_norm": 20.864459991455078, "learning_rate": 1.5331658865167255e-05, "loss": 3.2251, "step": 30480 }, { "epoch": 3.4728629193006437, "grad_norm": 12.102560997009277, "learning_rate": 1.532024203676219e-05, "loss": 3.3722, "step": 30490 }, { "epoch": 3.474001936328948, "grad_norm": 19.17115020751953, "learning_rate": 1.5308825208357118e-05, "loss": 3.0327, "step": 30500 }, { "epoch": 3.475140953357253, "grad_norm": 14.776339530944824, "learning_rate": 1.529740837995205e-05, "loss": 3.2387, "step": 30510 }, { "epoch": 3.476279970385557, "grad_norm": 21.124536514282227, "learning_rate": 1.528599155154698e-05, "loss": 3.3011, "step": 30520 }, { "epoch": 3.477418987413862, "grad_norm": 18.17337417602539, "learning_rate": 1.5274574723141914e-05, "loss": 3.2292, "step": 30530 }, { "epoch": 3.4785580044421662, "grad_norm": 14.608512878417969, "learning_rate": 1.5263157894736842e-05, "loss": 3.1605, "step": 30540 }, { "epoch": 3.479697021470471, "grad_norm": 15.62062931060791, "learning_rate": 1.5251741066331774e-05, "loss": 3.2276, "step": 30550 }, { "epoch": 3.4808360384987758, "grad_norm": 39.68000411987305, "learning_rate": 1.5240324237926704e-05, "loss": 3.7446, "step": 30560 }, { "epoch": 3.48197505552708, "grad_norm": 28.0665340423584, "learning_rate": 1.5228907409521636e-05, "loss": 2.9518, "step": 30570 }, { "epoch": 3.483114072555385, "grad_norm": 14.938121795654297, "learning_rate": 1.5217490581116566e-05, "loss": 3.2497, "step": 30580 }, { "epoch": 3.484253089583689, "grad_norm": 15.683838844299316, "learning_rate": 1.5206073752711496e-05, "loss": 2.9266, "step": 30590 }, { "epoch": 3.485392106611994, "grad_norm": 36.78779602050781, "learning_rate": 1.5194656924306428e-05, "loss": 3.3181, "step": 30600 }, { "epoch": 3.4865311236402983, "grad_norm": 21.261545181274414, "learning_rate": 1.518324009590136e-05, "loss": 3.148, "step": 30610 }, { "epoch": 3.487670140668603, "grad_norm": 15.538030624389648, "learning_rate": 1.517182326749629e-05, "loss": 3.21, "step": 30620 }, { "epoch": 3.488809157696908, "grad_norm": 16.829191207885742, "learning_rate": 1.516040643909122e-05, "loss": 3.0878, "step": 30630 }, { "epoch": 3.489948174725212, "grad_norm": 18.444971084594727, "learning_rate": 1.514898961068615e-05, "loss": 3.414, "step": 30640 }, { "epoch": 3.491087191753517, "grad_norm": 20.05096435546875, "learning_rate": 1.5137572782281084e-05, "loss": 3.4709, "step": 30650 }, { "epoch": 3.492226208781821, "grad_norm": 18.415931701660156, "learning_rate": 1.5126155953876014e-05, "loss": 3.0899, "step": 30660 }, { "epoch": 3.493365225810126, "grad_norm": 17.503332138061523, "learning_rate": 1.5114739125470944e-05, "loss": 3.2454, "step": 30670 }, { "epoch": 3.4945042428384303, "grad_norm": 12.981982231140137, "learning_rate": 1.5103322297065874e-05, "loss": 3.6228, "step": 30680 }, { "epoch": 3.495643259866735, "grad_norm": 13.89370346069336, "learning_rate": 1.5091905468660808e-05, "loss": 2.9744, "step": 30690 }, { "epoch": 3.4967822768950394, "grad_norm": 17.46722984313965, "learning_rate": 1.5080488640255738e-05, "loss": 2.9983, "step": 30700 }, { "epoch": 3.497921293923344, "grad_norm": 20.388866424560547, "learning_rate": 1.5069071811850668e-05, "loss": 3.3692, "step": 30710 }, { "epoch": 3.4990603109516485, "grad_norm": 18.890443801879883, "learning_rate": 1.5057654983445598e-05, "loss": 3.1463, "step": 30720 }, { "epoch": 3.5001993279799533, "grad_norm": 20.678749084472656, "learning_rate": 1.5046238155040532e-05, "loss": 3.3334, "step": 30730 }, { "epoch": 3.501338345008258, "grad_norm": 35.57350158691406, "learning_rate": 1.5034821326635462e-05, "loss": 3.0429, "step": 30740 }, { "epoch": 3.5024773620365623, "grad_norm": 16.265722274780273, "learning_rate": 1.5023404498230392e-05, "loss": 3.4358, "step": 30750 }, { "epoch": 3.503616379064867, "grad_norm": 21.30552864074707, "learning_rate": 1.5011987669825322e-05, "loss": 3.4364, "step": 30760 }, { "epoch": 3.5047553960931714, "grad_norm": 14.750267028808594, "learning_rate": 1.5000570841420252e-05, "loss": 3.2435, "step": 30770 }, { "epoch": 3.505894413121476, "grad_norm": 22.640623092651367, "learning_rate": 1.4989154013015186e-05, "loss": 2.9251, "step": 30780 }, { "epoch": 3.5070334301497805, "grad_norm": 15.266586303710938, "learning_rate": 1.4977737184610116e-05, "loss": 3.1723, "step": 30790 }, { "epoch": 3.5081724471780853, "grad_norm": 17.580577850341797, "learning_rate": 1.4966320356205046e-05, "loss": 3.2163, "step": 30800 }, { "epoch": 3.50931146420639, "grad_norm": 25.055023193359375, "learning_rate": 1.4954903527799976e-05, "loss": 3.2747, "step": 30810 }, { "epoch": 3.5104504812346944, "grad_norm": 15.265680313110352, "learning_rate": 1.494348669939491e-05, "loss": 2.945, "step": 30820 }, { "epoch": 3.511589498262999, "grad_norm": 43.274559020996094, "learning_rate": 1.493206987098984e-05, "loss": 3.0168, "step": 30830 }, { "epoch": 3.5127285152913035, "grad_norm": 17.391170501708984, "learning_rate": 1.492065304258477e-05, "loss": 3.5466, "step": 30840 }, { "epoch": 3.5138675323196082, "grad_norm": 34.3690071105957, "learning_rate": 1.49092362141797e-05, "loss": 3.611, "step": 30850 }, { "epoch": 3.5150065493479126, "grad_norm": 18.427181243896484, "learning_rate": 1.4897819385774634e-05, "loss": 3.0353, "step": 30860 }, { "epoch": 3.5161455663762173, "grad_norm": 22.536880493164062, "learning_rate": 1.4886402557369564e-05, "loss": 3.4574, "step": 30870 }, { "epoch": 3.517284583404522, "grad_norm": 17.170167922973633, "learning_rate": 1.4874985728964494e-05, "loss": 3.3474, "step": 30880 }, { "epoch": 3.5184236004328264, "grad_norm": 22.633150100708008, "learning_rate": 1.4863568900559424e-05, "loss": 3.0808, "step": 30890 }, { "epoch": 3.519562617461131, "grad_norm": 24.105703353881836, "learning_rate": 1.4852152072154358e-05, "loss": 3.5359, "step": 30900 }, { "epoch": 3.5207016344894355, "grad_norm": 16.735557556152344, "learning_rate": 1.4840735243749288e-05, "loss": 3.5234, "step": 30910 }, { "epoch": 3.5218406515177403, "grad_norm": 17.64985466003418, "learning_rate": 1.4829318415344218e-05, "loss": 3.2298, "step": 30920 }, { "epoch": 3.5229796685460446, "grad_norm": 33.44835662841797, "learning_rate": 1.4817901586939148e-05, "loss": 3.3543, "step": 30930 }, { "epoch": 3.5241186855743494, "grad_norm": 28.6575984954834, "learning_rate": 1.4806484758534082e-05, "loss": 3.1647, "step": 30940 }, { "epoch": 3.525257702602654, "grad_norm": 16.440593719482422, "learning_rate": 1.4795067930129012e-05, "loss": 3.3117, "step": 30950 }, { "epoch": 3.5263967196309585, "grad_norm": 26.83173370361328, "learning_rate": 1.4783651101723942e-05, "loss": 3.2006, "step": 30960 }, { "epoch": 3.527535736659263, "grad_norm": 17.046058654785156, "learning_rate": 1.4772234273318872e-05, "loss": 3.1932, "step": 30970 }, { "epoch": 3.5286747536875676, "grad_norm": 32.76432418823242, "learning_rate": 1.4760817444913804e-05, "loss": 3.6004, "step": 30980 }, { "epoch": 3.5298137707158723, "grad_norm": 15.658669471740723, "learning_rate": 1.4749400616508734e-05, "loss": 3.0127, "step": 30990 }, { "epoch": 3.5309527877441766, "grad_norm": 24.679824829101562, "learning_rate": 1.4737983788103666e-05, "loss": 3.3527, "step": 31000 }, { "epoch": 3.5320918047724814, "grad_norm": 11.274155616760254, "learning_rate": 1.4726566959698596e-05, "loss": 2.9699, "step": 31010 }, { "epoch": 3.533230821800786, "grad_norm": 10.21949291229248, "learning_rate": 1.4715150131293528e-05, "loss": 3.4614, "step": 31020 }, { "epoch": 3.5343698388290905, "grad_norm": 13.77395248413086, "learning_rate": 1.4703733302888458e-05, "loss": 3.1646, "step": 31030 }, { "epoch": 3.535508855857395, "grad_norm": 21.251293182373047, "learning_rate": 1.4692316474483388e-05, "loss": 3.286, "step": 31040 }, { "epoch": 3.5366478728856996, "grad_norm": 13.558059692382812, "learning_rate": 1.468089964607832e-05, "loss": 3.0235, "step": 31050 }, { "epoch": 3.5377868899140044, "grad_norm": 46.95809555053711, "learning_rate": 1.4669482817673252e-05, "loss": 3.0121, "step": 31060 }, { "epoch": 3.5389259069423087, "grad_norm": 17.31770896911621, "learning_rate": 1.4658065989268182e-05, "loss": 3.4192, "step": 31070 }, { "epoch": 3.5400649239706135, "grad_norm": 17.153045654296875, "learning_rate": 1.4646649160863112e-05, "loss": 3.0657, "step": 31080 }, { "epoch": 3.5412039409989178, "grad_norm": 19.619625091552734, "learning_rate": 1.4635232332458042e-05, "loss": 2.9207, "step": 31090 }, { "epoch": 3.5423429580272225, "grad_norm": 23.412649154663086, "learning_rate": 1.4623815504052973e-05, "loss": 3.4202, "step": 31100 }, { "epoch": 3.543481975055527, "grad_norm": 24.09189224243164, "learning_rate": 1.4612398675647906e-05, "loss": 3.1347, "step": 31110 }, { "epoch": 3.5446209920838316, "grad_norm": 28.67925453186035, "learning_rate": 1.4600981847242836e-05, "loss": 3.0364, "step": 31120 }, { "epoch": 3.5457600091121364, "grad_norm": 25.847000122070312, "learning_rate": 1.4589565018837766e-05, "loss": 3.2325, "step": 31130 }, { "epoch": 3.5468990261404407, "grad_norm": 12.999897956848145, "learning_rate": 1.4578148190432697e-05, "loss": 3.3743, "step": 31140 }, { "epoch": 3.5480380431687455, "grad_norm": 32.32277297973633, "learning_rate": 1.456673136202763e-05, "loss": 3.0741, "step": 31150 }, { "epoch": 3.54917706019705, "grad_norm": 22.80971908569336, "learning_rate": 1.455531453362256e-05, "loss": 3.1639, "step": 31160 }, { "epoch": 3.5503160772253546, "grad_norm": 28.205598831176758, "learning_rate": 1.454389770521749e-05, "loss": 3.3927, "step": 31170 }, { "epoch": 3.551455094253659, "grad_norm": 26.13220977783203, "learning_rate": 1.453248087681242e-05, "loss": 3.2931, "step": 31180 }, { "epoch": 3.5525941112819637, "grad_norm": 17.164941787719727, "learning_rate": 1.4521064048407354e-05, "loss": 3.5586, "step": 31190 }, { "epoch": 3.5537331283102684, "grad_norm": 21.556045532226562, "learning_rate": 1.4509647220002284e-05, "loss": 3.3343, "step": 31200 }, { "epoch": 3.5548721453385728, "grad_norm": 33.504573822021484, "learning_rate": 1.4498230391597214e-05, "loss": 3.0983, "step": 31210 }, { "epoch": 3.5560111623668775, "grad_norm": 14.091229438781738, "learning_rate": 1.4486813563192144e-05, "loss": 2.8769, "step": 31220 }, { "epoch": 3.557150179395182, "grad_norm": 21.432533264160156, "learning_rate": 1.4475396734787078e-05, "loss": 3.091, "step": 31230 }, { "epoch": 3.5582891964234866, "grad_norm": 18.884870529174805, "learning_rate": 1.4463979906382008e-05, "loss": 3.3367, "step": 31240 }, { "epoch": 3.559428213451791, "grad_norm": 16.285051345825195, "learning_rate": 1.4452563077976938e-05, "loss": 3.1689, "step": 31250 }, { "epoch": 3.5605672304800957, "grad_norm": 23.484182357788086, "learning_rate": 1.4441146249571868e-05, "loss": 3.1922, "step": 31260 }, { "epoch": 3.5617062475084005, "grad_norm": 15.000371932983398, "learning_rate": 1.4429729421166802e-05, "loss": 3.5585, "step": 31270 }, { "epoch": 3.562845264536705, "grad_norm": 33.48192596435547, "learning_rate": 1.4418312592761732e-05, "loss": 3.3127, "step": 31280 }, { "epoch": 3.563984281565009, "grad_norm": 27.877798080444336, "learning_rate": 1.4406895764356662e-05, "loss": 3.4143, "step": 31290 }, { "epoch": 3.565123298593314, "grad_norm": 22.70536994934082, "learning_rate": 1.4395478935951592e-05, "loss": 3.316, "step": 31300 }, { "epoch": 3.5662623156216187, "grad_norm": 20.055557250976562, "learning_rate": 1.4384062107546526e-05, "loss": 3.3235, "step": 31310 }, { "epoch": 3.567401332649923, "grad_norm": 15.23216724395752, "learning_rate": 1.4372645279141456e-05, "loss": 3.4314, "step": 31320 }, { "epoch": 3.5685403496782278, "grad_norm": 22.534870147705078, "learning_rate": 1.4361228450736386e-05, "loss": 3.1412, "step": 31330 }, { "epoch": 3.5696793667065325, "grad_norm": 17.03142738342285, "learning_rate": 1.4349811622331316e-05, "loss": 3.5653, "step": 31340 }, { "epoch": 3.570818383734837, "grad_norm": 17.72332000732422, "learning_rate": 1.433839479392625e-05, "loss": 3.2854, "step": 31350 }, { "epoch": 3.571957400763141, "grad_norm": 18.860029220581055, "learning_rate": 1.432697796552118e-05, "loss": 3.3297, "step": 31360 }, { "epoch": 3.573096417791446, "grad_norm": 15.948371887207031, "learning_rate": 1.431556113711611e-05, "loss": 3.2231, "step": 31370 }, { "epoch": 3.5742354348197507, "grad_norm": 14.283364295959473, "learning_rate": 1.430414430871104e-05, "loss": 3.0603, "step": 31380 }, { "epoch": 3.575374451848055, "grad_norm": 23.466094970703125, "learning_rate": 1.4292727480305972e-05, "loss": 3.3291, "step": 31390 }, { "epoch": 3.57651346887636, "grad_norm": 14.758607864379883, "learning_rate": 1.4281310651900904e-05, "loss": 3.2652, "step": 31400 }, { "epoch": 3.577652485904664, "grad_norm": 29.233253479003906, "learning_rate": 1.4269893823495834e-05, "loss": 2.9194, "step": 31410 }, { "epoch": 3.578791502932969, "grad_norm": 93.158935546875, "learning_rate": 1.4258476995090764e-05, "loss": 3.2382, "step": 31420 }, { "epoch": 3.579930519961273, "grad_norm": 15.761682510375977, "learning_rate": 1.4247060166685696e-05, "loss": 3.5805, "step": 31430 }, { "epoch": 3.581069536989578, "grad_norm": 20.996610641479492, "learning_rate": 1.4235643338280626e-05, "loss": 3.0408, "step": 31440 }, { "epoch": 3.5822085540178827, "grad_norm": 38.895912170410156, "learning_rate": 1.4224226509875558e-05, "loss": 3.1143, "step": 31450 }, { "epoch": 3.583347571046187, "grad_norm": 18.48572540283203, "learning_rate": 1.4212809681470488e-05, "loss": 3.1559, "step": 31460 }, { "epoch": 3.584486588074492, "grad_norm": 19.490568161010742, "learning_rate": 1.4202534535905926e-05, "loss": 3.606, "step": 31470 }, { "epoch": 3.585625605102796, "grad_norm": 17.260679244995117, "learning_rate": 1.4191117707500856e-05, "loss": 3.3253, "step": 31480 }, { "epoch": 3.586764622131101, "grad_norm": 23.358652114868164, "learning_rate": 1.417970087909579e-05, "loss": 3.1187, "step": 31490 }, { "epoch": 3.5879036391594052, "grad_norm": 23.210037231445312, "learning_rate": 1.416828405069072e-05, "loss": 3.2333, "step": 31500 }, { "epoch": 3.58904265618771, "grad_norm": 18.232166290283203, "learning_rate": 1.415686722228565e-05, "loss": 3.274, "step": 31510 }, { "epoch": 3.590181673216015, "grad_norm": 35.253353118896484, "learning_rate": 1.414545039388058e-05, "loss": 2.9219, "step": 31520 }, { "epoch": 3.591320690244319, "grad_norm": 14.418645858764648, "learning_rate": 1.4134033565475514e-05, "loss": 3.354, "step": 31530 }, { "epoch": 3.592459707272624, "grad_norm": 13.531665802001953, "learning_rate": 1.4122616737070444e-05, "loss": 3.4503, "step": 31540 }, { "epoch": 3.593598724300928, "grad_norm": 16.598588943481445, "learning_rate": 1.4111199908665374e-05, "loss": 3.4679, "step": 31550 }, { "epoch": 3.594737741329233, "grad_norm": 16.628454208374023, "learning_rate": 1.4099783080260304e-05, "loss": 3.4018, "step": 31560 }, { "epoch": 3.5958767583575373, "grad_norm": 16.88093376159668, "learning_rate": 1.4088366251855234e-05, "loss": 3.5366, "step": 31570 }, { "epoch": 3.597015775385842, "grad_norm": 23.86626625061035, "learning_rate": 1.4076949423450168e-05, "loss": 3.271, "step": 31580 }, { "epoch": 3.598154792414147, "grad_norm": 15.128466606140137, "learning_rate": 1.4065532595045098e-05, "loss": 3.163, "step": 31590 }, { "epoch": 3.599293809442451, "grad_norm": 18.111583709716797, "learning_rate": 1.4054115766640028e-05, "loss": 3.3405, "step": 31600 }, { "epoch": 3.6004328264707555, "grad_norm": 19.99451446533203, "learning_rate": 1.4042698938234958e-05, "loss": 3.4593, "step": 31610 }, { "epoch": 3.6015718434990602, "grad_norm": 14.503628730773926, "learning_rate": 1.4031282109829892e-05, "loss": 3.412, "step": 31620 }, { "epoch": 3.602710860527365, "grad_norm": 21.69760513305664, "learning_rate": 1.4019865281424822e-05, "loss": 3.3879, "step": 31630 }, { "epoch": 3.6038498775556693, "grad_norm": 15.388642311096191, "learning_rate": 1.4008448453019752e-05, "loss": 2.9837, "step": 31640 }, { "epoch": 3.604988894583974, "grad_norm": 20.263608932495117, "learning_rate": 1.3997031624614682e-05, "loss": 3.4145, "step": 31650 }, { "epoch": 3.606127911612279, "grad_norm": 18.486692428588867, "learning_rate": 1.3985614796209614e-05, "loss": 3.4104, "step": 31660 }, { "epoch": 3.607266928640583, "grad_norm": 18.97177505493164, "learning_rate": 1.3974197967804544e-05, "loss": 3.341, "step": 31670 }, { "epoch": 3.6084059456688875, "grad_norm": 24.558284759521484, "learning_rate": 1.3962781139399476e-05, "loss": 3.3603, "step": 31680 }, { "epoch": 3.6095449626971923, "grad_norm": 28.811580657958984, "learning_rate": 1.3951364310994406e-05, "loss": 2.9241, "step": 31690 }, { "epoch": 3.610683979725497, "grad_norm": 20.004981994628906, "learning_rate": 1.3939947482589338e-05, "loss": 3.2403, "step": 31700 }, { "epoch": 3.6118229967538014, "grad_norm": 23.830997467041016, "learning_rate": 1.3928530654184268e-05, "loss": 3.1013, "step": 31710 }, { "epoch": 3.612962013782106, "grad_norm": 18.13580894470215, "learning_rate": 1.3917113825779198e-05, "loss": 3.2296, "step": 31720 }, { "epoch": 3.6141010308104105, "grad_norm": 38.78871536254883, "learning_rate": 1.390569699737413e-05, "loss": 2.7775, "step": 31730 }, { "epoch": 3.615240047838715, "grad_norm": 19.123943328857422, "learning_rate": 1.3894280168969062e-05, "loss": 3.4268, "step": 31740 }, { "epoch": 3.6163790648670195, "grad_norm": 78.86042785644531, "learning_rate": 1.3882863340563992e-05, "loss": 3.2857, "step": 31750 }, { "epoch": 3.6175180818953243, "grad_norm": 15.203715324401855, "learning_rate": 1.3871446512158922e-05, "loss": 3.6243, "step": 31760 }, { "epoch": 3.618657098923629, "grad_norm": 12.27120304107666, "learning_rate": 1.3860029683753852e-05, "loss": 3.2453, "step": 31770 }, { "epoch": 3.6197961159519334, "grad_norm": 13.538084983825684, "learning_rate": 1.3848612855348786e-05, "loss": 3.5178, "step": 31780 }, { "epoch": 3.620935132980238, "grad_norm": 14.385091781616211, "learning_rate": 1.3837196026943716e-05, "loss": 3.0514, "step": 31790 }, { "epoch": 3.6220741500085425, "grad_norm": 16.23875617980957, "learning_rate": 1.3825779198538646e-05, "loss": 3.1698, "step": 31800 }, { "epoch": 3.6232131670368473, "grad_norm": 20.820890426635742, "learning_rate": 1.3814362370133576e-05, "loss": 3.1269, "step": 31810 }, { "epoch": 3.6243521840651516, "grad_norm": 32.737640380859375, "learning_rate": 1.380294554172851e-05, "loss": 3.2673, "step": 31820 }, { "epoch": 3.6254912010934564, "grad_norm": 40.11422348022461, "learning_rate": 1.379152871332344e-05, "loss": 3.286, "step": 31830 }, { "epoch": 3.626630218121761, "grad_norm": 17.645458221435547, "learning_rate": 1.378011188491837e-05, "loss": 3.0182, "step": 31840 }, { "epoch": 3.6277692351500654, "grad_norm": 26.229267120361328, "learning_rate": 1.37686950565133e-05, "loss": 3.103, "step": 31850 }, { "epoch": 3.62890825217837, "grad_norm": 16.306425094604492, "learning_rate": 1.3757278228108234e-05, "loss": 3.0238, "step": 31860 }, { "epoch": 3.6300472692066745, "grad_norm": 17.451791763305664, "learning_rate": 1.3745861399703164e-05, "loss": 3.1266, "step": 31870 }, { "epoch": 3.6311862862349793, "grad_norm": 9.61365795135498, "learning_rate": 1.3734444571298094e-05, "loss": 3.0935, "step": 31880 }, { "epoch": 3.6323253032632836, "grad_norm": 27.982622146606445, "learning_rate": 1.3723027742893024e-05, "loss": 3.1049, "step": 31890 }, { "epoch": 3.6334643202915884, "grad_norm": 18.57550048828125, "learning_rate": 1.3711610914487954e-05, "loss": 3.1093, "step": 31900 }, { "epoch": 3.634603337319893, "grad_norm": 13.69741153717041, "learning_rate": 1.3700194086082888e-05, "loss": 3.0639, "step": 31910 }, { "epoch": 3.6357423543481975, "grad_norm": 15.091955184936523, "learning_rate": 1.3688777257677818e-05, "loss": 3.1179, "step": 31920 }, { "epoch": 3.636881371376502, "grad_norm": 15.062325477600098, "learning_rate": 1.3677360429272748e-05, "loss": 3.6283, "step": 31930 }, { "epoch": 3.6380203884048066, "grad_norm": 26.963247299194336, "learning_rate": 1.3665943600867678e-05, "loss": 3.4437, "step": 31940 }, { "epoch": 3.6391594054331113, "grad_norm": 14.558040618896484, "learning_rate": 1.3654526772462612e-05, "loss": 3.0342, "step": 31950 }, { "epoch": 3.6402984224614157, "grad_norm": 15.125991821289062, "learning_rate": 1.3643109944057542e-05, "loss": 3.0543, "step": 31960 }, { "epoch": 3.6414374394897204, "grad_norm": 73.07426452636719, "learning_rate": 1.3631693115652472e-05, "loss": 2.9307, "step": 31970 }, { "epoch": 3.642576456518025, "grad_norm": 19.4658145904541, "learning_rate": 1.3620276287247402e-05, "loss": 3.3872, "step": 31980 }, { "epoch": 3.6437154735463295, "grad_norm": 54.43883514404297, "learning_rate": 1.3608859458842336e-05, "loss": 3.3176, "step": 31990 }, { "epoch": 3.644854490574634, "grad_norm": 11.834235191345215, "learning_rate": 1.3597442630437266e-05, "loss": 3.077, "step": 32000 }, { "epoch": 3.644854490574634, "eval_loss": 6.616151809692383, "eval_runtime": 12.3895, "eval_samples_per_second": 1.211, "eval_steps_per_second": 0.161, "step": 32000 } ], "logging_steps": 10, "max_steps": 43895, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }