{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 7000, "global_step": 37494, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 9.306169509887695, "eval_runtime": 10.9126, "eval_samples_per_second": 3.665, "eval_steps_per_second": 0.458, "step": 0 }, { "epoch": 0.0008001280204832773, "grad_norm": 8.51533031463623, "learning_rate": 3.5000000000000004e-06, "loss": 8.786, "step": 10 }, { "epoch": 0.0016002560409665546, "grad_norm": 10.90935230255127, "learning_rate": 8.500000000000002e-06, "loss": 8.3433, "step": 20 }, { "epoch": 0.002400384061449832, "grad_norm": 7.269016265869141, "learning_rate": 1.3500000000000001e-05, "loss": 7.549, "step": 30 }, { "epoch": 0.003200512081933109, "grad_norm": 8.790578842163086, "learning_rate": 1.85e-05, "loss": 7.2574, "step": 40 }, { "epoch": 0.004000640102416387, "grad_norm": 6.52068567276001, "learning_rate": 2.35e-05, "loss": 7.0024, "step": 50 }, { "epoch": 0.004800768122899664, "grad_norm": 6.902959823608398, "learning_rate": 2.8499999999999998e-05, "loss": 6.9074, "step": 60 }, { "epoch": 0.005600896143382941, "grad_norm": 5.350945949554443, "learning_rate": 3.35e-05, "loss": 6.8765, "step": 70 }, { "epoch": 0.006401024163866218, "grad_norm": 5.928489685058594, "learning_rate": 3.85e-05, "loss": 6.5663, "step": 80 }, { "epoch": 0.007201152184349496, "grad_norm": 9.222543716430664, "learning_rate": 4.35e-05, "loss": 6.6131, "step": 90 }, { "epoch": 0.008001280204832774, "grad_norm": 6.57027006149292, "learning_rate": 4.85e-05, "loss": 6.5829, "step": 100 }, { "epoch": 0.00880140822531605, "grad_norm": 5.280848503112793, "learning_rate": 4.999064020965931e-05, "loss": 6.5996, "step": 110 }, { "epoch": 0.009601536245799328, "grad_norm": 5.950971603393555, "learning_rate": 4.997726908060117e-05, "loss": 6.6075, "step": 120 }, { "epoch": 0.010401664266282605, "grad_norm": 4.300549507141113, "learning_rate": 4.996389795154303e-05, "loss": 6.5074, "step": 130 }, { "epoch": 0.011201792286765882, "grad_norm": 4.824333190917969, "learning_rate": 4.9950526822484896e-05, "loss": 6.6072, "step": 140 }, { "epoch": 0.01200192030724916, "grad_norm": 5.4324116706848145, "learning_rate": 4.993715569342676e-05, "loss": 6.6183, "step": 150 }, { "epoch": 0.012802048327732437, "grad_norm": 4.087579250335693, "learning_rate": 4.992378456436862e-05, "loss": 6.4806, "step": 160 }, { "epoch": 0.013602176348215714, "grad_norm": 7.260207653045654, "learning_rate": 4.9910413435310484e-05, "loss": 6.3709, "step": 170 }, { "epoch": 0.014402304368698993, "grad_norm": 4.145061016082764, "learning_rate": 4.9897042306252346e-05, "loss": 6.2951, "step": 180 }, { "epoch": 0.01520243238918227, "grad_norm": 3.2026450634002686, "learning_rate": 4.98836711771942e-05, "loss": 6.3255, "step": 190 }, { "epoch": 0.016002560409665547, "grad_norm": 3.443145751953125, "learning_rate": 4.9870300048136065e-05, "loss": 6.4894, "step": 200 }, { "epoch": 0.016802688430148822, "grad_norm": 5.324231147766113, "learning_rate": 4.985692891907793e-05, "loss": 6.4312, "step": 210 }, { "epoch": 0.0176028164506321, "grad_norm": 3.2833452224731445, "learning_rate": 4.984355779001979e-05, "loss": 6.513, "step": 220 }, { "epoch": 0.018402944471115377, "grad_norm": 3.8984358310699463, "learning_rate": 4.983018666096165e-05, "loss": 6.1683, "step": 230 }, { "epoch": 0.019203072491598656, "grad_norm": 4.183676719665527, "learning_rate": 4.9816815531903516e-05, "loss": 6.329, "step": 240 }, { "epoch": 0.020003200512081935, "grad_norm": 3.136693239212036, "learning_rate": 4.980344440284538e-05, "loss": 6.466, "step": 250 }, { "epoch": 0.02080332853256521, "grad_norm": 4.185967445373535, "learning_rate": 4.979007327378724e-05, "loss": 6.4613, "step": 260 }, { "epoch": 0.02160345655304849, "grad_norm": 3.105653762817383, "learning_rate": 4.9776702144729104e-05, "loss": 6.3596, "step": 270 }, { "epoch": 0.022403584573531764, "grad_norm": 3.927561044692993, "learning_rate": 4.9763331015670967e-05, "loss": 6.2604, "step": 280 }, { "epoch": 0.023203712594015043, "grad_norm": 3.513439178466797, "learning_rate": 4.974995988661283e-05, "loss": 6.2747, "step": 290 }, { "epoch": 0.02400384061449832, "grad_norm": 3.07377290725708, "learning_rate": 4.973658875755469e-05, "loss": 6.202, "step": 300 }, { "epoch": 0.024803968634981598, "grad_norm": 3.045619249343872, "learning_rate": 4.9723217628496555e-05, "loss": 6.1022, "step": 310 }, { "epoch": 0.025604096655464873, "grad_norm": 3.330648183822632, "learning_rate": 4.970984649943842e-05, "loss": 6.1544, "step": 320 }, { "epoch": 0.026404224675948152, "grad_norm": 3.0299668312072754, "learning_rate": 4.969647537038028e-05, "loss": 6.3119, "step": 330 }, { "epoch": 0.027204352696431428, "grad_norm": 3.687938928604126, "learning_rate": 4.9683104241322136e-05, "loss": 6.333, "step": 340 }, { "epoch": 0.028004480716914706, "grad_norm": 4.0919413566589355, "learning_rate": 4.9669733112264e-05, "loss": 6.1711, "step": 350 }, { "epoch": 0.028804608737397985, "grad_norm": 3.1327242851257324, "learning_rate": 4.965636198320586e-05, "loss": 6.3365, "step": 360 }, { "epoch": 0.02960473675788126, "grad_norm": 4.531859874725342, "learning_rate": 4.9642990854147724e-05, "loss": 6.2121, "step": 370 }, { "epoch": 0.03040486477836454, "grad_norm": 2.522672414779663, "learning_rate": 4.962961972508959e-05, "loss": 6.2388, "step": 380 }, { "epoch": 0.031204992798847815, "grad_norm": 5.62153959274292, "learning_rate": 4.961624859603145e-05, "loss": 6.168, "step": 390 }, { "epoch": 0.032005120819331094, "grad_norm": 3.522804021835327, "learning_rate": 4.960287746697331e-05, "loss": 6.1207, "step": 400 }, { "epoch": 0.03280524883981437, "grad_norm": 7.260324478149414, "learning_rate": 4.9589506337915175e-05, "loss": 6.31, "step": 410 }, { "epoch": 0.033605376860297645, "grad_norm": 4.309441566467285, "learning_rate": 4.957613520885704e-05, "loss": 6.1107, "step": 420 }, { "epoch": 0.034405504880780924, "grad_norm": 3.2409913539886475, "learning_rate": 4.95627640797989e-05, "loss": 6.2082, "step": 430 }, { "epoch": 0.0352056329012642, "grad_norm": 3.9414610862731934, "learning_rate": 4.954939295074076e-05, "loss": 6.2102, "step": 440 }, { "epoch": 0.03600576092174748, "grad_norm": 2.441235303878784, "learning_rate": 4.9536021821682626e-05, "loss": 6.1023, "step": 450 }, { "epoch": 0.036805888942230754, "grad_norm": 2.997591972351074, "learning_rate": 4.952265069262449e-05, "loss": 6.1147, "step": 460 }, { "epoch": 0.03760601696271403, "grad_norm": 3.950436592102051, "learning_rate": 4.950927956356635e-05, "loss": 6.0725, "step": 470 }, { "epoch": 0.03840614498319731, "grad_norm": 3.4340896606445312, "learning_rate": 4.9495908434508214e-05, "loss": 6.1336, "step": 480 }, { "epoch": 0.03920627300368059, "grad_norm": 3.28839373588562, "learning_rate": 4.948253730545007e-05, "loss": 6.1709, "step": 490 }, { "epoch": 0.04000640102416387, "grad_norm": 2.976365566253662, "learning_rate": 4.946916617639193e-05, "loss": 6.2074, "step": 500 }, { "epoch": 0.04080652904464714, "grad_norm": 4.156027793884277, "learning_rate": 4.9455795047333795e-05, "loss": 6.1694, "step": 510 }, { "epoch": 0.04160665706513042, "grad_norm": 3.4855797290802, "learning_rate": 4.944242391827566e-05, "loss": 6.1218, "step": 520 }, { "epoch": 0.0424067850856137, "grad_norm": 4.489185333251953, "learning_rate": 4.942905278921752e-05, "loss": 6.1507, "step": 530 }, { "epoch": 0.04320691310609698, "grad_norm": 3.2751166820526123, "learning_rate": 4.941568166015938e-05, "loss": 6.1055, "step": 540 }, { "epoch": 0.04400704112658025, "grad_norm": 2.4234585762023926, "learning_rate": 4.9402310531101246e-05, "loss": 6.1755, "step": 550 }, { "epoch": 0.04480716914706353, "grad_norm": 3.4436991214752197, "learning_rate": 4.938893940204311e-05, "loss": 6.1882, "step": 560 }, { "epoch": 0.04560729716754681, "grad_norm": 3.3731908798217773, "learning_rate": 4.937556827298497e-05, "loss": 6.0648, "step": 570 }, { "epoch": 0.04640742518803009, "grad_norm": 3.8733670711517334, "learning_rate": 4.9362197143926834e-05, "loss": 6.0621, "step": 580 }, { "epoch": 0.04720755320851336, "grad_norm": 4.126636505126953, "learning_rate": 4.9348826014868696e-05, "loss": 6.122, "step": 590 }, { "epoch": 0.04800768122899664, "grad_norm": 3.8605775833129883, "learning_rate": 4.933545488581056e-05, "loss": 5.9788, "step": 600 }, { "epoch": 0.048807809249479917, "grad_norm": 2.9509966373443604, "learning_rate": 4.932208375675242e-05, "loss": 6.2045, "step": 610 }, { "epoch": 0.049607937269963195, "grad_norm": 4.4266510009765625, "learning_rate": 4.9308712627694285e-05, "loss": 5.9981, "step": 620 }, { "epoch": 0.050408065290446474, "grad_norm": 2.79042649269104, "learning_rate": 4.929534149863615e-05, "loss": 6.1882, "step": 630 }, { "epoch": 0.051208193310929746, "grad_norm": 2.8986568450927734, "learning_rate": 4.928197036957801e-05, "loss": 6.1739, "step": 640 }, { "epoch": 0.052008321331413025, "grad_norm": 4.294217586517334, "learning_rate": 4.926859924051987e-05, "loss": 6.0566, "step": 650 }, { "epoch": 0.052808449351896304, "grad_norm": 8.848836898803711, "learning_rate": 4.9255228111461735e-05, "loss": 6.2994, "step": 660 }, { "epoch": 0.05360857737237958, "grad_norm": 3.2204337120056152, "learning_rate": 4.92418569824036e-05, "loss": 6.0573, "step": 670 }, { "epoch": 0.054408705392862855, "grad_norm": 4.775251865386963, "learning_rate": 4.922848585334546e-05, "loss": 5.9764, "step": 680 }, { "epoch": 0.055208833413346134, "grad_norm": 3.5426905155181885, "learning_rate": 4.921511472428732e-05, "loss": 6.0402, "step": 690 }, { "epoch": 0.05600896143382941, "grad_norm": 10.72481632232666, "learning_rate": 4.9201743595229186e-05, "loss": 6.0024, "step": 700 }, { "epoch": 0.05680908945431269, "grad_norm": 2.441681385040283, "learning_rate": 4.918837246617105e-05, "loss": 6.1122, "step": 710 }, { "epoch": 0.05760921747479597, "grad_norm": 3.375319480895996, "learning_rate": 4.917500133711291e-05, "loss": 6.058, "step": 720 }, { "epoch": 0.05840934549527924, "grad_norm": 2.821507453918457, "learning_rate": 4.9161630208054774e-05, "loss": 6.0586, "step": 730 }, { "epoch": 0.05920947351576252, "grad_norm": 2.8658957481384277, "learning_rate": 4.914825907899664e-05, "loss": 6.0115, "step": 740 }, { "epoch": 0.0600096015362458, "grad_norm": 2.239774227142334, "learning_rate": 4.91348879499385e-05, "loss": 6.0669, "step": 750 }, { "epoch": 0.06080972955672908, "grad_norm": 3.5249900817871094, "learning_rate": 4.912151682088036e-05, "loss": 6.1013, "step": 760 }, { "epoch": 0.06160985757721235, "grad_norm": 2.790356159210205, "learning_rate": 4.9108145691822225e-05, "loss": 6.0099, "step": 770 }, { "epoch": 0.06240998559769563, "grad_norm": 3.0729963779449463, "learning_rate": 4.909477456276409e-05, "loss": 6.1376, "step": 780 }, { "epoch": 0.06321011361817891, "grad_norm": 2.9490275382995605, "learning_rate": 4.908140343370595e-05, "loss": 6.1457, "step": 790 }, { "epoch": 0.06401024163866219, "grad_norm": 2.7475438117980957, "learning_rate": 4.9068032304647806e-05, "loss": 6.0041, "step": 800 }, { "epoch": 0.06481036965914547, "grad_norm": 2.755703926086426, "learning_rate": 4.905466117558967e-05, "loss": 6.0242, "step": 810 }, { "epoch": 0.06561049767962875, "grad_norm": 2.724515676498413, "learning_rate": 4.904129004653153e-05, "loss": 6.1827, "step": 820 }, { "epoch": 0.06641062570011202, "grad_norm": 4.498260974884033, "learning_rate": 4.9027918917473394e-05, "loss": 6.0892, "step": 830 }, { "epoch": 0.06721075372059529, "grad_norm": 2.4399070739746094, "learning_rate": 4.901454778841526e-05, "loss": 6.0197, "step": 840 }, { "epoch": 0.06801088174107857, "grad_norm": 2.7584304809570312, "learning_rate": 4.900117665935712e-05, "loss": 5.9056, "step": 850 }, { "epoch": 0.06881100976156185, "grad_norm": 2.8177144527435303, "learning_rate": 4.898780553029898e-05, "loss": 6.1484, "step": 860 }, { "epoch": 0.06961113778204513, "grad_norm": 4.181133270263672, "learning_rate": 4.8974434401240845e-05, "loss": 5.9376, "step": 870 }, { "epoch": 0.0704112658025284, "grad_norm": 3.677849769592285, "learning_rate": 4.896106327218271e-05, "loss": 6.0403, "step": 880 }, { "epoch": 0.07121139382301168, "grad_norm": 3.1553192138671875, "learning_rate": 4.894769214312457e-05, "loss": 6.0488, "step": 890 }, { "epoch": 0.07201152184349496, "grad_norm": 3.2580947875976562, "learning_rate": 4.893432101406643e-05, "loss": 6.1002, "step": 900 }, { "epoch": 0.07281164986397824, "grad_norm": 6.328150749206543, "learning_rate": 4.8920949885008296e-05, "loss": 6.0225, "step": 910 }, { "epoch": 0.07361177788446151, "grad_norm": 2.7467615604400635, "learning_rate": 4.890757875595016e-05, "loss": 5.9622, "step": 920 }, { "epoch": 0.07441190590494479, "grad_norm": 2.86570405960083, "learning_rate": 4.889420762689202e-05, "loss": 5.9718, "step": 930 }, { "epoch": 0.07521203392542807, "grad_norm": 2.544917106628418, "learning_rate": 4.8880836497833884e-05, "loss": 5.8697, "step": 940 }, { "epoch": 0.07601216194591134, "grad_norm": 2.5245840549468994, "learning_rate": 4.8867465368775746e-05, "loss": 5.9973, "step": 950 }, { "epoch": 0.07681228996639462, "grad_norm": 3.6830902099609375, "learning_rate": 4.88540942397176e-05, "loss": 5.943, "step": 960 }, { "epoch": 0.0776124179868779, "grad_norm": 2.6643354892730713, "learning_rate": 4.8840723110659465e-05, "loss": 5.8958, "step": 970 }, { "epoch": 0.07841254600736118, "grad_norm": 6.4623565673828125, "learning_rate": 4.882735198160133e-05, "loss": 6.0236, "step": 980 }, { "epoch": 0.07921267402784446, "grad_norm": 2.186974048614502, "learning_rate": 4.881398085254319e-05, "loss": 6.0481, "step": 990 }, { "epoch": 0.08001280204832774, "grad_norm": 2.4983859062194824, "learning_rate": 4.880060972348505e-05, "loss": 6.075, "step": 1000 }, { "epoch": 0.080812930068811, "grad_norm": 2.778280258178711, "learning_rate": 4.8787238594426916e-05, "loss": 6.0757, "step": 1010 }, { "epoch": 0.08161305808929428, "grad_norm": 2.706965923309326, "learning_rate": 4.877386746536878e-05, "loss": 6.1504, "step": 1020 }, { "epoch": 0.08241318610977756, "grad_norm": 3.4069600105285645, "learning_rate": 4.876049633631064e-05, "loss": 6.0889, "step": 1030 }, { "epoch": 0.08321331413026084, "grad_norm": 3.179551124572754, "learning_rate": 4.8747125207252504e-05, "loss": 6.0057, "step": 1040 }, { "epoch": 0.08401344215074412, "grad_norm": 2.924018383026123, "learning_rate": 4.873375407819437e-05, "loss": 5.8406, "step": 1050 }, { "epoch": 0.0848135701712274, "grad_norm": 3.103912115097046, "learning_rate": 4.872038294913623e-05, "loss": 6.0351, "step": 1060 }, { "epoch": 0.08561369819171068, "grad_norm": 2.8037219047546387, "learning_rate": 4.870701182007809e-05, "loss": 6.0272, "step": 1070 }, { "epoch": 0.08641382621219396, "grad_norm": 2.477062940597534, "learning_rate": 4.8693640691019955e-05, "loss": 5.9269, "step": 1080 }, { "epoch": 0.08721395423267723, "grad_norm": 2.748488187789917, "learning_rate": 4.868026956196182e-05, "loss": 5.943, "step": 1090 }, { "epoch": 0.0880140822531605, "grad_norm": 3.3991920948028564, "learning_rate": 4.866689843290368e-05, "loss": 6.1455, "step": 1100 }, { "epoch": 0.08881421027364378, "grad_norm": 3.208509683609009, "learning_rate": 4.8653527303845536e-05, "loss": 5.9746, "step": 1110 }, { "epoch": 0.08961433829412706, "grad_norm": 3.3378469944000244, "learning_rate": 4.86401561747874e-05, "loss": 5.9185, "step": 1120 }, { "epoch": 0.09041446631461034, "grad_norm": 2.269606113433838, "learning_rate": 4.862678504572926e-05, "loss": 5.9369, "step": 1130 }, { "epoch": 0.09121459433509362, "grad_norm": 2.749335765838623, "learning_rate": 4.8613413916671124e-05, "loss": 6.0648, "step": 1140 }, { "epoch": 0.0920147223555769, "grad_norm": 2.821913480758667, "learning_rate": 4.860004278761299e-05, "loss": 5.952, "step": 1150 }, { "epoch": 0.09281485037606017, "grad_norm": 2.640990734100342, "learning_rate": 4.858667165855485e-05, "loss": 6.0537, "step": 1160 }, { "epoch": 0.09361497839654345, "grad_norm": 3.570896625518799, "learning_rate": 4.857330052949671e-05, "loss": 5.7721, "step": 1170 }, { "epoch": 0.09441510641702672, "grad_norm": 3.245318651199341, "learning_rate": 4.8559929400438575e-05, "loss": 5.7305, "step": 1180 }, { "epoch": 0.09521523443751, "grad_norm": 4.075076580047607, "learning_rate": 4.854655827138044e-05, "loss": 5.974, "step": 1190 }, { "epoch": 0.09601536245799328, "grad_norm": 2.429893732070923, "learning_rate": 4.85331871423223e-05, "loss": 5.7828, "step": 1200 }, { "epoch": 0.09681549047847655, "grad_norm": 2.7077040672302246, "learning_rate": 4.851981601326416e-05, "loss": 5.9143, "step": 1210 }, { "epoch": 0.09761561849895983, "grad_norm": 2.767918586730957, "learning_rate": 4.8506444884206026e-05, "loss": 5.9449, "step": 1220 }, { "epoch": 0.09841574651944311, "grad_norm": 2.4544034004211426, "learning_rate": 4.849307375514789e-05, "loss": 6.0034, "step": 1230 }, { "epoch": 0.09921587453992639, "grad_norm": 5.215607643127441, "learning_rate": 4.847970262608975e-05, "loss": 5.867, "step": 1240 }, { "epoch": 0.10001600256040967, "grad_norm": 2.7856080532073975, "learning_rate": 4.8466331497031614e-05, "loss": 6.0213, "step": 1250 }, { "epoch": 0.10081613058089295, "grad_norm": 2.5528719425201416, "learning_rate": 4.8452960367973476e-05, "loss": 5.9634, "step": 1260 }, { "epoch": 0.10161625860137621, "grad_norm": 2.4917409420013428, "learning_rate": 4.843958923891533e-05, "loss": 5.887, "step": 1270 }, { "epoch": 0.10241638662185949, "grad_norm": 6.125699520111084, "learning_rate": 4.8426218109857195e-05, "loss": 6.1189, "step": 1280 }, { "epoch": 0.10321651464234277, "grad_norm": 2.783156156539917, "learning_rate": 4.841284698079906e-05, "loss": 5.9064, "step": 1290 }, { "epoch": 0.10401664266282605, "grad_norm": 3.611070156097412, "learning_rate": 4.839947585174092e-05, "loss": 5.9405, "step": 1300 }, { "epoch": 0.10481677068330933, "grad_norm": 4.296909809112549, "learning_rate": 4.838610472268278e-05, "loss": 5.9067, "step": 1310 }, { "epoch": 0.10561689870379261, "grad_norm": 2.4273040294647217, "learning_rate": 4.8372733593624646e-05, "loss": 5.888, "step": 1320 }, { "epoch": 0.10641702672427589, "grad_norm": 2.6499924659729004, "learning_rate": 4.835936246456651e-05, "loss": 5.9683, "step": 1330 }, { "epoch": 0.10721715474475917, "grad_norm": 3.1474297046661377, "learning_rate": 4.834599133550837e-05, "loss": 5.8946, "step": 1340 }, { "epoch": 0.10801728276524244, "grad_norm": 3.5050199031829834, "learning_rate": 4.8332620206450234e-05, "loss": 5.9179, "step": 1350 }, { "epoch": 0.10881741078572571, "grad_norm": 2.693700075149536, "learning_rate": 4.8319249077392096e-05, "loss": 5.7965, "step": 1360 }, { "epoch": 0.10961753880620899, "grad_norm": 2.8202953338623047, "learning_rate": 4.830587794833396e-05, "loss": 5.9526, "step": 1370 }, { "epoch": 0.11041766682669227, "grad_norm": 2.514862060546875, "learning_rate": 4.829250681927582e-05, "loss": 5.936, "step": 1380 }, { "epoch": 0.11121779484717555, "grad_norm": 3.18804931640625, "learning_rate": 4.8279135690217685e-05, "loss": 5.9246, "step": 1390 }, { "epoch": 0.11201792286765883, "grad_norm": 2.77697491645813, "learning_rate": 4.826576456115955e-05, "loss": 5.9576, "step": 1400 }, { "epoch": 0.1128180508881421, "grad_norm": 2.762524127960205, "learning_rate": 4.825239343210141e-05, "loss": 5.9085, "step": 1410 }, { "epoch": 0.11361817890862538, "grad_norm": 2.4407670497894287, "learning_rate": 4.8239022303043266e-05, "loss": 5.9518, "step": 1420 }, { "epoch": 0.11441830692910866, "grad_norm": 3.1036713123321533, "learning_rate": 4.822565117398513e-05, "loss": 5.8412, "step": 1430 }, { "epoch": 0.11521843494959194, "grad_norm": 3.319058418273926, "learning_rate": 4.821228004492699e-05, "loss": 5.9733, "step": 1440 }, { "epoch": 0.1160185629700752, "grad_norm": 2.13468599319458, "learning_rate": 4.8198908915868854e-05, "loss": 5.9193, "step": 1450 }, { "epoch": 0.11681869099055849, "grad_norm": 2.6057028770446777, "learning_rate": 4.8185537786810717e-05, "loss": 5.9807, "step": 1460 }, { "epoch": 0.11761881901104176, "grad_norm": 2.7509753704071045, "learning_rate": 4.817216665775258e-05, "loss": 5.9534, "step": 1470 }, { "epoch": 0.11841894703152504, "grad_norm": 2.111055850982666, "learning_rate": 4.815879552869444e-05, "loss": 5.9207, "step": 1480 }, { "epoch": 0.11921907505200832, "grad_norm": 2.5271990299224854, "learning_rate": 4.8145424399636305e-05, "loss": 5.7148, "step": 1490 }, { "epoch": 0.1200192030724916, "grad_norm": 2.814138174057007, "learning_rate": 4.813205327057817e-05, "loss": 5.9498, "step": 1500 }, { "epoch": 0.12081933109297488, "grad_norm": 3.449355363845825, "learning_rate": 4.811868214152003e-05, "loss": 5.7814, "step": 1510 }, { "epoch": 0.12161945911345816, "grad_norm": 2.813746213912964, "learning_rate": 4.810531101246189e-05, "loss": 5.9517, "step": 1520 }, { "epoch": 0.12241958713394142, "grad_norm": 2.529242753982544, "learning_rate": 4.8091939883403755e-05, "loss": 5.8227, "step": 1530 }, { "epoch": 0.1232197151544247, "grad_norm": 2.2425034046173096, "learning_rate": 4.807856875434562e-05, "loss": 6.1064, "step": 1540 }, { "epoch": 0.12401984317490798, "grad_norm": 2.7732784748077393, "learning_rate": 4.806519762528748e-05, "loss": 5.8888, "step": 1550 }, { "epoch": 0.12481997119539126, "grad_norm": 2.5558009147644043, "learning_rate": 4.8051826496229343e-05, "loss": 5.8185, "step": 1560 }, { "epoch": 0.12562009921587455, "grad_norm": 2.884411096572876, "learning_rate": 4.8038455367171206e-05, "loss": 6.0534, "step": 1570 }, { "epoch": 0.12642022723635782, "grad_norm": 2.5747668743133545, "learning_rate": 4.802508423811307e-05, "loss": 5.8186, "step": 1580 }, { "epoch": 0.12722035525684108, "grad_norm": 2.324767827987671, "learning_rate": 4.801171310905493e-05, "loss": 5.8642, "step": 1590 }, { "epoch": 0.12802048327732438, "grad_norm": 2.2255160808563232, "learning_rate": 4.7998341979996794e-05, "loss": 5.8559, "step": 1600 }, { "epoch": 0.12882061129780764, "grad_norm": 2.97525954246521, "learning_rate": 4.798497085093866e-05, "loss": 5.8744, "step": 1610 }, { "epoch": 0.12962073931829093, "grad_norm": 2.23962664604187, "learning_rate": 4.797159972188052e-05, "loss": 5.7545, "step": 1620 }, { "epoch": 0.1304208673387742, "grad_norm": 3.6182124614715576, "learning_rate": 4.795822859282238e-05, "loss": 5.8872, "step": 1630 }, { "epoch": 0.1312209953592575, "grad_norm": 4.068545341491699, "learning_rate": 4.7944857463764245e-05, "loss": 5.9008, "step": 1640 }, { "epoch": 0.13202112337974076, "grad_norm": 3.627082109451294, "learning_rate": 4.793148633470611e-05, "loss": 5.8215, "step": 1650 }, { "epoch": 0.13282125140022405, "grad_norm": 3.0080721378326416, "learning_rate": 4.791811520564797e-05, "loss": 5.9086, "step": 1660 }, { "epoch": 0.13362137942070731, "grad_norm": 2.5463860034942627, "learning_rate": 4.790474407658983e-05, "loss": 5.776, "step": 1670 }, { "epoch": 0.13442150744119058, "grad_norm": 2.212488889694214, "learning_rate": 4.7891372947531696e-05, "loss": 6.006, "step": 1680 }, { "epoch": 0.13522163546167387, "grad_norm": 4.147563934326172, "learning_rate": 4.787800181847356e-05, "loss": 5.886, "step": 1690 }, { "epoch": 0.13602176348215714, "grad_norm": 2.6021018028259277, "learning_rate": 4.786463068941542e-05, "loss": 5.9182, "step": 1700 }, { "epoch": 0.13682189150264043, "grad_norm": 2.3109893798828125, "learning_rate": 4.7851259560357284e-05, "loss": 5.8084, "step": 1710 }, { "epoch": 0.1376220195231237, "grad_norm": 2.8678529262542725, "learning_rate": 4.7837888431299147e-05, "loss": 6.0363, "step": 1720 }, { "epoch": 0.138422147543607, "grad_norm": 2.1921958923339844, "learning_rate": 4.7824517302241e-05, "loss": 5.7667, "step": 1730 }, { "epoch": 0.13922227556409025, "grad_norm": 2.6883316040039062, "learning_rate": 4.7811146173182865e-05, "loss": 5.7906, "step": 1740 }, { "epoch": 0.14002240358457352, "grad_norm": 2.4079957008361816, "learning_rate": 4.779777504412473e-05, "loss": 5.7698, "step": 1750 }, { "epoch": 0.1408225316050568, "grad_norm": 4.29390287399292, "learning_rate": 4.778440391506659e-05, "loss": 5.9639, "step": 1760 }, { "epoch": 0.14162265962554008, "grad_norm": 4.133132457733154, "learning_rate": 4.777103278600845e-05, "loss": 6.0901, "step": 1770 }, { "epoch": 0.14242278764602337, "grad_norm": 3.871561288833618, "learning_rate": 4.7757661656950316e-05, "loss": 5.7455, "step": 1780 }, { "epoch": 0.14322291566650663, "grad_norm": 4.266111850738525, "learning_rate": 4.774429052789218e-05, "loss": 5.9971, "step": 1790 }, { "epoch": 0.14402304368698993, "grad_norm": 2.9000513553619385, "learning_rate": 4.773091939883404e-05, "loss": 5.9025, "step": 1800 }, { "epoch": 0.1448231717074732, "grad_norm": 2.549964189529419, "learning_rate": 4.7717548269775904e-05, "loss": 5.768, "step": 1810 }, { "epoch": 0.14562329972795648, "grad_norm": 2.2882704734802246, "learning_rate": 4.770417714071777e-05, "loss": 6.022, "step": 1820 }, { "epoch": 0.14642342774843975, "grad_norm": 2.6501784324645996, "learning_rate": 4.769080601165963e-05, "loss": 5.8539, "step": 1830 }, { "epoch": 0.14722355576892301, "grad_norm": 2.3417108058929443, "learning_rate": 4.767743488260149e-05, "loss": 5.7734, "step": 1840 }, { "epoch": 0.1480236837894063, "grad_norm": 2.2151668071746826, "learning_rate": 4.7664063753543355e-05, "loss": 5.84, "step": 1850 }, { "epoch": 0.14882381180988957, "grad_norm": 3.114260196685791, "learning_rate": 4.765069262448522e-05, "loss": 5.9409, "step": 1860 }, { "epoch": 0.14962393983037287, "grad_norm": 2.4931910037994385, "learning_rate": 4.763732149542708e-05, "loss": 5.9396, "step": 1870 }, { "epoch": 0.15042406785085613, "grad_norm": 3.736487865447998, "learning_rate": 4.7623950366368936e-05, "loss": 5.7427, "step": 1880 }, { "epoch": 0.15122419587133942, "grad_norm": 4.730785846710205, "learning_rate": 4.76105792373108e-05, "loss": 5.9181, "step": 1890 }, { "epoch": 0.1520243238918227, "grad_norm": 2.9264132976531982, "learning_rate": 4.759720810825266e-05, "loss": 5.8967, "step": 1900 }, { "epoch": 0.15282445191230598, "grad_norm": 3.2538132667541504, "learning_rate": 4.7583836979194524e-05, "loss": 5.8459, "step": 1910 }, { "epoch": 0.15362457993278925, "grad_norm": 2.7208549976348877, "learning_rate": 4.757046585013639e-05, "loss": 5.7038, "step": 1920 }, { "epoch": 0.1544247079532725, "grad_norm": 2.7510788440704346, "learning_rate": 4.755709472107825e-05, "loss": 5.8524, "step": 1930 }, { "epoch": 0.1552248359737558, "grad_norm": 2.6565892696380615, "learning_rate": 4.754372359202011e-05, "loss": 5.6324, "step": 1940 }, { "epoch": 0.15602496399423907, "grad_norm": 2.954798936843872, "learning_rate": 4.7530352462961975e-05, "loss": 5.8388, "step": 1950 }, { "epoch": 0.15682509201472236, "grad_norm": 2.291714668273926, "learning_rate": 4.751698133390384e-05, "loss": 5.7504, "step": 1960 }, { "epoch": 0.15762522003520563, "grad_norm": 2.1387598514556885, "learning_rate": 4.75036102048457e-05, "loss": 5.7556, "step": 1970 }, { "epoch": 0.15842534805568892, "grad_norm": 2.290407180786133, "learning_rate": 4.749023907578756e-05, "loss": 5.7089, "step": 1980 }, { "epoch": 0.15922547607617218, "grad_norm": 2.852696657180786, "learning_rate": 4.7476867946729426e-05, "loss": 5.8656, "step": 1990 }, { "epoch": 0.16002560409665548, "grad_norm": 2.8190526962280273, "learning_rate": 4.746349681767129e-05, "loss": 6.0134, "step": 2000 }, { "epoch": 0.16082573211713874, "grad_norm": 2.705008029937744, "learning_rate": 4.745012568861315e-05, "loss": 5.8713, "step": 2010 }, { "epoch": 0.161625860137622, "grad_norm": 3.571394205093384, "learning_rate": 4.7436754559555014e-05, "loss": 5.8329, "step": 2020 }, { "epoch": 0.1624259881581053, "grad_norm": 2.687455177307129, "learning_rate": 4.7423383430496876e-05, "loss": 5.8355, "step": 2030 }, { "epoch": 0.16322611617858857, "grad_norm": 2.6158690452575684, "learning_rate": 4.741001230143873e-05, "loss": 5.6938, "step": 2040 }, { "epoch": 0.16402624419907186, "grad_norm": 2.9657154083251953, "learning_rate": 4.7396641172380595e-05, "loss": 5.7514, "step": 2050 }, { "epoch": 0.16482637221955512, "grad_norm": 2.310607433319092, "learning_rate": 4.738327004332246e-05, "loss": 5.7397, "step": 2060 }, { "epoch": 0.16562650024003842, "grad_norm": 2.855271339416504, "learning_rate": 4.736989891426432e-05, "loss": 5.7645, "step": 2070 }, { "epoch": 0.16642662826052168, "grad_norm": 2.778768301010132, "learning_rate": 4.735652778520618e-05, "loss": 5.9582, "step": 2080 }, { "epoch": 0.16722675628100497, "grad_norm": 3.069973945617676, "learning_rate": 4.7343156656148046e-05, "loss": 5.8205, "step": 2090 }, { "epoch": 0.16802688430148824, "grad_norm": 3.5799551010131836, "learning_rate": 4.732978552708991e-05, "loss": 5.9001, "step": 2100 }, { "epoch": 0.1688270123219715, "grad_norm": 2.556668758392334, "learning_rate": 4.731641439803177e-05, "loss": 5.7258, "step": 2110 }, { "epoch": 0.1696271403424548, "grad_norm": 2.7847707271575928, "learning_rate": 4.7303043268973634e-05, "loss": 5.9007, "step": 2120 }, { "epoch": 0.17042726836293806, "grad_norm": 4.071508407592773, "learning_rate": 4.7289672139915496e-05, "loss": 5.7035, "step": 2130 }, { "epoch": 0.17122739638342135, "grad_norm": 2.6188418865203857, "learning_rate": 4.727630101085736e-05, "loss": 5.651, "step": 2140 }, { "epoch": 0.17202752440390462, "grad_norm": 1.952249526977539, "learning_rate": 4.726292988179922e-05, "loss": 6.1107, "step": 2150 }, { "epoch": 0.1728276524243879, "grad_norm": 2.299018144607544, "learning_rate": 4.7249558752741085e-05, "loss": 5.7609, "step": 2160 }, { "epoch": 0.17362778044487118, "grad_norm": 2.5578439235687256, "learning_rate": 4.723618762368295e-05, "loss": 5.792, "step": 2170 }, { "epoch": 0.17442790846535447, "grad_norm": 3.9921529293060303, "learning_rate": 4.722281649462481e-05, "loss": 5.7233, "step": 2180 }, { "epoch": 0.17522803648583773, "grad_norm": 2.5521302223205566, "learning_rate": 4.7209445365566666e-05, "loss": 5.807, "step": 2190 }, { "epoch": 0.176028164506321, "grad_norm": 2.71401047706604, "learning_rate": 4.719607423650853e-05, "loss": 5.6689, "step": 2200 }, { "epoch": 0.1768282925268043, "grad_norm": 3.782607316970825, "learning_rate": 4.718270310745039e-05, "loss": 5.734, "step": 2210 }, { "epoch": 0.17762842054728756, "grad_norm": 2.57356333732605, "learning_rate": 4.7169331978392254e-05, "loss": 5.8101, "step": 2220 }, { "epoch": 0.17842854856777085, "grad_norm": 2.7005815505981445, "learning_rate": 4.715596084933412e-05, "loss": 6.0603, "step": 2230 }, { "epoch": 0.17922867658825412, "grad_norm": 2.081550359725952, "learning_rate": 4.714258972027598e-05, "loss": 5.7677, "step": 2240 }, { "epoch": 0.1800288046087374, "grad_norm": 3.6565728187561035, "learning_rate": 4.712921859121784e-05, "loss": 5.9672, "step": 2250 }, { "epoch": 0.18082893262922067, "grad_norm": 2.4702320098876953, "learning_rate": 4.7115847462159705e-05, "loss": 5.8397, "step": 2260 }, { "epoch": 0.18162906064970397, "grad_norm": 3.335736036300659, "learning_rate": 4.710247633310157e-05, "loss": 5.7021, "step": 2270 }, { "epoch": 0.18242918867018723, "grad_norm": 3.3939075469970703, "learning_rate": 4.708910520404343e-05, "loss": 5.8464, "step": 2280 }, { "epoch": 0.1832293166906705, "grad_norm": 2.4869279861450195, "learning_rate": 4.707573407498529e-05, "loss": 5.6904, "step": 2290 }, { "epoch": 0.1840294447111538, "grad_norm": 2.4240360260009766, "learning_rate": 4.7062362945927155e-05, "loss": 5.7227, "step": 2300 }, { "epoch": 0.18482957273163705, "grad_norm": 2.428786039352417, "learning_rate": 4.704899181686902e-05, "loss": 5.8295, "step": 2310 }, { "epoch": 0.18562970075212035, "grad_norm": 3.3214187622070312, "learning_rate": 4.703562068781088e-05, "loss": 5.8341, "step": 2320 }, { "epoch": 0.1864298287726036, "grad_norm": 3.2146456241607666, "learning_rate": 4.7022249558752744e-05, "loss": 5.7217, "step": 2330 }, { "epoch": 0.1872299567930869, "grad_norm": 4.442914009094238, "learning_rate": 4.7008878429694606e-05, "loss": 5.9003, "step": 2340 }, { "epoch": 0.18803008481357017, "grad_norm": 1.9268267154693604, "learning_rate": 4.699550730063646e-05, "loss": 5.8292, "step": 2350 }, { "epoch": 0.18883021283405343, "grad_norm": 3.130021095275879, "learning_rate": 4.6982136171578325e-05, "loss": 5.6864, "step": 2360 }, { "epoch": 0.18963034085453673, "grad_norm": 2.8835690021514893, "learning_rate": 4.696876504252019e-05, "loss": 5.829, "step": 2370 }, { "epoch": 0.19043046887502, "grad_norm": 2.4171135425567627, "learning_rate": 4.695539391346205e-05, "loss": 5.7972, "step": 2380 }, { "epoch": 0.19123059689550329, "grad_norm": 3.782817840576172, "learning_rate": 4.694202278440391e-05, "loss": 5.8497, "step": 2390 }, { "epoch": 0.19203072491598655, "grad_norm": 2.475249767303467, "learning_rate": 4.6928651655345776e-05, "loss": 5.9237, "step": 2400 }, { "epoch": 0.19283085293646984, "grad_norm": 2.5809242725372314, "learning_rate": 4.691528052628764e-05, "loss": 5.7756, "step": 2410 }, { "epoch": 0.1936309809569531, "grad_norm": 2.6922059059143066, "learning_rate": 4.69019093972295e-05, "loss": 5.9326, "step": 2420 }, { "epoch": 0.1944311089774364, "grad_norm": 2.7542431354522705, "learning_rate": 4.6888538268171364e-05, "loss": 5.6279, "step": 2430 }, { "epoch": 0.19523123699791967, "grad_norm": 2.4063303470611572, "learning_rate": 4.6875167139113226e-05, "loss": 5.91, "step": 2440 }, { "epoch": 0.19603136501840293, "grad_norm": 4.855547904968262, "learning_rate": 4.686179601005509e-05, "loss": 5.7286, "step": 2450 }, { "epoch": 0.19683149303888622, "grad_norm": 2.9875595569610596, "learning_rate": 4.684842488099695e-05, "loss": 5.8299, "step": 2460 }, { "epoch": 0.1976316210593695, "grad_norm": 4.467639923095703, "learning_rate": 4.6835053751938814e-05, "loss": 5.8469, "step": 2470 }, { "epoch": 0.19843174907985278, "grad_norm": 2.2144124507904053, "learning_rate": 4.682168262288068e-05, "loss": 5.7871, "step": 2480 }, { "epoch": 0.19923187710033605, "grad_norm": 2.4507012367248535, "learning_rate": 4.680831149382254e-05, "loss": 5.7529, "step": 2490 }, { "epoch": 0.20003200512081934, "grad_norm": 2.208648681640625, "learning_rate": 4.67949403647644e-05, "loss": 5.7265, "step": 2500 }, { "epoch": 0.2008321331413026, "grad_norm": 2.560302257537842, "learning_rate": 4.6781569235706265e-05, "loss": 5.7842, "step": 2510 }, { "epoch": 0.2016322611617859, "grad_norm": 2.354292154312134, "learning_rate": 4.676819810664813e-05, "loss": 5.8468, "step": 2520 }, { "epoch": 0.20243238918226916, "grad_norm": 2.9559860229492188, "learning_rate": 4.675482697758999e-05, "loss": 5.7003, "step": 2530 }, { "epoch": 0.20323251720275243, "grad_norm": 3.251077651977539, "learning_rate": 4.674145584853185e-05, "loss": 5.8129, "step": 2540 }, { "epoch": 0.20403264522323572, "grad_norm": 2.7863471508026123, "learning_rate": 4.6728084719473716e-05, "loss": 5.6814, "step": 2550 }, { "epoch": 0.20483277324371899, "grad_norm": 2.9006989002227783, "learning_rate": 4.671471359041558e-05, "loss": 5.8292, "step": 2560 }, { "epoch": 0.20563290126420228, "grad_norm": 2.930689573287964, "learning_rate": 4.670134246135744e-05, "loss": 5.8825, "step": 2570 }, { "epoch": 0.20643302928468554, "grad_norm": 2.3105032444000244, "learning_rate": 4.6687971332299304e-05, "loss": 5.7039, "step": 2580 }, { "epoch": 0.20723315730516884, "grad_norm": 3.1141879558563232, "learning_rate": 4.667460020324117e-05, "loss": 5.8692, "step": 2590 }, { "epoch": 0.2080332853256521, "grad_norm": 3.5017199516296387, "learning_rate": 4.666122907418303e-05, "loss": 5.7922, "step": 2600 }, { "epoch": 0.2088334133461354, "grad_norm": 2.657975912094116, "learning_rate": 4.664785794512489e-05, "loss": 5.7736, "step": 2610 }, { "epoch": 0.20963354136661866, "grad_norm": 3.246952772140503, "learning_rate": 4.6634486816066755e-05, "loss": 5.768, "step": 2620 }, { "epoch": 0.21043366938710192, "grad_norm": 6.832335948944092, "learning_rate": 4.662111568700862e-05, "loss": 5.6752, "step": 2630 }, { "epoch": 0.21123379740758522, "grad_norm": 3.2479753494262695, "learning_rate": 4.660774455795048e-05, "loss": 5.8015, "step": 2640 }, { "epoch": 0.21203392542806848, "grad_norm": 2.809082508087158, "learning_rate": 4.659437342889234e-05, "loss": 5.8663, "step": 2650 }, { "epoch": 0.21283405344855177, "grad_norm": 3.7948036193847656, "learning_rate": 4.65810022998342e-05, "loss": 5.889, "step": 2660 }, { "epoch": 0.21363418146903504, "grad_norm": 2.836090564727783, "learning_rate": 4.656763117077606e-05, "loss": 5.7516, "step": 2670 }, { "epoch": 0.21443430948951833, "grad_norm": 3.0940232276916504, "learning_rate": 4.6554260041717924e-05, "loss": 5.7033, "step": 2680 }, { "epoch": 0.2152344375100016, "grad_norm": 2.436757802963257, "learning_rate": 4.654088891265979e-05, "loss": 5.746, "step": 2690 }, { "epoch": 0.2160345655304849, "grad_norm": 2.4339609146118164, "learning_rate": 4.652751778360165e-05, "loss": 5.828, "step": 2700 }, { "epoch": 0.21683469355096816, "grad_norm": 2.379366874694824, "learning_rate": 4.651414665454351e-05, "loss": 5.719, "step": 2710 }, { "epoch": 0.21763482157145142, "grad_norm": 2.1722371578216553, "learning_rate": 4.6500775525485375e-05, "loss": 5.7875, "step": 2720 }, { "epoch": 0.2184349495919347, "grad_norm": 3.633279800415039, "learning_rate": 4.648740439642724e-05, "loss": 5.802, "step": 2730 }, { "epoch": 0.21923507761241798, "grad_norm": 2.4091219902038574, "learning_rate": 4.64740332673691e-05, "loss": 5.8197, "step": 2740 }, { "epoch": 0.22003520563290127, "grad_norm": 2.7289021015167236, "learning_rate": 4.646066213831096e-05, "loss": 5.9445, "step": 2750 }, { "epoch": 0.22083533365338454, "grad_norm": 2.376481294631958, "learning_rate": 4.6447291009252826e-05, "loss": 5.9943, "step": 2760 }, { "epoch": 0.22163546167386783, "grad_norm": 2.6542563438415527, "learning_rate": 4.643391988019469e-05, "loss": 5.6049, "step": 2770 }, { "epoch": 0.2224355896943511, "grad_norm": 2.320472240447998, "learning_rate": 4.642054875113655e-05, "loss": 5.7637, "step": 2780 }, { "epoch": 0.2232357177148344, "grad_norm": 2.8923239707946777, "learning_rate": 4.6407177622078414e-05, "loss": 5.9666, "step": 2790 }, { "epoch": 0.22403584573531765, "grad_norm": 4.277271270751953, "learning_rate": 4.6393806493020276e-05, "loss": 5.8393, "step": 2800 }, { "epoch": 0.22483597375580092, "grad_norm": 2.797428607940674, "learning_rate": 4.638043536396213e-05, "loss": 5.759, "step": 2810 }, { "epoch": 0.2256361017762842, "grad_norm": 2.1849517822265625, "learning_rate": 4.6367064234903995e-05, "loss": 5.7514, "step": 2820 }, { "epoch": 0.22643622979676747, "grad_norm": 2.8607492446899414, "learning_rate": 4.635369310584586e-05, "loss": 5.7545, "step": 2830 }, { "epoch": 0.22723635781725077, "grad_norm": 3.722041130065918, "learning_rate": 4.634032197678772e-05, "loss": 5.8011, "step": 2840 }, { "epoch": 0.22803648583773403, "grad_norm": 2.8563833236694336, "learning_rate": 4.632695084772958e-05, "loss": 5.8569, "step": 2850 }, { "epoch": 0.22883661385821732, "grad_norm": 3.5724806785583496, "learning_rate": 4.6313579718671446e-05, "loss": 5.9649, "step": 2860 }, { "epoch": 0.2296367418787006, "grad_norm": 2.380469560623169, "learning_rate": 4.630020858961331e-05, "loss": 5.7467, "step": 2870 }, { "epoch": 0.23043686989918388, "grad_norm": 3.1629838943481445, "learning_rate": 4.628683746055517e-05, "loss": 5.642, "step": 2880 }, { "epoch": 0.23123699791966715, "grad_norm": 2.1239373683929443, "learning_rate": 4.6273466331497034e-05, "loss": 5.6483, "step": 2890 }, { "epoch": 0.2320371259401504, "grad_norm": 3.049079418182373, "learning_rate": 4.6260095202438897e-05, "loss": 5.9736, "step": 2900 }, { "epoch": 0.2328372539606337, "grad_norm": 2.556830406188965, "learning_rate": 4.624672407338076e-05, "loss": 5.6037, "step": 2910 }, { "epoch": 0.23363738198111697, "grad_norm": 2.8762035369873047, "learning_rate": 4.623335294432262e-05, "loss": 5.6345, "step": 2920 }, { "epoch": 0.23443751000160026, "grad_norm": 2.11167573928833, "learning_rate": 4.6219981815264485e-05, "loss": 5.7822, "step": 2930 }, { "epoch": 0.23523763802208353, "grad_norm": 4.623869895935059, "learning_rate": 4.620661068620635e-05, "loss": 5.7063, "step": 2940 }, { "epoch": 0.23603776604256682, "grad_norm": 2.4420578479766846, "learning_rate": 4.619323955714821e-05, "loss": 5.686, "step": 2950 }, { "epoch": 0.2368378940630501, "grad_norm": 2.6543869972229004, "learning_rate": 4.617986842809007e-05, "loss": 5.7802, "step": 2960 }, { "epoch": 0.23763802208353338, "grad_norm": 2.6264312267303467, "learning_rate": 4.616649729903193e-05, "loss": 5.6667, "step": 2970 }, { "epoch": 0.23843815010401664, "grad_norm": 2.4579195976257324, "learning_rate": 4.615312616997379e-05, "loss": 5.6738, "step": 2980 }, { "epoch": 0.2392382781244999, "grad_norm": 2.299448251724243, "learning_rate": 4.6139755040915654e-05, "loss": 5.8622, "step": 2990 }, { "epoch": 0.2400384061449832, "grad_norm": 3.6527328491210938, "learning_rate": 4.612638391185752e-05, "loss": 5.6346, "step": 3000 }, { "epoch": 0.24083853416546647, "grad_norm": 2.217876434326172, "learning_rate": 4.611301278279938e-05, "loss": 5.7892, "step": 3010 }, { "epoch": 0.24163866218594976, "grad_norm": 3.500544309616089, "learning_rate": 4.609964165374124e-05, "loss": 5.8026, "step": 3020 }, { "epoch": 0.24243879020643302, "grad_norm": 3.1694483757019043, "learning_rate": 4.6086270524683105e-05, "loss": 5.827, "step": 3030 }, { "epoch": 0.24323891822691632, "grad_norm": 2.899625778198242, "learning_rate": 4.607289939562497e-05, "loss": 5.7384, "step": 3040 }, { "epoch": 0.24403904624739958, "grad_norm": 2.8286776542663574, "learning_rate": 4.605952826656683e-05, "loss": 5.7629, "step": 3050 }, { "epoch": 0.24483917426788285, "grad_norm": 2.7585489749908447, "learning_rate": 4.604615713750869e-05, "loss": 5.7462, "step": 3060 }, { "epoch": 0.24563930228836614, "grad_norm": 2.2017667293548584, "learning_rate": 4.6032786008450555e-05, "loss": 5.844, "step": 3070 }, { "epoch": 0.2464394303088494, "grad_norm": 4.679725170135498, "learning_rate": 4.601941487939242e-05, "loss": 5.7254, "step": 3080 }, { "epoch": 0.2472395583293327, "grad_norm": 2.923884868621826, "learning_rate": 4.600604375033428e-05, "loss": 5.703, "step": 3090 }, { "epoch": 0.24803968634981596, "grad_norm": 2.2205090522766113, "learning_rate": 4.5992672621276144e-05, "loss": 5.7185, "step": 3100 }, { "epoch": 0.24883981437029926, "grad_norm": 2.852313280105591, "learning_rate": 4.5979301492218006e-05, "loss": 5.5653, "step": 3110 }, { "epoch": 0.24963994239078252, "grad_norm": 2.7683911323547363, "learning_rate": 4.596593036315986e-05, "loss": 5.7262, "step": 3120 }, { "epoch": 0.2504400704112658, "grad_norm": 3.1315665245056152, "learning_rate": 4.5952559234101725e-05, "loss": 5.7524, "step": 3130 }, { "epoch": 0.2512401984317491, "grad_norm": 2.5233592987060547, "learning_rate": 4.593918810504359e-05, "loss": 5.7443, "step": 3140 }, { "epoch": 0.25204032645223234, "grad_norm": 2.3802831172943115, "learning_rate": 4.592581697598545e-05, "loss": 5.8091, "step": 3150 }, { "epoch": 0.25284045447271564, "grad_norm": 2.378218412399292, "learning_rate": 4.591244584692731e-05, "loss": 5.7741, "step": 3160 }, { "epoch": 0.25364058249319893, "grad_norm": 4.712483882904053, "learning_rate": 4.5899074717869176e-05, "loss": 5.8643, "step": 3170 }, { "epoch": 0.25444071051368217, "grad_norm": 2.798752784729004, "learning_rate": 4.588570358881104e-05, "loss": 5.7984, "step": 3180 }, { "epoch": 0.25524083853416546, "grad_norm": 2.302037477493286, "learning_rate": 4.58723324597529e-05, "loss": 5.6548, "step": 3190 }, { "epoch": 0.25604096655464875, "grad_norm": 2.8621273040771484, "learning_rate": 4.5858961330694764e-05, "loss": 5.6875, "step": 3200 }, { "epoch": 0.25684109457513205, "grad_norm": 2.9079480171203613, "learning_rate": 4.5845590201636626e-05, "loss": 5.8801, "step": 3210 }, { "epoch": 0.2576412225956153, "grad_norm": 2.9576847553253174, "learning_rate": 4.583221907257849e-05, "loss": 5.6646, "step": 3220 }, { "epoch": 0.2584413506160986, "grad_norm": 4.085951805114746, "learning_rate": 4.581884794352035e-05, "loss": 5.9078, "step": 3230 }, { "epoch": 0.25924147863658187, "grad_norm": 2.622903347015381, "learning_rate": 4.5805476814462214e-05, "loss": 5.6821, "step": 3240 }, { "epoch": 0.2600416066570651, "grad_norm": 1.794255256652832, "learning_rate": 4.579210568540408e-05, "loss": 5.751, "step": 3250 }, { "epoch": 0.2608417346775484, "grad_norm": 3.074042558670044, "learning_rate": 4.577873455634594e-05, "loss": 5.7864, "step": 3260 }, { "epoch": 0.2616418626980317, "grad_norm": 2.3138844966888428, "learning_rate": 4.57653634272878e-05, "loss": 5.693, "step": 3270 }, { "epoch": 0.262441990718515, "grad_norm": 3.8877549171447754, "learning_rate": 4.5751992298229665e-05, "loss": 5.7154, "step": 3280 }, { "epoch": 0.2632421187389982, "grad_norm": 2.9623680114746094, "learning_rate": 4.573862116917153e-05, "loss": 5.7514, "step": 3290 }, { "epoch": 0.2640422467594815, "grad_norm": 2.840122938156128, "learning_rate": 4.572525004011339e-05, "loss": 5.7397, "step": 3300 }, { "epoch": 0.2648423747799648, "grad_norm": 2.9699277877807617, "learning_rate": 4.571187891105525e-05, "loss": 5.7626, "step": 3310 }, { "epoch": 0.2656425028004481, "grad_norm": 2.6493773460388184, "learning_rate": 4.5698507781997116e-05, "loss": 5.7619, "step": 3320 }, { "epoch": 0.26644263082093134, "grad_norm": 2.283259868621826, "learning_rate": 4.568513665293898e-05, "loss": 5.8409, "step": 3330 }, { "epoch": 0.26724275884141463, "grad_norm": 1.9254164695739746, "learning_rate": 4.567176552388084e-05, "loss": 5.8218, "step": 3340 }, { "epoch": 0.2680428868618979, "grad_norm": 2.382345676422119, "learning_rate": 4.5658394394822704e-05, "loss": 5.6865, "step": 3350 }, { "epoch": 0.26884301488238116, "grad_norm": 2.6039271354675293, "learning_rate": 4.564502326576457e-05, "loss": 5.7254, "step": 3360 }, { "epoch": 0.26964314290286445, "grad_norm": 2.0948996543884277, "learning_rate": 4.563165213670643e-05, "loss": 5.7589, "step": 3370 }, { "epoch": 0.27044327092334774, "grad_norm": 2.939955711364746, "learning_rate": 4.561828100764829e-05, "loss": 5.8298, "step": 3380 }, { "epoch": 0.27124339894383104, "grad_norm": 2.748307466506958, "learning_rate": 4.5604909878590155e-05, "loss": 5.8505, "step": 3390 }, { "epoch": 0.2720435269643143, "grad_norm": 2.7122459411621094, "learning_rate": 4.559153874953202e-05, "loss": 5.9027, "step": 3400 }, { "epoch": 0.27284365498479757, "grad_norm": 3.6053593158721924, "learning_rate": 4.557816762047388e-05, "loss": 5.6746, "step": 3410 }, { "epoch": 0.27364378300528086, "grad_norm": 4.433299541473389, "learning_rate": 4.556479649141574e-05, "loss": 5.7713, "step": 3420 }, { "epoch": 0.2744439110257641, "grad_norm": 2.5253539085388184, "learning_rate": 4.55514253623576e-05, "loss": 5.8219, "step": 3430 }, { "epoch": 0.2752440390462474, "grad_norm": 4.9358062744140625, "learning_rate": 4.553805423329946e-05, "loss": 5.7971, "step": 3440 }, { "epoch": 0.2760441670667307, "grad_norm": 2.6247594356536865, "learning_rate": 4.5524683104241324e-05, "loss": 5.1528, "step": 3450 }, { "epoch": 0.276844295087214, "grad_norm": 2.8152048587799072, "learning_rate": 4.551131197518319e-05, "loss": 5.7955, "step": 3460 }, { "epoch": 0.2776444231076972, "grad_norm": 2.143275499343872, "learning_rate": 4.549794084612505e-05, "loss": 5.6875, "step": 3470 }, { "epoch": 0.2784445511281805, "grad_norm": 2.9896023273468018, "learning_rate": 4.548456971706691e-05, "loss": 5.7981, "step": 3480 }, { "epoch": 0.2792446791486638, "grad_norm": 3.5231759548187256, "learning_rate": 4.5471198588008775e-05, "loss": 5.7343, "step": 3490 }, { "epoch": 0.28004480716914704, "grad_norm": 2.391721487045288, "learning_rate": 4.545782745895064e-05, "loss": 5.6821, "step": 3500 }, { "epoch": 0.28084493518963033, "grad_norm": 2.414992332458496, "learning_rate": 4.54444563298925e-05, "loss": 5.7357, "step": 3510 }, { "epoch": 0.2816450632101136, "grad_norm": 2.7502214908599854, "learning_rate": 4.543108520083436e-05, "loss": 5.6511, "step": 3520 }, { "epoch": 0.2824451912305969, "grad_norm": 2.1601436138153076, "learning_rate": 4.5417714071776226e-05, "loss": 5.6249, "step": 3530 }, { "epoch": 0.28324531925108015, "grad_norm": 2.89013671875, "learning_rate": 4.540434294271809e-05, "loss": 5.7583, "step": 3540 }, { "epoch": 0.28404544727156344, "grad_norm": 2.4915778636932373, "learning_rate": 4.539097181365995e-05, "loss": 5.6957, "step": 3550 }, { "epoch": 0.28484557529204674, "grad_norm": 5.053386688232422, "learning_rate": 4.5377600684601814e-05, "loss": 5.632, "step": 3560 }, { "epoch": 0.28564570331253003, "grad_norm": 2.6207687854766846, "learning_rate": 4.5364229555543676e-05, "loss": 5.8514, "step": 3570 }, { "epoch": 0.28644583133301327, "grad_norm": 4.157670497894287, "learning_rate": 4.535085842648553e-05, "loss": 5.7608, "step": 3580 }, { "epoch": 0.28724595935349656, "grad_norm": 3.4464797973632812, "learning_rate": 4.5337487297427395e-05, "loss": 5.6737, "step": 3590 }, { "epoch": 0.28804608737397985, "grad_norm": 4.255002498626709, "learning_rate": 4.532411616836926e-05, "loss": 5.7977, "step": 3600 }, { "epoch": 0.2888462153944631, "grad_norm": 2.7926547527313232, "learning_rate": 4.531074503931112e-05, "loss": 5.6891, "step": 3610 }, { "epoch": 0.2896463434149464, "grad_norm": 3.150400400161743, "learning_rate": 4.529737391025298e-05, "loss": 5.7931, "step": 3620 }, { "epoch": 0.2904464714354297, "grad_norm": 2.1223199367523193, "learning_rate": 4.5284002781194846e-05, "loss": 5.8646, "step": 3630 }, { "epoch": 0.29124659945591297, "grad_norm": 3.950665235519409, "learning_rate": 4.527063165213671e-05, "loss": 5.7008, "step": 3640 }, { "epoch": 0.2920467274763962, "grad_norm": 2.995692729949951, "learning_rate": 4.525726052307857e-05, "loss": 5.688, "step": 3650 }, { "epoch": 0.2928468554968795, "grad_norm": 2.041736125946045, "learning_rate": 4.5243889394020434e-05, "loss": 5.7301, "step": 3660 }, { "epoch": 0.2936469835173628, "grad_norm": 2.541757106781006, "learning_rate": 4.5230518264962297e-05, "loss": 5.5606, "step": 3670 }, { "epoch": 0.29444711153784603, "grad_norm": 2.140761613845825, "learning_rate": 4.521714713590416e-05, "loss": 5.7671, "step": 3680 }, { "epoch": 0.2952472395583293, "grad_norm": 2.6869146823883057, "learning_rate": 4.520377600684602e-05, "loss": 5.6452, "step": 3690 }, { "epoch": 0.2960473675788126, "grad_norm": 3.072376012802124, "learning_rate": 4.5190404877787885e-05, "loss": 5.6956, "step": 3700 }, { "epoch": 0.2968474955992959, "grad_norm": 2.5933837890625, "learning_rate": 4.517703374872975e-05, "loss": 5.6212, "step": 3710 }, { "epoch": 0.29764762361977914, "grad_norm": 3.0443103313446045, "learning_rate": 4.516366261967161e-05, "loss": 5.7849, "step": 3720 }, { "epoch": 0.29844775164026244, "grad_norm": 2.673583745956421, "learning_rate": 4.515029149061347e-05, "loss": 5.6186, "step": 3730 }, { "epoch": 0.29924787966074573, "grad_norm": 2.3276283740997314, "learning_rate": 4.513692036155533e-05, "loss": 5.9188, "step": 3740 }, { "epoch": 0.300048007681229, "grad_norm": 5.504491329193115, "learning_rate": 4.512354923249719e-05, "loss": 5.5676, "step": 3750 }, { "epoch": 0.30084813570171226, "grad_norm": 2.4181482791900635, "learning_rate": 4.5110178103439054e-05, "loss": 5.6852, "step": 3760 }, { "epoch": 0.30164826372219555, "grad_norm": 2.2489006519317627, "learning_rate": 4.509680697438092e-05, "loss": 5.7003, "step": 3770 }, { "epoch": 0.30244839174267885, "grad_norm": 2.6925253868103027, "learning_rate": 4.508343584532278e-05, "loss": 5.8176, "step": 3780 }, { "epoch": 0.3032485197631621, "grad_norm": 2.904318332672119, "learning_rate": 4.507006471626464e-05, "loss": 5.6912, "step": 3790 }, { "epoch": 0.3040486477836454, "grad_norm": 3.3189070224761963, "learning_rate": 4.5056693587206505e-05, "loss": 5.8706, "step": 3800 }, { "epoch": 0.30484877580412867, "grad_norm": 2.8324170112609863, "learning_rate": 4.504332245814837e-05, "loss": 5.8795, "step": 3810 }, { "epoch": 0.30564890382461196, "grad_norm": 3.113417148590088, "learning_rate": 4.502995132909023e-05, "loss": 5.8689, "step": 3820 }, { "epoch": 0.3064490318450952, "grad_norm": 2.469269275665283, "learning_rate": 4.501658020003209e-05, "loss": 5.7934, "step": 3830 }, { "epoch": 0.3072491598655785, "grad_norm": 2.778571128845215, "learning_rate": 4.5003209070973956e-05, "loss": 5.8577, "step": 3840 }, { "epoch": 0.3080492878860618, "grad_norm": 3.4269161224365234, "learning_rate": 4.498983794191582e-05, "loss": 5.8378, "step": 3850 }, { "epoch": 0.308849415906545, "grad_norm": 3.417850971221924, "learning_rate": 4.497646681285768e-05, "loss": 5.6532, "step": 3860 }, { "epoch": 0.3096495439270283, "grad_norm": 2.389784097671509, "learning_rate": 4.4963095683799544e-05, "loss": 5.5454, "step": 3870 }, { "epoch": 0.3104496719475116, "grad_norm": 2.384453296661377, "learning_rate": 4.4949724554741406e-05, "loss": 5.8014, "step": 3880 }, { "epoch": 0.3112497999679949, "grad_norm": 1.913668155670166, "learning_rate": 4.493635342568326e-05, "loss": 5.6033, "step": 3890 }, { "epoch": 0.31204992798847814, "grad_norm": 3.4930074214935303, "learning_rate": 4.4922982296625125e-05, "loss": 5.7649, "step": 3900 }, { "epoch": 0.31285005600896143, "grad_norm": 3.517458200454712, "learning_rate": 4.490961116756699e-05, "loss": 5.5635, "step": 3910 }, { "epoch": 0.3136501840294447, "grad_norm": 2.611274480819702, "learning_rate": 4.489624003850885e-05, "loss": 5.8121, "step": 3920 }, { "epoch": 0.314450312049928, "grad_norm": 2.373997926712036, "learning_rate": 4.488286890945071e-05, "loss": 5.6002, "step": 3930 }, { "epoch": 0.31525044007041125, "grad_norm": 2.554847002029419, "learning_rate": 4.4869497780392576e-05, "loss": 5.6432, "step": 3940 }, { "epoch": 0.31605056809089455, "grad_norm": 3.3720595836639404, "learning_rate": 4.485612665133444e-05, "loss": 5.5794, "step": 3950 }, { "epoch": 0.31685069611137784, "grad_norm": 2.2308788299560547, "learning_rate": 4.48427555222763e-05, "loss": 5.794, "step": 3960 }, { "epoch": 0.3176508241318611, "grad_norm": 2.0659661293029785, "learning_rate": 4.4829384393218164e-05, "loss": 5.5383, "step": 3970 }, { "epoch": 0.31845095215234437, "grad_norm": 3.2644894123077393, "learning_rate": 4.4816013264160026e-05, "loss": 5.6979, "step": 3980 }, { "epoch": 0.31925108017282766, "grad_norm": 2.3485729694366455, "learning_rate": 4.480264213510189e-05, "loss": 5.7214, "step": 3990 }, { "epoch": 0.32005120819331095, "grad_norm": 2.7470600605010986, "learning_rate": 4.478927100604375e-05, "loss": 5.6032, "step": 4000 }, { "epoch": 0.3208513362137942, "grad_norm": 2.1622989177703857, "learning_rate": 4.4775899876985614e-05, "loss": 5.7976, "step": 4010 }, { "epoch": 0.3216514642342775, "grad_norm": 2.7463905811309814, "learning_rate": 4.476252874792748e-05, "loss": 5.7181, "step": 4020 }, { "epoch": 0.3224515922547608, "grad_norm": 3.503662109375, "learning_rate": 4.474915761886934e-05, "loss": 5.8092, "step": 4030 }, { "epoch": 0.323251720275244, "grad_norm": 2.6073853969573975, "learning_rate": 4.47357864898112e-05, "loss": 5.7876, "step": 4040 }, { "epoch": 0.3240518482957273, "grad_norm": 3.354768991470337, "learning_rate": 4.472241536075306e-05, "loss": 5.7741, "step": 4050 }, { "epoch": 0.3248519763162106, "grad_norm": 2.648145914077759, "learning_rate": 4.470904423169492e-05, "loss": 5.7522, "step": 4060 }, { "epoch": 0.3256521043366939, "grad_norm": 3.086655378341675, "learning_rate": 4.4695673102636784e-05, "loss": 5.81, "step": 4070 }, { "epoch": 0.32645223235717713, "grad_norm": 2.230905771255493, "learning_rate": 4.4682301973578647e-05, "loss": 5.8839, "step": 4080 }, { "epoch": 0.3272523603776604, "grad_norm": 2.5391674041748047, "learning_rate": 4.466893084452051e-05, "loss": 5.5535, "step": 4090 }, { "epoch": 0.3280524883981437, "grad_norm": 2.7574117183685303, "learning_rate": 4.465555971546237e-05, "loss": 5.8275, "step": 4100 }, { "epoch": 0.32885261641862695, "grad_norm": 3.1114678382873535, "learning_rate": 4.4642188586404235e-05, "loss": 5.6876, "step": 4110 }, { "epoch": 0.32965274443911025, "grad_norm": 2.404892683029175, "learning_rate": 4.46288174573461e-05, "loss": 5.6876, "step": 4120 }, { "epoch": 0.33045287245959354, "grad_norm": 2.590759754180908, "learning_rate": 4.461544632828796e-05, "loss": 5.802, "step": 4130 }, { "epoch": 0.33125300048007683, "grad_norm": 2.4358649253845215, "learning_rate": 4.460207519922982e-05, "loss": 5.632, "step": 4140 }, { "epoch": 0.33205312850056007, "grad_norm": 3.9567458629608154, "learning_rate": 4.4588704070171685e-05, "loss": 5.8761, "step": 4150 }, { "epoch": 0.33285325652104336, "grad_norm": 2.3808743953704834, "learning_rate": 4.457533294111355e-05, "loss": 5.6815, "step": 4160 }, { "epoch": 0.33365338454152665, "grad_norm": 2.6527156829833984, "learning_rate": 4.456196181205541e-05, "loss": 5.805, "step": 4170 }, { "epoch": 0.33445351256200995, "grad_norm": 2.351062536239624, "learning_rate": 4.4548590682997273e-05, "loss": 5.6681, "step": 4180 }, { "epoch": 0.3352536405824932, "grad_norm": 2.3213460445404053, "learning_rate": 4.4535219553939136e-05, "loss": 5.6363, "step": 4190 }, { "epoch": 0.3360537686029765, "grad_norm": 1.9470767974853516, "learning_rate": 4.4521848424881e-05, "loss": 5.8772, "step": 4200 }, { "epoch": 0.33685389662345977, "grad_norm": 4.303500652313232, "learning_rate": 4.450847729582286e-05, "loss": 5.6185, "step": 4210 }, { "epoch": 0.337654024643943, "grad_norm": 2.713275909423828, "learning_rate": 4.4495106166764724e-05, "loss": 5.6754, "step": 4220 }, { "epoch": 0.3384541526644263, "grad_norm": 2.34993314743042, "learning_rate": 4.448173503770659e-05, "loss": 5.7003, "step": 4230 }, { "epoch": 0.3392542806849096, "grad_norm": 2.276228666305542, "learning_rate": 4.446836390864845e-05, "loss": 5.6, "step": 4240 }, { "epoch": 0.3400544087053929, "grad_norm": 2.3635685443878174, "learning_rate": 4.445499277959031e-05, "loss": 5.7373, "step": 4250 }, { "epoch": 0.3408545367258761, "grad_norm": 3.100604772567749, "learning_rate": 4.4441621650532175e-05, "loss": 5.7354, "step": 4260 }, { "epoch": 0.3416546647463594, "grad_norm": 2.6743876934051514, "learning_rate": 4.442825052147404e-05, "loss": 5.7544, "step": 4270 }, { "epoch": 0.3424547927668427, "grad_norm": 2.5783612728118896, "learning_rate": 4.44148793924159e-05, "loss": 5.8826, "step": 4280 }, { "epoch": 0.34325492078732595, "grad_norm": 2.8976659774780273, "learning_rate": 4.440150826335776e-05, "loss": 5.5418, "step": 4290 }, { "epoch": 0.34405504880780924, "grad_norm": 2.1061089038848877, "learning_rate": 4.4388137134299626e-05, "loss": 5.6406, "step": 4300 }, { "epoch": 0.34485517682829253, "grad_norm": 2.1303789615631104, "learning_rate": 4.437476600524149e-05, "loss": 5.6491, "step": 4310 }, { "epoch": 0.3456553048487758, "grad_norm": 2.6240499019622803, "learning_rate": 4.436139487618335e-05, "loss": 5.7161, "step": 4320 }, { "epoch": 0.34645543286925906, "grad_norm": 2.325155019760132, "learning_rate": 4.4348023747125214e-05, "loss": 5.6172, "step": 4330 }, { "epoch": 0.34725556088974235, "grad_norm": 2.8844404220581055, "learning_rate": 4.4334652618067076e-05, "loss": 5.7438, "step": 4340 }, { "epoch": 0.34805568891022565, "grad_norm": 2.375324249267578, "learning_rate": 4.432128148900894e-05, "loss": 5.8335, "step": 4350 }, { "epoch": 0.34885581693070894, "grad_norm": 2.1572377681732178, "learning_rate": 4.4307910359950795e-05, "loss": 5.706, "step": 4360 }, { "epoch": 0.3496559449511922, "grad_norm": 2.5218889713287354, "learning_rate": 4.429453923089266e-05, "loss": 5.7487, "step": 4370 }, { "epoch": 0.35045607297167547, "grad_norm": 2.636223554611206, "learning_rate": 4.428116810183452e-05, "loss": 5.8327, "step": 4380 }, { "epoch": 0.35125620099215876, "grad_norm": 2.436155080795288, "learning_rate": 4.426779697277638e-05, "loss": 5.6895, "step": 4390 }, { "epoch": 0.352056329012642, "grad_norm": 3.4435484409332275, "learning_rate": 4.4254425843718246e-05, "loss": 5.6171, "step": 4400 }, { "epoch": 0.3528564570331253, "grad_norm": 2.3990628719329834, "learning_rate": 4.424105471466011e-05, "loss": 5.7574, "step": 4410 }, { "epoch": 0.3536565850536086, "grad_norm": 2.544774293899536, "learning_rate": 4.422768358560197e-05, "loss": 5.558, "step": 4420 }, { "epoch": 0.3544567130740919, "grad_norm": 2.389491081237793, "learning_rate": 4.4214312456543834e-05, "loss": 5.6628, "step": 4430 }, { "epoch": 0.3552568410945751, "grad_norm": 5.203212261199951, "learning_rate": 4.4200941327485697e-05, "loss": 5.5403, "step": 4440 }, { "epoch": 0.3560569691150584, "grad_norm": 2.0861873626708984, "learning_rate": 4.418757019842756e-05, "loss": 5.625, "step": 4450 }, { "epoch": 0.3568570971355417, "grad_norm": 2.2355470657348633, "learning_rate": 4.417419906936942e-05, "loss": 5.614, "step": 4460 }, { "epoch": 0.35765722515602494, "grad_norm": 2.2239274978637695, "learning_rate": 4.4160827940311285e-05, "loss": 5.6885, "step": 4470 }, { "epoch": 0.35845735317650823, "grad_norm": 4.571592807769775, "learning_rate": 4.414745681125315e-05, "loss": 5.8495, "step": 4480 }, { "epoch": 0.3592574811969915, "grad_norm": 2.6501150131225586, "learning_rate": 4.413408568219501e-05, "loss": 5.6158, "step": 4490 }, { "epoch": 0.3600576092174748, "grad_norm": 2.8568902015686035, "learning_rate": 4.412071455313687e-05, "loss": 5.6403, "step": 4500 }, { "epoch": 0.36085773723795805, "grad_norm": 2.4179179668426514, "learning_rate": 4.410734342407873e-05, "loss": 5.749, "step": 4510 }, { "epoch": 0.36165786525844135, "grad_norm": 2.950491189956665, "learning_rate": 4.409397229502059e-05, "loss": 5.7128, "step": 4520 }, { "epoch": 0.36245799327892464, "grad_norm": 3.731049060821533, "learning_rate": 4.4080601165962454e-05, "loss": 5.6397, "step": 4530 }, { "epoch": 0.36325812129940793, "grad_norm": 2.255730390548706, "learning_rate": 4.406723003690432e-05, "loss": 5.626, "step": 4540 }, { "epoch": 0.36405824931989117, "grad_norm": 2.623455047607422, "learning_rate": 4.405385890784618e-05, "loss": 5.6792, "step": 4550 }, { "epoch": 0.36485837734037446, "grad_norm": 2.366481065750122, "learning_rate": 4.404048777878804e-05, "loss": 5.5455, "step": 4560 }, { "epoch": 0.36565850536085776, "grad_norm": 2.56351375579834, "learning_rate": 4.4027116649729905e-05, "loss": 5.7982, "step": 4570 }, { "epoch": 0.366458633381341, "grad_norm": 2.3203811645507812, "learning_rate": 4.401374552067177e-05, "loss": 5.7969, "step": 4580 }, { "epoch": 0.3672587614018243, "grad_norm": 2.3838179111480713, "learning_rate": 4.400037439161363e-05, "loss": 5.7484, "step": 4590 }, { "epoch": 0.3680588894223076, "grad_norm": 2.0725440979003906, "learning_rate": 4.398700326255549e-05, "loss": 5.8405, "step": 4600 }, { "epoch": 0.36885901744279087, "grad_norm": 3.49495005607605, "learning_rate": 4.3973632133497356e-05, "loss": 5.7151, "step": 4610 }, { "epoch": 0.3696591454632741, "grad_norm": 2.643007755279541, "learning_rate": 4.396026100443922e-05, "loss": 5.6374, "step": 4620 }, { "epoch": 0.3704592734837574, "grad_norm": 2.282304286956787, "learning_rate": 4.394688987538108e-05, "loss": 5.589, "step": 4630 }, { "epoch": 0.3712594015042407, "grad_norm": 2.244058609008789, "learning_rate": 4.3933518746322944e-05, "loss": 5.7516, "step": 4640 }, { "epoch": 0.37205952952472393, "grad_norm": 2.44496488571167, "learning_rate": 4.3920147617264806e-05, "loss": 5.8393, "step": 4650 }, { "epoch": 0.3728596575452072, "grad_norm": 2.6613078117370605, "learning_rate": 4.390677648820667e-05, "loss": 5.6764, "step": 4660 }, { "epoch": 0.3736597855656905, "grad_norm": 3.99092173576355, "learning_rate": 4.3893405359148525e-05, "loss": 5.8658, "step": 4670 }, { "epoch": 0.3744599135861738, "grad_norm": 1.6338485479354858, "learning_rate": 4.388003423009039e-05, "loss": 5.7527, "step": 4680 }, { "epoch": 0.37526004160665705, "grad_norm": 2.3723371028900146, "learning_rate": 4.386666310103225e-05, "loss": 5.7482, "step": 4690 }, { "epoch": 0.37606016962714034, "grad_norm": 2.630424976348877, "learning_rate": 4.385329197197411e-05, "loss": 5.7539, "step": 4700 }, { "epoch": 0.37686029764762363, "grad_norm": 2.3873038291931152, "learning_rate": 4.3839920842915976e-05, "loss": 5.6729, "step": 4710 }, { "epoch": 0.37766042566810687, "grad_norm": 1.9391748905181885, "learning_rate": 4.382654971385784e-05, "loss": 5.6794, "step": 4720 }, { "epoch": 0.37846055368859016, "grad_norm": 2.103975296020508, "learning_rate": 4.38131785847997e-05, "loss": 5.5104, "step": 4730 }, { "epoch": 0.37926068170907346, "grad_norm": 3.731184959411621, "learning_rate": 4.3799807455741564e-05, "loss": 5.6699, "step": 4740 }, { "epoch": 0.38006080972955675, "grad_norm": 2.881068468093872, "learning_rate": 4.3786436326683426e-05, "loss": 5.6394, "step": 4750 }, { "epoch": 0.38086093775004, "grad_norm": 2.5963799953460693, "learning_rate": 4.377306519762529e-05, "loss": 5.784, "step": 4760 }, { "epoch": 0.3816610657705233, "grad_norm": 1.9520230293273926, "learning_rate": 4.375969406856715e-05, "loss": 5.7608, "step": 4770 }, { "epoch": 0.38246119379100657, "grad_norm": 2.386702537536621, "learning_rate": 4.374766005241483e-05, "loss": 5.5725, "step": 4780 }, { "epoch": 0.38326132181148986, "grad_norm": 2.3830511569976807, "learning_rate": 4.3734288923356694e-05, "loss": 5.5584, "step": 4790 }, { "epoch": 0.3840614498319731, "grad_norm": 2.1514739990234375, "learning_rate": 4.3720917794298556e-05, "loss": 5.6621, "step": 4800 }, { "epoch": 0.3848615778524564, "grad_norm": 2.5376317501068115, "learning_rate": 4.370754666524042e-05, "loss": 5.4138, "step": 4810 }, { "epoch": 0.3856617058729397, "grad_norm": 3.425899028778076, "learning_rate": 4.3694175536182275e-05, "loss": 5.6478, "step": 4820 }, { "epoch": 0.3864618338934229, "grad_norm": 2.7518632411956787, "learning_rate": 4.368080440712414e-05, "loss": 5.6556, "step": 4830 }, { "epoch": 0.3872619619139062, "grad_norm": 3.119227647781372, "learning_rate": 4.3667433278066e-05, "loss": 5.7925, "step": 4840 }, { "epoch": 0.3880620899343895, "grad_norm": 3.2664616107940674, "learning_rate": 4.365406214900786e-05, "loss": 5.7176, "step": 4850 }, { "epoch": 0.3888622179548728, "grad_norm": 2.5125045776367188, "learning_rate": 4.3640691019949726e-05, "loss": 5.6511, "step": 4860 }, { "epoch": 0.38966234597535604, "grad_norm": 2.992112874984741, "learning_rate": 4.362731989089159e-05, "loss": 5.6426, "step": 4870 }, { "epoch": 0.39046247399583933, "grad_norm": 4.46783971786499, "learning_rate": 4.361394876183345e-05, "loss": 5.736, "step": 4880 }, { "epoch": 0.3912626020163226, "grad_norm": 1.8372838497161865, "learning_rate": 4.3600577632775314e-05, "loss": 5.7603, "step": 4890 }, { "epoch": 0.39206273003680586, "grad_norm": 2.1635375022888184, "learning_rate": 4.3587206503717176e-05, "loss": 5.6019, "step": 4900 }, { "epoch": 0.39286285805728915, "grad_norm": 2.2425310611724854, "learning_rate": 4.357383537465904e-05, "loss": 5.6829, "step": 4910 }, { "epoch": 0.39366298607777245, "grad_norm": 2.408907413482666, "learning_rate": 4.35604642456009e-05, "loss": 5.6821, "step": 4920 }, { "epoch": 0.39446311409825574, "grad_norm": 3.012258291244507, "learning_rate": 4.3547093116542765e-05, "loss": 5.7503, "step": 4930 }, { "epoch": 0.395263242118739, "grad_norm": 3.187053680419922, "learning_rate": 4.353372198748463e-05, "loss": 5.6459, "step": 4940 }, { "epoch": 0.39606337013922227, "grad_norm": 2.7528955936431885, "learning_rate": 4.352035085842649e-05, "loss": 5.6386, "step": 4950 }, { "epoch": 0.39686349815970556, "grad_norm": 2.9744699001312256, "learning_rate": 4.350697972936835e-05, "loss": 5.5938, "step": 4960 }, { "epoch": 0.39766362618018886, "grad_norm": 2.779604196548462, "learning_rate": 4.3493608600310215e-05, "loss": 5.5459, "step": 4970 }, { "epoch": 0.3984637542006721, "grad_norm": 2.9092133045196533, "learning_rate": 4.348023747125207e-05, "loss": 5.7695, "step": 4980 }, { "epoch": 0.3992638822211554, "grad_norm": 2.800872802734375, "learning_rate": 4.3466866342193934e-05, "loss": 5.6943, "step": 4990 }, { "epoch": 0.4000640102416387, "grad_norm": 3.299595832824707, "learning_rate": 4.3453495213135797e-05, "loss": 5.4432, "step": 5000 }, { "epoch": 0.4008641382621219, "grad_norm": 2.2425456047058105, "learning_rate": 4.344012408407766e-05, "loss": 5.6688, "step": 5010 }, { "epoch": 0.4016642662826052, "grad_norm": 2.269378423690796, "learning_rate": 4.342675295501952e-05, "loss": 5.7713, "step": 5020 }, { "epoch": 0.4024643943030885, "grad_norm": 2.3903868198394775, "learning_rate": 4.3413381825961385e-05, "loss": 5.5926, "step": 5030 }, { "epoch": 0.4032645223235718, "grad_norm": 3.267918109893799, "learning_rate": 4.340001069690325e-05, "loss": 5.6806, "step": 5040 }, { "epoch": 0.40406465034405503, "grad_norm": 3.2075066566467285, "learning_rate": 4.338663956784511e-05, "loss": 5.6582, "step": 5050 }, { "epoch": 0.4048647783645383, "grad_norm": 2.5458226203918457, "learning_rate": 4.337326843878697e-05, "loss": 5.6576, "step": 5060 }, { "epoch": 0.4056649063850216, "grad_norm": 2.0331077575683594, "learning_rate": 4.3359897309728835e-05, "loss": 5.6725, "step": 5070 }, { "epoch": 0.40646503440550485, "grad_norm": 2.406907796859741, "learning_rate": 4.33465261806707e-05, "loss": 5.5168, "step": 5080 }, { "epoch": 0.40726516242598815, "grad_norm": 2.661137580871582, "learning_rate": 4.333315505161256e-05, "loss": 5.5953, "step": 5090 }, { "epoch": 0.40806529044647144, "grad_norm": 2.857725143432617, "learning_rate": 4.3319783922554423e-05, "loss": 5.6702, "step": 5100 }, { "epoch": 0.40886541846695473, "grad_norm": 2.7894747257232666, "learning_rate": 4.3306412793496286e-05, "loss": 5.6228, "step": 5110 }, { "epoch": 0.40966554648743797, "grad_norm": 2.8865861892700195, "learning_rate": 4.329304166443815e-05, "loss": 5.6859, "step": 5120 }, { "epoch": 0.41046567450792126, "grad_norm": 2.1493608951568604, "learning_rate": 4.3279670535380005e-05, "loss": 5.5516, "step": 5130 }, { "epoch": 0.41126580252840456, "grad_norm": 3.112820863723755, "learning_rate": 4.326629940632187e-05, "loss": 5.6409, "step": 5140 }, { "epoch": 0.41206593054888785, "grad_norm": 2.778876543045044, "learning_rate": 4.325292827726373e-05, "loss": 5.6948, "step": 5150 }, { "epoch": 0.4128660585693711, "grad_norm": 2.0409047603607178, "learning_rate": 4.323955714820559e-05, "loss": 5.5458, "step": 5160 }, { "epoch": 0.4136661865898544, "grad_norm": 3.1058828830718994, "learning_rate": 4.3226186019147456e-05, "loss": 5.8437, "step": 5170 }, { "epoch": 0.41446631461033767, "grad_norm": 3.306704044342041, "learning_rate": 4.321281489008932e-05, "loss": 5.691, "step": 5180 }, { "epoch": 0.4152664426308209, "grad_norm": 2.9495625495910645, "learning_rate": 4.319944376103118e-05, "loss": 5.6364, "step": 5190 }, { "epoch": 0.4160665706513042, "grad_norm": 2.1773974895477295, "learning_rate": 4.3186072631973044e-05, "loss": 5.6713, "step": 5200 }, { "epoch": 0.4168666986717875, "grad_norm": 2.0897533893585205, "learning_rate": 4.3172701502914906e-05, "loss": 5.6022, "step": 5210 }, { "epoch": 0.4176668266922708, "grad_norm": 2.2131927013397217, "learning_rate": 4.315933037385677e-05, "loss": 5.5728, "step": 5220 }, { "epoch": 0.418466954712754, "grad_norm": 2.225728750228882, "learning_rate": 4.314595924479863e-05, "loss": 5.5374, "step": 5230 }, { "epoch": 0.4192670827332373, "grad_norm": 2.219791889190674, "learning_rate": 4.3132588115740494e-05, "loss": 5.6986, "step": 5240 }, { "epoch": 0.4200672107537206, "grad_norm": 2.720323085784912, "learning_rate": 4.311921698668236e-05, "loss": 5.6046, "step": 5250 }, { "epoch": 0.42086733877420385, "grad_norm": 2.4254257678985596, "learning_rate": 4.310584585762422e-05, "loss": 5.5566, "step": 5260 }, { "epoch": 0.42166746679468714, "grad_norm": 2.2297472953796387, "learning_rate": 4.309247472856608e-05, "loss": 5.7431, "step": 5270 }, { "epoch": 0.42246759481517043, "grad_norm": 2.2767512798309326, "learning_rate": 4.3079103599507945e-05, "loss": 5.6661, "step": 5280 }, { "epoch": 0.4232677228356537, "grad_norm": 2.8959579467773438, "learning_rate": 4.30657324704498e-05, "loss": 5.6584, "step": 5290 }, { "epoch": 0.42406785085613696, "grad_norm": 2.49867844581604, "learning_rate": 4.3052361341391664e-05, "loss": 5.7564, "step": 5300 }, { "epoch": 0.42486797887662026, "grad_norm": 2.1820337772369385, "learning_rate": 4.3038990212333526e-05, "loss": 5.6288, "step": 5310 }, { "epoch": 0.42566810689710355, "grad_norm": 2.7174227237701416, "learning_rate": 4.302561908327539e-05, "loss": 5.6496, "step": 5320 }, { "epoch": 0.42646823491758684, "grad_norm": 2.7261149883270264, "learning_rate": 4.301224795421725e-05, "loss": 5.6557, "step": 5330 }, { "epoch": 0.4272683629380701, "grad_norm": 2.581760883331299, "learning_rate": 4.2998876825159114e-05, "loss": 5.604, "step": 5340 }, { "epoch": 0.42806849095855337, "grad_norm": 2.43254017829895, "learning_rate": 4.298550569610098e-05, "loss": 5.6041, "step": 5350 }, { "epoch": 0.42886861897903666, "grad_norm": 4.465782165527344, "learning_rate": 4.297213456704284e-05, "loss": 5.7158, "step": 5360 }, { "epoch": 0.4296687469995199, "grad_norm": 2.6434614658355713, "learning_rate": 4.29587634379847e-05, "loss": 5.6347, "step": 5370 }, { "epoch": 0.4304688750200032, "grad_norm": 2.344190835952759, "learning_rate": 4.2945392308926565e-05, "loss": 5.6062, "step": 5380 }, { "epoch": 0.4312690030404865, "grad_norm": 4.311372756958008, "learning_rate": 4.293202117986843e-05, "loss": 5.7356, "step": 5390 }, { "epoch": 0.4320691310609698, "grad_norm": 2.8204123973846436, "learning_rate": 4.291865005081029e-05, "loss": 5.63, "step": 5400 }, { "epoch": 0.432869259081453, "grad_norm": 3.333059072494507, "learning_rate": 4.290527892175215e-05, "loss": 5.5992, "step": 5410 }, { "epoch": 0.4336693871019363, "grad_norm": 2.0647048950195312, "learning_rate": 4.2891907792694016e-05, "loss": 5.691, "step": 5420 }, { "epoch": 0.4344695151224196, "grad_norm": 2.5100045204162598, "learning_rate": 4.287853666363588e-05, "loss": 5.615, "step": 5430 }, { "epoch": 0.43526964314290284, "grad_norm": 2.6120762825012207, "learning_rate": 4.286516553457774e-05, "loss": 5.746, "step": 5440 }, { "epoch": 0.43606977116338613, "grad_norm": 2.2886853218078613, "learning_rate": 4.2851794405519604e-05, "loss": 5.6783, "step": 5450 }, { "epoch": 0.4368698991838694, "grad_norm": 2.6724119186401367, "learning_rate": 4.283842327646147e-05, "loss": 5.6526, "step": 5460 }, { "epoch": 0.4376700272043527, "grad_norm": 2.2408151626586914, "learning_rate": 4.282505214740333e-05, "loss": 5.6314, "step": 5470 }, { "epoch": 0.43847015522483596, "grad_norm": 3.0294084548950195, "learning_rate": 4.281168101834519e-05, "loss": 5.6669, "step": 5480 }, { "epoch": 0.43927028324531925, "grad_norm": 2.1664011478424072, "learning_rate": 4.2798309889287055e-05, "loss": 5.4856, "step": 5490 }, { "epoch": 0.44007041126580254, "grad_norm": 3.4465417861938477, "learning_rate": 4.278493876022892e-05, "loss": 5.5859, "step": 5500 }, { "epoch": 0.4408705392862858, "grad_norm": 2.0116310119628906, "learning_rate": 4.277156763117078e-05, "loss": 5.5982, "step": 5510 }, { "epoch": 0.44167066730676907, "grad_norm": 2.578658103942871, "learning_rate": 4.275819650211264e-05, "loss": 5.4026, "step": 5520 }, { "epoch": 0.44247079532725236, "grad_norm": 3.1201677322387695, "learning_rate": 4.2744825373054506e-05, "loss": 5.7024, "step": 5530 }, { "epoch": 0.44327092334773566, "grad_norm": 2.2246837615966797, "learning_rate": 4.273145424399637e-05, "loss": 5.5842, "step": 5540 }, { "epoch": 0.4440710513682189, "grad_norm": 2.1593568325042725, "learning_rate": 4.271808311493823e-05, "loss": 5.5099, "step": 5550 }, { "epoch": 0.4448711793887022, "grad_norm": 3.082218885421753, "learning_rate": 4.2704711985880094e-05, "loss": 5.5539, "step": 5560 }, { "epoch": 0.4456713074091855, "grad_norm": 3.2272634506225586, "learning_rate": 4.2691340856821956e-05, "loss": 5.73, "step": 5570 }, { "epoch": 0.4464714354296688, "grad_norm": 2.301713466644287, "learning_rate": 4.267796972776382e-05, "loss": 5.5444, "step": 5580 }, { "epoch": 0.447271563450152, "grad_norm": 3.2985429763793945, "learning_rate": 4.2664598598705675e-05, "loss": 5.7499, "step": 5590 }, { "epoch": 0.4480716914706353, "grad_norm": 2.103994607925415, "learning_rate": 4.265122746964754e-05, "loss": 5.5627, "step": 5600 }, { "epoch": 0.4488718194911186, "grad_norm": 3.260099172592163, "learning_rate": 4.26378563405894e-05, "loss": 5.5692, "step": 5610 }, { "epoch": 0.44967194751160183, "grad_norm": 2.740907907485962, "learning_rate": 4.262448521153126e-05, "loss": 5.4984, "step": 5620 }, { "epoch": 0.4504720755320851, "grad_norm": 5.314218997955322, "learning_rate": 4.2611114082473126e-05, "loss": 5.5641, "step": 5630 }, { "epoch": 0.4512722035525684, "grad_norm": 3.0524938106536865, "learning_rate": 4.259774295341499e-05, "loss": 5.6375, "step": 5640 }, { "epoch": 0.4520723315730517, "grad_norm": 3.57781982421875, "learning_rate": 4.258437182435685e-05, "loss": 5.6726, "step": 5650 }, { "epoch": 0.45287245959353495, "grad_norm": 3.094510793685913, "learning_rate": 4.2571000695298714e-05, "loss": 5.7328, "step": 5660 }, { "epoch": 0.45367258761401824, "grad_norm": 2.731092929840088, "learning_rate": 4.2557629566240576e-05, "loss": 5.6667, "step": 5670 }, { "epoch": 0.45447271563450153, "grad_norm": 3.6701395511627197, "learning_rate": 4.254425843718244e-05, "loss": 5.641, "step": 5680 }, { "epoch": 0.45527284365498477, "grad_norm": 1.9017853736877441, "learning_rate": 4.25308873081243e-05, "loss": 5.6521, "step": 5690 }, { "epoch": 0.45607297167546806, "grad_norm": 3.2658119201660156, "learning_rate": 4.2517516179066165e-05, "loss": 5.6431, "step": 5700 }, { "epoch": 0.45687309969595136, "grad_norm": 2.227353572845459, "learning_rate": 4.250414505000803e-05, "loss": 5.6198, "step": 5710 }, { "epoch": 0.45767322771643465, "grad_norm": 1.7804296016693115, "learning_rate": 4.249077392094989e-05, "loss": 5.618, "step": 5720 }, { "epoch": 0.4584733557369179, "grad_norm": 2.9357879161834717, "learning_rate": 4.247740279189175e-05, "loss": 5.5222, "step": 5730 }, { "epoch": 0.4592734837574012, "grad_norm": 5.074959754943848, "learning_rate": 4.2464031662833615e-05, "loss": 5.7604, "step": 5740 }, { "epoch": 0.4600736117778845, "grad_norm": 2.4961061477661133, "learning_rate": 4.245066053377547e-05, "loss": 5.5699, "step": 5750 }, { "epoch": 0.46087373979836777, "grad_norm": 2.636403799057007, "learning_rate": 4.2437289404717334e-05, "loss": 5.745, "step": 5760 }, { "epoch": 0.461673867818851, "grad_norm": 2.4829630851745605, "learning_rate": 4.2423918275659197e-05, "loss": 5.9779, "step": 5770 }, { "epoch": 0.4624739958393343, "grad_norm": 2.389112710952759, "learning_rate": 4.241054714660106e-05, "loss": 5.696, "step": 5780 }, { "epoch": 0.4632741238598176, "grad_norm": 2.3053462505340576, "learning_rate": 4.239717601754292e-05, "loss": 5.6567, "step": 5790 }, { "epoch": 0.4640742518803008, "grad_norm": 2.9635446071624756, "learning_rate": 4.2383804888484785e-05, "loss": 5.7643, "step": 5800 }, { "epoch": 0.4648743799007841, "grad_norm": 3.3227570056915283, "learning_rate": 4.237043375942665e-05, "loss": 5.5425, "step": 5810 }, { "epoch": 0.4656745079212674, "grad_norm": 3.2959067821502686, "learning_rate": 4.235706263036851e-05, "loss": 5.5886, "step": 5820 }, { "epoch": 0.4664746359417507, "grad_norm": 2.497953176498413, "learning_rate": 4.234369150131037e-05, "loss": 5.6248, "step": 5830 }, { "epoch": 0.46727476396223394, "grad_norm": 3.5957205295562744, "learning_rate": 4.2330320372252235e-05, "loss": 5.5345, "step": 5840 }, { "epoch": 0.46807489198271723, "grad_norm": 2.9113316535949707, "learning_rate": 4.23169492431941e-05, "loss": 5.7358, "step": 5850 }, { "epoch": 0.4688750200032005, "grad_norm": 3.8617255687713623, "learning_rate": 4.230357811413596e-05, "loss": 5.7451, "step": 5860 }, { "epoch": 0.46967514802368376, "grad_norm": 2.5546538829803467, "learning_rate": 4.2290206985077824e-05, "loss": 5.5874, "step": 5870 }, { "epoch": 0.47047527604416706, "grad_norm": 3.7215869426727295, "learning_rate": 4.2276835856019686e-05, "loss": 5.5462, "step": 5880 }, { "epoch": 0.47127540406465035, "grad_norm": 3.3122622966766357, "learning_rate": 4.226346472696155e-05, "loss": 5.7368, "step": 5890 }, { "epoch": 0.47207553208513364, "grad_norm": 2.3962459564208984, "learning_rate": 4.2250093597903405e-05, "loss": 5.7328, "step": 5900 }, { "epoch": 0.4728756601056169, "grad_norm": 2.497668504714966, "learning_rate": 4.223672246884527e-05, "loss": 5.7063, "step": 5910 }, { "epoch": 0.4736757881261002, "grad_norm": 2.301725387573242, "learning_rate": 4.222335133978713e-05, "loss": 5.6029, "step": 5920 }, { "epoch": 0.47447591614658347, "grad_norm": 3.840155839920044, "learning_rate": 4.220998021072899e-05, "loss": 5.825, "step": 5930 }, { "epoch": 0.47527604416706676, "grad_norm": 3.1776278018951416, "learning_rate": 4.2196609081670856e-05, "loss": 5.6421, "step": 5940 }, { "epoch": 0.47607617218755, "grad_norm": 2.1823127269744873, "learning_rate": 4.218323795261272e-05, "loss": 5.7154, "step": 5950 }, { "epoch": 0.4768763002080333, "grad_norm": 2.944390058517456, "learning_rate": 4.216986682355458e-05, "loss": 5.5429, "step": 5960 }, { "epoch": 0.4776764282285166, "grad_norm": 2.035430431365967, "learning_rate": 4.2156495694496444e-05, "loss": 5.8187, "step": 5970 }, { "epoch": 0.4784765562489998, "grad_norm": 3.167098045349121, "learning_rate": 4.2143124565438306e-05, "loss": 5.5891, "step": 5980 }, { "epoch": 0.4792766842694831, "grad_norm": 1.9377233982086182, "learning_rate": 4.212975343638017e-05, "loss": 5.7428, "step": 5990 }, { "epoch": 0.4800768122899664, "grad_norm": 2.759096622467041, "learning_rate": 4.211638230732203e-05, "loss": 5.5572, "step": 6000 }, { "epoch": 0.4808769403104497, "grad_norm": 2.074033498764038, "learning_rate": 4.2103011178263894e-05, "loss": 5.517, "step": 6010 }, { "epoch": 0.48167706833093293, "grad_norm": 2.2866854667663574, "learning_rate": 4.208964004920576e-05, "loss": 5.6539, "step": 6020 }, { "epoch": 0.4824771963514162, "grad_norm": 1.9909095764160156, "learning_rate": 4.207626892014762e-05, "loss": 5.5532, "step": 6030 }, { "epoch": 0.4832773243718995, "grad_norm": 3.245906114578247, "learning_rate": 4.206289779108948e-05, "loss": 5.6797, "step": 6040 }, { "epoch": 0.48407745239238276, "grad_norm": 2.013009786605835, "learning_rate": 4.2049526662031345e-05, "loss": 5.6378, "step": 6050 }, { "epoch": 0.48487758041286605, "grad_norm": 2.5478925704956055, "learning_rate": 4.20361555329732e-05, "loss": 5.555, "step": 6060 }, { "epoch": 0.48567770843334934, "grad_norm": 3.079225778579712, "learning_rate": 4.2022784403915064e-05, "loss": 5.7618, "step": 6070 }, { "epoch": 0.48647783645383263, "grad_norm": 2.2639927864074707, "learning_rate": 4.2009413274856926e-05, "loss": 5.8063, "step": 6080 }, { "epoch": 0.48727796447431587, "grad_norm": 4.630524158477783, "learning_rate": 4.199604214579879e-05, "loss": 5.6403, "step": 6090 }, { "epoch": 0.48807809249479917, "grad_norm": 3.11018967628479, "learning_rate": 4.198267101674065e-05, "loss": 5.7517, "step": 6100 }, { "epoch": 0.48887822051528246, "grad_norm": 8.462982177734375, "learning_rate": 4.1969299887682515e-05, "loss": 5.7311, "step": 6110 }, { "epoch": 0.4896783485357657, "grad_norm": 2.418065071105957, "learning_rate": 4.195592875862438e-05, "loss": 5.6239, "step": 6120 }, { "epoch": 0.490478476556249, "grad_norm": 2.5452466011047363, "learning_rate": 4.194255762956624e-05, "loss": 5.7417, "step": 6130 }, { "epoch": 0.4912786045767323, "grad_norm": 2.986041307449341, "learning_rate": 4.19291865005081e-05, "loss": 5.663, "step": 6140 }, { "epoch": 0.4920787325972156, "grad_norm": 2.7642807960510254, "learning_rate": 4.1915815371449965e-05, "loss": 5.5379, "step": 6150 }, { "epoch": 0.4928788606176988, "grad_norm": 4.326907157897949, "learning_rate": 4.190244424239183e-05, "loss": 5.8058, "step": 6160 }, { "epoch": 0.4936789886381821, "grad_norm": 1.9514706134796143, "learning_rate": 4.188907311333369e-05, "loss": 5.7004, "step": 6170 }, { "epoch": 0.4944791166586654, "grad_norm": 2.5721428394317627, "learning_rate": 4.187570198427555e-05, "loss": 5.6959, "step": 6180 }, { "epoch": 0.4952792446791487, "grad_norm": 2.6619083881378174, "learning_rate": 4.1862330855217416e-05, "loss": 5.7196, "step": 6190 }, { "epoch": 0.4960793726996319, "grad_norm": 2.322341203689575, "learning_rate": 4.184895972615928e-05, "loss": 5.5998, "step": 6200 }, { "epoch": 0.4968795007201152, "grad_norm": 2.280777931213379, "learning_rate": 4.183558859710114e-05, "loss": 5.5171, "step": 6210 }, { "epoch": 0.4976796287405985, "grad_norm": 1.9774320125579834, "learning_rate": 4.1822217468043004e-05, "loss": 5.6368, "step": 6220 }, { "epoch": 0.49847975676108175, "grad_norm": 2.199708938598633, "learning_rate": 4.180884633898487e-05, "loss": 5.4638, "step": 6230 }, { "epoch": 0.49927988478156504, "grad_norm": 2.0054879188537598, "learning_rate": 4.179547520992673e-05, "loss": 5.4624, "step": 6240 }, { "epoch": 0.5000800128020483, "grad_norm": 2.0623903274536133, "learning_rate": 4.178210408086859e-05, "loss": 5.6554, "step": 6250 }, { "epoch": 0.5008801408225316, "grad_norm": 2.5907487869262695, "learning_rate": 4.1768732951810455e-05, "loss": 5.4989, "step": 6260 }, { "epoch": 0.5016802688430149, "grad_norm": 2.181987762451172, "learning_rate": 4.175536182275232e-05, "loss": 5.624, "step": 6270 }, { "epoch": 0.5024803968634982, "grad_norm": 2.9678001403808594, "learning_rate": 4.174199069369418e-05, "loss": 5.6545, "step": 6280 }, { "epoch": 0.5032805248839815, "grad_norm": 5.213638782501221, "learning_rate": 4.172861956463604e-05, "loss": 5.7048, "step": 6290 }, { "epoch": 0.5040806529044647, "grad_norm": 2.465900182723999, "learning_rate": 4.1715248435577906e-05, "loss": 5.646, "step": 6300 }, { "epoch": 0.504880780924948, "grad_norm": 2.94570255279541, "learning_rate": 4.170187730651977e-05, "loss": 5.6274, "step": 6310 }, { "epoch": 0.5056809089454313, "grad_norm": 3.5255651473999023, "learning_rate": 4.168850617746163e-05, "loss": 5.5336, "step": 6320 }, { "epoch": 0.5064810369659145, "grad_norm": 2.3499608039855957, "learning_rate": 4.1675135048403494e-05, "loss": 5.7768, "step": 6330 }, { "epoch": 0.5072811649863979, "grad_norm": 2.0476951599121094, "learning_rate": 4.1661763919345356e-05, "loss": 5.5927, "step": 6340 }, { "epoch": 0.5080812930068811, "grad_norm": 2.4708118438720703, "learning_rate": 4.164839279028722e-05, "loss": 5.6458, "step": 6350 }, { "epoch": 0.5088814210273643, "grad_norm": 2.465075731277466, "learning_rate": 4.163502166122908e-05, "loss": 5.5744, "step": 6360 }, { "epoch": 0.5096815490478477, "grad_norm": 2.9378490447998047, "learning_rate": 4.162165053217094e-05, "loss": 5.6963, "step": 6370 }, { "epoch": 0.5104816770683309, "grad_norm": 2.201359987258911, "learning_rate": 4.16082794031128e-05, "loss": 5.613, "step": 6380 }, { "epoch": 0.5112818050888142, "grad_norm": 1.8427401781082153, "learning_rate": 4.159490827405466e-05, "loss": 5.5494, "step": 6390 }, { "epoch": 0.5120819331092975, "grad_norm": 1.9969813823699951, "learning_rate": 4.1581537144996526e-05, "loss": 5.5783, "step": 6400 }, { "epoch": 0.5128820611297807, "grad_norm": 2.9670321941375732, "learning_rate": 4.156816601593839e-05, "loss": 5.7176, "step": 6410 }, { "epoch": 0.5136821891502641, "grad_norm": 2.76875901222229, "learning_rate": 4.155479488688025e-05, "loss": 5.5584, "step": 6420 }, { "epoch": 0.5144823171707473, "grad_norm": 3.2874600887298584, "learning_rate": 4.1541423757822114e-05, "loss": 5.8726, "step": 6430 }, { "epoch": 0.5152824451912306, "grad_norm": 2.4672482013702393, "learning_rate": 4.1528052628763977e-05, "loss": 5.764, "step": 6440 }, { "epoch": 0.5160825732117139, "grad_norm": 3.5424506664276123, "learning_rate": 4.151468149970584e-05, "loss": 5.6612, "step": 6450 }, { "epoch": 0.5168827012321972, "grad_norm": 2.7947871685028076, "learning_rate": 4.15013103706477e-05, "loss": 5.668, "step": 6460 }, { "epoch": 0.5176828292526804, "grad_norm": 2.624370574951172, "learning_rate": 4.1487939241589565e-05, "loss": 5.577, "step": 6470 }, { "epoch": 0.5184829572731637, "grad_norm": 2.276289701461792, "learning_rate": 4.147456811253143e-05, "loss": 5.7592, "step": 6480 }, { "epoch": 0.519283085293647, "grad_norm": 2.751945972442627, "learning_rate": 4.146119698347329e-05, "loss": 5.6251, "step": 6490 }, { "epoch": 0.5200832133141302, "grad_norm": 2.1990444660186768, "learning_rate": 4.144782585441515e-05, "loss": 5.5141, "step": 6500 }, { "epoch": 0.5208833413346136, "grad_norm": 2.732024908065796, "learning_rate": 4.1434454725357015e-05, "loss": 5.5938, "step": 6510 }, { "epoch": 0.5216834693550968, "grad_norm": 2.6876533031463623, "learning_rate": 4.142108359629887e-05, "loss": 5.7126, "step": 6520 }, { "epoch": 0.5224835973755801, "grad_norm": 2.660323143005371, "learning_rate": 4.1407712467240734e-05, "loss": 5.6261, "step": 6530 }, { "epoch": 0.5232837253960634, "grad_norm": 2.567084550857544, "learning_rate": 4.13943413381826e-05, "loss": 5.5248, "step": 6540 }, { "epoch": 0.5240838534165466, "grad_norm": 4.317018032073975, "learning_rate": 4.138097020912446e-05, "loss": 5.4444, "step": 6550 }, { "epoch": 0.52488398143703, "grad_norm": 2.0361647605895996, "learning_rate": 4.136759908006632e-05, "loss": 5.7532, "step": 6560 }, { "epoch": 0.5256841094575132, "grad_norm": 2.0946271419525146, "learning_rate": 4.1354227951008185e-05, "loss": 5.6343, "step": 6570 }, { "epoch": 0.5264842374779964, "grad_norm": 3.3724842071533203, "learning_rate": 4.134085682195005e-05, "loss": 5.6455, "step": 6580 }, { "epoch": 0.5272843654984798, "grad_norm": 4.078947067260742, "learning_rate": 4.132748569289191e-05, "loss": 5.6681, "step": 6590 }, { "epoch": 0.528084493518963, "grad_norm": 4.288105010986328, "learning_rate": 4.131411456383377e-05, "loss": 5.7152, "step": 6600 }, { "epoch": 0.5288846215394463, "grad_norm": 2.5208754539489746, "learning_rate": 4.1300743434775635e-05, "loss": 5.5715, "step": 6610 }, { "epoch": 0.5296847495599296, "grad_norm": 2.6902217864990234, "learning_rate": 4.12873723057175e-05, "loss": 5.4997, "step": 6620 }, { "epoch": 0.5304848775804129, "grad_norm": 2.4580068588256836, "learning_rate": 4.127400117665936e-05, "loss": 5.7656, "step": 6630 }, { "epoch": 0.5312850056008962, "grad_norm": 2.5117955207824707, "learning_rate": 4.1260630047601224e-05, "loss": 5.6373, "step": 6640 }, { "epoch": 0.5320851336213794, "grad_norm": 2.660921096801758, "learning_rate": 4.1247258918543086e-05, "loss": 5.6829, "step": 6650 }, { "epoch": 0.5328852616418627, "grad_norm": 2.4601287841796875, "learning_rate": 4.123388778948495e-05, "loss": 5.7702, "step": 6660 }, { "epoch": 0.533685389662346, "grad_norm": 2.9025120735168457, "learning_rate": 4.122051666042681e-05, "loss": 5.6374, "step": 6670 }, { "epoch": 0.5344855176828293, "grad_norm": 2.8221569061279297, "learning_rate": 4.120714553136867e-05, "loss": 5.5568, "step": 6680 }, { "epoch": 0.5352856457033125, "grad_norm": 2.3035178184509277, "learning_rate": 4.119377440231053e-05, "loss": 5.5845, "step": 6690 }, { "epoch": 0.5360857737237958, "grad_norm": 2.0955657958984375, "learning_rate": 4.118040327325239e-05, "loss": 5.687, "step": 6700 }, { "epoch": 0.5368859017442791, "grad_norm": 2.530156135559082, "learning_rate": 4.1167032144194256e-05, "loss": 5.5772, "step": 6710 }, { "epoch": 0.5376860297647623, "grad_norm": 2.2060387134552, "learning_rate": 4.115366101513612e-05, "loss": 5.5964, "step": 6720 }, { "epoch": 0.5384861577852457, "grad_norm": 2.720702886581421, "learning_rate": 4.114028988607798e-05, "loss": 5.5432, "step": 6730 }, { "epoch": 0.5392862858057289, "grad_norm": 2.2585232257843018, "learning_rate": 4.1126918757019844e-05, "loss": 5.77, "step": 6740 }, { "epoch": 0.5400864138262121, "grad_norm": 2.052316904067993, "learning_rate": 4.1113547627961706e-05, "loss": 5.5679, "step": 6750 }, { "epoch": 0.5408865418466955, "grad_norm": 2.772500991821289, "learning_rate": 4.110017649890357e-05, "loss": 5.5608, "step": 6760 }, { "epoch": 0.5416866698671787, "grad_norm": 2.158129930496216, "learning_rate": 4.108680536984543e-05, "loss": 5.6612, "step": 6770 }, { "epoch": 0.5424867978876621, "grad_norm": 2.874685287475586, "learning_rate": 4.1073434240787294e-05, "loss": 5.5999, "step": 6780 }, { "epoch": 0.5432869259081453, "grad_norm": 2.2797632217407227, "learning_rate": 4.106006311172916e-05, "loss": 5.7243, "step": 6790 }, { "epoch": 0.5440870539286286, "grad_norm": 2.998309850692749, "learning_rate": 4.1048029095576836e-05, "loss": 5.5031, "step": 6800 }, { "epoch": 0.5448871819491119, "grad_norm": 2.8155364990234375, "learning_rate": 4.10346579665187e-05, "loss": 5.7631, "step": 6810 }, { "epoch": 0.5456873099695951, "grad_norm": 2.327279806137085, "learning_rate": 4.102128683746056e-05, "loss": 5.6293, "step": 6820 }, { "epoch": 0.5464874379900784, "grad_norm": 3.3200621604919434, "learning_rate": 4.100791570840242e-05, "loss": 5.717, "step": 6830 }, { "epoch": 0.5472875660105617, "grad_norm": 2.521144390106201, "learning_rate": 4.099454457934428e-05, "loss": 5.5705, "step": 6840 }, { "epoch": 0.548087694031045, "grad_norm": 2.7198219299316406, "learning_rate": 4.098117345028614e-05, "loss": 5.5931, "step": 6850 }, { "epoch": 0.5488878220515282, "grad_norm": 2.701251268386841, "learning_rate": 4.0967802321228006e-05, "loss": 5.4706, "step": 6860 }, { "epoch": 0.5496879500720115, "grad_norm": 2.2789149284362793, "learning_rate": 4.095443119216987e-05, "loss": 5.5883, "step": 6870 }, { "epoch": 0.5504880780924948, "grad_norm": 2.8821568489074707, "learning_rate": 4.094106006311173e-05, "loss": 5.7525, "step": 6880 }, { "epoch": 0.5512882061129781, "grad_norm": 2.3450064659118652, "learning_rate": 4.0927688934053594e-05, "loss": 5.5166, "step": 6890 }, { "epoch": 0.5520883341334614, "grad_norm": 2.639960527420044, "learning_rate": 4.0914317804995456e-05, "loss": 5.7001, "step": 6900 }, { "epoch": 0.5528884621539446, "grad_norm": 2.6743710041046143, "learning_rate": 4.090094667593732e-05, "loss": 5.7049, "step": 6910 }, { "epoch": 0.553688590174428, "grad_norm": 2.7540199756622314, "learning_rate": 4.088757554687918e-05, "loss": 5.5705, "step": 6920 }, { "epoch": 0.5544887181949112, "grad_norm": 3.2703442573547363, "learning_rate": 4.0874204417821044e-05, "loss": 5.5585, "step": 6930 }, { "epoch": 0.5552888462153944, "grad_norm": 3.684135913848877, "learning_rate": 4.086083328876291e-05, "loss": 5.6561, "step": 6940 }, { "epoch": 0.5560889742358778, "grad_norm": 2.918989896774292, "learning_rate": 4.084746215970477e-05, "loss": 5.5171, "step": 6950 }, { "epoch": 0.556889102256361, "grad_norm": 2.5902323722839355, "learning_rate": 4.083409103064663e-05, "loss": 5.6703, "step": 6960 }, { "epoch": 0.5576892302768442, "grad_norm": 2.23820161819458, "learning_rate": 4.0820719901588495e-05, "loss": 5.7048, "step": 6970 }, { "epoch": 0.5584893582973276, "grad_norm": 2.4339401721954346, "learning_rate": 4.080734877253036e-05, "loss": 5.4264, "step": 6980 }, { "epoch": 0.5592894863178108, "grad_norm": 3.3097031116485596, "learning_rate": 4.0793977643472214e-05, "loss": 5.5931, "step": 6990 }, { "epoch": 0.5600896143382941, "grad_norm": 2.6903202533721924, "learning_rate": 4.0780606514414077e-05, "loss": 5.5349, "step": 7000 }, { "epoch": 0.5600896143382941, "eval_loss": 5.870830535888672, "eval_runtime": 13.3044, "eval_samples_per_second": 3.007, "eval_steps_per_second": 0.376, "step": 7000 }, { "epoch": 0.5608897423587774, "grad_norm": 2.144684314727783, "learning_rate": 4.076723538535594e-05, "loss": 5.6295, "step": 7010 }, { "epoch": 0.5616898703792607, "grad_norm": 3.227046489715576, "learning_rate": 4.07538642562978e-05, "loss": 5.5506, "step": 7020 }, { "epoch": 0.562489998399744, "grad_norm": 2.7323713302612305, "learning_rate": 4.0740493127239665e-05, "loss": 5.5441, "step": 7030 }, { "epoch": 0.5632901264202272, "grad_norm": 2.3682384490966797, "learning_rate": 4.072712199818153e-05, "loss": 5.6632, "step": 7040 }, { "epoch": 0.5640902544407105, "grad_norm": 3.006518602371216, "learning_rate": 4.071375086912339e-05, "loss": 5.5702, "step": 7050 }, { "epoch": 0.5648903824611938, "grad_norm": 2.554481029510498, "learning_rate": 4.070037974006525e-05, "loss": 5.4405, "step": 7060 }, { "epoch": 0.5656905104816771, "grad_norm": 2.2349042892456055, "learning_rate": 4.0687008611007115e-05, "loss": 5.5774, "step": 7070 }, { "epoch": 0.5664906385021603, "grad_norm": 2.24906325340271, "learning_rate": 4.067363748194898e-05, "loss": 5.6362, "step": 7080 }, { "epoch": 0.5672907665226437, "grad_norm": 2.2345407009124756, "learning_rate": 4.066026635289084e-05, "loss": 5.642, "step": 7090 }, { "epoch": 0.5680908945431269, "grad_norm": 3.2273216247558594, "learning_rate": 4.0646895223832703e-05, "loss": 5.5204, "step": 7100 }, { "epoch": 0.5688910225636101, "grad_norm": 2.689624071121216, "learning_rate": 4.0633524094774566e-05, "loss": 5.5565, "step": 7110 }, { "epoch": 0.5696911505840935, "grad_norm": 3.4473490715026855, "learning_rate": 4.062015296571643e-05, "loss": 5.4041, "step": 7120 }, { "epoch": 0.5704912786045767, "grad_norm": 2.528700590133667, "learning_rate": 4.060678183665829e-05, "loss": 5.4294, "step": 7130 }, { "epoch": 0.5712914066250601, "grad_norm": 2.6679399013519287, "learning_rate": 4.059341070760015e-05, "loss": 5.6018, "step": 7140 }, { "epoch": 0.5720915346455433, "grad_norm": 2.0572123527526855, "learning_rate": 4.058003957854201e-05, "loss": 5.6527, "step": 7150 }, { "epoch": 0.5728916626660265, "grad_norm": 2.446279287338257, "learning_rate": 4.056666844948387e-05, "loss": 5.5862, "step": 7160 }, { "epoch": 0.5736917906865099, "grad_norm": 2.067232131958008, "learning_rate": 4.0553297320425735e-05, "loss": 5.5159, "step": 7170 }, { "epoch": 0.5744919187069931, "grad_norm": 2.225755214691162, "learning_rate": 4.05399261913676e-05, "loss": 5.6483, "step": 7180 }, { "epoch": 0.5752920467274764, "grad_norm": 2.3613367080688477, "learning_rate": 4.052655506230946e-05, "loss": 5.6226, "step": 7190 }, { "epoch": 0.5760921747479597, "grad_norm": 2.4239625930786133, "learning_rate": 4.0513183933251324e-05, "loss": 5.6164, "step": 7200 }, { "epoch": 0.5768923027684429, "grad_norm": 3.5525450706481934, "learning_rate": 4.0499812804193186e-05, "loss": 5.4503, "step": 7210 }, { "epoch": 0.5776924307889262, "grad_norm": 2.664311170578003, "learning_rate": 4.048644167513505e-05, "loss": 5.5188, "step": 7220 }, { "epoch": 0.5784925588094095, "grad_norm": 2.4020540714263916, "learning_rate": 4.047307054607691e-05, "loss": 5.5481, "step": 7230 }, { "epoch": 0.5792926868298928, "grad_norm": 2.256044626235962, "learning_rate": 4.0459699417018774e-05, "loss": 5.6097, "step": 7240 }, { "epoch": 0.5800928148503761, "grad_norm": 2.1168150901794434, "learning_rate": 4.044632828796064e-05, "loss": 5.5249, "step": 7250 }, { "epoch": 0.5808929428708594, "grad_norm": 2.329375743865967, "learning_rate": 4.04329571589025e-05, "loss": 5.504, "step": 7260 }, { "epoch": 0.5816930708913426, "grad_norm": 2.1734092235565186, "learning_rate": 4.041958602984436e-05, "loss": 5.5017, "step": 7270 }, { "epoch": 0.5824931989118259, "grad_norm": 3.232649564743042, "learning_rate": 4.0406214900786225e-05, "loss": 5.6462, "step": 7280 }, { "epoch": 0.5832933269323092, "grad_norm": 3.140702724456787, "learning_rate": 4.039284377172809e-05, "loss": 5.4393, "step": 7290 }, { "epoch": 0.5840934549527924, "grad_norm": 2.284515619277954, "learning_rate": 4.0379472642669944e-05, "loss": 5.4891, "step": 7300 }, { "epoch": 0.5848935829732758, "grad_norm": 4.518533706665039, "learning_rate": 4.0366101513611806e-05, "loss": 5.7371, "step": 7310 }, { "epoch": 0.585693710993759, "grad_norm": 2.2323620319366455, "learning_rate": 4.035273038455367e-05, "loss": 5.6324, "step": 7320 }, { "epoch": 0.5864938390142422, "grad_norm": 3.123394012451172, "learning_rate": 4.033935925549553e-05, "loss": 5.6266, "step": 7330 }, { "epoch": 0.5872939670347256, "grad_norm": 2.577545642852783, "learning_rate": 4.0325988126437394e-05, "loss": 5.6541, "step": 7340 }, { "epoch": 0.5880940950552088, "grad_norm": 2.8590281009674072, "learning_rate": 4.031261699737926e-05, "loss": 5.6927, "step": 7350 }, { "epoch": 0.5888942230756921, "grad_norm": 3.0693793296813965, "learning_rate": 4.029924586832112e-05, "loss": 5.5101, "step": 7360 }, { "epoch": 0.5896943510961754, "grad_norm": 2.5813119411468506, "learning_rate": 4.028587473926298e-05, "loss": 5.625, "step": 7370 }, { "epoch": 0.5904944791166586, "grad_norm": 2.7804691791534424, "learning_rate": 4.0272503610204845e-05, "loss": 5.6264, "step": 7380 }, { "epoch": 0.591294607137142, "grad_norm": 2.4291296005249023, "learning_rate": 4.025913248114671e-05, "loss": 5.5024, "step": 7390 }, { "epoch": 0.5920947351576252, "grad_norm": 2.6989386081695557, "learning_rate": 4.024576135208857e-05, "loss": 5.4484, "step": 7400 }, { "epoch": 0.5928948631781085, "grad_norm": 2.42767596244812, "learning_rate": 4.023239022303043e-05, "loss": 5.5537, "step": 7410 }, { "epoch": 0.5936949911985918, "grad_norm": 2.492577075958252, "learning_rate": 4.0219019093972296e-05, "loss": 5.616, "step": 7420 }, { "epoch": 0.594495119219075, "grad_norm": 2.4696478843688965, "learning_rate": 4.020564796491416e-05, "loss": 5.62, "step": 7430 }, { "epoch": 0.5952952472395583, "grad_norm": 3.2339985370635986, "learning_rate": 4.019227683585602e-05, "loss": 5.485, "step": 7440 }, { "epoch": 0.5960953752600416, "grad_norm": 3.9647512435913086, "learning_rate": 4.0178905706797884e-05, "loss": 5.5868, "step": 7450 }, { "epoch": 0.5968955032805249, "grad_norm": 2.36417293548584, "learning_rate": 4.016553457773975e-05, "loss": 5.5179, "step": 7460 }, { "epoch": 0.5976956313010081, "grad_norm": 2.1484084129333496, "learning_rate": 4.015216344868161e-05, "loss": 5.6915, "step": 7470 }, { "epoch": 0.5984957593214915, "grad_norm": 2.5233757495880127, "learning_rate": 4.013879231962347e-05, "loss": 5.4879, "step": 7480 }, { "epoch": 0.5992958873419747, "grad_norm": 3.3730146884918213, "learning_rate": 4.0125421190565335e-05, "loss": 5.6531, "step": 7490 }, { "epoch": 0.600096015362458, "grad_norm": 3.0788846015930176, "learning_rate": 4.01120500615072e-05, "loss": 5.6058, "step": 7500 }, { "epoch": 0.6008961433829413, "grad_norm": 2.93515944480896, "learning_rate": 4.009867893244906e-05, "loss": 5.4777, "step": 7510 }, { "epoch": 0.6016962714034245, "grad_norm": 2.6020236015319824, "learning_rate": 4.008530780339092e-05, "loss": 5.6444, "step": 7520 }, { "epoch": 0.6024963994239079, "grad_norm": 2.4522392749786377, "learning_rate": 4.0071936674332786e-05, "loss": 5.6157, "step": 7530 }, { "epoch": 0.6032965274443911, "grad_norm": 3.1317343711853027, "learning_rate": 4.005856554527465e-05, "loss": 5.5527, "step": 7540 }, { "epoch": 0.6040966554648743, "grad_norm": 2.485154390335083, "learning_rate": 4.004519441621651e-05, "loss": 5.6467, "step": 7550 }, { "epoch": 0.6048967834853577, "grad_norm": 2.2032833099365234, "learning_rate": 4.0031823287158374e-05, "loss": 5.4957, "step": 7560 }, { "epoch": 0.6056969115058409, "grad_norm": 3.1787898540496826, "learning_rate": 4.0018452158100236e-05, "loss": 5.6204, "step": 7570 }, { "epoch": 0.6064970395263242, "grad_norm": 2.9925789833068848, "learning_rate": 4.00050810290421e-05, "loss": 5.6732, "step": 7580 }, { "epoch": 0.6072971675468075, "grad_norm": 2.7631521224975586, "learning_rate": 3.999170989998396e-05, "loss": 5.6743, "step": 7590 }, { "epoch": 0.6080972955672908, "grad_norm": 2.808265447616577, "learning_rate": 3.997833877092582e-05, "loss": 5.5951, "step": 7600 }, { "epoch": 0.608897423587774, "grad_norm": 3.6244983673095703, "learning_rate": 3.996496764186768e-05, "loss": 5.5216, "step": 7610 }, { "epoch": 0.6096975516082573, "grad_norm": 2.4245145320892334, "learning_rate": 3.995159651280954e-05, "loss": 5.5844, "step": 7620 }, { "epoch": 0.6104976796287406, "grad_norm": 2.2855565547943115, "learning_rate": 3.9938225383751406e-05, "loss": 5.5674, "step": 7630 }, { "epoch": 0.6112978076492239, "grad_norm": 2.2801260948181152, "learning_rate": 3.992485425469327e-05, "loss": 5.4406, "step": 7640 }, { "epoch": 0.6120979356697072, "grad_norm": 2.0117592811584473, "learning_rate": 3.991148312563513e-05, "loss": 5.5463, "step": 7650 }, { "epoch": 0.6128980636901904, "grad_norm": 3.110349655151367, "learning_rate": 3.9898111996576994e-05, "loss": 5.6124, "step": 7660 }, { "epoch": 0.6136981917106737, "grad_norm": 2.9789066314697266, "learning_rate": 3.9884740867518856e-05, "loss": 5.789, "step": 7670 }, { "epoch": 0.614498319731157, "grad_norm": 2.641871213912964, "learning_rate": 3.987136973846072e-05, "loss": 5.4838, "step": 7680 }, { "epoch": 0.6152984477516402, "grad_norm": 3.82928466796875, "learning_rate": 3.985799860940258e-05, "loss": 5.7108, "step": 7690 }, { "epoch": 0.6160985757721236, "grad_norm": 3.2533349990844727, "learning_rate": 3.9844627480344444e-05, "loss": 5.4167, "step": 7700 }, { "epoch": 0.6168987037926068, "grad_norm": 2.4259872436523438, "learning_rate": 3.983125635128631e-05, "loss": 5.5539, "step": 7710 }, { "epoch": 0.61769883181309, "grad_norm": 3.5356359481811523, "learning_rate": 3.981788522222817e-05, "loss": 5.4643, "step": 7720 }, { "epoch": 0.6184989598335734, "grad_norm": 2.5774996280670166, "learning_rate": 3.980451409317003e-05, "loss": 5.5389, "step": 7730 }, { "epoch": 0.6192990878540566, "grad_norm": 2.3197529315948486, "learning_rate": 3.9791142964111895e-05, "loss": 5.5724, "step": 7740 }, { "epoch": 0.62009921587454, "grad_norm": 2.2660646438598633, "learning_rate": 3.977777183505376e-05, "loss": 5.5675, "step": 7750 }, { "epoch": 0.6208993438950232, "grad_norm": 2.7596511840820312, "learning_rate": 3.9764400705995614e-05, "loss": 5.6168, "step": 7760 }, { "epoch": 0.6216994719155065, "grad_norm": 2.4579806327819824, "learning_rate": 3.9751029576937477e-05, "loss": 5.4243, "step": 7770 }, { "epoch": 0.6224995999359898, "grad_norm": 2.7039647102355957, "learning_rate": 3.973765844787934e-05, "loss": 5.633, "step": 7780 }, { "epoch": 0.623299727956473, "grad_norm": 2.274777412414551, "learning_rate": 3.97242873188212e-05, "loss": 5.5945, "step": 7790 }, { "epoch": 0.6240998559769563, "grad_norm": 2.4263217449188232, "learning_rate": 3.9710916189763065e-05, "loss": 5.6763, "step": 7800 }, { "epoch": 0.6248999839974396, "grad_norm": 3.420625686645508, "learning_rate": 3.969754506070493e-05, "loss": 5.4884, "step": 7810 }, { "epoch": 0.6257001120179229, "grad_norm": 2.1576149463653564, "learning_rate": 3.968417393164679e-05, "loss": 5.6325, "step": 7820 }, { "epoch": 0.6265002400384061, "grad_norm": 2.4189348220825195, "learning_rate": 3.967080280258865e-05, "loss": 5.5113, "step": 7830 }, { "epoch": 0.6273003680588894, "grad_norm": 2.533433675765991, "learning_rate": 3.9657431673530515e-05, "loss": 5.3743, "step": 7840 }, { "epoch": 0.6281004960793727, "grad_norm": 2.2747883796691895, "learning_rate": 3.964406054447238e-05, "loss": 5.4912, "step": 7850 }, { "epoch": 0.628900624099856, "grad_norm": 2.546261787414551, "learning_rate": 3.963068941541424e-05, "loss": 5.6571, "step": 7860 }, { "epoch": 0.6297007521203393, "grad_norm": 2.5970914363861084, "learning_rate": 3.9617318286356103e-05, "loss": 5.6732, "step": 7870 }, { "epoch": 0.6305008801408225, "grad_norm": 2.956646680831909, "learning_rate": 3.9603947157297966e-05, "loss": 5.4769, "step": 7880 }, { "epoch": 0.6313010081613059, "grad_norm": 2.9553463459014893, "learning_rate": 3.959057602823983e-05, "loss": 5.4675, "step": 7890 }, { "epoch": 0.6321011361817891, "grad_norm": 2.6471643447875977, "learning_rate": 3.957720489918169e-05, "loss": 5.4538, "step": 7900 }, { "epoch": 0.6329012642022723, "grad_norm": 2.847944736480713, "learning_rate": 3.956383377012355e-05, "loss": 5.384, "step": 7910 }, { "epoch": 0.6337013922227557, "grad_norm": 3.6218080520629883, "learning_rate": 3.955046264106541e-05, "loss": 5.56, "step": 7920 }, { "epoch": 0.6345015202432389, "grad_norm": 2.396426200866699, "learning_rate": 3.953709151200727e-05, "loss": 5.6353, "step": 7930 }, { "epoch": 0.6353016482637222, "grad_norm": 2.4465904235839844, "learning_rate": 3.9523720382949135e-05, "loss": 5.6698, "step": 7940 }, { "epoch": 0.6361017762842055, "grad_norm": 2.6707208156585693, "learning_rate": 3.9510349253891e-05, "loss": 5.4316, "step": 7950 }, { "epoch": 0.6369019043046887, "grad_norm": 2.982117176055908, "learning_rate": 3.949697812483286e-05, "loss": 5.6359, "step": 7960 }, { "epoch": 0.637702032325172, "grad_norm": 2.6343331336975098, "learning_rate": 3.9483606995774724e-05, "loss": 5.6188, "step": 7970 }, { "epoch": 0.6385021603456553, "grad_norm": 2.290728807449341, "learning_rate": 3.9470235866716586e-05, "loss": 5.5824, "step": 7980 }, { "epoch": 0.6393022883661386, "grad_norm": 2.3056259155273438, "learning_rate": 3.945686473765845e-05, "loss": 5.5314, "step": 7990 }, { "epoch": 0.6401024163866219, "grad_norm": 2.301790714263916, "learning_rate": 3.944349360860031e-05, "loss": 5.497, "step": 8000 }, { "epoch": 0.6409025444071051, "grad_norm": 2.2784414291381836, "learning_rate": 3.9430122479542174e-05, "loss": 5.6482, "step": 8010 }, { "epoch": 0.6417026724275884, "grad_norm": 2.3686752319335938, "learning_rate": 3.941675135048404e-05, "loss": 5.449, "step": 8020 }, { "epoch": 0.6425028004480717, "grad_norm": 3.0353329181671143, "learning_rate": 3.94033802214259e-05, "loss": 5.4544, "step": 8030 }, { "epoch": 0.643302928468555, "grad_norm": 3.035477876663208, "learning_rate": 3.939000909236776e-05, "loss": 5.4641, "step": 8040 }, { "epoch": 0.6441030564890382, "grad_norm": 2.6078028678894043, "learning_rate": 3.9376637963309625e-05, "loss": 5.6181, "step": 8050 }, { "epoch": 0.6449031845095216, "grad_norm": 2.7835607528686523, "learning_rate": 3.936326683425149e-05, "loss": 5.459, "step": 8060 }, { "epoch": 0.6457033125300048, "grad_norm": 2.465331792831421, "learning_rate": 3.9349895705193344e-05, "loss": 5.5365, "step": 8070 }, { "epoch": 0.646503440550488, "grad_norm": 2.0666961669921875, "learning_rate": 3.9336524576135206e-05, "loss": 5.5158, "step": 8080 }, { "epoch": 0.6473035685709714, "grad_norm": 2.2512967586517334, "learning_rate": 3.932315344707707e-05, "loss": 5.4235, "step": 8090 }, { "epoch": 0.6481036965914546, "grad_norm": 2.081125497817993, "learning_rate": 3.930978231801893e-05, "loss": 5.4172, "step": 8100 }, { "epoch": 0.648903824611938, "grad_norm": 2.0393776893615723, "learning_rate": 3.9296411188960794e-05, "loss": 5.5454, "step": 8110 }, { "epoch": 0.6497039526324212, "grad_norm": 2.671065092086792, "learning_rate": 3.928304005990266e-05, "loss": 5.4562, "step": 8120 }, { "epoch": 0.6505040806529044, "grad_norm": 2.3266165256500244, "learning_rate": 3.926966893084452e-05, "loss": 5.5839, "step": 8130 }, { "epoch": 0.6513042086733878, "grad_norm": 2.400386333465576, "learning_rate": 3.925629780178638e-05, "loss": 5.7815, "step": 8140 }, { "epoch": 0.652104336693871, "grad_norm": 2.3798139095306396, "learning_rate": 3.9242926672728245e-05, "loss": 5.5736, "step": 8150 }, { "epoch": 0.6529044647143543, "grad_norm": 2.4090096950531006, "learning_rate": 3.922955554367011e-05, "loss": 5.4634, "step": 8160 }, { "epoch": 0.6537045927348376, "grad_norm": 3.5072951316833496, "learning_rate": 3.921618441461197e-05, "loss": 5.5608, "step": 8170 }, { "epoch": 0.6545047207553208, "grad_norm": 2.364222526550293, "learning_rate": 3.920281328555383e-05, "loss": 5.7275, "step": 8180 }, { "epoch": 0.6553048487758041, "grad_norm": 4.594448566436768, "learning_rate": 3.9189442156495696e-05, "loss": 5.7235, "step": 8190 }, { "epoch": 0.6561049767962874, "grad_norm": 3.863098621368408, "learning_rate": 3.917607102743756e-05, "loss": 5.5359, "step": 8200 }, { "epoch": 0.6569051048167707, "grad_norm": 3.201704978942871, "learning_rate": 3.916269989837942e-05, "loss": 5.645, "step": 8210 }, { "epoch": 0.6577052328372539, "grad_norm": 2.697448492050171, "learning_rate": 3.9149328769321284e-05, "loss": 5.523, "step": 8220 }, { "epoch": 0.6585053608577373, "grad_norm": 2.4561972618103027, "learning_rate": 3.913595764026315e-05, "loss": 5.734, "step": 8230 }, { "epoch": 0.6593054888782205, "grad_norm": 4.527692794799805, "learning_rate": 3.912258651120501e-05, "loss": 5.4594, "step": 8240 }, { "epoch": 0.6601056168987038, "grad_norm": 2.8713691234588623, "learning_rate": 3.910921538214687e-05, "loss": 5.7247, "step": 8250 }, { "epoch": 0.6609057449191871, "grad_norm": 2.167921304702759, "learning_rate": 3.9095844253088735e-05, "loss": 5.6405, "step": 8260 }, { "epoch": 0.6617058729396703, "grad_norm": 2.8967878818511963, "learning_rate": 3.90824731240306e-05, "loss": 5.4989, "step": 8270 }, { "epoch": 0.6625060009601537, "grad_norm": 2.002103090286255, "learning_rate": 3.906910199497246e-05, "loss": 5.4434, "step": 8280 }, { "epoch": 0.6633061289806369, "grad_norm": 2.187889575958252, "learning_rate": 3.905573086591432e-05, "loss": 5.4078, "step": 8290 }, { "epoch": 0.6641062570011201, "grad_norm": 2.4078755378723145, "learning_rate": 3.9042359736856186e-05, "loss": 5.5381, "step": 8300 }, { "epoch": 0.6649063850216035, "grad_norm": 3.071484327316284, "learning_rate": 3.902898860779805e-05, "loss": 5.4298, "step": 8310 }, { "epoch": 0.6657065130420867, "grad_norm": 3.8413217067718506, "learning_rate": 3.901561747873991e-05, "loss": 5.4844, "step": 8320 }, { "epoch": 0.66650664106257, "grad_norm": 3.0394554138183594, "learning_rate": 3.9002246349681774e-05, "loss": 5.5524, "step": 8330 }, { "epoch": 0.6673067690830533, "grad_norm": 2.635354518890381, "learning_rate": 3.8988875220623636e-05, "loss": 5.5727, "step": 8340 }, { "epoch": 0.6681068971035365, "grad_norm": 2.2557764053344727, "learning_rate": 3.89755040915655e-05, "loss": 5.3455, "step": 8350 }, { "epoch": 0.6689070251240199, "grad_norm": 2.837040662765503, "learning_rate": 3.896213296250736e-05, "loss": 5.3729, "step": 8360 }, { "epoch": 0.6697071531445031, "grad_norm": 6.783266067504883, "learning_rate": 3.8948761833449224e-05, "loss": 5.4372, "step": 8370 }, { "epoch": 0.6705072811649864, "grad_norm": 2.20611310005188, "learning_rate": 3.893539070439108e-05, "loss": 5.4983, "step": 8380 }, { "epoch": 0.6713074091854697, "grad_norm": 2.378692626953125, "learning_rate": 3.892201957533294e-05, "loss": 5.6309, "step": 8390 }, { "epoch": 0.672107537205953, "grad_norm": 2.7219278812408447, "learning_rate": 3.8908648446274806e-05, "loss": 5.67, "step": 8400 }, { "epoch": 0.6729076652264362, "grad_norm": 2.7383148670196533, "learning_rate": 3.889527731721667e-05, "loss": 5.5648, "step": 8410 }, { "epoch": 0.6737077932469195, "grad_norm": 1.882124423980713, "learning_rate": 3.888190618815853e-05, "loss": 5.5879, "step": 8420 }, { "epoch": 0.6745079212674028, "grad_norm": 2.5975465774536133, "learning_rate": 3.8868535059100394e-05, "loss": 5.5644, "step": 8430 }, { "epoch": 0.675308049287886, "grad_norm": 3.4361534118652344, "learning_rate": 3.8855163930042256e-05, "loss": 5.6302, "step": 8440 }, { "epoch": 0.6761081773083694, "grad_norm": 2.241267442703247, "learning_rate": 3.884179280098412e-05, "loss": 5.5003, "step": 8450 }, { "epoch": 0.6769083053288526, "grad_norm": 1.9234975576400757, "learning_rate": 3.882842167192598e-05, "loss": 5.4739, "step": 8460 }, { "epoch": 0.677708433349336, "grad_norm": 2.05928897857666, "learning_rate": 3.8815050542867845e-05, "loss": 5.5566, "step": 8470 }, { "epoch": 0.6785085613698192, "grad_norm": 2.5602006912231445, "learning_rate": 3.880167941380971e-05, "loss": 5.6363, "step": 8480 }, { "epoch": 0.6793086893903024, "grad_norm": 2.36325740814209, "learning_rate": 3.878830828475157e-05, "loss": 5.4635, "step": 8490 }, { "epoch": 0.6801088174107858, "grad_norm": 3.087769031524658, "learning_rate": 3.877493715569343e-05, "loss": 5.5537, "step": 8500 }, { "epoch": 0.680908945431269, "grad_norm": 2.759660482406616, "learning_rate": 3.8761566026635295e-05, "loss": 5.5427, "step": 8510 }, { "epoch": 0.6817090734517522, "grad_norm": 2.7726991176605225, "learning_rate": 3.874819489757716e-05, "loss": 5.4868, "step": 8520 }, { "epoch": 0.6825092014722356, "grad_norm": 3.408202648162842, "learning_rate": 3.8734823768519014e-05, "loss": 5.5416, "step": 8530 }, { "epoch": 0.6833093294927188, "grad_norm": 3.801959753036499, "learning_rate": 3.8721452639460877e-05, "loss": 5.5577, "step": 8540 }, { "epoch": 0.6841094575132021, "grad_norm": 2.7447824478149414, "learning_rate": 3.870808151040274e-05, "loss": 5.5837, "step": 8550 }, { "epoch": 0.6849095855336854, "grad_norm": 3.7551326751708984, "learning_rate": 3.86947103813446e-05, "loss": 5.4772, "step": 8560 }, { "epoch": 0.6857097135541687, "grad_norm": 2.036146640777588, "learning_rate": 3.8681339252286465e-05, "loss": 5.659, "step": 8570 }, { "epoch": 0.6865098415746519, "grad_norm": 2.392986536026001, "learning_rate": 3.866796812322833e-05, "loss": 5.3913, "step": 8580 }, { "epoch": 0.6873099695951352, "grad_norm": 2.7194063663482666, "learning_rate": 3.865459699417019e-05, "loss": 5.418, "step": 8590 }, { "epoch": 0.6881100976156185, "grad_norm": 2.2499608993530273, "learning_rate": 3.864122586511205e-05, "loss": 5.4924, "step": 8600 }, { "epoch": 0.6889102256361018, "grad_norm": 3.661318302154541, "learning_rate": 3.8627854736053915e-05, "loss": 5.5578, "step": 8610 }, { "epoch": 0.6897103536565851, "grad_norm": 3.076019048690796, "learning_rate": 3.861448360699578e-05, "loss": 5.6017, "step": 8620 }, { "epoch": 0.6905104816770683, "grad_norm": 2.133923053741455, "learning_rate": 3.860111247793764e-05, "loss": 5.5295, "step": 8630 }, { "epoch": 0.6913106096975516, "grad_norm": 3.3584773540496826, "learning_rate": 3.8587741348879503e-05, "loss": 5.4534, "step": 8640 }, { "epoch": 0.6921107377180349, "grad_norm": 2.499058723449707, "learning_rate": 3.8574370219821366e-05, "loss": 5.3402, "step": 8650 }, { "epoch": 0.6929108657385181, "grad_norm": 2.5099146366119385, "learning_rate": 3.856099909076323e-05, "loss": 5.3765, "step": 8660 }, { "epoch": 0.6937109937590015, "grad_norm": 2.9601941108703613, "learning_rate": 3.854762796170509e-05, "loss": 5.5139, "step": 8670 }, { "epoch": 0.6945111217794847, "grad_norm": 3.2487246990203857, "learning_rate": 3.8534256832646954e-05, "loss": 5.5665, "step": 8680 }, { "epoch": 0.695311249799968, "grad_norm": 2.8433704376220703, "learning_rate": 3.852088570358881e-05, "loss": 5.4445, "step": 8690 }, { "epoch": 0.6961113778204513, "grad_norm": 2.204953670501709, "learning_rate": 3.850751457453067e-05, "loss": 5.5415, "step": 8700 }, { "epoch": 0.6969115058409345, "grad_norm": 2.7477571964263916, "learning_rate": 3.8494143445472536e-05, "loss": 5.5603, "step": 8710 }, { "epoch": 0.6977116338614179, "grad_norm": 3.2059755325317383, "learning_rate": 3.84807723164144e-05, "loss": 5.5524, "step": 8720 }, { "epoch": 0.6985117618819011, "grad_norm": 3.2654213905334473, "learning_rate": 3.846740118735626e-05, "loss": 5.5482, "step": 8730 }, { "epoch": 0.6993118899023844, "grad_norm": 2.3536834716796875, "learning_rate": 3.8454030058298124e-05, "loss": 5.6251, "step": 8740 }, { "epoch": 0.7001120179228677, "grad_norm": 3.132542371749878, "learning_rate": 3.8440658929239986e-05, "loss": 5.762, "step": 8750 }, { "epoch": 0.7009121459433509, "grad_norm": 2.3961470127105713, "learning_rate": 3.842728780018185e-05, "loss": 5.4919, "step": 8760 }, { "epoch": 0.7017122739638342, "grad_norm": 1.9365229606628418, "learning_rate": 3.841391667112371e-05, "loss": 5.4369, "step": 8770 }, { "epoch": 0.7025124019843175, "grad_norm": 2.227877140045166, "learning_rate": 3.8400545542065574e-05, "loss": 5.4361, "step": 8780 }, { "epoch": 0.7033125300048008, "grad_norm": 2.521822452545166, "learning_rate": 3.838717441300744e-05, "loss": 5.6763, "step": 8790 }, { "epoch": 0.704112658025284, "grad_norm": 2.4155185222625732, "learning_rate": 3.83738032839493e-05, "loss": 5.7041, "step": 8800 }, { "epoch": 0.7049127860457673, "grad_norm": 1.9704358577728271, "learning_rate": 3.836043215489116e-05, "loss": 5.5136, "step": 8810 }, { "epoch": 0.7057129140662506, "grad_norm": 3.447098731994629, "learning_rate": 3.8347061025833025e-05, "loss": 5.5963, "step": 8820 }, { "epoch": 0.7065130420867338, "grad_norm": 2.0857930183410645, "learning_rate": 3.833368989677489e-05, "loss": 5.5328, "step": 8830 }, { "epoch": 0.7073131701072172, "grad_norm": 5.354836940765381, "learning_rate": 3.8320318767716744e-05, "loss": 5.561, "step": 8840 }, { "epoch": 0.7081132981277004, "grad_norm": 2.1317214965820312, "learning_rate": 3.8306947638658606e-05, "loss": 5.7044, "step": 8850 }, { "epoch": 0.7089134261481838, "grad_norm": 2.163472890853882, "learning_rate": 3.829357650960047e-05, "loss": 5.4564, "step": 8860 }, { "epoch": 0.709713554168667, "grad_norm": 2.155075788497925, "learning_rate": 3.828020538054233e-05, "loss": 5.5767, "step": 8870 }, { "epoch": 0.7105136821891502, "grad_norm": 2.225407361984253, "learning_rate": 3.8266834251484194e-05, "loss": 5.574, "step": 8880 }, { "epoch": 0.7113138102096336, "grad_norm": 2.737126350402832, "learning_rate": 3.825346312242606e-05, "loss": 5.5425, "step": 8890 }, { "epoch": 0.7121139382301168, "grad_norm": 3.4771502017974854, "learning_rate": 3.824009199336792e-05, "loss": 5.6085, "step": 8900 }, { "epoch": 0.7129140662506, "grad_norm": 3.2826528549194336, "learning_rate": 3.822672086430978e-05, "loss": 5.5632, "step": 8910 }, { "epoch": 0.7137141942710834, "grad_norm": 2.4936113357543945, "learning_rate": 3.8213349735251645e-05, "loss": 5.4818, "step": 8920 }, { "epoch": 0.7145143222915666, "grad_norm": 3.6719648838043213, "learning_rate": 3.819997860619351e-05, "loss": 5.5637, "step": 8930 }, { "epoch": 0.7153144503120499, "grad_norm": 2.7252962589263916, "learning_rate": 3.818660747713537e-05, "loss": 5.5623, "step": 8940 }, { "epoch": 0.7161145783325332, "grad_norm": 3.8873820304870605, "learning_rate": 3.817323634807723e-05, "loss": 5.5009, "step": 8950 }, { "epoch": 0.7169147063530165, "grad_norm": 2.6248092651367188, "learning_rate": 3.8159865219019096e-05, "loss": 5.6683, "step": 8960 }, { "epoch": 0.7177148343734998, "grad_norm": 2.1327767372131348, "learning_rate": 3.814649408996096e-05, "loss": 5.373, "step": 8970 }, { "epoch": 0.718514962393983, "grad_norm": 3.1641392707824707, "learning_rate": 3.813312296090282e-05, "loss": 5.6192, "step": 8980 }, { "epoch": 0.7193150904144663, "grad_norm": 2.533423662185669, "learning_rate": 3.811975183184468e-05, "loss": 5.4736, "step": 8990 }, { "epoch": 0.7201152184349496, "grad_norm": 2.892228841781616, "learning_rate": 3.810638070278654e-05, "loss": 5.437, "step": 9000 }, { "epoch": 0.7209153464554329, "grad_norm": 2.295328140258789, "learning_rate": 3.80930095737284e-05, "loss": 5.4327, "step": 9010 }, { "epoch": 0.7217154744759161, "grad_norm": 2.4300477504730225, "learning_rate": 3.8079638444670265e-05, "loss": 5.6341, "step": 9020 }, { "epoch": 0.7225156024963995, "grad_norm": 4.092593669891357, "learning_rate": 3.806626731561213e-05, "loss": 5.5062, "step": 9030 }, { "epoch": 0.7233157305168827, "grad_norm": 2.7330925464630127, "learning_rate": 3.805289618655399e-05, "loss": 5.4915, "step": 9040 }, { "epoch": 0.7241158585373659, "grad_norm": 2.0372865200042725, "learning_rate": 3.8039525057495853e-05, "loss": 5.5056, "step": 9050 }, { "epoch": 0.7249159865578493, "grad_norm": 2.5585618019104004, "learning_rate": 3.8026153928437716e-05, "loss": 5.4614, "step": 9060 }, { "epoch": 0.7257161145783325, "grad_norm": 2.653251886367798, "learning_rate": 3.801278279937958e-05, "loss": 5.4437, "step": 9070 }, { "epoch": 0.7265162425988159, "grad_norm": 2.7902703285217285, "learning_rate": 3.799941167032144e-05, "loss": 5.4927, "step": 9080 }, { "epoch": 0.7273163706192991, "grad_norm": 3.366363525390625, "learning_rate": 3.7986040541263304e-05, "loss": 5.382, "step": 9090 }, { "epoch": 0.7281164986397823, "grad_norm": 2.065732479095459, "learning_rate": 3.797266941220517e-05, "loss": 5.5663, "step": 9100 }, { "epoch": 0.7289166266602657, "grad_norm": 3.823241710662842, "learning_rate": 3.795929828314703e-05, "loss": 5.4697, "step": 9110 }, { "epoch": 0.7297167546807489, "grad_norm": 2.3972017765045166, "learning_rate": 3.794592715408889e-05, "loss": 5.5508, "step": 9120 }, { "epoch": 0.7305168827012322, "grad_norm": 2.4955368041992188, "learning_rate": 3.7932556025030755e-05, "loss": 5.5437, "step": 9130 }, { "epoch": 0.7313170107217155, "grad_norm": 5.454606533050537, "learning_rate": 3.791918489597262e-05, "loss": 5.4974, "step": 9140 }, { "epoch": 0.7321171387421987, "grad_norm": 2.6541287899017334, "learning_rate": 3.790581376691448e-05, "loss": 5.5327, "step": 9150 }, { "epoch": 0.732917266762682, "grad_norm": 2.974902391433716, "learning_rate": 3.789244263785634e-05, "loss": 5.5352, "step": 9160 }, { "epoch": 0.7337173947831653, "grad_norm": 7.2000274658203125, "learning_rate": 3.7879071508798206e-05, "loss": 5.5946, "step": 9170 }, { "epoch": 0.7345175228036486, "grad_norm": 2.418121576309204, "learning_rate": 3.786570037974007e-05, "loss": 5.4985, "step": 9180 }, { "epoch": 0.7353176508241318, "grad_norm": 2.3174428939819336, "learning_rate": 3.785232925068193e-05, "loss": 5.6393, "step": 9190 }, { "epoch": 0.7361177788446152, "grad_norm": 2.172489643096924, "learning_rate": 3.7838958121623794e-05, "loss": 5.6173, "step": 9200 }, { "epoch": 0.7369179068650984, "grad_norm": 3.9107019901275635, "learning_rate": 3.7825586992565656e-05, "loss": 5.4436, "step": 9210 }, { "epoch": 0.7377180348855817, "grad_norm": 2.3483355045318604, "learning_rate": 3.781221586350752e-05, "loss": 5.4981, "step": 9220 }, { "epoch": 0.738518162906065, "grad_norm": 3.839348077774048, "learning_rate": 3.779884473444938e-05, "loss": 5.5541, "step": 9230 }, { "epoch": 0.7393182909265482, "grad_norm": 1.686996579170227, "learning_rate": 3.7785473605391245e-05, "loss": 5.6328, "step": 9240 }, { "epoch": 0.7401184189470316, "grad_norm": 2.7277584075927734, "learning_rate": 3.777210247633311e-05, "loss": 5.5787, "step": 9250 }, { "epoch": 0.7409185469675148, "grad_norm": 2.60896635055542, "learning_rate": 3.775873134727497e-05, "loss": 5.5082, "step": 9260 }, { "epoch": 0.741718674987998, "grad_norm": 2.957674264907837, "learning_rate": 3.774669733112264e-05, "loss": 5.516, "step": 9270 }, { "epoch": 0.7425188030084814, "grad_norm": 2.223433017730713, "learning_rate": 3.7733326202064505e-05, "loss": 5.502, "step": 9280 }, { "epoch": 0.7433189310289646, "grad_norm": 2.6075685024261475, "learning_rate": 3.771995507300637e-05, "loss": 5.5067, "step": 9290 }, { "epoch": 0.7441190590494479, "grad_norm": 2.6572721004486084, "learning_rate": 3.7706583943948224e-05, "loss": 5.6304, "step": 9300 }, { "epoch": 0.7449191870699312, "grad_norm": 2.0563318729400635, "learning_rate": 3.7693212814890086e-05, "loss": 5.4974, "step": 9310 }, { "epoch": 0.7457193150904144, "grad_norm": 2.032820463180542, "learning_rate": 3.767984168583195e-05, "loss": 5.6016, "step": 9320 }, { "epoch": 0.7465194431108978, "grad_norm": 5.646316051483154, "learning_rate": 3.766647055677381e-05, "loss": 5.6661, "step": 9330 }, { "epoch": 0.747319571131381, "grad_norm": 2.5043859481811523, "learning_rate": 3.7653099427715674e-05, "loss": 5.6445, "step": 9340 }, { "epoch": 0.7481196991518643, "grad_norm": 2.817434787750244, "learning_rate": 3.763972829865754e-05, "loss": 5.3901, "step": 9350 }, { "epoch": 0.7489198271723476, "grad_norm": 2.4041759967803955, "learning_rate": 3.76263571695994e-05, "loss": 5.7132, "step": 9360 }, { "epoch": 0.7497199551928309, "grad_norm": 1.8806638717651367, "learning_rate": 3.761298604054126e-05, "loss": 5.5203, "step": 9370 }, { "epoch": 0.7505200832133141, "grad_norm": 2.088700532913208, "learning_rate": 3.7599614911483125e-05, "loss": 5.4414, "step": 9380 }, { "epoch": 0.7513202112337974, "grad_norm": 2.519188165664673, "learning_rate": 3.758624378242499e-05, "loss": 5.4094, "step": 9390 }, { "epoch": 0.7521203392542807, "grad_norm": 4.597784042358398, "learning_rate": 3.757287265336685e-05, "loss": 5.6246, "step": 9400 }, { "epoch": 0.7529204672747639, "grad_norm": 2.0422868728637695, "learning_rate": 3.755950152430871e-05, "loss": 5.3393, "step": 9410 }, { "epoch": 0.7537205952952473, "grad_norm": 3.0451338291168213, "learning_rate": 3.7546130395250576e-05, "loss": 5.618, "step": 9420 }, { "epoch": 0.7545207233157305, "grad_norm": 2.3379099369049072, "learning_rate": 3.753275926619244e-05, "loss": 5.4859, "step": 9430 }, { "epoch": 0.7553208513362137, "grad_norm": 2.6721060276031494, "learning_rate": 3.75193881371343e-05, "loss": 5.5349, "step": 9440 }, { "epoch": 0.7561209793566971, "grad_norm": 2.495716094970703, "learning_rate": 3.7506017008076164e-05, "loss": 5.626, "step": 9450 }, { "epoch": 0.7569211073771803, "grad_norm": 2.9002442359924316, "learning_rate": 3.749264587901803e-05, "loss": 5.5438, "step": 9460 }, { "epoch": 0.7577212353976637, "grad_norm": 2.3616931438446045, "learning_rate": 3.747927474995989e-05, "loss": 5.6381, "step": 9470 }, { "epoch": 0.7585213634181469, "grad_norm": 2.389329433441162, "learning_rate": 3.746590362090175e-05, "loss": 5.4326, "step": 9480 }, { "epoch": 0.7593214914386301, "grad_norm": 2.1870810985565186, "learning_rate": 3.7452532491843615e-05, "loss": 5.5129, "step": 9490 }, { "epoch": 0.7601216194591135, "grad_norm": 2.2454891204833984, "learning_rate": 3.743916136278548e-05, "loss": 5.3963, "step": 9500 }, { "epoch": 0.7609217474795967, "grad_norm": 2.5803539752960205, "learning_rate": 3.742579023372734e-05, "loss": 5.5237, "step": 9510 }, { "epoch": 0.76172187550008, "grad_norm": 2.5508155822753906, "learning_rate": 3.74124191046692e-05, "loss": 5.4525, "step": 9520 }, { "epoch": 0.7625220035205633, "grad_norm": 3.693437337875366, "learning_rate": 3.7399047975611065e-05, "loss": 5.5101, "step": 9530 }, { "epoch": 0.7633221315410466, "grad_norm": 2.4398484230041504, "learning_rate": 3.738567684655293e-05, "loss": 5.5372, "step": 9540 }, { "epoch": 0.7641222595615298, "grad_norm": 2.226680278778076, "learning_rate": 3.737230571749479e-05, "loss": 5.3711, "step": 9550 }, { "epoch": 0.7649223875820131, "grad_norm": 2.182704210281372, "learning_rate": 3.7358934588436654e-05, "loss": 5.4957, "step": 9560 }, { "epoch": 0.7657225156024964, "grad_norm": 3.145799398422241, "learning_rate": 3.7345563459378516e-05, "loss": 5.5411, "step": 9570 }, { "epoch": 0.7665226436229797, "grad_norm": 2.656719923019409, "learning_rate": 3.733219233032038e-05, "loss": 5.4737, "step": 9580 }, { "epoch": 0.767322771643463, "grad_norm": 2.2230639457702637, "learning_rate": 3.731882120126224e-05, "loss": 5.5192, "step": 9590 }, { "epoch": 0.7681228996639462, "grad_norm": 4.286400318145752, "learning_rate": 3.7305450072204104e-05, "loss": 5.6413, "step": 9600 }, { "epoch": 0.7689230276844295, "grad_norm": 2.3106577396392822, "learning_rate": 3.729207894314596e-05, "loss": 5.5998, "step": 9610 }, { "epoch": 0.7697231557049128, "grad_norm": 2.7155752182006836, "learning_rate": 3.727870781408782e-05, "loss": 5.4494, "step": 9620 }, { "epoch": 0.770523283725396, "grad_norm": 2.082399368286133, "learning_rate": 3.7265336685029686e-05, "loss": 5.4897, "step": 9630 }, { "epoch": 0.7713234117458794, "grad_norm": 2.0752410888671875, "learning_rate": 3.725196555597155e-05, "loss": 5.537, "step": 9640 }, { "epoch": 0.7721235397663626, "grad_norm": 2.258284091949463, "learning_rate": 3.723859442691341e-05, "loss": 5.6481, "step": 9650 }, { "epoch": 0.7729236677868458, "grad_norm": 2.8548264503479004, "learning_rate": 3.7225223297855274e-05, "loss": 5.5508, "step": 9660 }, { "epoch": 0.7737237958073292, "grad_norm": 3.375497579574585, "learning_rate": 3.7211852168797136e-05, "loss": 5.3847, "step": 9670 }, { "epoch": 0.7745239238278124, "grad_norm": 2.6680548191070557, "learning_rate": 3.7198481039739e-05, "loss": 5.3742, "step": 9680 }, { "epoch": 0.7753240518482958, "grad_norm": 2.2915420532226562, "learning_rate": 3.718510991068086e-05, "loss": 5.5593, "step": 9690 }, { "epoch": 0.776124179868779, "grad_norm": 3.224327325820923, "learning_rate": 3.7171738781622724e-05, "loss": 5.5711, "step": 9700 }, { "epoch": 0.7769243078892623, "grad_norm": 3.025899887084961, "learning_rate": 3.715836765256459e-05, "loss": 5.3164, "step": 9710 }, { "epoch": 0.7777244359097456, "grad_norm": 1.9424941539764404, "learning_rate": 3.714499652350645e-05, "loss": 5.4804, "step": 9720 }, { "epoch": 0.7785245639302288, "grad_norm": 2.863312005996704, "learning_rate": 3.713162539444831e-05, "loss": 5.3353, "step": 9730 }, { "epoch": 0.7793246919507121, "grad_norm": 2.0607283115386963, "learning_rate": 3.7118254265390175e-05, "loss": 5.5311, "step": 9740 }, { "epoch": 0.7801248199711954, "grad_norm": 2.225666046142578, "learning_rate": 3.710488313633204e-05, "loss": 5.5315, "step": 9750 }, { "epoch": 0.7809249479916787, "grad_norm": 2.1531851291656494, "learning_rate": 3.70915120072739e-05, "loss": 5.5311, "step": 9760 }, { "epoch": 0.7817250760121619, "grad_norm": 2.6129846572875977, "learning_rate": 3.7078140878215756e-05, "loss": 5.5927, "step": 9770 }, { "epoch": 0.7825252040326452, "grad_norm": 3.1822173595428467, "learning_rate": 3.706476974915762e-05, "loss": 5.5403, "step": 9780 }, { "epoch": 0.7833253320531285, "grad_norm": 5.453544616699219, "learning_rate": 3.705139862009948e-05, "loss": 5.4393, "step": 9790 }, { "epoch": 0.7841254600736117, "grad_norm": 2.573024272918701, "learning_rate": 3.7038027491041345e-05, "loss": 5.5677, "step": 9800 }, { "epoch": 0.7849255880940951, "grad_norm": 2.283381700515747, "learning_rate": 3.702465636198321e-05, "loss": 5.3814, "step": 9810 }, { "epoch": 0.7857257161145783, "grad_norm": 3.119277238845825, "learning_rate": 3.701128523292507e-05, "loss": 5.5022, "step": 9820 }, { "epoch": 0.7865258441350617, "grad_norm": 5.085709571838379, "learning_rate": 3.699791410386693e-05, "loss": 5.5322, "step": 9830 }, { "epoch": 0.7873259721555449, "grad_norm": 2.4339115619659424, "learning_rate": 3.6984542974808795e-05, "loss": 5.5885, "step": 9840 }, { "epoch": 0.7881261001760281, "grad_norm": 2.2715206146240234, "learning_rate": 3.697117184575066e-05, "loss": 5.4657, "step": 9850 }, { "epoch": 0.7889262281965115, "grad_norm": 2.1434290409088135, "learning_rate": 3.695780071669252e-05, "loss": 5.5571, "step": 9860 }, { "epoch": 0.7897263562169947, "grad_norm": 2.235814094543457, "learning_rate": 3.694442958763438e-05, "loss": 5.5054, "step": 9870 }, { "epoch": 0.790526484237478, "grad_norm": 4.322607517242432, "learning_rate": 3.6931058458576246e-05, "loss": 5.3727, "step": 9880 }, { "epoch": 0.7913266122579613, "grad_norm": 2.0876612663269043, "learning_rate": 3.691768732951811e-05, "loss": 5.5682, "step": 9890 }, { "epoch": 0.7921267402784445, "grad_norm": 1.9573509693145752, "learning_rate": 3.690431620045997e-05, "loss": 5.4981, "step": 9900 }, { "epoch": 0.7929268682989278, "grad_norm": 2.527776002883911, "learning_rate": 3.6890945071401834e-05, "loss": 5.3799, "step": 9910 }, { "epoch": 0.7937269963194111, "grad_norm": 3.043266773223877, "learning_rate": 3.687757394234369e-05, "loss": 5.5366, "step": 9920 }, { "epoch": 0.7945271243398944, "grad_norm": 2.502704381942749, "learning_rate": 3.686420281328555e-05, "loss": 5.576, "step": 9930 }, { "epoch": 0.7953272523603777, "grad_norm": 2.863032817840576, "learning_rate": 3.6850831684227415e-05, "loss": 5.4838, "step": 9940 }, { "epoch": 0.796127380380861, "grad_norm": 2.4610373973846436, "learning_rate": 3.683746055516928e-05, "loss": 5.6119, "step": 9950 }, { "epoch": 0.7969275084013442, "grad_norm": 2.193134069442749, "learning_rate": 3.682408942611114e-05, "loss": 5.3948, "step": 9960 }, { "epoch": 0.7977276364218275, "grad_norm": 3.6384451389312744, "learning_rate": 3.6810718297053003e-05, "loss": 5.5381, "step": 9970 }, { "epoch": 0.7985277644423108, "grad_norm": 2.5201289653778076, "learning_rate": 3.6797347167994866e-05, "loss": 5.4386, "step": 9980 }, { "epoch": 0.799327892462794, "grad_norm": 2.3459038734436035, "learning_rate": 3.678397603893673e-05, "loss": 5.8173, "step": 9990 }, { "epoch": 0.8001280204832774, "grad_norm": 2.575666904449463, "learning_rate": 3.677060490987859e-05, "loss": 5.4436, "step": 10000 }, { "epoch": 0.8009281485037606, "grad_norm": 4.0012712478637695, "learning_rate": 3.6757233780820454e-05, "loss": 5.5222, "step": 10010 }, { "epoch": 0.8017282765242438, "grad_norm": 2.3244402408599854, "learning_rate": 3.674386265176232e-05, "loss": 5.398, "step": 10020 }, { "epoch": 0.8025284045447272, "grad_norm": 2.2298974990844727, "learning_rate": 3.673049152270418e-05, "loss": 5.4749, "step": 10030 }, { "epoch": 0.8033285325652104, "grad_norm": 3.589245080947876, "learning_rate": 3.671712039364604e-05, "loss": 5.5091, "step": 10040 }, { "epoch": 0.8041286605856938, "grad_norm": 2.2426655292510986, "learning_rate": 3.6703749264587905e-05, "loss": 5.5136, "step": 10050 }, { "epoch": 0.804928788606177, "grad_norm": 2.5258290767669678, "learning_rate": 3.669037813552977e-05, "loss": 5.522, "step": 10060 }, { "epoch": 0.8057289166266602, "grad_norm": 3.040107250213623, "learning_rate": 3.667700700647163e-05, "loss": 5.5748, "step": 10070 }, { "epoch": 0.8065290446471436, "grad_norm": 2.561196804046631, "learning_rate": 3.6663635877413486e-05, "loss": 5.5973, "step": 10080 }, { "epoch": 0.8073291726676268, "grad_norm": 2.4179880619049072, "learning_rate": 3.665026474835535e-05, "loss": 5.5915, "step": 10090 }, { "epoch": 0.8081293006881101, "grad_norm": 2.393134593963623, "learning_rate": 3.663689361929721e-05, "loss": 5.4809, "step": 10100 }, { "epoch": 0.8089294287085934, "grad_norm": 3.107543468475342, "learning_rate": 3.6623522490239074e-05, "loss": 5.6127, "step": 10110 }, { "epoch": 0.8097295567290766, "grad_norm": 2.8467986583709717, "learning_rate": 3.661015136118094e-05, "loss": 5.5274, "step": 10120 }, { "epoch": 0.8105296847495599, "grad_norm": 2.49955153465271, "learning_rate": 3.65967802321228e-05, "loss": 5.4469, "step": 10130 }, { "epoch": 0.8113298127700432, "grad_norm": 2.817401885986328, "learning_rate": 3.658340910306466e-05, "loss": 5.5901, "step": 10140 }, { "epoch": 0.8121299407905265, "grad_norm": 2.284855842590332, "learning_rate": 3.6570037974006525e-05, "loss": 5.588, "step": 10150 }, { "epoch": 0.8129300688110097, "grad_norm": 3.13712739944458, "learning_rate": 3.655666684494839e-05, "loss": 5.5035, "step": 10160 }, { "epoch": 0.8137301968314931, "grad_norm": 2.7964253425598145, "learning_rate": 3.654329571589025e-05, "loss": 5.4622, "step": 10170 }, { "epoch": 0.8145303248519763, "grad_norm": 3.7489845752716064, "learning_rate": 3.652992458683211e-05, "loss": 5.6106, "step": 10180 }, { "epoch": 0.8153304528724596, "grad_norm": 2.0697953701019287, "learning_rate": 3.6516553457773976e-05, "loss": 5.4128, "step": 10190 }, { "epoch": 0.8161305808929429, "grad_norm": 2.495635986328125, "learning_rate": 3.650318232871584e-05, "loss": 5.3183, "step": 10200 }, { "epoch": 0.8169307089134261, "grad_norm": 1.9717586040496826, "learning_rate": 3.64898111996577e-05, "loss": 5.4251, "step": 10210 }, { "epoch": 0.8177308369339095, "grad_norm": 2.591371774673462, "learning_rate": 3.6476440070599564e-05, "loss": 5.3903, "step": 10220 }, { "epoch": 0.8185309649543927, "grad_norm": 2.9142751693725586, "learning_rate": 3.646306894154142e-05, "loss": 5.4119, "step": 10230 }, { "epoch": 0.8193310929748759, "grad_norm": 2.1791203022003174, "learning_rate": 3.644969781248328e-05, "loss": 5.5931, "step": 10240 }, { "epoch": 0.8201312209953593, "grad_norm": 2.787339925765991, "learning_rate": 3.6436326683425145e-05, "loss": 5.5301, "step": 10250 }, { "epoch": 0.8209313490158425, "grad_norm": 2.722717523574829, "learning_rate": 3.642295555436701e-05, "loss": 5.5608, "step": 10260 }, { "epoch": 0.8217314770363258, "grad_norm": 2.937549114227295, "learning_rate": 3.640958442530887e-05, "loss": 5.5967, "step": 10270 }, { "epoch": 0.8225316050568091, "grad_norm": 3.0384104251861572, "learning_rate": 3.639621329625073e-05, "loss": 5.5901, "step": 10280 }, { "epoch": 0.8233317330772923, "grad_norm": 2.6817758083343506, "learning_rate": 3.6382842167192596e-05, "loss": 5.4188, "step": 10290 }, { "epoch": 0.8241318610977757, "grad_norm": 2.6184494495391846, "learning_rate": 3.636947103813446e-05, "loss": 5.5194, "step": 10300 }, { "epoch": 0.8249319891182589, "grad_norm": 2.613208293914795, "learning_rate": 3.635609990907632e-05, "loss": 5.4968, "step": 10310 }, { "epoch": 0.8257321171387422, "grad_norm": 6.223053932189941, "learning_rate": 3.6342728780018184e-05, "loss": 5.3478, "step": 10320 }, { "epoch": 0.8265322451592255, "grad_norm": 3.294417381286621, "learning_rate": 3.632935765096005e-05, "loss": 5.5736, "step": 10330 }, { "epoch": 0.8273323731797088, "grad_norm": 2.3347206115722656, "learning_rate": 3.631598652190191e-05, "loss": 5.6787, "step": 10340 }, { "epoch": 0.828132501200192, "grad_norm": 3.219491958618164, "learning_rate": 3.630261539284377e-05, "loss": 5.5125, "step": 10350 }, { "epoch": 0.8289326292206753, "grad_norm": 2.5759575366973877, "learning_rate": 3.6289244263785635e-05, "loss": 5.4405, "step": 10360 }, { "epoch": 0.8297327572411586, "grad_norm": 2.4145963191986084, "learning_rate": 3.62758731347275e-05, "loss": 5.479, "step": 10370 }, { "epoch": 0.8305328852616418, "grad_norm": 2.7548952102661133, "learning_rate": 3.626250200566936e-05, "loss": 5.5466, "step": 10380 }, { "epoch": 0.8313330132821252, "grad_norm": 1.9488781690597534, "learning_rate": 3.624913087661122e-05, "loss": 5.5063, "step": 10390 }, { "epoch": 0.8321331413026084, "grad_norm": 2.648233652114868, "learning_rate": 3.6235759747553086e-05, "loss": 5.4158, "step": 10400 }, { "epoch": 0.8329332693230916, "grad_norm": 2.8808720111846924, "learning_rate": 3.622238861849495e-05, "loss": 5.5431, "step": 10410 }, { "epoch": 0.833733397343575, "grad_norm": 3.4570131301879883, "learning_rate": 3.620901748943681e-05, "loss": 5.4842, "step": 10420 }, { "epoch": 0.8345335253640582, "grad_norm": 4.246754169464111, "learning_rate": 3.6195646360378674e-05, "loss": 5.5809, "step": 10430 }, { "epoch": 0.8353336533845416, "grad_norm": 1.8645952939987183, "learning_rate": 3.6182275231320536e-05, "loss": 5.4272, "step": 10440 }, { "epoch": 0.8361337814050248, "grad_norm": 3.3832550048828125, "learning_rate": 3.61689041022624e-05, "loss": 5.4291, "step": 10450 }, { "epoch": 0.836933909425508, "grad_norm": 2.1454830169677734, "learning_rate": 3.615553297320426e-05, "loss": 5.4457, "step": 10460 }, { "epoch": 0.8377340374459914, "grad_norm": 2.9275059700012207, "learning_rate": 3.6142161844146124e-05, "loss": 5.3577, "step": 10470 }, { "epoch": 0.8385341654664746, "grad_norm": 2.9177403450012207, "learning_rate": 3.612879071508799e-05, "loss": 5.5011, "step": 10480 }, { "epoch": 0.8393342934869579, "grad_norm": 2.9115045070648193, "learning_rate": 3.611541958602985e-05, "loss": 5.4961, "step": 10490 }, { "epoch": 0.8401344215074412, "grad_norm": 3.270296335220337, "learning_rate": 3.610204845697171e-05, "loss": 5.4651, "step": 10500 }, { "epoch": 0.8409345495279245, "grad_norm": 2.2930686473846436, "learning_rate": 3.6088677327913575e-05, "loss": 5.4363, "step": 10510 }, { "epoch": 0.8417346775484077, "grad_norm": 3.168717622756958, "learning_rate": 3.607530619885544e-05, "loss": 5.361, "step": 10520 }, { "epoch": 0.842534805568891, "grad_norm": 2.009021759033203, "learning_rate": 3.60619350697973e-05, "loss": 5.4435, "step": 10530 }, { "epoch": 0.8433349335893743, "grad_norm": 3.454181432723999, "learning_rate": 3.6048563940739156e-05, "loss": 5.4134, "step": 10540 }, { "epoch": 0.8441350616098576, "grad_norm": 2.8601911067962646, "learning_rate": 3.603519281168102e-05, "loss": 5.3224, "step": 10550 }, { "epoch": 0.8449351896303409, "grad_norm": 2.612689733505249, "learning_rate": 3.602182168262288e-05, "loss": 5.3947, "step": 10560 }, { "epoch": 0.8457353176508241, "grad_norm": 2.813868284225464, "learning_rate": 3.6008450553564745e-05, "loss": 5.4598, "step": 10570 }, { "epoch": 0.8465354456713075, "grad_norm": 2.226395606994629, "learning_rate": 3.599507942450661e-05, "loss": 5.4401, "step": 10580 }, { "epoch": 0.8473355736917907, "grad_norm": 3.4722280502319336, "learning_rate": 3.598170829544847e-05, "loss": 5.4831, "step": 10590 }, { "epoch": 0.8481357017122739, "grad_norm": 3.270322799682617, "learning_rate": 3.596833716639033e-05, "loss": 5.6256, "step": 10600 }, { "epoch": 0.8489358297327573, "grad_norm": 1.9735034704208374, "learning_rate": 3.5954966037332195e-05, "loss": 5.491, "step": 10610 }, { "epoch": 0.8497359577532405, "grad_norm": 2.9609665870666504, "learning_rate": 3.594159490827406e-05, "loss": 5.5421, "step": 10620 }, { "epoch": 0.8505360857737237, "grad_norm": 3.1109185218811035, "learning_rate": 3.592822377921592e-05, "loss": 5.5718, "step": 10630 }, { "epoch": 0.8513362137942071, "grad_norm": 2.68784761428833, "learning_rate": 3.5914852650157783e-05, "loss": 5.4769, "step": 10640 }, { "epoch": 0.8521363418146903, "grad_norm": 2.2947535514831543, "learning_rate": 3.5901481521099646e-05, "loss": 5.4901, "step": 10650 }, { "epoch": 0.8529364698351737, "grad_norm": 1.894142746925354, "learning_rate": 3.588811039204151e-05, "loss": 5.5021, "step": 10660 }, { "epoch": 0.8537365978556569, "grad_norm": 2.800260543823242, "learning_rate": 3.587473926298337e-05, "loss": 5.6767, "step": 10670 }, { "epoch": 0.8545367258761402, "grad_norm": 3.055172920227051, "learning_rate": 3.5861368133925234e-05, "loss": 5.5765, "step": 10680 }, { "epoch": 0.8553368538966235, "grad_norm": 2.3778443336486816, "learning_rate": 3.58479970048671e-05, "loss": 5.5377, "step": 10690 }, { "epoch": 0.8561369819171067, "grad_norm": 4.772058486938477, "learning_rate": 3.583462587580895e-05, "loss": 5.432, "step": 10700 }, { "epoch": 0.85693710993759, "grad_norm": 1.9563825130462646, "learning_rate": 3.5821254746750815e-05, "loss": 5.4832, "step": 10710 }, { "epoch": 0.8577372379580733, "grad_norm": 2.149519205093384, "learning_rate": 3.580788361769268e-05, "loss": 5.491, "step": 10720 }, { "epoch": 0.8585373659785566, "grad_norm": 3.5061347484588623, "learning_rate": 3.579451248863454e-05, "loss": 5.5747, "step": 10730 }, { "epoch": 0.8593374939990398, "grad_norm": 2.74947452545166, "learning_rate": 3.5781141359576404e-05, "loss": 5.3591, "step": 10740 }, { "epoch": 0.8601376220195232, "grad_norm": 2.818753719329834, "learning_rate": 3.5767770230518266e-05, "loss": 5.4722, "step": 10750 }, { "epoch": 0.8609377500400064, "grad_norm": 2.7501718997955322, "learning_rate": 3.575439910146013e-05, "loss": 5.4531, "step": 10760 }, { "epoch": 0.8617378780604896, "grad_norm": 2.314549207687378, "learning_rate": 3.574102797240199e-05, "loss": 5.5488, "step": 10770 }, { "epoch": 0.862538006080973, "grad_norm": 2.583895683288574, "learning_rate": 3.5727656843343854e-05, "loss": 5.5101, "step": 10780 }, { "epoch": 0.8633381341014562, "grad_norm": 2.778087854385376, "learning_rate": 3.571428571428572e-05, "loss": 5.421, "step": 10790 }, { "epoch": 0.8641382621219396, "grad_norm": 3.679514169692993, "learning_rate": 3.570091458522758e-05, "loss": 5.5277, "step": 10800 }, { "epoch": 0.8649383901424228, "grad_norm": 3.3869597911834717, "learning_rate": 3.568754345616944e-05, "loss": 5.5185, "step": 10810 }, { "epoch": 0.865738518162906, "grad_norm": 3.1094346046447754, "learning_rate": 3.5674172327111305e-05, "loss": 5.396, "step": 10820 }, { "epoch": 0.8665386461833894, "grad_norm": 2.3561792373657227, "learning_rate": 3.566080119805317e-05, "loss": 5.5995, "step": 10830 }, { "epoch": 0.8673387742038726, "grad_norm": 2.7533133029937744, "learning_rate": 3.564743006899503e-05, "loss": 5.4848, "step": 10840 }, { "epoch": 0.8681389022243559, "grad_norm": 2.923741579055786, "learning_rate": 3.5634058939936886e-05, "loss": 5.5549, "step": 10850 }, { "epoch": 0.8689390302448392, "grad_norm": 2.002704381942749, "learning_rate": 3.562068781087875e-05, "loss": 5.4354, "step": 10860 }, { "epoch": 0.8697391582653224, "grad_norm": 2.277064085006714, "learning_rate": 3.560731668182061e-05, "loss": 5.4404, "step": 10870 }, { "epoch": 0.8705392862858057, "grad_norm": 2.23490047454834, "learning_rate": 3.5593945552762474e-05, "loss": 5.7253, "step": 10880 }, { "epoch": 0.871339414306289, "grad_norm": 2.42874813079834, "learning_rate": 3.558057442370434e-05, "loss": 5.4351, "step": 10890 }, { "epoch": 0.8721395423267723, "grad_norm": 2.097278118133545, "learning_rate": 3.55672032946462e-05, "loss": 5.4772, "step": 10900 }, { "epoch": 0.8729396703472556, "grad_norm": 2.045832395553589, "learning_rate": 3.555383216558806e-05, "loss": 5.4132, "step": 10910 }, { "epoch": 0.8737397983677389, "grad_norm": 2.695033550262451, "learning_rate": 3.5540461036529925e-05, "loss": 5.3975, "step": 10920 }, { "epoch": 0.8745399263882221, "grad_norm": 2.62748384475708, "learning_rate": 3.552708990747179e-05, "loss": 5.5843, "step": 10930 }, { "epoch": 0.8753400544087054, "grad_norm": 2.6703569889068604, "learning_rate": 3.551371877841365e-05, "loss": 5.548, "step": 10940 }, { "epoch": 0.8761401824291887, "grad_norm": 2.7184908390045166, "learning_rate": 3.550034764935551e-05, "loss": 5.4833, "step": 10950 }, { "epoch": 0.8769403104496719, "grad_norm": 2.6194417476654053, "learning_rate": 3.5486976520297376e-05, "loss": 5.3647, "step": 10960 }, { "epoch": 0.8777404384701553, "grad_norm": 2.5021440982818604, "learning_rate": 3.547360539123924e-05, "loss": 5.4775, "step": 10970 }, { "epoch": 0.8785405664906385, "grad_norm": 3.3758370876312256, "learning_rate": 3.54602342621811e-05, "loss": 5.4144, "step": 10980 }, { "epoch": 0.8793406945111217, "grad_norm": 2.7361087799072266, "learning_rate": 3.5446863133122964e-05, "loss": 5.3614, "step": 10990 }, { "epoch": 0.8801408225316051, "grad_norm": 3.831631660461426, "learning_rate": 3.543349200406482e-05, "loss": 5.4672, "step": 11000 }, { "epoch": 0.8809409505520883, "grad_norm": 2.9705264568328857, "learning_rate": 3.542012087500668e-05, "loss": 5.5334, "step": 11010 }, { "epoch": 0.8817410785725716, "grad_norm": 3.578693389892578, "learning_rate": 3.5406749745948545e-05, "loss": 5.4943, "step": 11020 }, { "epoch": 0.8825412065930549, "grad_norm": 2.0674843788146973, "learning_rate": 3.539337861689041e-05, "loss": 5.4054, "step": 11030 }, { "epoch": 0.8833413346135381, "grad_norm": 2.1904194355010986, "learning_rate": 3.538000748783227e-05, "loss": 5.37, "step": 11040 }, { "epoch": 0.8841414626340215, "grad_norm": 3.7718141078948975, "learning_rate": 3.536663635877413e-05, "loss": 5.6004, "step": 11050 }, { "epoch": 0.8849415906545047, "grad_norm": 2.7325282096862793, "learning_rate": 3.5353265229715996e-05, "loss": 5.4552, "step": 11060 }, { "epoch": 0.885741718674988, "grad_norm": 3.3750839233398438, "learning_rate": 3.533989410065786e-05, "loss": 5.5041, "step": 11070 }, { "epoch": 0.8865418466954713, "grad_norm": 2.5617001056671143, "learning_rate": 3.532652297159972e-05, "loss": 5.4912, "step": 11080 }, { "epoch": 0.8873419747159546, "grad_norm": 1.9870737791061401, "learning_rate": 3.5313151842541584e-05, "loss": 5.4576, "step": 11090 }, { "epoch": 0.8881421027364378, "grad_norm": 2.458249568939209, "learning_rate": 3.529978071348345e-05, "loss": 5.7306, "step": 11100 }, { "epoch": 0.8889422307569211, "grad_norm": 3.1406562328338623, "learning_rate": 3.528640958442531e-05, "loss": 5.5833, "step": 11110 }, { "epoch": 0.8897423587774044, "grad_norm": 2.4337878227233887, "learning_rate": 3.527303845536717e-05, "loss": 5.4938, "step": 11120 }, { "epoch": 0.8905424867978876, "grad_norm": 2.925147294998169, "learning_rate": 3.5259667326309035e-05, "loss": 5.5591, "step": 11130 }, { "epoch": 0.891342614818371, "grad_norm": 2.5177969932556152, "learning_rate": 3.52462961972509e-05, "loss": 5.5199, "step": 11140 }, { "epoch": 0.8921427428388542, "grad_norm": 2.3133068084716797, "learning_rate": 3.523292506819276e-05, "loss": 5.3506, "step": 11150 }, { "epoch": 0.8929428708593375, "grad_norm": 2.1670310497283936, "learning_rate": 3.521955393913462e-05, "loss": 5.3459, "step": 11160 }, { "epoch": 0.8937429988798208, "grad_norm": 2.875126838684082, "learning_rate": 3.5206182810076486e-05, "loss": 5.3948, "step": 11170 }, { "epoch": 0.894543126900304, "grad_norm": 2.3784403800964355, "learning_rate": 3.519281168101835e-05, "loss": 5.431, "step": 11180 }, { "epoch": 0.8953432549207874, "grad_norm": 2.400426149368286, "learning_rate": 3.517944055196021e-05, "loss": 5.4228, "step": 11190 }, { "epoch": 0.8961433829412706, "grad_norm": 2.2166919708251953, "learning_rate": 3.5166069422902074e-05, "loss": 5.6408, "step": 11200 }, { "epoch": 0.8969435109617538, "grad_norm": 1.7938240766525269, "learning_rate": 3.5152698293843936e-05, "loss": 5.3972, "step": 11210 }, { "epoch": 0.8977436389822372, "grad_norm": 2.4942996501922607, "learning_rate": 3.51393271647858e-05, "loss": 5.5523, "step": 11220 }, { "epoch": 0.8985437670027204, "grad_norm": 2.706131935119629, "learning_rate": 3.512595603572766e-05, "loss": 5.6029, "step": 11230 }, { "epoch": 0.8993438950232037, "grad_norm": 3.6749794483184814, "learning_rate": 3.5112584906669524e-05, "loss": 5.5903, "step": 11240 }, { "epoch": 0.900144023043687, "grad_norm": 2.8764829635620117, "learning_rate": 3.509921377761139e-05, "loss": 5.392, "step": 11250 }, { "epoch": 0.9009441510641703, "grad_norm": 1.9971251487731934, "learning_rate": 3.508584264855325e-05, "loss": 5.5115, "step": 11260 }, { "epoch": 0.9017442790846536, "grad_norm": 1.9127808809280396, "learning_rate": 3.507247151949511e-05, "loss": 5.6273, "step": 11270 }, { "epoch": 0.9025444071051368, "grad_norm": 2.679152727127075, "learning_rate": 3.5059100390436975e-05, "loss": 5.5216, "step": 11280 }, { "epoch": 0.9033445351256201, "grad_norm": 3.1412837505340576, "learning_rate": 3.504572926137884e-05, "loss": 5.5665, "step": 11290 }, { "epoch": 0.9041446631461034, "grad_norm": 3.2604153156280518, "learning_rate": 3.50323581323207e-05, "loss": 5.6283, "step": 11300 }, { "epoch": 0.9049447911665867, "grad_norm": 2.2050578594207764, "learning_rate": 3.5018987003262557e-05, "loss": 5.429, "step": 11310 }, { "epoch": 0.9057449191870699, "grad_norm": 3.6569366455078125, "learning_rate": 3.500561587420442e-05, "loss": 5.5833, "step": 11320 }, { "epoch": 0.9065450472075532, "grad_norm": 2.38771653175354, "learning_rate": 3.499224474514628e-05, "loss": 5.4127, "step": 11330 }, { "epoch": 0.9073451752280365, "grad_norm": 2.1471800804138184, "learning_rate": 3.4978873616088145e-05, "loss": 5.4064, "step": 11340 }, { "epoch": 0.9081453032485197, "grad_norm": 2.340174674987793, "learning_rate": 3.496550248703001e-05, "loss": 5.5581, "step": 11350 }, { "epoch": 0.9089454312690031, "grad_norm": 2.771235466003418, "learning_rate": 3.495213135797187e-05, "loss": 5.4221, "step": 11360 }, { "epoch": 0.9097455592894863, "grad_norm": 2.7797491550445557, "learning_rate": 3.493876022891373e-05, "loss": 5.5604, "step": 11370 }, { "epoch": 0.9105456873099695, "grad_norm": 2.0206966400146484, "learning_rate": 3.4925389099855595e-05, "loss": 5.3382, "step": 11380 }, { "epoch": 0.9113458153304529, "grad_norm": 3.5101125240325928, "learning_rate": 3.491201797079746e-05, "loss": 5.5358, "step": 11390 }, { "epoch": 0.9121459433509361, "grad_norm": 2.3375003337860107, "learning_rate": 3.489864684173932e-05, "loss": 5.5492, "step": 11400 }, { "epoch": 0.9129460713714195, "grad_norm": 2.4977264404296875, "learning_rate": 3.4885275712681183e-05, "loss": 5.507, "step": 11410 }, { "epoch": 0.9137461993919027, "grad_norm": 2.0408174991607666, "learning_rate": 3.4871904583623046e-05, "loss": 5.5587, "step": 11420 }, { "epoch": 0.914546327412386, "grad_norm": 2.525320053100586, "learning_rate": 3.485853345456491e-05, "loss": 5.5013, "step": 11430 }, { "epoch": 0.9153464554328693, "grad_norm": 2.946377992630005, "learning_rate": 3.484516232550677e-05, "loss": 5.5959, "step": 11440 }, { "epoch": 0.9161465834533525, "grad_norm": 2.138331174850464, "learning_rate": 3.4831791196448634e-05, "loss": 5.4817, "step": 11450 }, { "epoch": 0.9169467114738358, "grad_norm": 1.7159631252288818, "learning_rate": 3.48184200673905e-05, "loss": 5.5036, "step": 11460 }, { "epoch": 0.9177468394943191, "grad_norm": 2.5576088428497314, "learning_rate": 3.480504893833235e-05, "loss": 5.4721, "step": 11470 }, { "epoch": 0.9185469675148024, "grad_norm": 2.057349443435669, "learning_rate": 3.4791677809274215e-05, "loss": 5.5468, "step": 11480 }, { "epoch": 0.9193470955352856, "grad_norm": 2.4942944049835205, "learning_rate": 3.477830668021608e-05, "loss": 5.5999, "step": 11490 }, { "epoch": 0.920147223555769, "grad_norm": 3.3070192337036133, "learning_rate": 3.476493555115794e-05, "loss": 5.5418, "step": 11500 }, { "epoch": 0.9209473515762522, "grad_norm": 2.2323672771453857, "learning_rate": 3.4751564422099804e-05, "loss": 5.398, "step": 11510 }, { "epoch": 0.9217474795967355, "grad_norm": 1.9982457160949707, "learning_rate": 3.4738193293041666e-05, "loss": 5.4668, "step": 11520 }, { "epoch": 0.9225476076172188, "grad_norm": 3.4668660163879395, "learning_rate": 3.472482216398353e-05, "loss": 5.5433, "step": 11530 }, { "epoch": 0.923347735637702, "grad_norm": 2.7247307300567627, "learning_rate": 3.471145103492539e-05, "loss": 5.4156, "step": 11540 }, { "epoch": 0.9241478636581854, "grad_norm": 2.42948317527771, "learning_rate": 3.4698079905867254e-05, "loss": 5.4336, "step": 11550 }, { "epoch": 0.9249479916786686, "grad_norm": 4.134993076324463, "learning_rate": 3.468470877680912e-05, "loss": 5.3362, "step": 11560 }, { "epoch": 0.9257481196991518, "grad_norm": 2.0852134227752686, "learning_rate": 3.467133764775098e-05, "loss": 5.4117, "step": 11570 }, { "epoch": 0.9265482477196352, "grad_norm": 2.224235773086548, "learning_rate": 3.465796651869284e-05, "loss": 5.4132, "step": 11580 }, { "epoch": 0.9273483757401184, "grad_norm": 2.0093464851379395, "learning_rate": 3.4644595389634705e-05, "loss": 5.3876, "step": 11590 }, { "epoch": 0.9281485037606017, "grad_norm": 1.9892866611480713, "learning_rate": 3.463122426057657e-05, "loss": 5.4069, "step": 11600 }, { "epoch": 0.928948631781085, "grad_norm": 3.9974398612976074, "learning_rate": 3.461785313151843e-05, "loss": 5.4892, "step": 11610 }, { "epoch": 0.9297487598015682, "grad_norm": 1.9878896474838257, "learning_rate": 3.4604482002460286e-05, "loss": 5.5017, "step": 11620 }, { "epoch": 0.9305488878220515, "grad_norm": 3.1477320194244385, "learning_rate": 3.459111087340215e-05, "loss": 5.3199, "step": 11630 }, { "epoch": 0.9313490158425348, "grad_norm": 2.434946298599243, "learning_rate": 3.457773974434401e-05, "loss": 5.4885, "step": 11640 }, { "epoch": 0.9321491438630181, "grad_norm": 3.2463152408599854, "learning_rate": 3.4564368615285874e-05, "loss": 5.5232, "step": 11650 }, { "epoch": 0.9329492718835014, "grad_norm": 3.733612537384033, "learning_rate": 3.455099748622774e-05, "loss": 5.4918, "step": 11660 }, { "epoch": 0.9337493999039846, "grad_norm": 3.3726518154144287, "learning_rate": 3.45376263571696e-05, "loss": 5.3887, "step": 11670 }, { "epoch": 0.9345495279244679, "grad_norm": 2.527639627456665, "learning_rate": 3.452425522811146e-05, "loss": 5.4, "step": 11680 }, { "epoch": 0.9353496559449512, "grad_norm": 3.3945000171661377, "learning_rate": 3.4510884099053325e-05, "loss": 5.4835, "step": 11690 }, { "epoch": 0.9361497839654345, "grad_norm": 2.492178201675415, "learning_rate": 3.449751296999519e-05, "loss": 5.5472, "step": 11700 }, { "epoch": 0.9369499119859177, "grad_norm": 2.2719671726226807, "learning_rate": 3.448414184093705e-05, "loss": 5.3069, "step": 11710 }, { "epoch": 0.937750040006401, "grad_norm": 4.121431350708008, "learning_rate": 3.447077071187891e-05, "loss": 5.3377, "step": 11720 }, { "epoch": 0.9385501680268843, "grad_norm": 2.2480831146240234, "learning_rate": 3.4457399582820776e-05, "loss": 5.3888, "step": 11730 }, { "epoch": 0.9393502960473675, "grad_norm": 3.118621349334717, "learning_rate": 3.444402845376264e-05, "loss": 5.3225, "step": 11740 }, { "epoch": 0.9401504240678509, "grad_norm": 2.513777494430542, "learning_rate": 3.44306573247045e-05, "loss": 5.4971, "step": 11750 }, { "epoch": 0.9409505520883341, "grad_norm": 2.491767406463623, "learning_rate": 3.4417286195646364e-05, "loss": 5.5061, "step": 11760 }, { "epoch": 0.9417506801088175, "grad_norm": 2.8964290618896484, "learning_rate": 3.440391506658823e-05, "loss": 5.3395, "step": 11770 }, { "epoch": 0.9425508081293007, "grad_norm": 2.1613073348999023, "learning_rate": 3.439054393753008e-05, "loss": 5.512, "step": 11780 }, { "epoch": 0.9433509361497839, "grad_norm": 3.5444371700286865, "learning_rate": 3.4377172808471945e-05, "loss": 5.4804, "step": 11790 }, { "epoch": 0.9441510641702673, "grad_norm": 3.0833287239074707, "learning_rate": 3.436380167941381e-05, "loss": 5.5711, "step": 11800 }, { "epoch": 0.9449511921907505, "grad_norm": 2.2267260551452637, "learning_rate": 3.435043055035567e-05, "loss": 5.3964, "step": 11810 }, { "epoch": 0.9457513202112338, "grad_norm": 3.114546537399292, "learning_rate": 3.4337059421297533e-05, "loss": 5.4296, "step": 11820 }, { "epoch": 0.9465514482317171, "grad_norm": 3.316612958908081, "learning_rate": 3.4323688292239396e-05, "loss": 5.451, "step": 11830 }, { "epoch": 0.9473515762522003, "grad_norm": 2.97145414352417, "learning_rate": 3.431031716318126e-05, "loss": 5.6184, "step": 11840 }, { "epoch": 0.9481517042726836, "grad_norm": 2.2837045192718506, "learning_rate": 3.429694603412312e-05, "loss": 5.3398, "step": 11850 }, { "epoch": 0.9489518322931669, "grad_norm": 2.2095916271209717, "learning_rate": 3.4283574905064984e-05, "loss": 5.3933, "step": 11860 }, { "epoch": 0.9497519603136502, "grad_norm": 1.9592795372009277, "learning_rate": 3.427020377600685e-05, "loss": 5.4423, "step": 11870 }, { "epoch": 0.9505520883341335, "grad_norm": 2.9245188236236572, "learning_rate": 3.425683264694871e-05, "loss": 5.4927, "step": 11880 }, { "epoch": 0.9513522163546168, "grad_norm": 2.5000531673431396, "learning_rate": 3.424346151789057e-05, "loss": 5.3523, "step": 11890 }, { "epoch": 0.9521523443751, "grad_norm": 2.4692375659942627, "learning_rate": 3.4230090388832435e-05, "loss": 5.5949, "step": 11900 }, { "epoch": 0.9529524723955833, "grad_norm": 2.387812852859497, "learning_rate": 3.42167192597743e-05, "loss": 5.4971, "step": 11910 }, { "epoch": 0.9537526004160666, "grad_norm": 2.938291072845459, "learning_rate": 3.420334813071616e-05, "loss": 5.3849, "step": 11920 }, { "epoch": 0.9545527284365498, "grad_norm": 2.608431339263916, "learning_rate": 3.4189977001658016e-05, "loss": 5.3414, "step": 11930 }, { "epoch": 0.9553528564570332, "grad_norm": 2.695615530014038, "learning_rate": 3.417660587259988e-05, "loss": 5.2343, "step": 11940 }, { "epoch": 0.9561529844775164, "grad_norm": 3.0142087936401367, "learning_rate": 3.416323474354174e-05, "loss": 5.3293, "step": 11950 }, { "epoch": 0.9569531124979996, "grad_norm": 2.5953242778778076, "learning_rate": 3.4149863614483604e-05, "loss": 5.459, "step": 11960 }, { "epoch": 0.957753240518483, "grad_norm": 2.2795822620391846, "learning_rate": 3.413649248542547e-05, "loss": 5.5305, "step": 11970 }, { "epoch": 0.9585533685389662, "grad_norm": 2.5979270935058594, "learning_rate": 3.412312135636733e-05, "loss": 5.4866, "step": 11980 }, { "epoch": 0.9593534965594495, "grad_norm": 2.66823673248291, "learning_rate": 3.410975022730919e-05, "loss": 5.5734, "step": 11990 }, { "epoch": 0.9601536245799328, "grad_norm": 2.3899004459381104, "learning_rate": 3.4096379098251055e-05, "loss": 5.5367, "step": 12000 }, { "epoch": 0.960953752600416, "grad_norm": 2.233553171157837, "learning_rate": 3.408300796919292e-05, "loss": 5.3773, "step": 12010 }, { "epoch": 0.9617538806208994, "grad_norm": 2.2967305183410645, "learning_rate": 3.406963684013478e-05, "loss": 5.4409, "step": 12020 }, { "epoch": 0.9625540086413826, "grad_norm": 2.4291601181030273, "learning_rate": 3.405626571107664e-05, "loss": 5.4198, "step": 12030 }, { "epoch": 0.9633541366618659, "grad_norm": 2.6325435638427734, "learning_rate": 3.4042894582018506e-05, "loss": 5.6044, "step": 12040 }, { "epoch": 0.9641542646823492, "grad_norm": 2.4688518047332764, "learning_rate": 3.402952345296037e-05, "loss": 5.3633, "step": 12050 }, { "epoch": 0.9649543927028325, "grad_norm": 2.3974521160125732, "learning_rate": 3.401615232390223e-05, "loss": 5.3022, "step": 12060 }, { "epoch": 0.9657545207233157, "grad_norm": 2.146742105484009, "learning_rate": 3.4002781194844094e-05, "loss": 5.2753, "step": 12070 }, { "epoch": 0.966554648743799, "grad_norm": 2.1239147186279297, "learning_rate": 3.3989410065785957e-05, "loss": 5.466, "step": 12080 }, { "epoch": 0.9673547767642823, "grad_norm": 2.939096450805664, "learning_rate": 3.397603893672782e-05, "loss": 5.5288, "step": 12090 }, { "epoch": 0.9681549047847655, "grad_norm": 2.6875243186950684, "learning_rate": 3.396266780766968e-05, "loss": 5.4279, "step": 12100 }, { "epoch": 0.9689550328052489, "grad_norm": 3.1991941928863525, "learning_rate": 3.3949296678611545e-05, "loss": 5.5397, "step": 12110 }, { "epoch": 0.9697551608257321, "grad_norm": 2.4558470249176025, "learning_rate": 3.393592554955341e-05, "loss": 5.3246, "step": 12120 }, { "epoch": 0.9705552888462154, "grad_norm": 2.2693309783935547, "learning_rate": 3.392255442049527e-05, "loss": 5.5941, "step": 12130 }, { "epoch": 0.9713554168666987, "grad_norm": 2.8864657878875732, "learning_rate": 3.390918329143713e-05, "loss": 5.4632, "step": 12140 }, { "epoch": 0.9721555448871819, "grad_norm": 2.3996002674102783, "learning_rate": 3.3895812162378995e-05, "loss": 5.4724, "step": 12150 }, { "epoch": 0.9729556729076653, "grad_norm": 1.979028582572937, "learning_rate": 3.388244103332086e-05, "loss": 5.4229, "step": 12160 }, { "epoch": 0.9737558009281485, "grad_norm": 2.0203795433044434, "learning_rate": 3.386906990426272e-05, "loss": 5.5592, "step": 12170 }, { "epoch": 0.9745559289486317, "grad_norm": 2.0890145301818848, "learning_rate": 3.3855698775204583e-05, "loss": 5.4313, "step": 12180 }, { "epoch": 0.9753560569691151, "grad_norm": 2.4817287921905518, "learning_rate": 3.3842327646146446e-05, "loss": 5.5, "step": 12190 }, { "epoch": 0.9761561849895983, "grad_norm": 2.2497968673706055, "learning_rate": 3.382895651708831e-05, "loss": 5.3126, "step": 12200 }, { "epoch": 0.9769563130100816, "grad_norm": 3.2818548679351807, "learning_rate": 3.381558538803017e-05, "loss": 5.3421, "step": 12210 }, { "epoch": 0.9777564410305649, "grad_norm": 7.580129623413086, "learning_rate": 3.3802214258972034e-05, "loss": 5.6585, "step": 12220 }, { "epoch": 0.9785565690510482, "grad_norm": 3.0450634956359863, "learning_rate": 3.37888431299139e-05, "loss": 5.4403, "step": 12230 }, { "epoch": 0.9793566970715314, "grad_norm": 2.5230050086975098, "learning_rate": 3.377547200085575e-05, "loss": 5.5331, "step": 12240 }, { "epoch": 0.9801568250920147, "grad_norm": 3.398266315460205, "learning_rate": 3.3762100871797616e-05, "loss": 5.3996, "step": 12250 }, { "epoch": 0.980956953112498, "grad_norm": 2.2126028537750244, "learning_rate": 3.374872974273948e-05, "loss": 5.4175, "step": 12260 }, { "epoch": 0.9817570811329813, "grad_norm": 3.0015792846679688, "learning_rate": 3.373535861368134e-05, "loss": 5.3961, "step": 12270 }, { "epoch": 0.9825572091534646, "grad_norm": 2.5461559295654297, "learning_rate": 3.3721987484623204e-05, "loss": 5.6026, "step": 12280 }, { "epoch": 0.9833573371739478, "grad_norm": 2.498425245285034, "learning_rate": 3.3708616355565066e-05, "loss": 5.3524, "step": 12290 }, { "epoch": 0.9841574651944311, "grad_norm": 2.9614803791046143, "learning_rate": 3.369524522650693e-05, "loss": 5.5101, "step": 12300 }, { "epoch": 0.9849575932149144, "grad_norm": 2.7508606910705566, "learning_rate": 3.368187409744879e-05, "loss": 5.3776, "step": 12310 }, { "epoch": 0.9857577212353976, "grad_norm": 2.0286755561828613, "learning_rate": 3.3668502968390654e-05, "loss": 5.4913, "step": 12320 }, { "epoch": 0.986557849255881, "grad_norm": 3.728842258453369, "learning_rate": 3.365513183933252e-05, "loss": 5.4477, "step": 12330 }, { "epoch": 0.9873579772763642, "grad_norm": 3.3132193088531494, "learning_rate": 3.364176071027438e-05, "loss": 5.2361, "step": 12340 }, { "epoch": 0.9881581052968474, "grad_norm": 2.515298843383789, "learning_rate": 3.362838958121624e-05, "loss": 5.4632, "step": 12350 }, { "epoch": 0.9889582333173308, "grad_norm": 2.0937442779541016, "learning_rate": 3.3615018452158105e-05, "loss": 5.5075, "step": 12360 }, { "epoch": 0.989758361337814, "grad_norm": 3.3019323348999023, "learning_rate": 3.360164732309997e-05, "loss": 5.4566, "step": 12370 }, { "epoch": 0.9905584893582974, "grad_norm": 3.502408266067505, "learning_rate": 3.358827619404183e-05, "loss": 5.464, "step": 12380 }, { "epoch": 0.9913586173787806, "grad_norm": 2.3667659759521484, "learning_rate": 3.357490506498369e-05, "loss": 5.5423, "step": 12390 }, { "epoch": 0.9921587453992639, "grad_norm": 2.15498423576355, "learning_rate": 3.356153393592555e-05, "loss": 5.4031, "step": 12400 }, { "epoch": 0.9929588734197472, "grad_norm": 2.733090877532959, "learning_rate": 3.354816280686741e-05, "loss": 5.4771, "step": 12410 }, { "epoch": 0.9937590014402304, "grad_norm": 2.595238208770752, "learning_rate": 3.3534791677809274e-05, "loss": 5.4538, "step": 12420 }, { "epoch": 0.9945591294607137, "grad_norm": 2.3755598068237305, "learning_rate": 3.352142054875114e-05, "loss": 5.432, "step": 12430 }, { "epoch": 0.995359257481197, "grad_norm": 2.2179529666900635, "learning_rate": 3.3508049419693e-05, "loss": 5.4359, "step": 12440 }, { "epoch": 0.9961593855016803, "grad_norm": 2.264469623565674, "learning_rate": 3.349467829063486e-05, "loss": 5.4514, "step": 12450 }, { "epoch": 0.9969595135221635, "grad_norm": 2.9361791610717773, "learning_rate": 3.3481307161576725e-05, "loss": 5.4411, "step": 12460 }, { "epoch": 0.9977596415426468, "grad_norm": 2.6548573970794678, "learning_rate": 3.346793603251859e-05, "loss": 5.4368, "step": 12470 }, { "epoch": 0.9985597695631301, "grad_norm": 3.5749149322509766, "learning_rate": 3.345456490346045e-05, "loss": 5.6314, "step": 12480 }, { "epoch": 0.9993598975836134, "grad_norm": 2.848527193069458, "learning_rate": 3.344119377440231e-05, "loss": 5.3849, "step": 12490 }, { "epoch": 1.0001600256040966, "grad_norm": 2.036498546600342, "learning_rate": 3.3427822645344176e-05, "loss": 5.5973, "step": 12500 }, { "epoch": 1.00096015362458, "grad_norm": 3.499455451965332, "learning_rate": 3.341445151628604e-05, "loss": 5.1882, "step": 12510 }, { "epoch": 1.0017602816450633, "grad_norm": 2.4391655921936035, "learning_rate": 3.34010803872279e-05, "loss": 5.0281, "step": 12520 }, { "epoch": 1.0025604096655465, "grad_norm": 2.522850513458252, "learning_rate": 3.3387709258169764e-05, "loss": 5.1038, "step": 12530 }, { "epoch": 1.0033605376860297, "grad_norm": 2.631127119064331, "learning_rate": 3.337433812911163e-05, "loss": 4.9671, "step": 12540 }, { "epoch": 1.004160665706513, "grad_norm": 2.9861068725585938, "learning_rate": 3.336096700005348e-05, "loss": 5.2225, "step": 12550 }, { "epoch": 1.0049607937269964, "grad_norm": 2.59002423286438, "learning_rate": 3.3347595870995345e-05, "loss": 5.142, "step": 12560 }, { "epoch": 1.0057609217474797, "grad_norm": 2.830385208129883, "learning_rate": 3.333422474193721e-05, "loss": 5.0919, "step": 12570 }, { "epoch": 1.006561049767963, "grad_norm": 2.6355655193328857, "learning_rate": 3.332085361287907e-05, "loss": 5.0604, "step": 12580 }, { "epoch": 1.0073611777884461, "grad_norm": 2.8990426063537598, "learning_rate": 3.3307482483820933e-05, "loss": 5.0488, "step": 12590 }, { "epoch": 1.0081613058089294, "grad_norm": 2.657283067703247, "learning_rate": 3.3294111354762796e-05, "loss": 5.157, "step": 12600 }, { "epoch": 1.0089614338294126, "grad_norm": 3.652735710144043, "learning_rate": 3.328074022570466e-05, "loss": 5.1629, "step": 12610 }, { "epoch": 1.009761561849896, "grad_norm": 2.9064295291900635, "learning_rate": 3.326736909664652e-05, "loss": 5.1757, "step": 12620 }, { "epoch": 1.0105616898703793, "grad_norm": 3.015488386154175, "learning_rate": 3.3253997967588384e-05, "loss": 5.2311, "step": 12630 }, { "epoch": 1.0113618178908625, "grad_norm": 9.49726390838623, "learning_rate": 3.324062683853025e-05, "loss": 5.1402, "step": 12640 }, { "epoch": 1.0121619459113458, "grad_norm": 6.71565055847168, "learning_rate": 3.322725570947211e-05, "loss": 4.7297, "step": 12650 }, { "epoch": 1.012962073931829, "grad_norm": 4.39326286315918, "learning_rate": 3.321388458041397e-05, "loss": 5.1663, "step": 12660 }, { "epoch": 1.0137622019523125, "grad_norm": 2.8973264694213867, "learning_rate": 3.3200513451355835e-05, "loss": 5.0674, "step": 12670 }, { "epoch": 1.0145623299727957, "grad_norm": 3.1058743000030518, "learning_rate": 3.31871423222977e-05, "loss": 4.9689, "step": 12680 }, { "epoch": 1.015362457993279, "grad_norm": 2.688951253890991, "learning_rate": 3.317377119323956e-05, "loss": 5.0916, "step": 12690 }, { "epoch": 1.0161625860137622, "grad_norm": 2.9495773315429688, "learning_rate": 3.3160400064181416e-05, "loss": 5.0939, "step": 12700 }, { "epoch": 1.0169627140342454, "grad_norm": 2.5915777683258057, "learning_rate": 3.314702893512328e-05, "loss": 5.134, "step": 12710 }, { "epoch": 1.0177628420547287, "grad_norm": 2.703012228012085, "learning_rate": 3.313365780606514e-05, "loss": 5.1285, "step": 12720 }, { "epoch": 1.0185629700752121, "grad_norm": 3.0492970943450928, "learning_rate": 3.3120286677007004e-05, "loss": 5.1477, "step": 12730 }, { "epoch": 1.0193630980956954, "grad_norm": 2.756546974182129, "learning_rate": 3.310691554794887e-05, "loss": 5.0668, "step": 12740 }, { "epoch": 1.0201632261161786, "grad_norm": 4.764959335327148, "learning_rate": 3.309354441889073e-05, "loss": 5.1243, "step": 12750 }, { "epoch": 1.0209633541366618, "grad_norm": 5.539842128753662, "learning_rate": 3.308017328983259e-05, "loss": 5.0519, "step": 12760 }, { "epoch": 1.021763482157145, "grad_norm": 3.8945937156677246, "learning_rate": 3.3066802160774455e-05, "loss": 5.1758, "step": 12770 }, { "epoch": 1.0225636101776283, "grad_norm": 2.5580265522003174, "learning_rate": 3.305343103171632e-05, "loss": 5.0893, "step": 12780 }, { "epoch": 1.0233637381981118, "grad_norm": 2.8203110694885254, "learning_rate": 3.304005990265818e-05, "loss": 5.2472, "step": 12790 }, { "epoch": 1.024163866218595, "grad_norm": 3.5090975761413574, "learning_rate": 3.302668877360004e-05, "loss": 5.0594, "step": 12800 }, { "epoch": 1.0249639942390782, "grad_norm": 2.915062189102173, "learning_rate": 3.3013317644541906e-05, "loss": 5.0673, "step": 12810 }, { "epoch": 1.0257641222595615, "grad_norm": 2.648737668991089, "learning_rate": 3.299994651548377e-05, "loss": 4.8937, "step": 12820 }, { "epoch": 1.0265642502800447, "grad_norm": 3.2576730251312256, "learning_rate": 3.298657538642563e-05, "loss": 5.1564, "step": 12830 }, { "epoch": 1.0273643783005282, "grad_norm": 5.624968528747559, "learning_rate": 3.2973204257367494e-05, "loss": 5.3011, "step": 12840 }, { "epoch": 1.0281645063210114, "grad_norm": 2.492978811264038, "learning_rate": 3.2959833128309357e-05, "loss": 5.0935, "step": 12850 }, { "epoch": 1.0289646343414947, "grad_norm": 2.4655046463012695, "learning_rate": 3.294646199925121e-05, "loss": 5.1768, "step": 12860 }, { "epoch": 1.029764762361978, "grad_norm": 3.4421567916870117, "learning_rate": 3.2933090870193075e-05, "loss": 5.0756, "step": 12870 }, { "epoch": 1.0305648903824611, "grad_norm": 2.6774377822875977, "learning_rate": 3.291971974113494e-05, "loss": 5.036, "step": 12880 }, { "epoch": 1.0313650184029444, "grad_norm": 2.665099859237671, "learning_rate": 3.29063486120768e-05, "loss": 5.1284, "step": 12890 }, { "epoch": 1.0321651464234278, "grad_norm": 3.7092061042785645, "learning_rate": 3.289297748301866e-05, "loss": 4.8892, "step": 12900 }, { "epoch": 1.032965274443911, "grad_norm": 2.875427484512329, "learning_rate": 3.2879606353960526e-05, "loss": 4.928, "step": 12910 }, { "epoch": 1.0337654024643943, "grad_norm": 2.409395694732666, "learning_rate": 3.286623522490239e-05, "loss": 5.0545, "step": 12920 }, { "epoch": 1.0345655304848775, "grad_norm": 3.936565637588501, "learning_rate": 3.285286409584425e-05, "loss": 5.0556, "step": 12930 }, { "epoch": 1.0353656585053608, "grad_norm": 3.52986216545105, "learning_rate": 3.2839492966786114e-05, "loss": 5.0738, "step": 12940 }, { "epoch": 1.0361657865258442, "grad_norm": 3.0732507705688477, "learning_rate": 3.282612183772798e-05, "loss": 5.0852, "step": 12950 }, { "epoch": 1.0369659145463275, "grad_norm": 2.800020217895508, "learning_rate": 3.281275070866984e-05, "loss": 4.9983, "step": 12960 }, { "epoch": 1.0377660425668107, "grad_norm": 2.682191848754883, "learning_rate": 3.27993795796117e-05, "loss": 4.8372, "step": 12970 }, { "epoch": 1.038566170587294, "grad_norm": 5.331565856933594, "learning_rate": 3.2786008450553565e-05, "loss": 5.1444, "step": 12980 }, { "epoch": 1.0393662986077772, "grad_norm": 3.530069589614868, "learning_rate": 3.277263732149543e-05, "loss": 5.1467, "step": 12990 }, { "epoch": 1.0401664266282604, "grad_norm": 2.296837568283081, "learning_rate": 3.275926619243729e-05, "loss": 5.0782, "step": 13000 }, { "epoch": 1.0409665546487439, "grad_norm": 4.3493146896362305, "learning_rate": 3.274589506337915e-05, "loss": 5.0574, "step": 13010 }, { "epoch": 1.0417666826692271, "grad_norm": 3.2167856693267822, "learning_rate": 3.2732523934321016e-05, "loss": 5.1219, "step": 13020 }, { "epoch": 1.0425668106897104, "grad_norm": 3.200861692428589, "learning_rate": 3.271915280526288e-05, "loss": 5.0674, "step": 13030 }, { "epoch": 1.0433669387101936, "grad_norm": 2.286841869354248, "learning_rate": 3.270578167620474e-05, "loss": 5.0125, "step": 13040 }, { "epoch": 1.0441670667306768, "grad_norm": 3.6788413524627686, "learning_rate": 3.2692410547146604e-05, "loss": 5.2975, "step": 13050 }, { "epoch": 1.0449671947511603, "grad_norm": 2.77284574508667, "learning_rate": 3.2679039418088466e-05, "loss": 5.0099, "step": 13060 }, { "epoch": 1.0457673227716435, "grad_norm": 4.33493185043335, "learning_rate": 3.266566828903033e-05, "loss": 5.0362, "step": 13070 }, { "epoch": 1.0465674507921268, "grad_norm": 3.2839553356170654, "learning_rate": 3.265229715997219e-05, "loss": 4.9569, "step": 13080 }, { "epoch": 1.04736757881261, "grad_norm": 2.9086809158325195, "learning_rate": 3.2638926030914054e-05, "loss": 5.0341, "step": 13090 }, { "epoch": 1.0481677068330932, "grad_norm": 2.565225124359131, "learning_rate": 3.262555490185592e-05, "loss": 5.0601, "step": 13100 }, { "epoch": 1.0489678348535765, "grad_norm": 2.8457388877868652, "learning_rate": 3.261218377279778e-05, "loss": 4.9952, "step": 13110 }, { "epoch": 1.04976796287406, "grad_norm": 2.5370593070983887, "learning_rate": 3.259881264373964e-05, "loss": 5.1425, "step": 13120 }, { "epoch": 1.0505680908945432, "grad_norm": 2.504817008972168, "learning_rate": 3.2585441514681505e-05, "loss": 4.9605, "step": 13130 }, { "epoch": 1.0513682189150264, "grad_norm": 2.9582226276397705, "learning_rate": 3.257207038562337e-05, "loss": 5.1436, "step": 13140 }, { "epoch": 1.0521683469355096, "grad_norm": 3.7598915100097656, "learning_rate": 3.255869925656523e-05, "loss": 5.0743, "step": 13150 }, { "epoch": 1.0529684749559929, "grad_norm": 3.2642862796783447, "learning_rate": 3.254532812750709e-05, "loss": 5.139, "step": 13160 }, { "epoch": 1.0537686029764763, "grad_norm": 3.4917502403259277, "learning_rate": 3.253195699844895e-05, "loss": 5.0566, "step": 13170 }, { "epoch": 1.0545687309969596, "grad_norm": 2.9878995418548584, "learning_rate": 3.251858586939081e-05, "loss": 5.2385, "step": 13180 }, { "epoch": 1.0553688590174428, "grad_norm": 2.9996213912963867, "learning_rate": 3.2505214740332674e-05, "loss": 5.1138, "step": 13190 }, { "epoch": 1.056168987037926, "grad_norm": 5.470676422119141, "learning_rate": 3.249184361127454e-05, "loss": 5.0921, "step": 13200 }, { "epoch": 1.0569691150584093, "grad_norm": 2.9724602699279785, "learning_rate": 3.24784724822164e-05, "loss": 4.9315, "step": 13210 }, { "epoch": 1.0577692430788925, "grad_norm": 3.191342353820801, "learning_rate": 3.246510135315826e-05, "loss": 5.1095, "step": 13220 }, { "epoch": 1.058569371099376, "grad_norm": 4.010619163513184, "learning_rate": 3.2451730224100125e-05, "loss": 5.1697, "step": 13230 }, { "epoch": 1.0593694991198592, "grad_norm": 2.828768253326416, "learning_rate": 3.243835909504199e-05, "loss": 5.1114, "step": 13240 }, { "epoch": 1.0601696271403425, "grad_norm": 4.081239223480225, "learning_rate": 3.242498796598385e-05, "loss": 5.0629, "step": 13250 }, { "epoch": 1.0609697551608257, "grad_norm": 3.347407817840576, "learning_rate": 3.241161683692571e-05, "loss": 5.0355, "step": 13260 }, { "epoch": 1.061769883181309, "grad_norm": 2.902289390563965, "learning_rate": 3.2398245707867576e-05, "loss": 5.1561, "step": 13270 }, { "epoch": 1.0625700112017924, "grad_norm": 15.202888488769531, "learning_rate": 3.238487457880944e-05, "loss": 5.2149, "step": 13280 }, { "epoch": 1.0633701392222756, "grad_norm": 3.353285551071167, "learning_rate": 3.23715034497513e-05, "loss": 4.8566, "step": 13290 }, { "epoch": 1.0641702672427589, "grad_norm": 4.258049011230469, "learning_rate": 3.2358132320693164e-05, "loss": 5.0358, "step": 13300 }, { "epoch": 1.064970395263242, "grad_norm": 2.727367639541626, "learning_rate": 3.234476119163503e-05, "loss": 4.9733, "step": 13310 }, { "epoch": 1.0657705232837253, "grad_norm": 4.626856803894043, "learning_rate": 3.233139006257688e-05, "loss": 5.162, "step": 13320 }, { "epoch": 1.0665706513042086, "grad_norm": 3.074949264526367, "learning_rate": 3.2318018933518745e-05, "loss": 5.1322, "step": 13330 }, { "epoch": 1.067370779324692, "grad_norm": 4.150319576263428, "learning_rate": 3.230464780446061e-05, "loss": 5.0567, "step": 13340 }, { "epoch": 1.0681709073451753, "grad_norm": 5.132182598114014, "learning_rate": 3.229127667540247e-05, "loss": 5.1743, "step": 13350 }, { "epoch": 1.0689710353656585, "grad_norm": 4.4582839012146, "learning_rate": 3.2277905546344333e-05, "loss": 5.2236, "step": 13360 }, { "epoch": 1.0697711633861418, "grad_norm": 2.9640562534332275, "learning_rate": 3.2264534417286196e-05, "loss": 5.0974, "step": 13370 }, { "epoch": 1.070571291406625, "grad_norm": 2.8978335857391357, "learning_rate": 3.225116328822806e-05, "loss": 5.1591, "step": 13380 }, { "epoch": 1.0713714194271082, "grad_norm": 2.773488759994507, "learning_rate": 3.223779215916992e-05, "loss": 5.0814, "step": 13390 }, { "epoch": 1.0721715474475917, "grad_norm": 2.719374656677246, "learning_rate": 3.2224421030111784e-05, "loss": 5.0352, "step": 13400 }, { "epoch": 1.072971675468075, "grad_norm": 2.918991804122925, "learning_rate": 3.221104990105365e-05, "loss": 5.0955, "step": 13410 }, { "epoch": 1.0737718034885582, "grad_norm": 3.3438122272491455, "learning_rate": 3.219767877199551e-05, "loss": 5.0205, "step": 13420 }, { "epoch": 1.0745719315090414, "grad_norm": 2.915687322616577, "learning_rate": 3.218430764293737e-05, "loss": 5.0708, "step": 13430 }, { "epoch": 1.0753720595295246, "grad_norm": 2.3897652626037598, "learning_rate": 3.2170936513879235e-05, "loss": 5.0898, "step": 13440 }, { "epoch": 1.076172187550008, "grad_norm": 2.5261075496673584, "learning_rate": 3.21575653848211e-05, "loss": 5.0002, "step": 13450 }, { "epoch": 1.0769723155704913, "grad_norm": 4.839473247528076, "learning_rate": 3.214419425576296e-05, "loss": 5.1853, "step": 13460 }, { "epoch": 1.0777724435909746, "grad_norm": 2.396831512451172, "learning_rate": 3.213082312670482e-05, "loss": 5.0397, "step": 13470 }, { "epoch": 1.0785725716114578, "grad_norm": 4.165911674499512, "learning_rate": 3.211745199764668e-05, "loss": 5.2065, "step": 13480 }, { "epoch": 1.079372699631941, "grad_norm": 2.74873423576355, "learning_rate": 3.210408086858854e-05, "loss": 5.2217, "step": 13490 }, { "epoch": 1.0801728276524245, "grad_norm": 3.480703353881836, "learning_rate": 3.2090709739530404e-05, "loss": 4.9929, "step": 13500 }, { "epoch": 1.0809729556729077, "grad_norm": 3.747199773788452, "learning_rate": 3.207733861047227e-05, "loss": 5.1235, "step": 13510 }, { "epoch": 1.081773083693391, "grad_norm": 3.634990692138672, "learning_rate": 3.206396748141413e-05, "loss": 4.9466, "step": 13520 }, { "epoch": 1.0825732117138742, "grad_norm": 3.6419565677642822, "learning_rate": 3.205059635235599e-05, "loss": 5.1791, "step": 13530 }, { "epoch": 1.0833733397343575, "grad_norm": 3.413770914077759, "learning_rate": 3.2037225223297855e-05, "loss": 5.1777, "step": 13540 }, { "epoch": 1.0841734677548407, "grad_norm": 5.771011829376221, "learning_rate": 3.202385409423972e-05, "loss": 5.0543, "step": 13550 }, { "epoch": 1.0849735957753242, "grad_norm": 2.9491965770721436, "learning_rate": 3.201048296518158e-05, "loss": 4.9719, "step": 13560 }, { "epoch": 1.0857737237958074, "grad_norm": 3.3095767498016357, "learning_rate": 3.199711183612344e-05, "loss": 5.2155, "step": 13570 }, { "epoch": 1.0865738518162906, "grad_norm": 4.941197395324707, "learning_rate": 3.1983740707065306e-05, "loss": 5.073, "step": 13580 }, { "epoch": 1.0873739798367739, "grad_norm": 2.3605270385742188, "learning_rate": 3.197036957800717e-05, "loss": 5.1746, "step": 13590 }, { "epoch": 1.088174107857257, "grad_norm": 2.9810526371002197, "learning_rate": 3.195699844894903e-05, "loss": 5.157, "step": 13600 }, { "epoch": 1.0889742358777403, "grad_norm": 2.767223358154297, "learning_rate": 3.1943627319890894e-05, "loss": 5.0831, "step": 13610 }, { "epoch": 1.0897743638982238, "grad_norm": 6.959831714630127, "learning_rate": 3.193025619083276e-05, "loss": 4.883, "step": 13620 }, { "epoch": 1.090574491918707, "grad_norm": 6.120983123779297, "learning_rate": 3.191688506177461e-05, "loss": 5.0368, "step": 13630 }, { "epoch": 1.0913746199391903, "grad_norm": 2.680748462677002, "learning_rate": 3.1903513932716475e-05, "loss": 5.1996, "step": 13640 }, { "epoch": 1.0921747479596735, "grad_norm": 4.287043571472168, "learning_rate": 3.189014280365834e-05, "loss": 4.9824, "step": 13650 }, { "epoch": 1.0929748759801567, "grad_norm": 2.647005319595337, "learning_rate": 3.18767716746002e-05, "loss": 4.9845, "step": 13660 }, { "epoch": 1.0937750040006402, "grad_norm": 2.9568288326263428, "learning_rate": 3.186340054554206e-05, "loss": 5.0804, "step": 13670 }, { "epoch": 1.0945751320211234, "grad_norm": 4.118317127227783, "learning_rate": 3.1850029416483926e-05, "loss": 5.0375, "step": 13680 }, { "epoch": 1.0953752600416067, "grad_norm": 3.7457168102264404, "learning_rate": 3.183665828742579e-05, "loss": 5.0193, "step": 13690 }, { "epoch": 1.09617538806209, "grad_norm": 2.829274892807007, "learning_rate": 3.182328715836765e-05, "loss": 5.1896, "step": 13700 }, { "epoch": 1.0969755160825732, "grad_norm": 3.568166971206665, "learning_rate": 3.1809916029309514e-05, "loss": 5.0527, "step": 13710 }, { "epoch": 1.0977756441030564, "grad_norm": 2.8555142879486084, "learning_rate": 3.179654490025138e-05, "loss": 5.0873, "step": 13720 }, { "epoch": 1.0985757721235399, "grad_norm": 2.9258460998535156, "learning_rate": 3.178317377119324e-05, "loss": 4.9293, "step": 13730 }, { "epoch": 1.099375900144023, "grad_norm": 3.3614535331726074, "learning_rate": 3.17698026421351e-05, "loss": 4.992, "step": 13740 }, { "epoch": 1.1001760281645063, "grad_norm": 3.859238624572754, "learning_rate": 3.1756431513076965e-05, "loss": 4.9695, "step": 13750 }, { "epoch": 1.1009761561849896, "grad_norm": 2.9869918823242188, "learning_rate": 3.174306038401883e-05, "loss": 5.0833, "step": 13760 }, { "epoch": 1.1017762842054728, "grad_norm": 2.874736785888672, "learning_rate": 3.172968925496069e-05, "loss": 5.0329, "step": 13770 }, { "epoch": 1.102576412225956, "grad_norm": 3.2926857471466064, "learning_rate": 3.171631812590255e-05, "loss": 5.0431, "step": 13780 }, { "epoch": 1.1033765402464395, "grad_norm": 3.0349912643432617, "learning_rate": 3.1702946996844416e-05, "loss": 5.0485, "step": 13790 }, { "epoch": 1.1041766682669227, "grad_norm": 3.0139970779418945, "learning_rate": 3.168957586778628e-05, "loss": 5.0519, "step": 13800 }, { "epoch": 1.104976796287406, "grad_norm": 3.5662894248962402, "learning_rate": 3.167620473872814e-05, "loss": 5.2053, "step": 13810 }, { "epoch": 1.1057769243078892, "grad_norm": 3.348515033721924, "learning_rate": 3.1662833609670004e-05, "loss": 5.0588, "step": 13820 }, { "epoch": 1.1065770523283724, "grad_norm": 2.439892292022705, "learning_rate": 3.1649462480611866e-05, "loss": 5.0894, "step": 13830 }, { "epoch": 1.107377180348856, "grad_norm": 3.85776948928833, "learning_rate": 3.163609135155373e-05, "loss": 5.0345, "step": 13840 }, { "epoch": 1.1081773083693391, "grad_norm": 2.6576287746429443, "learning_rate": 3.162272022249559e-05, "loss": 5.0607, "step": 13850 }, { "epoch": 1.1089774363898224, "grad_norm": 2.6049861907958984, "learning_rate": 3.1609349093437454e-05, "loss": 5.0033, "step": 13860 }, { "epoch": 1.1097775644103056, "grad_norm": 2.5496983528137207, "learning_rate": 3.159597796437932e-05, "loss": 5.2102, "step": 13870 }, { "epoch": 1.1105776924307889, "grad_norm": 4.300173282623291, "learning_rate": 3.158260683532118e-05, "loss": 5.1137, "step": 13880 }, { "epoch": 1.1113778204512723, "grad_norm": 2.4413559436798096, "learning_rate": 3.156923570626304e-05, "loss": 4.9222, "step": 13890 }, { "epoch": 1.1121779484717556, "grad_norm": 2.4938573837280273, "learning_rate": 3.1555864577204905e-05, "loss": 5.1414, "step": 13900 }, { "epoch": 1.1129780764922388, "grad_norm": 3.333294153213501, "learning_rate": 3.154249344814677e-05, "loss": 5.1243, "step": 13910 }, { "epoch": 1.113778204512722, "grad_norm": 3.8718490600585938, "learning_rate": 3.152912231908863e-05, "loss": 5.2178, "step": 13920 }, { "epoch": 1.1145783325332053, "grad_norm": 4.667349338531494, "learning_rate": 3.151575119003049e-05, "loss": 5.185, "step": 13930 }, { "epoch": 1.1153784605536885, "grad_norm": 3.7269580364227295, "learning_rate": 3.150238006097235e-05, "loss": 4.9231, "step": 13940 }, { "epoch": 1.116178588574172, "grad_norm": 3.8037633895874023, "learning_rate": 3.148900893191421e-05, "loss": 5.0166, "step": 13950 }, { "epoch": 1.1169787165946552, "grad_norm": 3.2636613845825195, "learning_rate": 3.1475637802856075e-05, "loss": 5.0339, "step": 13960 }, { "epoch": 1.1177788446151384, "grad_norm": 4.069303035736084, "learning_rate": 3.146226667379794e-05, "loss": 5.1558, "step": 13970 }, { "epoch": 1.1185789726356217, "grad_norm": 3.160214424133301, "learning_rate": 3.14488955447398e-05, "loss": 5.0048, "step": 13980 }, { "epoch": 1.119379100656105, "grad_norm": 2.7678611278533936, "learning_rate": 3.143552441568166e-05, "loss": 5.0992, "step": 13990 }, { "epoch": 1.1201792286765881, "grad_norm": 3.162316083908081, "learning_rate": 3.1422153286623525e-05, "loss": 5.0398, "step": 14000 }, { "epoch": 1.1201792286765881, "eval_loss": 5.684463977813721, "eval_runtime": 11.9219, "eval_samples_per_second": 3.355, "eval_steps_per_second": 0.419, "step": 14000 }, { "epoch": 1.1209793566970716, "grad_norm": 3.6854958534240723, "learning_rate": 3.140878215756539e-05, "loss": 5.0345, "step": 14010 }, { "epoch": 1.1217794847175548, "grad_norm": 2.577242851257324, "learning_rate": 3.139541102850725e-05, "loss": 4.9944, "step": 14020 }, { "epoch": 1.122579612738038, "grad_norm": 2.181784152984619, "learning_rate": 3.138203989944911e-05, "loss": 5.0055, "step": 14030 }, { "epoch": 1.1233797407585213, "grad_norm": 4.235867500305176, "learning_rate": 3.1368668770390976e-05, "loss": 5.2758, "step": 14040 }, { "epoch": 1.1241798687790046, "grad_norm": 2.5550196170806885, "learning_rate": 3.135529764133284e-05, "loss": 5.0281, "step": 14050 }, { "epoch": 1.124979996799488, "grad_norm": 3.513957977294922, "learning_rate": 3.13419265122747e-05, "loss": 5.0031, "step": 14060 }, { "epoch": 1.1257801248199713, "grad_norm": 2.731046676635742, "learning_rate": 3.1328555383216564e-05, "loss": 5.061, "step": 14070 }, { "epoch": 1.1265802528404545, "grad_norm": 2.664210557937622, "learning_rate": 3.131518425415843e-05, "loss": 5.0907, "step": 14080 }, { "epoch": 1.1273803808609377, "grad_norm": 3.6753573417663574, "learning_rate": 3.130181312510029e-05, "loss": 4.9645, "step": 14090 }, { "epoch": 1.128180508881421, "grad_norm": 4.166867733001709, "learning_rate": 3.1288441996042145e-05, "loss": 5.1689, "step": 14100 }, { "epoch": 1.1289806369019044, "grad_norm": 3.4742753505706787, "learning_rate": 3.127507086698401e-05, "loss": 5.0116, "step": 14110 }, { "epoch": 1.1297807649223877, "grad_norm": 2.653500556945801, "learning_rate": 3.126169973792587e-05, "loss": 5.0203, "step": 14120 }, { "epoch": 1.130580892942871, "grad_norm": 4.123785495758057, "learning_rate": 3.1248328608867733e-05, "loss": 4.8747, "step": 14130 }, { "epoch": 1.1313810209633541, "grad_norm": 2.67256498336792, "learning_rate": 3.1234957479809596e-05, "loss": 5.0739, "step": 14140 }, { "epoch": 1.1321811489838374, "grad_norm": 3.181354284286499, "learning_rate": 3.122158635075146e-05, "loss": 5.0416, "step": 14150 }, { "epoch": 1.1329812770043206, "grad_norm": 3.475081443786621, "learning_rate": 3.120821522169332e-05, "loss": 5.126, "step": 14160 }, { "epoch": 1.1337814050248038, "grad_norm": 3.9857749938964844, "learning_rate": 3.1194844092635184e-05, "loss": 5.088, "step": 14170 }, { "epoch": 1.1345815330452873, "grad_norm": 2.8515474796295166, "learning_rate": 3.118147296357705e-05, "loss": 5.0137, "step": 14180 }, { "epoch": 1.1353816610657705, "grad_norm": 3.1561362743377686, "learning_rate": 3.116810183451891e-05, "loss": 5.1099, "step": 14190 }, { "epoch": 1.1361817890862538, "grad_norm": 3.4241456985473633, "learning_rate": 3.115473070546077e-05, "loss": 5.2076, "step": 14200 }, { "epoch": 1.136981917106737, "grad_norm": 5.489968776702881, "learning_rate": 3.1141359576402635e-05, "loss": 5.0493, "step": 14210 }, { "epoch": 1.1377820451272203, "grad_norm": 4.867628574371338, "learning_rate": 3.11279884473445e-05, "loss": 5.0169, "step": 14220 }, { "epoch": 1.1385821731477037, "grad_norm": 4.413990497589111, "learning_rate": 3.111461731828636e-05, "loss": 5.227, "step": 14230 }, { "epoch": 1.139382301168187, "grad_norm": 2.8556582927703857, "learning_rate": 3.110124618922822e-05, "loss": 5.0318, "step": 14240 }, { "epoch": 1.1401824291886702, "grad_norm": 3.325246572494507, "learning_rate": 3.108787506017008e-05, "loss": 5.1337, "step": 14250 }, { "epoch": 1.1409825572091534, "grad_norm": 2.323495864868164, "learning_rate": 3.107450393111194e-05, "loss": 5.05, "step": 14260 }, { "epoch": 1.1417826852296367, "grad_norm": 3.933109998703003, "learning_rate": 3.1061132802053804e-05, "loss": 5.0799, "step": 14270 }, { "epoch": 1.1425828132501201, "grad_norm": 4.54990816116333, "learning_rate": 3.104776167299567e-05, "loss": 5.0432, "step": 14280 }, { "epoch": 1.1433829412706034, "grad_norm": 3.6891863346099854, "learning_rate": 3.103439054393753e-05, "loss": 5.281, "step": 14290 }, { "epoch": 1.1441830692910866, "grad_norm": 3.3995590209960938, "learning_rate": 3.102101941487939e-05, "loss": 5.0141, "step": 14300 }, { "epoch": 1.1449831973115698, "grad_norm": 3.7183680534362793, "learning_rate": 3.1007648285821255e-05, "loss": 5.1297, "step": 14310 }, { "epoch": 1.145783325332053, "grad_norm": 4.312760353088379, "learning_rate": 3.099427715676312e-05, "loss": 5.1593, "step": 14320 }, { "epoch": 1.1465834533525365, "grad_norm": 3.545175552368164, "learning_rate": 3.098090602770498e-05, "loss": 5.0797, "step": 14330 }, { "epoch": 1.1473835813730198, "grad_norm": 2.8631224632263184, "learning_rate": 3.096753489864684e-05, "loss": 4.8917, "step": 14340 }, { "epoch": 1.148183709393503, "grad_norm": 4.92164945602417, "learning_rate": 3.0954163769588706e-05, "loss": 5.1036, "step": 14350 }, { "epoch": 1.1489838374139862, "grad_norm": 3.5669424533843994, "learning_rate": 3.094079264053057e-05, "loss": 5.0415, "step": 14360 }, { "epoch": 1.1497839654344695, "grad_norm": 2.990910291671753, "learning_rate": 3.092742151147243e-05, "loss": 5.031, "step": 14370 }, { "epoch": 1.1505840934549527, "grad_norm": 3.9870245456695557, "learning_rate": 3.0914050382414294e-05, "loss": 5.1785, "step": 14380 }, { "epoch": 1.151384221475436, "grad_norm": 2.9064061641693115, "learning_rate": 3.090067925335616e-05, "loss": 4.8864, "step": 14390 }, { "epoch": 1.1521843494959194, "grad_norm": 2.7341842651367188, "learning_rate": 3.088730812429801e-05, "loss": 5.0189, "step": 14400 }, { "epoch": 1.1529844775164027, "grad_norm": 3.3455512523651123, "learning_rate": 3.0873936995239875e-05, "loss": 4.998, "step": 14410 }, { "epoch": 1.1537846055368859, "grad_norm": 3.155400276184082, "learning_rate": 3.086056586618174e-05, "loss": 4.9555, "step": 14420 }, { "epoch": 1.1545847335573691, "grad_norm": 3.9464547634124756, "learning_rate": 3.08471947371236e-05, "loss": 4.9177, "step": 14430 }, { "epoch": 1.1553848615778524, "grad_norm": 3.159940242767334, "learning_rate": 3.083382360806546e-05, "loss": 5.113, "step": 14440 }, { "epoch": 1.1561849895983358, "grad_norm": 5.134779930114746, "learning_rate": 3.0820452479007326e-05, "loss": 5.054, "step": 14450 }, { "epoch": 1.156985117618819, "grad_norm": 3.6196463108062744, "learning_rate": 3.080708134994919e-05, "loss": 5.0461, "step": 14460 }, { "epoch": 1.1577852456393023, "grad_norm": 3.5549261569976807, "learning_rate": 3.079371022089105e-05, "loss": 5.0545, "step": 14470 }, { "epoch": 1.1585853736597855, "grad_norm": 2.656663656234741, "learning_rate": 3.0780339091832914e-05, "loss": 5.0538, "step": 14480 }, { "epoch": 1.1593855016802688, "grad_norm": 3.927999973297119, "learning_rate": 3.076696796277478e-05, "loss": 4.95, "step": 14490 }, { "epoch": 1.1601856297007522, "grad_norm": 3.3732807636260986, "learning_rate": 3.075359683371664e-05, "loss": 5.1581, "step": 14500 }, { "epoch": 1.1609857577212355, "grad_norm": 3.2441952228546143, "learning_rate": 3.07402257046585e-05, "loss": 4.9357, "step": 14510 }, { "epoch": 1.1617858857417187, "grad_norm": 3.2283682823181152, "learning_rate": 3.0726854575600365e-05, "loss": 4.9143, "step": 14520 }, { "epoch": 1.162586013762202, "grad_norm": 4.4284234046936035, "learning_rate": 3.071348344654223e-05, "loss": 5.0004, "step": 14530 }, { "epoch": 1.1633861417826852, "grad_norm": 2.5761866569519043, "learning_rate": 3.070011231748409e-05, "loss": 5.08, "step": 14540 }, { "epoch": 1.1641862698031684, "grad_norm": 3.1167516708374023, "learning_rate": 3.068674118842595e-05, "loss": 5.0459, "step": 14550 }, { "epoch": 1.1649863978236519, "grad_norm": 4.732173442840576, "learning_rate": 3.067337005936781e-05, "loss": 5.092, "step": 14560 }, { "epoch": 1.1657865258441351, "grad_norm": 3.620969772338867, "learning_rate": 3.065999893030967e-05, "loss": 5.0032, "step": 14570 }, { "epoch": 1.1665866538646184, "grad_norm": 4.062996864318848, "learning_rate": 3.0646627801251534e-05, "loss": 5.1082, "step": 14580 }, { "epoch": 1.1673867818851016, "grad_norm": 4.451529502868652, "learning_rate": 3.06332566721934e-05, "loss": 4.7973, "step": 14590 }, { "epoch": 1.1681869099055848, "grad_norm": 2.944892406463623, "learning_rate": 3.061988554313526e-05, "loss": 5.1265, "step": 14600 }, { "epoch": 1.168987037926068, "grad_norm": 2.3681206703186035, "learning_rate": 3.060651441407712e-05, "loss": 5.036, "step": 14610 }, { "epoch": 1.1697871659465515, "grad_norm": 3.83963680267334, "learning_rate": 3.0593143285018985e-05, "loss": 5.0027, "step": 14620 }, { "epoch": 1.1705872939670348, "grad_norm": 2.4702982902526855, "learning_rate": 3.057977215596085e-05, "loss": 5.1936, "step": 14630 }, { "epoch": 1.171387421987518, "grad_norm": 3.8022842407226562, "learning_rate": 3.056640102690271e-05, "loss": 4.9908, "step": 14640 }, { "epoch": 1.1721875500080012, "grad_norm": 7.476479530334473, "learning_rate": 3.055302989784457e-05, "loss": 4.9345, "step": 14650 }, { "epoch": 1.1729876780284845, "grad_norm": 4.326262474060059, "learning_rate": 3.0539658768786436e-05, "loss": 5.0183, "step": 14660 }, { "epoch": 1.173787806048968, "grad_norm": 3.567706346511841, "learning_rate": 3.05262876397283e-05, "loss": 4.9711, "step": 14670 }, { "epoch": 1.1745879340694512, "grad_norm": 5.6367011070251465, "learning_rate": 3.0512916510670165e-05, "loss": 5.2, "step": 14680 }, { "epoch": 1.1753880620899344, "grad_norm": 3.228248357772827, "learning_rate": 3.0499545381612027e-05, "loss": 5.0736, "step": 14690 }, { "epoch": 1.1761881901104176, "grad_norm": 3.1230525970458984, "learning_rate": 3.048617425255389e-05, "loss": 5.046, "step": 14700 }, { "epoch": 1.1769883181309009, "grad_norm": 3.1620965003967285, "learning_rate": 3.0472803123495746e-05, "loss": 5.2402, "step": 14710 }, { "epoch": 1.1777884461513843, "grad_norm": 2.9617507457733154, "learning_rate": 3.045943199443761e-05, "loss": 4.9144, "step": 14720 }, { "epoch": 1.1785885741718676, "grad_norm": 4.437487602233887, "learning_rate": 3.044606086537947e-05, "loss": 5.1531, "step": 14730 }, { "epoch": 1.1793887021923508, "grad_norm": 4.128335952758789, "learning_rate": 3.0432689736321334e-05, "loss": 5.152, "step": 14740 }, { "epoch": 1.180188830212834, "grad_norm": 3.35322642326355, "learning_rate": 3.0419318607263197e-05, "loss": 5.1341, "step": 14750 }, { "epoch": 1.1809889582333173, "grad_norm": 3.4951529502868652, "learning_rate": 3.040594747820506e-05, "loss": 5.1443, "step": 14760 }, { "epoch": 1.1817890862538005, "grad_norm": 3.2248058319091797, "learning_rate": 3.0392576349146922e-05, "loss": 5.221, "step": 14770 }, { "epoch": 1.1825892142742838, "grad_norm": 4.069246292114258, "learning_rate": 3.0379205220088785e-05, "loss": 5.1644, "step": 14780 }, { "epoch": 1.1833893422947672, "grad_norm": 3.8001856803894043, "learning_rate": 3.0365834091030647e-05, "loss": 5.1899, "step": 14790 }, { "epoch": 1.1841894703152505, "grad_norm": 2.359663248062134, "learning_rate": 3.035246296197251e-05, "loss": 5.1497, "step": 14800 }, { "epoch": 1.1849895983357337, "grad_norm": 2.8918564319610596, "learning_rate": 3.0339091832914373e-05, "loss": 5.0774, "step": 14810 }, { "epoch": 1.185789726356217, "grad_norm": 2.6341822147369385, "learning_rate": 3.0325720703856235e-05, "loss": 5.0062, "step": 14820 }, { "epoch": 1.1865898543767002, "grad_norm": 2.6374053955078125, "learning_rate": 3.0312349574798098e-05, "loss": 5.0424, "step": 14830 }, { "epoch": 1.1873899823971836, "grad_norm": 3.653303623199463, "learning_rate": 3.029897844573996e-05, "loss": 5.1633, "step": 14840 }, { "epoch": 1.1881901104176669, "grad_norm": 3.1818668842315674, "learning_rate": 3.0286944429587637e-05, "loss": 5.0724, "step": 14850 }, { "epoch": 1.18899023843815, "grad_norm": 3.803523540496826, "learning_rate": 3.02735733005295e-05, "loss": 5.0358, "step": 14860 }, { "epoch": 1.1897903664586333, "grad_norm": 2.8528194427490234, "learning_rate": 3.026020217147136e-05, "loss": 5.2583, "step": 14870 }, { "epoch": 1.1905904944791166, "grad_norm": 3.8696224689483643, "learning_rate": 3.024683104241322e-05, "loss": 5.1876, "step": 14880 }, { "epoch": 1.1913906224996, "grad_norm": 2.9173583984375, "learning_rate": 3.0233459913355084e-05, "loss": 5.0444, "step": 14890 }, { "epoch": 1.1921907505200833, "grad_norm": 3.349648952484131, "learning_rate": 3.0220088784296947e-05, "loss": 5.1046, "step": 14900 }, { "epoch": 1.1929908785405665, "grad_norm": 3.1171865463256836, "learning_rate": 3.020671765523881e-05, "loss": 5.1088, "step": 14910 }, { "epoch": 1.1937910065610498, "grad_norm": 3.1871280670166016, "learning_rate": 3.0193346526180672e-05, "loss": 4.9422, "step": 14920 }, { "epoch": 1.194591134581533, "grad_norm": 5.893035888671875, "learning_rate": 3.0179975397122535e-05, "loss": 5.1122, "step": 14930 }, { "epoch": 1.1953912626020164, "grad_norm": 2.8081605434417725, "learning_rate": 3.0166604268064397e-05, "loss": 5.0883, "step": 14940 }, { "epoch": 1.1961913906224997, "grad_norm": 3.0510361194610596, "learning_rate": 3.015323313900626e-05, "loss": 5.1283, "step": 14950 }, { "epoch": 1.196991518642983, "grad_norm": 4.671666622161865, "learning_rate": 3.0139862009948123e-05, "loss": 5.0935, "step": 14960 }, { "epoch": 1.1977916466634662, "grad_norm": 4.178012847900391, "learning_rate": 3.0126490880889985e-05, "loss": 5.0072, "step": 14970 }, { "epoch": 1.1985917746839494, "grad_norm": 3.84997820854187, "learning_rate": 3.0113119751831848e-05, "loss": 5.01, "step": 14980 }, { "epoch": 1.1993919027044326, "grad_norm": 3.262110710144043, "learning_rate": 3.009974862277371e-05, "loss": 5.1383, "step": 14990 }, { "epoch": 1.2001920307249159, "grad_norm": 2.7314374446868896, "learning_rate": 3.0086377493715573e-05, "loss": 5.0762, "step": 15000 }, { "epoch": 1.2009921587453993, "grad_norm": 2.8299612998962402, "learning_rate": 3.0073006364657436e-05, "loss": 5.0924, "step": 15010 }, { "epoch": 1.2017922867658826, "grad_norm": 3.1154122352600098, "learning_rate": 3.0059635235599292e-05, "loss": 5.1659, "step": 15020 }, { "epoch": 1.2025924147863658, "grad_norm": 3.96730375289917, "learning_rate": 3.0046264106541155e-05, "loss": 5.0078, "step": 15030 }, { "epoch": 1.203392542806849, "grad_norm": 3.298128604888916, "learning_rate": 3.0032892977483017e-05, "loss": 4.9144, "step": 15040 }, { "epoch": 1.2041926708273323, "grad_norm": 3.2130002975463867, "learning_rate": 3.001952184842488e-05, "loss": 5.0833, "step": 15050 }, { "epoch": 1.2049927988478157, "grad_norm": 3.294297456741333, "learning_rate": 3.0006150719366743e-05, "loss": 5.0306, "step": 15060 }, { "epoch": 1.205792926868299, "grad_norm": 4.316399574279785, "learning_rate": 2.9992779590308606e-05, "loss": 5.1813, "step": 15070 }, { "epoch": 1.2065930548887822, "grad_norm": 2.5436484813690186, "learning_rate": 2.9979408461250468e-05, "loss": 5.0249, "step": 15080 }, { "epoch": 1.2073931829092654, "grad_norm": 3.144047737121582, "learning_rate": 2.996603733219233e-05, "loss": 5.1768, "step": 15090 }, { "epoch": 1.2081933109297487, "grad_norm": 2.7458813190460205, "learning_rate": 2.9952666203134194e-05, "loss": 5.136, "step": 15100 }, { "epoch": 1.2089934389502321, "grad_norm": 5.8104095458984375, "learning_rate": 2.9939295074076056e-05, "loss": 5.0067, "step": 15110 }, { "epoch": 1.2097935669707154, "grad_norm": 4.127492904663086, "learning_rate": 2.992592394501792e-05, "loss": 5.1984, "step": 15120 }, { "epoch": 1.2105936949911986, "grad_norm": 2.4212141036987305, "learning_rate": 2.991255281595978e-05, "loss": 5.0106, "step": 15130 }, { "epoch": 1.2113938230116819, "grad_norm": 2.9509193897247314, "learning_rate": 2.9899181686901644e-05, "loss": 5.3269, "step": 15140 }, { "epoch": 1.212193951032165, "grad_norm": 3.008227825164795, "learning_rate": 2.9885810557843507e-05, "loss": 5.2664, "step": 15150 }, { "epoch": 1.2129940790526483, "grad_norm": 4.015718936920166, "learning_rate": 2.987243942878537e-05, "loss": 4.9637, "step": 15160 }, { "epoch": 1.2137942070731318, "grad_norm": 4.144075870513916, "learning_rate": 2.9859068299727232e-05, "loss": 5.2009, "step": 15170 }, { "epoch": 1.214594335093615, "grad_norm": 2.7135703563690186, "learning_rate": 2.9845697170669092e-05, "loss": 5.0274, "step": 15180 }, { "epoch": 1.2153944631140983, "grad_norm": 2.703979015350342, "learning_rate": 2.9832326041610954e-05, "loss": 4.9492, "step": 15190 }, { "epoch": 1.2161945911345815, "grad_norm": 3.669940948486328, "learning_rate": 2.9818954912552817e-05, "loss": 5.1698, "step": 15200 }, { "epoch": 1.2169947191550647, "grad_norm": 3.5432851314544678, "learning_rate": 2.980558378349468e-05, "loss": 5.0233, "step": 15210 }, { "epoch": 1.217794847175548, "grad_norm": 7.245898723602295, "learning_rate": 2.9792212654436542e-05, "loss": 5.0977, "step": 15220 }, { "epoch": 1.2185949751960314, "grad_norm": 3.1283178329467773, "learning_rate": 2.9778841525378405e-05, "loss": 4.9224, "step": 15230 }, { "epoch": 1.2193951032165147, "grad_norm": 4.550083160400391, "learning_rate": 2.9765470396320268e-05, "loss": 5.1493, "step": 15240 }, { "epoch": 1.220195231236998, "grad_norm": 2.429172992706299, "learning_rate": 2.975209926726213e-05, "loss": 4.8708, "step": 15250 }, { "epoch": 1.2209953592574811, "grad_norm": 3.3362350463867188, "learning_rate": 2.9738728138203993e-05, "loss": 5.1264, "step": 15260 }, { "epoch": 1.2217954872779644, "grad_norm": 3.737426996231079, "learning_rate": 2.9725357009145856e-05, "loss": 5.0933, "step": 15270 }, { "epoch": 1.2225956152984478, "grad_norm": 3.679044723510742, "learning_rate": 2.971198588008772e-05, "loss": 5.1628, "step": 15280 }, { "epoch": 1.223395743318931, "grad_norm": 3.072758436203003, "learning_rate": 2.969861475102958e-05, "loss": 5.0404, "step": 15290 }, { "epoch": 1.2241958713394143, "grad_norm": 2.881199598312378, "learning_rate": 2.9685243621971444e-05, "loss": 5.1415, "step": 15300 }, { "epoch": 1.2249959993598976, "grad_norm": 2.628080129623413, "learning_rate": 2.9671872492913307e-05, "loss": 4.8588, "step": 15310 }, { "epoch": 1.2257961273803808, "grad_norm": 3.700045108795166, "learning_rate": 2.965850136385517e-05, "loss": 4.8126, "step": 15320 }, { "epoch": 1.2265962554008643, "grad_norm": 3.386517286300659, "learning_rate": 2.9645130234797025e-05, "loss": 5.0336, "step": 15330 }, { "epoch": 1.2273963834213475, "grad_norm": 3.7533726692199707, "learning_rate": 2.9631759105738888e-05, "loss": 4.9332, "step": 15340 }, { "epoch": 1.2281965114418307, "grad_norm": 3.1511549949645996, "learning_rate": 2.961838797668075e-05, "loss": 5.1175, "step": 15350 }, { "epoch": 1.228996639462314, "grad_norm": 2.8970258235931396, "learning_rate": 2.9605016847622613e-05, "loss": 4.9315, "step": 15360 }, { "epoch": 1.2297967674827972, "grad_norm": 4.416659355163574, "learning_rate": 2.9591645718564476e-05, "loss": 4.9091, "step": 15370 }, { "epoch": 1.2305968955032804, "grad_norm": 3.8760738372802734, "learning_rate": 2.957827458950634e-05, "loss": 5.0697, "step": 15380 }, { "epoch": 1.2313970235237637, "grad_norm": 2.789149761199951, "learning_rate": 2.95649034604482e-05, "loss": 5.129, "step": 15390 }, { "epoch": 1.2321971515442471, "grad_norm": 2.7208750247955322, "learning_rate": 2.9551532331390064e-05, "loss": 5.1578, "step": 15400 }, { "epoch": 1.2329972795647304, "grad_norm": 3.365661144256592, "learning_rate": 2.9538161202331927e-05, "loss": 5.0808, "step": 15410 }, { "epoch": 1.2337974075852136, "grad_norm": 3.0695741176605225, "learning_rate": 2.952479007327379e-05, "loss": 4.7112, "step": 15420 }, { "epoch": 1.2345975356056968, "grad_norm": 3.3540539741516113, "learning_rate": 2.9511418944215652e-05, "loss": 5.0135, "step": 15430 }, { "epoch": 1.23539766362618, "grad_norm": 3.529069185256958, "learning_rate": 2.9498047815157515e-05, "loss": 5.001, "step": 15440 }, { "epoch": 1.2361977916466635, "grad_norm": 2.7868266105651855, "learning_rate": 2.9484676686099378e-05, "loss": 5.0339, "step": 15450 }, { "epoch": 1.2369979196671468, "grad_norm": 4.2665839195251465, "learning_rate": 2.947130555704124e-05, "loss": 5.0819, "step": 15460 }, { "epoch": 1.23779804768763, "grad_norm": 3.5087594985961914, "learning_rate": 2.9457934427983103e-05, "loss": 5.1543, "step": 15470 }, { "epoch": 1.2385981757081133, "grad_norm": 2.8320281505584717, "learning_rate": 2.9444563298924966e-05, "loss": 5.0284, "step": 15480 }, { "epoch": 1.2393983037285965, "grad_norm": 3.7067158222198486, "learning_rate": 2.943119216986682e-05, "loss": 5.1457, "step": 15490 }, { "epoch": 1.24019843174908, "grad_norm": 2.533123016357422, "learning_rate": 2.9417821040808684e-05, "loss": 4.9353, "step": 15500 }, { "epoch": 1.2409985597695632, "grad_norm": 5.649012565612793, "learning_rate": 2.9404449911750547e-05, "loss": 4.9736, "step": 15510 }, { "epoch": 1.2417986877900464, "grad_norm": 2.7992377281188965, "learning_rate": 2.939107878269241e-05, "loss": 5.0501, "step": 15520 }, { "epoch": 1.2425988158105297, "grad_norm": 4.733801364898682, "learning_rate": 2.9377707653634272e-05, "loss": 5.0347, "step": 15530 }, { "epoch": 1.243398943831013, "grad_norm": 3.681401014328003, "learning_rate": 2.9364336524576135e-05, "loss": 5.0521, "step": 15540 }, { "epoch": 1.2441990718514964, "grad_norm": 3.267540693283081, "learning_rate": 2.9350965395517998e-05, "loss": 5.1938, "step": 15550 }, { "epoch": 1.2449991998719796, "grad_norm": 3.616941452026367, "learning_rate": 2.933759426645986e-05, "loss": 4.959, "step": 15560 }, { "epoch": 1.2457993278924628, "grad_norm": 2.2835965156555176, "learning_rate": 2.9324223137401723e-05, "loss": 5.1389, "step": 15570 }, { "epoch": 1.246599455912946, "grad_norm": 4.935213565826416, "learning_rate": 2.9310852008343586e-05, "loss": 5.0684, "step": 15580 }, { "epoch": 1.2473995839334293, "grad_norm": 2.770784616470337, "learning_rate": 2.929748087928545e-05, "loss": 5.0231, "step": 15590 }, { "epoch": 1.2481997119539125, "grad_norm": 3.6176304817199707, "learning_rate": 2.928410975022731e-05, "loss": 5.1103, "step": 15600 }, { "epoch": 1.2489998399743958, "grad_norm": 2.7759737968444824, "learning_rate": 2.9270738621169174e-05, "loss": 5.2312, "step": 15610 }, { "epoch": 1.2497999679948792, "grad_norm": 3.1174516677856445, "learning_rate": 2.9257367492111037e-05, "loss": 5.1001, "step": 15620 }, { "epoch": 1.2506000960153625, "grad_norm": 3.429516315460205, "learning_rate": 2.92439963630529e-05, "loss": 4.9854, "step": 15630 }, { "epoch": 1.2514002240358457, "grad_norm": 3.940547227859497, "learning_rate": 2.923062523399476e-05, "loss": 5.1451, "step": 15640 }, { "epoch": 1.252200352056329, "grad_norm": 3.9238924980163574, "learning_rate": 2.921725410493662e-05, "loss": 5.0231, "step": 15650 }, { "epoch": 1.2530004800768122, "grad_norm": 3.668210983276367, "learning_rate": 2.9203882975878484e-05, "loss": 4.8446, "step": 15660 }, { "epoch": 1.2538006080972957, "grad_norm": 3.138932704925537, "learning_rate": 2.9190511846820347e-05, "loss": 5.1351, "step": 15670 }, { "epoch": 1.254600736117779, "grad_norm": 2.6472814083099365, "learning_rate": 2.917714071776221e-05, "loss": 5.0462, "step": 15680 }, { "epoch": 1.2554008641382621, "grad_norm": 3.304532766342163, "learning_rate": 2.9163769588704072e-05, "loss": 5.1271, "step": 15690 }, { "epoch": 1.2562009921587454, "grad_norm": 3.478391170501709, "learning_rate": 2.9150398459645935e-05, "loss": 5.0127, "step": 15700 }, { "epoch": 1.2570011201792286, "grad_norm": 2.867481231689453, "learning_rate": 2.9137027330587797e-05, "loss": 5.061, "step": 15710 }, { "epoch": 1.257801248199712, "grad_norm": 3.4853031635284424, "learning_rate": 2.912365620152966e-05, "loss": 4.8585, "step": 15720 }, { "epoch": 1.2586013762201953, "grad_norm": 2.6660521030426025, "learning_rate": 2.9110285072471523e-05, "loss": 4.906, "step": 15730 }, { "epoch": 1.2594015042406785, "grad_norm": 4.05171012878418, "learning_rate": 2.9096913943413385e-05, "loss": 4.9971, "step": 15740 }, { "epoch": 1.2602016322611618, "grad_norm": 2.448624849319458, "learning_rate": 2.9083542814355248e-05, "loss": 5.2265, "step": 15750 }, { "epoch": 1.261001760281645, "grad_norm": 3.1989893913269043, "learning_rate": 2.907017168529711e-05, "loss": 5.0903, "step": 15760 }, { "epoch": 1.2618018883021285, "grad_norm": 3.377147674560547, "learning_rate": 2.9056800556238974e-05, "loss": 4.9618, "step": 15770 }, { "epoch": 1.2626020163226115, "grad_norm": 2.9969663619995117, "learning_rate": 2.9043429427180836e-05, "loss": 5.1172, "step": 15780 }, { "epoch": 1.263402144343095, "grad_norm": 2.4437100887298584, "learning_rate": 2.90300582981227e-05, "loss": 4.9518, "step": 15790 }, { "epoch": 1.2642022723635782, "grad_norm": 2.806199073791504, "learning_rate": 2.9016687169064555e-05, "loss": 5.0077, "step": 15800 }, { "epoch": 1.2650024003840614, "grad_norm": 3.0119595527648926, "learning_rate": 2.9003316040006417e-05, "loss": 5.1922, "step": 15810 }, { "epoch": 1.2658025284045447, "grad_norm": 4.022315502166748, "learning_rate": 2.898994491094828e-05, "loss": 5.0947, "step": 15820 }, { "epoch": 1.266602656425028, "grad_norm": 3.117807388305664, "learning_rate": 2.8976573781890143e-05, "loss": 5.1026, "step": 15830 }, { "epoch": 1.2674027844455114, "grad_norm": 5.5977935791015625, "learning_rate": 2.8963202652832006e-05, "loss": 4.9931, "step": 15840 }, { "epoch": 1.2682029124659946, "grad_norm": 4.070805549621582, "learning_rate": 2.8949831523773868e-05, "loss": 5.0492, "step": 15850 }, { "epoch": 1.2690030404864778, "grad_norm": 2.844937324523926, "learning_rate": 2.893646039471573e-05, "loss": 5.0453, "step": 15860 }, { "epoch": 1.269803168506961, "grad_norm": 3.0533149242401123, "learning_rate": 2.8923089265657594e-05, "loss": 5.129, "step": 15870 }, { "epoch": 1.2706032965274443, "grad_norm": 4.0250043869018555, "learning_rate": 2.8909718136599456e-05, "loss": 4.9406, "step": 15880 }, { "epoch": 1.2714034245479278, "grad_norm": 3.149026870727539, "learning_rate": 2.889634700754132e-05, "loss": 5.0056, "step": 15890 }, { "epoch": 1.272203552568411, "grad_norm": 4.144321918487549, "learning_rate": 2.8882975878483182e-05, "loss": 4.9246, "step": 15900 }, { "epoch": 1.2730036805888942, "grad_norm": 2.5918688774108887, "learning_rate": 2.8869604749425044e-05, "loss": 4.8537, "step": 15910 }, { "epoch": 1.2738038086093775, "grad_norm": 2.5930793285369873, "learning_rate": 2.8856233620366907e-05, "loss": 4.9614, "step": 15920 }, { "epoch": 1.2746039366298607, "grad_norm": 2.6094300746917725, "learning_rate": 2.884286249130877e-05, "loss": 4.9636, "step": 15930 }, { "epoch": 1.2754040646503442, "grad_norm": 3.7304527759552, "learning_rate": 2.8829491362250632e-05, "loss": 5.0208, "step": 15940 }, { "epoch": 1.2762041926708274, "grad_norm": 3.657022714614868, "learning_rate": 2.881612023319249e-05, "loss": 5.0232, "step": 15950 }, { "epoch": 1.2770043206913106, "grad_norm": 2.3147027492523193, "learning_rate": 2.880274910413435e-05, "loss": 5.0987, "step": 15960 }, { "epoch": 1.2778044487117939, "grad_norm": 2.9794461727142334, "learning_rate": 2.8789377975076214e-05, "loss": 5.0097, "step": 15970 }, { "epoch": 1.2786045767322771, "grad_norm": 4.13859748840332, "learning_rate": 2.8776006846018076e-05, "loss": 5.0925, "step": 15980 }, { "epoch": 1.2794047047527606, "grad_norm": 3.289497137069702, "learning_rate": 2.876263571695994e-05, "loss": 4.9943, "step": 15990 }, { "epoch": 1.2802048327732436, "grad_norm": 4.03442907333374, "learning_rate": 2.8749264587901802e-05, "loss": 5.1402, "step": 16000 }, { "epoch": 1.281004960793727, "grad_norm": 2.7994818687438965, "learning_rate": 2.8735893458843665e-05, "loss": 5.1223, "step": 16010 }, { "epoch": 1.2818050888142103, "grad_norm": 2.323319673538208, "learning_rate": 2.8722522329785527e-05, "loss": 4.9936, "step": 16020 }, { "epoch": 1.2826052168346935, "grad_norm": 3.254915952682495, "learning_rate": 2.870915120072739e-05, "loss": 4.9216, "step": 16030 }, { "epoch": 1.2834053448551768, "grad_norm": 2.253689765930176, "learning_rate": 2.8695780071669253e-05, "loss": 4.9689, "step": 16040 }, { "epoch": 1.28420547287566, "grad_norm": 2.4922399520874023, "learning_rate": 2.8682408942611115e-05, "loss": 5.0267, "step": 16050 }, { "epoch": 1.2850056008961435, "grad_norm": 3.0966029167175293, "learning_rate": 2.8669037813552978e-05, "loss": 4.9785, "step": 16060 }, { "epoch": 1.2858057289166267, "grad_norm": 3.88032603263855, "learning_rate": 2.865566668449484e-05, "loss": 4.8793, "step": 16070 }, { "epoch": 1.28660585693711, "grad_norm": 3.152989625930786, "learning_rate": 2.8642295555436703e-05, "loss": 5.0864, "step": 16080 }, { "epoch": 1.2874059849575932, "grad_norm": 2.652728796005249, "learning_rate": 2.8628924426378566e-05, "loss": 5.0895, "step": 16090 }, { "epoch": 1.2882061129780764, "grad_norm": 3.099534273147583, "learning_rate": 2.8615553297320425e-05, "loss": 5.245, "step": 16100 }, { "epoch": 1.2890062409985599, "grad_norm": 3.588867664337158, "learning_rate": 2.8602182168262288e-05, "loss": 5.0127, "step": 16110 }, { "epoch": 1.289806369019043, "grad_norm": 3.4451441764831543, "learning_rate": 2.858881103920415e-05, "loss": 5.1003, "step": 16120 }, { "epoch": 1.2906064970395263, "grad_norm": 3.004190683364868, "learning_rate": 2.8575439910146013e-05, "loss": 5.0977, "step": 16130 }, { "epoch": 1.2914066250600096, "grad_norm": 2.912879467010498, "learning_rate": 2.8562068781087876e-05, "loss": 4.8822, "step": 16140 }, { "epoch": 1.2922067530804928, "grad_norm": 3.078666925430298, "learning_rate": 2.854869765202974e-05, "loss": 4.9094, "step": 16150 }, { "epoch": 1.2930068811009763, "grad_norm": 3.8617477416992188, "learning_rate": 2.85353265229716e-05, "loss": 5.0841, "step": 16160 }, { "epoch": 1.2938070091214593, "grad_norm": 3.9073874950408936, "learning_rate": 2.8521955393913464e-05, "loss": 4.9613, "step": 16170 }, { "epoch": 1.2946071371419428, "grad_norm": 4.929398059844971, "learning_rate": 2.8508584264855327e-05, "loss": 5.0763, "step": 16180 }, { "epoch": 1.295407265162426, "grad_norm": 2.9813883304595947, "learning_rate": 2.849521313579719e-05, "loss": 5.0864, "step": 16190 }, { "epoch": 1.2962073931829092, "grad_norm": 2.894916296005249, "learning_rate": 2.8481842006739052e-05, "loss": 5.0887, "step": 16200 }, { "epoch": 1.2970075212033925, "grad_norm": 3.0193276405334473, "learning_rate": 2.8468470877680915e-05, "loss": 5.1709, "step": 16210 }, { "epoch": 1.2978076492238757, "grad_norm": 2.6719863414764404, "learning_rate": 2.8455099748622778e-05, "loss": 5.1043, "step": 16220 }, { "epoch": 1.2986077772443592, "grad_norm": 5.449126243591309, "learning_rate": 2.844172861956464e-05, "loss": 5.0786, "step": 16230 }, { "epoch": 1.2994079052648424, "grad_norm": 3.498746633529663, "learning_rate": 2.8428357490506503e-05, "loss": 5.1796, "step": 16240 }, { "epoch": 1.3002080332853256, "grad_norm": 3.3248443603515625, "learning_rate": 2.8414986361448366e-05, "loss": 5.0488, "step": 16250 }, { "epoch": 1.3010081613058089, "grad_norm": 3.4345290660858154, "learning_rate": 2.840161523239022e-05, "loss": 5.0962, "step": 16260 }, { "epoch": 1.3018082893262921, "grad_norm": 3.6769347190856934, "learning_rate": 2.8388244103332084e-05, "loss": 5.1032, "step": 16270 }, { "epoch": 1.3026084173467756, "grad_norm": 6.013178825378418, "learning_rate": 2.8374872974273947e-05, "loss": 5.2127, "step": 16280 }, { "epoch": 1.3034085453672588, "grad_norm": 3.117189884185791, "learning_rate": 2.836150184521581e-05, "loss": 5.0172, "step": 16290 }, { "epoch": 1.304208673387742, "grad_norm": 2.8342814445495605, "learning_rate": 2.8348130716157672e-05, "loss": 5.185, "step": 16300 }, { "epoch": 1.3050088014082253, "grad_norm": 3.0346531867980957, "learning_rate": 2.8334759587099535e-05, "loss": 5.0422, "step": 16310 }, { "epoch": 1.3058089294287085, "grad_norm": 2.4778048992156982, "learning_rate": 2.8321388458041398e-05, "loss": 5.2192, "step": 16320 }, { "epoch": 1.306609057449192, "grad_norm": 2.3309547901153564, "learning_rate": 2.830801732898326e-05, "loss": 4.8922, "step": 16330 }, { "epoch": 1.3074091854696752, "grad_norm": 5.362358093261719, "learning_rate": 2.8294646199925123e-05, "loss": 5.1077, "step": 16340 }, { "epoch": 1.3082093134901585, "grad_norm": 3.8114402294158936, "learning_rate": 2.8281275070866986e-05, "loss": 5.1087, "step": 16350 }, { "epoch": 1.3090094415106417, "grad_norm": 3.24176025390625, "learning_rate": 2.826790394180885e-05, "loss": 4.9285, "step": 16360 }, { "epoch": 1.309809569531125, "grad_norm": 3.4344241619110107, "learning_rate": 2.825453281275071e-05, "loss": 5.0988, "step": 16370 }, { "epoch": 1.3106096975516084, "grad_norm": 4.151029109954834, "learning_rate": 2.8241161683692574e-05, "loss": 5.027, "step": 16380 }, { "epoch": 1.3114098255720914, "grad_norm": 3.6375935077667236, "learning_rate": 2.8227790554634437e-05, "loss": 4.9798, "step": 16390 }, { "epoch": 1.3122099535925749, "grad_norm": 4.077208042144775, "learning_rate": 2.82144194255763e-05, "loss": 4.9757, "step": 16400 }, { "epoch": 1.313010081613058, "grad_norm": 2.053307056427002, "learning_rate": 2.8201048296518155e-05, "loss": 5.0139, "step": 16410 }, { "epoch": 1.3138102096335413, "grad_norm": 3.136188268661499, "learning_rate": 2.8187677167460018e-05, "loss": 5.2215, "step": 16420 }, { "epoch": 1.3146103376540246, "grad_norm": 3.371777296066284, "learning_rate": 2.817430603840188e-05, "loss": 5.1862, "step": 16430 }, { "epoch": 1.3154104656745078, "grad_norm": 2.6995646953582764, "learning_rate": 2.8160934909343743e-05, "loss": 5.0416, "step": 16440 }, { "epoch": 1.3162105936949913, "grad_norm": 4.669895172119141, "learning_rate": 2.8147563780285606e-05, "loss": 5.1037, "step": 16450 }, { "epoch": 1.3170107217154745, "grad_norm": 3.6712257862091064, "learning_rate": 2.813419265122747e-05, "loss": 5.0958, "step": 16460 }, { "epoch": 1.3178108497359577, "grad_norm": 3.1598026752471924, "learning_rate": 2.812082152216933e-05, "loss": 4.8109, "step": 16470 }, { "epoch": 1.318610977756441, "grad_norm": 3.089665412902832, "learning_rate": 2.8107450393111194e-05, "loss": 5.1861, "step": 16480 }, { "epoch": 1.3194111057769242, "grad_norm": 2.7782411575317383, "learning_rate": 2.8094079264053057e-05, "loss": 5.0381, "step": 16490 }, { "epoch": 1.3202112337974077, "grad_norm": 2.675037145614624, "learning_rate": 2.808070813499492e-05, "loss": 5.0606, "step": 16500 }, { "epoch": 1.321011361817891, "grad_norm": 2.1817705631256104, "learning_rate": 2.8067337005936782e-05, "loss": 4.9736, "step": 16510 }, { "epoch": 1.3218114898383742, "grad_norm": 4.562685012817383, "learning_rate": 2.8053965876878645e-05, "loss": 5.2282, "step": 16520 }, { "epoch": 1.3226116178588574, "grad_norm": 3.4051151275634766, "learning_rate": 2.8040594747820507e-05, "loss": 5.0466, "step": 16530 }, { "epoch": 1.3234117458793406, "grad_norm": 3.512916088104248, "learning_rate": 2.802722361876237e-05, "loss": 5.1473, "step": 16540 }, { "epoch": 1.324211873899824, "grad_norm": 2.6192626953125, "learning_rate": 2.8013852489704233e-05, "loss": 4.9312, "step": 16550 }, { "epoch": 1.3250120019203073, "grad_norm": 2.950780153274536, "learning_rate": 2.8000481360646096e-05, "loss": 5.0987, "step": 16560 }, { "epoch": 1.3258121299407906, "grad_norm": 3.2263593673706055, "learning_rate": 2.7987110231587955e-05, "loss": 5.041, "step": 16570 }, { "epoch": 1.3266122579612738, "grad_norm": 3.2121188640594482, "learning_rate": 2.7973739102529818e-05, "loss": 4.9174, "step": 16580 }, { "epoch": 1.327412385981757, "grad_norm": 5.739203929901123, "learning_rate": 2.796036797347168e-05, "loss": 4.9011, "step": 16590 }, { "epoch": 1.3282125140022405, "grad_norm": 3.4774510860443115, "learning_rate": 2.7946996844413543e-05, "loss": 4.87, "step": 16600 }, { "epoch": 1.3290126420227235, "grad_norm": 3.5726733207702637, "learning_rate": 2.7933625715355406e-05, "loss": 4.9914, "step": 16610 }, { "epoch": 1.329812770043207, "grad_norm": 6.456293106079102, "learning_rate": 2.7920254586297268e-05, "loss": 4.998, "step": 16620 }, { "epoch": 1.3306128980636902, "grad_norm": 3.2161965370178223, "learning_rate": 2.790688345723913e-05, "loss": 5.1975, "step": 16630 }, { "epoch": 1.3314130260841734, "grad_norm": 2.956698179244995, "learning_rate": 2.7893512328180994e-05, "loss": 4.8964, "step": 16640 }, { "epoch": 1.3322131541046567, "grad_norm": 2.859546661376953, "learning_rate": 2.7880141199122856e-05, "loss": 4.9889, "step": 16650 }, { "epoch": 1.33301328212514, "grad_norm": 2.9399912357330322, "learning_rate": 2.786677007006472e-05, "loss": 5.1039, "step": 16660 }, { "epoch": 1.3338134101456234, "grad_norm": 2.7161428928375244, "learning_rate": 2.7853398941006582e-05, "loss": 5.02, "step": 16670 }, { "epoch": 1.3346135381661066, "grad_norm": 3.227506399154663, "learning_rate": 2.7840027811948444e-05, "loss": 4.859, "step": 16680 }, { "epoch": 1.3354136661865899, "grad_norm": 2.404874086380005, "learning_rate": 2.7826656682890307e-05, "loss": 5.0912, "step": 16690 }, { "epoch": 1.336213794207073, "grad_norm": 3.3921666145324707, "learning_rate": 2.781328555383217e-05, "loss": 5.1004, "step": 16700 }, { "epoch": 1.3370139222275563, "grad_norm": 3.100717306137085, "learning_rate": 2.7799914424774032e-05, "loss": 5.0184, "step": 16710 }, { "epoch": 1.3378140502480398, "grad_norm": 4.171880722045898, "learning_rate": 2.778654329571589e-05, "loss": 4.9575, "step": 16720 }, { "epoch": 1.338614178268523, "grad_norm": 2.831758975982666, "learning_rate": 2.777317216665775e-05, "loss": 4.8927, "step": 16730 }, { "epoch": 1.3394143062890063, "grad_norm": 3.1294634342193604, "learning_rate": 2.7759801037599614e-05, "loss": 5.023, "step": 16740 }, { "epoch": 1.3402144343094895, "grad_norm": 3.0629208087921143, "learning_rate": 2.7746429908541476e-05, "loss": 5.1852, "step": 16750 }, { "epoch": 1.3410145623299727, "grad_norm": 3.2801673412323, "learning_rate": 2.773305877948334e-05, "loss": 5.1116, "step": 16760 }, { "epoch": 1.3418146903504562, "grad_norm": 3.6318020820617676, "learning_rate": 2.7719687650425202e-05, "loss": 4.8689, "step": 16770 }, { "epoch": 1.3426148183709394, "grad_norm": 3.2776827812194824, "learning_rate": 2.7706316521367065e-05, "loss": 4.8672, "step": 16780 }, { "epoch": 1.3434149463914227, "grad_norm": 2.7179038524627686, "learning_rate": 2.7692945392308927e-05, "loss": 5.0371, "step": 16790 }, { "epoch": 1.344215074411906, "grad_norm": 5.088667869567871, "learning_rate": 2.767957426325079e-05, "loss": 5.0763, "step": 16800 }, { "epoch": 1.3450152024323891, "grad_norm": 4.196096420288086, "learning_rate": 2.7666203134192653e-05, "loss": 4.9903, "step": 16810 }, { "epoch": 1.3458153304528724, "grad_norm": 2.5765233039855957, "learning_rate": 2.7652832005134515e-05, "loss": 5.0393, "step": 16820 }, { "epoch": 1.3466154584733556, "grad_norm": 2.857628583908081, "learning_rate": 2.7639460876076378e-05, "loss": 5.1609, "step": 16830 }, { "epoch": 1.347415586493839, "grad_norm": 4.593959808349609, "learning_rate": 2.762608974701824e-05, "loss": 5.3302, "step": 16840 }, { "epoch": 1.3482157145143223, "grad_norm": 2.7411885261535645, "learning_rate": 2.7612718617960103e-05, "loss": 5.0989, "step": 16850 }, { "epoch": 1.3490158425348056, "grad_norm": 3.586278200149536, "learning_rate": 2.7599347488901966e-05, "loss": 5.0393, "step": 16860 }, { "epoch": 1.3498159705552888, "grad_norm": 4.603923797607422, "learning_rate": 2.758597635984383e-05, "loss": 5.1441, "step": 16870 }, { "epoch": 1.350616098575772, "grad_norm": 5.242514133453369, "learning_rate": 2.7572605230785685e-05, "loss": 5.1488, "step": 16880 }, { "epoch": 1.3514162265962555, "grad_norm": 5.148528575897217, "learning_rate": 2.7559234101727547e-05, "loss": 5.1064, "step": 16890 }, { "epoch": 1.3522163546167387, "grad_norm": 3.759023904800415, "learning_rate": 2.754586297266941e-05, "loss": 4.9888, "step": 16900 }, { "epoch": 1.353016482637222, "grad_norm": 3.1770131587982178, "learning_rate": 2.7532491843611273e-05, "loss": 4.972, "step": 16910 }, { "epoch": 1.3538166106577052, "grad_norm": 3.5174386501312256, "learning_rate": 2.7519120714553135e-05, "loss": 5.1717, "step": 16920 }, { "epoch": 1.3546167386781884, "grad_norm": 4.057755470275879, "learning_rate": 2.7505749585494998e-05, "loss": 5.1299, "step": 16930 }, { "epoch": 1.355416866698672, "grad_norm": 3.446735382080078, "learning_rate": 2.749237845643686e-05, "loss": 4.9329, "step": 16940 }, { "epoch": 1.3562169947191551, "grad_norm": 3.140084743499756, "learning_rate": 2.7479007327378724e-05, "loss": 5.1446, "step": 16950 }, { "epoch": 1.3570171227396384, "grad_norm": 3.435009241104126, "learning_rate": 2.7465636198320586e-05, "loss": 5.095, "step": 16960 }, { "epoch": 1.3578172507601216, "grad_norm": 3.1479451656341553, "learning_rate": 2.745226506926245e-05, "loss": 4.9733, "step": 16970 }, { "epoch": 1.3586173787806048, "grad_norm": 2.3184094429016113, "learning_rate": 2.743889394020431e-05, "loss": 5.0376, "step": 16980 }, { "epoch": 1.3594175068010883, "grad_norm": 3.5338847637176514, "learning_rate": 2.7425522811146174e-05, "loss": 4.9706, "step": 16990 }, { "epoch": 1.3602176348215713, "grad_norm": 3.1620054244995117, "learning_rate": 2.7412151682088037e-05, "loss": 5.0098, "step": 17000 }, { "epoch": 1.3610177628420548, "grad_norm": 3.9371540546417236, "learning_rate": 2.73987805530299e-05, "loss": 5.2918, "step": 17010 }, { "epoch": 1.361817890862538, "grad_norm": 3.9547667503356934, "learning_rate": 2.7385409423971762e-05, "loss": 4.9147, "step": 17020 }, { "epoch": 1.3626180188830213, "grad_norm": 6.235637664794922, "learning_rate": 2.737203829491362e-05, "loss": 5.1119, "step": 17030 }, { "epoch": 1.3634181469035045, "grad_norm": 7.610907077789307, "learning_rate": 2.7358667165855484e-05, "loss": 5.0062, "step": 17040 }, { "epoch": 1.3642182749239877, "grad_norm": 2.9273130893707275, "learning_rate": 2.7345296036797347e-05, "loss": 5.1216, "step": 17050 }, { "epoch": 1.3650184029444712, "grad_norm": 3.5307564735412598, "learning_rate": 2.733192490773921e-05, "loss": 5.283, "step": 17060 }, { "epoch": 1.3658185309649544, "grad_norm": 4.250466346740723, "learning_rate": 2.7318553778681072e-05, "loss": 5.1638, "step": 17070 }, { "epoch": 1.3666186589854377, "grad_norm": 3.930469512939453, "learning_rate": 2.7305182649622935e-05, "loss": 5.0906, "step": 17080 }, { "epoch": 1.367418787005921, "grad_norm": 2.873779773712158, "learning_rate": 2.7291811520564798e-05, "loss": 5.106, "step": 17090 }, { "epoch": 1.3682189150264041, "grad_norm": 3.261646270751953, "learning_rate": 2.727844039150666e-05, "loss": 4.9115, "step": 17100 }, { "epoch": 1.3690190430468876, "grad_norm": 3.0499074459075928, "learning_rate": 2.7265069262448523e-05, "loss": 5.035, "step": 17110 }, { "epoch": 1.3698191710673708, "grad_norm": 3.5967090129852295, "learning_rate": 2.7251698133390386e-05, "loss": 5.0455, "step": 17120 }, { "epoch": 1.370619299087854, "grad_norm": 5.134222507476807, "learning_rate": 2.723832700433225e-05, "loss": 5.0828, "step": 17130 }, { "epoch": 1.3714194271083373, "grad_norm": 2.415818929672241, "learning_rate": 2.722495587527411e-05, "loss": 4.9838, "step": 17140 }, { "epoch": 1.3722195551288205, "grad_norm": 3.8739757537841797, "learning_rate": 2.7211584746215974e-05, "loss": 5.145, "step": 17150 }, { "epoch": 1.373019683149304, "grad_norm": 3.7621076107025146, "learning_rate": 2.7198213617157837e-05, "loss": 5.0735, "step": 17160 }, { "epoch": 1.3738198111697872, "grad_norm": 4.789724349975586, "learning_rate": 2.71848424880997e-05, "loss": 4.9842, "step": 17170 }, { "epoch": 1.3746199391902705, "grad_norm": 2.3499200344085693, "learning_rate": 2.7171471359041562e-05, "loss": 5.1707, "step": 17180 }, { "epoch": 1.3754200672107537, "grad_norm": 3.083209753036499, "learning_rate": 2.7158100229983418e-05, "loss": 5.1676, "step": 17190 }, { "epoch": 1.376220195231237, "grad_norm": 3.0781798362731934, "learning_rate": 2.714472910092528e-05, "loss": 4.9164, "step": 17200 }, { "epoch": 1.3770203232517204, "grad_norm": 2.9193978309631348, "learning_rate": 2.7131357971867143e-05, "loss": 4.9213, "step": 17210 }, { "epoch": 1.3778204512722034, "grad_norm": 5.844182014465332, "learning_rate": 2.7117986842809006e-05, "loss": 5.2283, "step": 17220 }, { "epoch": 1.3786205792926869, "grad_norm": 2.885617733001709, "learning_rate": 2.710461571375087e-05, "loss": 4.9839, "step": 17230 }, { "epoch": 1.3794207073131701, "grad_norm": 2.9228098392486572, "learning_rate": 2.709124458469273e-05, "loss": 5.1437, "step": 17240 }, { "epoch": 1.3802208353336534, "grad_norm": 2.6880314350128174, "learning_rate": 2.7077873455634594e-05, "loss": 5.0111, "step": 17250 }, { "epoch": 1.3810209633541366, "grad_norm": 4.601352691650391, "learning_rate": 2.7064502326576457e-05, "loss": 4.9988, "step": 17260 }, { "epoch": 1.3818210913746198, "grad_norm": 4.172126293182373, "learning_rate": 2.705113119751832e-05, "loss": 5.022, "step": 17270 }, { "epoch": 1.3826212193951033, "grad_norm": 2.526599407196045, "learning_rate": 2.7037760068460182e-05, "loss": 5.138, "step": 17280 }, { "epoch": 1.3834213474155865, "grad_norm": 2.531637191772461, "learning_rate": 2.7024388939402045e-05, "loss": 4.9327, "step": 17290 }, { "epoch": 1.3842214754360698, "grad_norm": 2.816145896911621, "learning_rate": 2.7011017810343907e-05, "loss": 5.0343, "step": 17300 }, { "epoch": 1.385021603456553, "grad_norm": 3.2603354454040527, "learning_rate": 2.699764668128577e-05, "loss": 5.0129, "step": 17310 }, { "epoch": 1.3858217314770362, "grad_norm": 2.7474968433380127, "learning_rate": 2.6984275552227633e-05, "loss": 5.0807, "step": 17320 }, { "epoch": 1.3866218594975197, "grad_norm": 3.539966344833374, "learning_rate": 2.6970904423169496e-05, "loss": 5.0476, "step": 17330 }, { "epoch": 1.387421987518003, "grad_norm": 3.2417261600494385, "learning_rate": 2.6957533294111355e-05, "loss": 4.9799, "step": 17340 }, { "epoch": 1.3882221155384862, "grad_norm": 3.1689510345458984, "learning_rate": 2.6944162165053218e-05, "loss": 5.131, "step": 17350 }, { "epoch": 1.3890222435589694, "grad_norm": 3.3699281215667725, "learning_rate": 2.693079103599508e-05, "loss": 4.9792, "step": 17360 }, { "epoch": 1.3898223715794527, "grad_norm": 3.8069379329681396, "learning_rate": 2.6917419906936943e-05, "loss": 5.2615, "step": 17370 }, { "epoch": 1.3906224995999361, "grad_norm": 3.1969141960144043, "learning_rate": 2.6904048777878806e-05, "loss": 4.8507, "step": 17380 }, { "epoch": 1.3914226276204193, "grad_norm": 2.7576959133148193, "learning_rate": 2.689067764882067e-05, "loss": 5.1096, "step": 17390 }, { "epoch": 1.3922227556409026, "grad_norm": 4.1617608070373535, "learning_rate": 2.687730651976253e-05, "loss": 4.9802, "step": 17400 }, { "epoch": 1.3930228836613858, "grad_norm": 4.2402567863464355, "learning_rate": 2.6863935390704394e-05, "loss": 5.1928, "step": 17410 }, { "epoch": 1.393823011681869, "grad_norm": 5.131753921508789, "learning_rate": 2.6850564261646256e-05, "loss": 5.099, "step": 17420 }, { "epoch": 1.3946231397023523, "grad_norm": 2.932196617126465, "learning_rate": 2.683719313258812e-05, "loss": 4.8787, "step": 17430 }, { "epoch": 1.3954232677228355, "grad_norm": 3.221860885620117, "learning_rate": 2.6823822003529982e-05, "loss": 5.2802, "step": 17440 }, { "epoch": 1.396223395743319, "grad_norm": 3.539557456970215, "learning_rate": 2.6810450874471844e-05, "loss": 4.8109, "step": 17450 }, { "epoch": 1.3970235237638022, "grad_norm": 4.26516056060791, "learning_rate": 2.6797079745413707e-05, "loss": 5.1081, "step": 17460 }, { "epoch": 1.3978236517842855, "grad_norm": 6.4403605461120605, "learning_rate": 2.678370861635557e-05, "loss": 5.1178, "step": 17470 }, { "epoch": 1.3986237798047687, "grad_norm": 3.0701773166656494, "learning_rate": 2.6770337487297433e-05, "loss": 5.1088, "step": 17480 }, { "epoch": 1.399423907825252, "grad_norm": 3.2516419887542725, "learning_rate": 2.6756966358239295e-05, "loss": 5.0524, "step": 17490 }, { "epoch": 1.4002240358457354, "grad_norm": 3.4712700843811035, "learning_rate": 2.674359522918115e-05, "loss": 4.9977, "step": 17500 }, { "epoch": 1.4010241638662186, "grad_norm": 3.4102516174316406, "learning_rate": 2.6730224100123014e-05, "loss": 4.8785, "step": 17510 }, { "epoch": 1.4018242918867019, "grad_norm": 3.1689910888671875, "learning_rate": 2.6716852971064877e-05, "loss": 5.2968, "step": 17520 }, { "epoch": 1.4026244199071851, "grad_norm": 3.560192823410034, "learning_rate": 2.670348184200674e-05, "loss": 5.1994, "step": 17530 }, { "epoch": 1.4034245479276684, "grad_norm": 4.837986469268799, "learning_rate": 2.6690110712948602e-05, "loss": 4.913, "step": 17540 }, { "epoch": 1.4042246759481518, "grad_norm": 3.8842897415161133, "learning_rate": 2.6676739583890465e-05, "loss": 5.1379, "step": 17550 }, { "epoch": 1.405024803968635, "grad_norm": 2.939554452896118, "learning_rate": 2.6663368454832327e-05, "loss": 4.9999, "step": 17560 }, { "epoch": 1.4058249319891183, "grad_norm": 3.137080192565918, "learning_rate": 2.664999732577419e-05, "loss": 4.9539, "step": 17570 }, { "epoch": 1.4066250600096015, "grad_norm": 3.7876265048980713, "learning_rate": 2.6636626196716053e-05, "loss": 5.0494, "step": 17580 }, { "epoch": 1.4074251880300848, "grad_norm": 3.8008368015289307, "learning_rate": 2.6623255067657915e-05, "loss": 5.0823, "step": 17590 }, { "epoch": 1.4082253160505682, "grad_norm": 2.5244452953338623, "learning_rate": 2.6609883938599778e-05, "loss": 5.0717, "step": 17600 }, { "epoch": 1.4090254440710512, "grad_norm": 2.8249638080596924, "learning_rate": 2.659651280954164e-05, "loss": 4.8676, "step": 17610 }, { "epoch": 1.4098255720915347, "grad_norm": 4.564566135406494, "learning_rate": 2.6583141680483503e-05, "loss": 5.1186, "step": 17620 }, { "epoch": 1.410625700112018, "grad_norm": 3.5436971187591553, "learning_rate": 2.6569770551425366e-05, "loss": 4.9887, "step": 17630 }, { "epoch": 1.4114258281325012, "grad_norm": 3.748399019241333, "learning_rate": 2.655639942236723e-05, "loss": 5.0972, "step": 17640 }, { "epoch": 1.4122259561529844, "grad_norm": 3.347356081008911, "learning_rate": 2.6543028293309085e-05, "loss": 5.1476, "step": 17650 }, { "epoch": 1.4130260841734676, "grad_norm": 4.2582807540893555, "learning_rate": 2.6529657164250947e-05, "loss": 4.893, "step": 17660 }, { "epoch": 1.413826212193951, "grad_norm": 3.092129945755005, "learning_rate": 2.651628603519281e-05, "loss": 5.0277, "step": 17670 }, { "epoch": 1.4146263402144343, "grad_norm": 3.6835200786590576, "learning_rate": 2.6502914906134673e-05, "loss": 5.0984, "step": 17680 }, { "epoch": 1.4154264682349176, "grad_norm": 2.6071996688842773, "learning_rate": 2.6489543777076535e-05, "loss": 5.1737, "step": 17690 }, { "epoch": 1.4162265962554008, "grad_norm": 2.6273155212402344, "learning_rate": 2.6476172648018398e-05, "loss": 4.9545, "step": 17700 }, { "epoch": 1.417026724275884, "grad_norm": 3.8607401847839355, "learning_rate": 2.646280151896026e-05, "loss": 5.1669, "step": 17710 }, { "epoch": 1.4178268522963675, "grad_norm": 3.5712077617645264, "learning_rate": 2.6449430389902124e-05, "loss": 5.1462, "step": 17720 }, { "epoch": 1.4186269803168507, "grad_norm": 3.0255634784698486, "learning_rate": 2.6436059260843986e-05, "loss": 4.9677, "step": 17730 }, { "epoch": 1.419427108337334, "grad_norm": 2.684609889984131, "learning_rate": 2.642268813178585e-05, "loss": 5.056, "step": 17740 }, { "epoch": 1.4202272363578172, "grad_norm": 3.0845911502838135, "learning_rate": 2.640931700272771e-05, "loss": 4.9412, "step": 17750 }, { "epoch": 1.4210273643783005, "grad_norm": 3.321986198425293, "learning_rate": 2.6395945873669574e-05, "loss": 5.1335, "step": 17760 }, { "epoch": 1.421827492398784, "grad_norm": 3.656062126159668, "learning_rate": 2.6382574744611437e-05, "loss": 4.8692, "step": 17770 }, { "epoch": 1.4226276204192672, "grad_norm": 2.865105152130127, "learning_rate": 2.63692036155533e-05, "loss": 5.0353, "step": 17780 }, { "epoch": 1.4234277484397504, "grad_norm": 3.2242352962493896, "learning_rate": 2.6355832486495162e-05, "loss": 5.0441, "step": 17790 }, { "epoch": 1.4242278764602336, "grad_norm": 2.8633460998535156, "learning_rate": 2.634246135743702e-05, "loss": 5.1531, "step": 17800 }, { "epoch": 1.4250280044807169, "grad_norm": 4.637207508087158, "learning_rate": 2.6329090228378884e-05, "loss": 4.9218, "step": 17810 }, { "epoch": 1.4258281325012003, "grad_norm": 3.403052806854248, "learning_rate": 2.6315719099320747e-05, "loss": 4.9892, "step": 17820 }, { "epoch": 1.4266282605216833, "grad_norm": 3.6156423091888428, "learning_rate": 2.630234797026261e-05, "loss": 5.0119, "step": 17830 }, { "epoch": 1.4274283885421668, "grad_norm": 4.8892340660095215, "learning_rate": 2.6288976841204472e-05, "loss": 4.9413, "step": 17840 }, { "epoch": 1.42822851656265, "grad_norm": 4.089402198791504, "learning_rate": 2.6275605712146335e-05, "loss": 5.078, "step": 17850 }, { "epoch": 1.4290286445831333, "grad_norm": 4.567444324493408, "learning_rate": 2.6262234583088198e-05, "loss": 4.9818, "step": 17860 }, { "epoch": 1.4298287726036165, "grad_norm": 3.038895845413208, "learning_rate": 2.624886345403006e-05, "loss": 4.8535, "step": 17870 }, { "epoch": 1.4306289006240998, "grad_norm": 3.1715147495269775, "learning_rate": 2.6235492324971923e-05, "loss": 5.118, "step": 17880 }, { "epoch": 1.4314290286445832, "grad_norm": 5.904415607452393, "learning_rate": 2.6222121195913786e-05, "loss": 5.1062, "step": 17890 }, { "epoch": 1.4322291566650664, "grad_norm": 3.092773675918579, "learning_rate": 2.620875006685565e-05, "loss": 5.0615, "step": 17900 }, { "epoch": 1.4330292846855497, "grad_norm": 4.345393180847168, "learning_rate": 2.619537893779751e-05, "loss": 5.0647, "step": 17910 }, { "epoch": 1.433829412706033, "grad_norm": 3.2700984477996826, "learning_rate": 2.6182007808739374e-05, "loss": 5.0142, "step": 17920 }, { "epoch": 1.4346295407265162, "grad_norm": 3.5998690128326416, "learning_rate": 2.6168636679681237e-05, "loss": 5.0143, "step": 17930 }, { "epoch": 1.4354296687469996, "grad_norm": 3.5779027938842773, "learning_rate": 2.61552655506231e-05, "loss": 5.0386, "step": 17940 }, { "epoch": 1.4362297967674829, "grad_norm": 4.815369606018066, "learning_rate": 2.6141894421564962e-05, "loss": 5.0961, "step": 17950 }, { "epoch": 1.437029924787966, "grad_norm": 3.695279836654663, "learning_rate": 2.6128523292506818e-05, "loss": 5.048, "step": 17960 }, { "epoch": 1.4378300528084493, "grad_norm": 5.449552536010742, "learning_rate": 2.611515216344868e-05, "loss": 4.9294, "step": 17970 }, { "epoch": 1.4386301808289326, "grad_norm": 3.0460941791534424, "learning_rate": 2.6101781034390543e-05, "loss": 4.9002, "step": 17980 }, { "epoch": 1.439430308849416, "grad_norm": 4.6357951164245605, "learning_rate": 2.6088409905332406e-05, "loss": 5.0084, "step": 17990 }, { "epoch": 1.4402304368698993, "grad_norm": 4.996743679046631, "learning_rate": 2.607503877627427e-05, "loss": 4.9983, "step": 18000 }, { "epoch": 1.4410305648903825, "grad_norm": 3.1133697032928467, "learning_rate": 2.606166764721613e-05, "loss": 5.1251, "step": 18010 }, { "epoch": 1.4418306929108657, "grad_norm": 2.8803775310516357, "learning_rate": 2.6048296518157994e-05, "loss": 5.1401, "step": 18020 }, { "epoch": 1.442630820931349, "grad_norm": 2.200620174407959, "learning_rate": 2.6034925389099857e-05, "loss": 5.0268, "step": 18030 }, { "epoch": 1.4434309489518322, "grad_norm": 6.3594651222229, "learning_rate": 2.602155426004172e-05, "loss": 5.0733, "step": 18040 }, { "epoch": 1.4442310769723155, "grad_norm": 2.7869396209716797, "learning_rate": 2.6008183130983582e-05, "loss": 5.0708, "step": 18050 }, { "epoch": 1.445031204992799, "grad_norm": 4.231550693511963, "learning_rate": 2.5994812001925445e-05, "loss": 5.0453, "step": 18060 }, { "epoch": 1.4458313330132821, "grad_norm": 3.3731446266174316, "learning_rate": 2.5981440872867308e-05, "loss": 5.0811, "step": 18070 }, { "epoch": 1.4466314610337654, "grad_norm": 2.6151371002197266, "learning_rate": 2.596806974380917e-05, "loss": 5.012, "step": 18080 }, { "epoch": 1.4474315890542486, "grad_norm": 4.7653937339782715, "learning_rate": 2.5954698614751033e-05, "loss": 5.0166, "step": 18090 }, { "epoch": 1.4482317170747319, "grad_norm": 2.9431347846984863, "learning_rate": 2.5941327485692896e-05, "loss": 4.8474, "step": 18100 }, { "epoch": 1.4490318450952153, "grad_norm": 2.9158408641815186, "learning_rate": 2.592795635663475e-05, "loss": 4.9953, "step": 18110 }, { "epoch": 1.4498319731156986, "grad_norm": 2.303264617919922, "learning_rate": 2.5914585227576614e-05, "loss": 4.9232, "step": 18120 }, { "epoch": 1.4506321011361818, "grad_norm": 3.09885835647583, "learning_rate": 2.5901214098518477e-05, "loss": 4.9656, "step": 18130 }, { "epoch": 1.451432229156665, "grad_norm": 4.0016045570373535, "learning_rate": 2.588784296946034e-05, "loss": 5.1111, "step": 18140 }, { "epoch": 1.4522323571771483, "grad_norm": 3.566770076751709, "learning_rate": 2.5874471840402202e-05, "loss": 5.1114, "step": 18150 }, { "epoch": 1.4530324851976317, "grad_norm": 3.5348453521728516, "learning_rate": 2.5861100711344065e-05, "loss": 5.0469, "step": 18160 }, { "epoch": 1.453832613218115, "grad_norm": 4.667888641357422, "learning_rate": 2.5847729582285928e-05, "loss": 4.9915, "step": 18170 }, { "epoch": 1.4546327412385982, "grad_norm": 3.968620777130127, "learning_rate": 2.583435845322779e-05, "loss": 4.9876, "step": 18180 }, { "epoch": 1.4554328692590814, "grad_norm": 3.7125017642974854, "learning_rate": 2.5820987324169653e-05, "loss": 5.1154, "step": 18190 }, { "epoch": 1.4562329972795647, "grad_norm": 4.5525312423706055, "learning_rate": 2.5807616195111516e-05, "loss": 5.0959, "step": 18200 }, { "epoch": 1.4570331253000481, "grad_norm": 3.8833093643188477, "learning_rate": 2.579424506605338e-05, "loss": 5.0564, "step": 18210 }, { "epoch": 1.4578332533205312, "grad_norm": 2.3059375286102295, "learning_rate": 2.578087393699524e-05, "loss": 5.1766, "step": 18220 }, { "epoch": 1.4586333813410146, "grad_norm": 2.98335599899292, "learning_rate": 2.5767502807937104e-05, "loss": 4.9341, "step": 18230 }, { "epoch": 1.4594335093614978, "grad_norm": 2.6796722412109375, "learning_rate": 2.5754131678878966e-05, "loss": 5.1078, "step": 18240 }, { "epoch": 1.460233637381981, "grad_norm": 3.41542649269104, "learning_rate": 2.574076054982083e-05, "loss": 5.0378, "step": 18250 }, { "epoch": 1.4610337654024643, "grad_norm": 3.245790481567383, "learning_rate": 2.5727389420762692e-05, "loss": 5.1121, "step": 18260 }, { "epoch": 1.4618338934229476, "grad_norm": 3.9725239276885986, "learning_rate": 2.571401829170455e-05, "loss": 4.9956, "step": 18270 }, { "epoch": 1.462634021443431, "grad_norm": 3.3008267879486084, "learning_rate": 2.5700647162646414e-05, "loss": 5.032, "step": 18280 }, { "epoch": 1.4634341494639143, "grad_norm": 2.838578701019287, "learning_rate": 2.5687276033588277e-05, "loss": 4.9847, "step": 18290 }, { "epoch": 1.4642342774843975, "grad_norm": 5.162490367889404, "learning_rate": 2.567390490453014e-05, "loss": 5.0771, "step": 18300 }, { "epoch": 1.4650344055048807, "grad_norm": 3.455357789993286, "learning_rate": 2.5660533775472002e-05, "loss": 5.0616, "step": 18310 }, { "epoch": 1.465834533525364, "grad_norm": 2.9185259342193604, "learning_rate": 2.5647162646413865e-05, "loss": 4.9563, "step": 18320 }, { "epoch": 1.4666346615458474, "grad_norm": 3.7584445476531982, "learning_rate": 2.5633791517355727e-05, "loss": 5.0305, "step": 18330 }, { "epoch": 1.4674347895663307, "grad_norm": 3.1257286071777344, "learning_rate": 2.562042038829759e-05, "loss": 5.0208, "step": 18340 }, { "epoch": 1.468234917586814, "grad_norm": 2.5908803939819336, "learning_rate": 2.5607049259239453e-05, "loss": 5.0216, "step": 18350 }, { "epoch": 1.4690350456072971, "grad_norm": 3.889470100402832, "learning_rate": 2.5593678130181315e-05, "loss": 5.2175, "step": 18360 }, { "epoch": 1.4698351736277804, "grad_norm": 2.6699845790863037, "learning_rate": 2.5580307001123178e-05, "loss": 5.0305, "step": 18370 }, { "epoch": 1.4706353016482638, "grad_norm": 4.451345920562744, "learning_rate": 2.556693587206504e-05, "loss": 4.9873, "step": 18380 }, { "epoch": 1.471435429668747, "grad_norm": 3.441887855529785, "learning_rate": 2.5553564743006903e-05, "loss": 4.958, "step": 18390 }, { "epoch": 1.4722355576892303, "grad_norm": 2.9705936908721924, "learning_rate": 2.5540193613948766e-05, "loss": 4.9719, "step": 18400 }, { "epoch": 1.4730356857097135, "grad_norm": 4.436412334442139, "learning_rate": 2.552682248489063e-05, "loss": 4.899, "step": 18410 }, { "epoch": 1.4738358137301968, "grad_norm": 3.027207136154175, "learning_rate": 2.5513451355832485e-05, "loss": 5.0222, "step": 18420 }, { "epoch": 1.4746359417506802, "grad_norm": 3.141206741333008, "learning_rate": 2.5500080226774347e-05, "loss": 4.8712, "step": 18430 }, { "epoch": 1.4754360697711633, "grad_norm": 2.8075220584869385, "learning_rate": 2.548670909771621e-05, "loss": 5.1235, "step": 18440 }, { "epoch": 1.4762361977916467, "grad_norm": 3.697413921356201, "learning_rate": 2.5473337968658073e-05, "loss": 4.9516, "step": 18450 }, { "epoch": 1.47703632581213, "grad_norm": 2.754885673522949, "learning_rate": 2.5459966839599936e-05, "loss": 5.0219, "step": 18460 }, { "epoch": 1.4778364538326132, "grad_norm": 2.937675714492798, "learning_rate": 2.5446595710541798e-05, "loss": 5.0053, "step": 18470 }, { "epoch": 1.4786365818530964, "grad_norm": 4.04290771484375, "learning_rate": 2.543322458148366e-05, "loss": 5.0199, "step": 18480 }, { "epoch": 1.4794367098735797, "grad_norm": 3.605565309524536, "learning_rate": 2.5419853452425524e-05, "loss": 4.9443, "step": 18490 }, { "epoch": 1.4802368378940631, "grad_norm": 3.089582920074463, "learning_rate": 2.5406482323367386e-05, "loss": 5.1402, "step": 18500 }, { "epoch": 1.4810369659145464, "grad_norm": 2.7486472129821777, "learning_rate": 2.539311119430925e-05, "loss": 4.9903, "step": 18510 }, { "epoch": 1.4818370939350296, "grad_norm": 3.1345067024230957, "learning_rate": 2.537974006525111e-05, "loss": 4.867, "step": 18520 }, { "epoch": 1.4826372219555128, "grad_norm": 2.698291540145874, "learning_rate": 2.5366368936192974e-05, "loss": 5.0244, "step": 18530 }, { "epoch": 1.483437349975996, "grad_norm": 3.02305006980896, "learning_rate": 2.5352997807134837e-05, "loss": 5.1695, "step": 18540 }, { "epoch": 1.4842374779964795, "grad_norm": 3.198091745376587, "learning_rate": 2.53396266780767e-05, "loss": 5.0686, "step": 18550 }, { "epoch": 1.4850376060169628, "grad_norm": 3.5371475219726562, "learning_rate": 2.5326255549018562e-05, "loss": 5.1243, "step": 18560 }, { "epoch": 1.485837734037446, "grad_norm": 6.168661594390869, "learning_rate": 2.5312884419960425e-05, "loss": 5.041, "step": 18570 }, { "epoch": 1.4866378620579292, "grad_norm": 2.842280864715576, "learning_rate": 2.529951329090228e-05, "loss": 5.0337, "step": 18580 }, { "epoch": 1.4874379900784125, "grad_norm": 2.6275157928466797, "learning_rate": 2.5286142161844144e-05, "loss": 4.8418, "step": 18590 }, { "epoch": 1.488238118098896, "grad_norm": 3.1988492012023926, "learning_rate": 2.5272771032786006e-05, "loss": 4.9543, "step": 18600 }, { "epoch": 1.4890382461193792, "grad_norm": 2.890760660171509, "learning_rate": 2.525939990372787e-05, "loss": 5.0444, "step": 18610 }, { "epoch": 1.4898383741398624, "grad_norm": 3.2027745246887207, "learning_rate": 2.5246028774669732e-05, "loss": 5.1103, "step": 18620 }, { "epoch": 1.4906385021603457, "grad_norm": 2.3930492401123047, "learning_rate": 2.5232657645611594e-05, "loss": 5.0071, "step": 18630 }, { "epoch": 1.491438630180829, "grad_norm": 2.7484025955200195, "learning_rate": 2.5219286516553457e-05, "loss": 5.1018, "step": 18640 }, { "epoch": 1.4922387582013121, "grad_norm": 4.265021800994873, "learning_rate": 2.520591538749532e-05, "loss": 4.9677, "step": 18650 }, { "epoch": 1.4930388862217954, "grad_norm": 3.4047820568084717, "learning_rate": 2.5192544258437183e-05, "loss": 5.0718, "step": 18660 }, { "epoch": 1.4938390142422788, "grad_norm": 3.635500192642212, "learning_rate": 2.5179173129379045e-05, "loss": 5.159, "step": 18670 }, { "epoch": 1.494639142262762, "grad_norm": 3.42348575592041, "learning_rate": 2.5165802000320908e-05, "loss": 5.2187, "step": 18680 }, { "epoch": 1.4954392702832453, "grad_norm": 3.96763277053833, "learning_rate": 2.515243087126277e-05, "loss": 5.0085, "step": 18690 }, { "epoch": 1.4962393983037285, "grad_norm": 4.471992015838623, "learning_rate": 2.5139059742204633e-05, "loss": 4.8476, "step": 18700 }, { "epoch": 1.4970395263242118, "grad_norm": 3.588132858276367, "learning_rate": 2.5125688613146496e-05, "loss": 4.9779, "step": 18710 }, { "epoch": 1.4978396543446952, "grad_norm": 4.183651924133301, "learning_rate": 2.511231748408836e-05, "loss": 5.0973, "step": 18720 }, { "epoch": 1.4986397823651785, "grad_norm": 4.290589332580566, "learning_rate": 2.5098946355030218e-05, "loss": 5.0105, "step": 18730 }, { "epoch": 1.4994399103856617, "grad_norm": 3.7274177074432373, "learning_rate": 2.508557522597208e-05, "loss": 5.0491, "step": 18740 }, { "epoch": 1.500240038406145, "grad_norm": 4.3094682693481445, "learning_rate": 2.5072204096913943e-05, "loss": 5.0653, "step": 18750 }, { "epoch": 1.5010401664266282, "grad_norm": 3.182372570037842, "learning_rate": 2.5058832967855806e-05, "loss": 5.0284, "step": 18760 }, { "epoch": 1.5018402944471116, "grad_norm": 3.6180033683776855, "learning_rate": 2.504546183879767e-05, "loss": 5.1201, "step": 18770 }, { "epoch": 1.5026404224675947, "grad_norm": 3.979736328125, "learning_rate": 2.503209070973953e-05, "loss": 5.0016, "step": 18780 }, { "epoch": 1.5034405504880781, "grad_norm": 3.0754947662353516, "learning_rate": 2.5018719580681394e-05, "loss": 4.9988, "step": 18790 }, { "epoch": 1.5042406785085614, "grad_norm": 3.5843517780303955, "learning_rate": 2.5005348451623257e-05, "loss": 5.003, "step": 18800 }, { "epoch": 1.5050408065290446, "grad_norm": 4.648735523223877, "learning_rate": 2.499197732256512e-05, "loss": 4.9771, "step": 18810 }, { "epoch": 1.505840934549528, "grad_norm": Infinity, "learning_rate": 2.4979943306412795e-05, "loss": 5.0095, "step": 18820 }, { "epoch": 1.506641062570011, "grad_norm": 3.6529128551483154, "learning_rate": 2.4966572177354658e-05, "loss": 5.1996, "step": 18830 }, { "epoch": 1.5074411905904945, "grad_norm": 3.5937626361846924, "learning_rate": 2.495320104829652e-05, "loss": 4.9082, "step": 18840 }, { "epoch": 1.5082413186109778, "grad_norm": 3.3477978706359863, "learning_rate": 2.493982991923838e-05, "loss": 5.0897, "step": 18850 }, { "epoch": 1.509041446631461, "grad_norm": 2.748612642288208, "learning_rate": 2.4926458790180243e-05, "loss": 4.8157, "step": 18860 }, { "epoch": 1.5098415746519445, "grad_norm": 3.3521084785461426, "learning_rate": 2.4913087661122105e-05, "loss": 5.1061, "step": 18870 }, { "epoch": 1.5106417026724275, "grad_norm": 3.596810817718506, "learning_rate": 2.4899716532063968e-05, "loss": 4.9414, "step": 18880 }, { "epoch": 1.511441830692911, "grad_norm": 3.07938289642334, "learning_rate": 2.488634540300583e-05, "loss": 4.9793, "step": 18890 }, { "epoch": 1.5122419587133942, "grad_norm": 2.709639310836792, "learning_rate": 2.4872974273947693e-05, "loss": 5.0751, "step": 18900 }, { "epoch": 1.5130420867338774, "grad_norm": 5.074763774871826, "learning_rate": 2.4859603144889556e-05, "loss": 5.0816, "step": 18910 }, { "epoch": 1.5138422147543606, "grad_norm": 3.0176215171813965, "learning_rate": 2.484623201583142e-05, "loss": 5.1346, "step": 18920 }, { "epoch": 1.5146423427748439, "grad_norm": 4.006988525390625, "learning_rate": 2.4832860886773278e-05, "loss": 5.1273, "step": 18930 }, { "epoch": 1.5154424707953273, "grad_norm": 4.101680755615234, "learning_rate": 2.481948975771514e-05, "loss": 4.975, "step": 18940 }, { "epoch": 1.5162425988158106, "grad_norm": 3.672960042953491, "learning_rate": 2.4806118628657003e-05, "loss": 5.0561, "step": 18950 }, { "epoch": 1.5170427268362938, "grad_norm": 3.214320421218872, "learning_rate": 2.4792747499598866e-05, "loss": 4.9711, "step": 18960 }, { "epoch": 1.517842854856777, "grad_norm": 4.82122278213501, "learning_rate": 2.477937637054073e-05, "loss": 5.136, "step": 18970 }, { "epoch": 1.5186429828772603, "grad_norm": 3.8006436824798584, "learning_rate": 2.476600524148259e-05, "loss": 5.1249, "step": 18980 }, { "epoch": 1.5194431108977438, "grad_norm": 3.1978204250335693, "learning_rate": 2.4752634112424454e-05, "loss": 5.2438, "step": 18990 }, { "epoch": 1.5202432389182268, "grad_norm": 3.2975521087646484, "learning_rate": 2.4739262983366317e-05, "loss": 4.9452, "step": 19000 }, { "epoch": 1.5210433669387102, "grad_norm": 2.568472385406494, "learning_rate": 2.472589185430818e-05, "loss": 5.1489, "step": 19010 }, { "epoch": 1.5218434949591935, "grad_norm": 2.766832113265991, "learning_rate": 2.4712520725250042e-05, "loss": 4.9362, "step": 19020 }, { "epoch": 1.5226436229796767, "grad_norm": 3.665269136428833, "learning_rate": 2.4699149596191905e-05, "loss": 4.9117, "step": 19030 }, { "epoch": 1.5234437510001602, "grad_norm": 3.0590641498565674, "learning_rate": 2.4685778467133768e-05, "loss": 5.03, "step": 19040 }, { "epoch": 1.5242438790206432, "grad_norm": 4.048924446105957, "learning_rate": 2.467240733807563e-05, "loss": 5.2367, "step": 19050 }, { "epoch": 1.5250440070411266, "grad_norm": 3.6891164779663086, "learning_rate": 2.4659036209017493e-05, "loss": 5.0999, "step": 19060 }, { "epoch": 1.5258441350616099, "grad_norm": 6.118479251861572, "learning_rate": 2.4645665079959356e-05, "loss": 5.2272, "step": 19070 }, { "epoch": 1.5266442630820931, "grad_norm": 4.393064498901367, "learning_rate": 2.4632293950901215e-05, "loss": 5.0277, "step": 19080 }, { "epoch": 1.5274443911025766, "grad_norm": 3.447316884994507, "learning_rate": 2.4618922821843078e-05, "loss": 5.1005, "step": 19090 }, { "epoch": 1.5282445191230596, "grad_norm": 2.9048802852630615, "learning_rate": 2.460555169278494e-05, "loss": 5.0995, "step": 19100 }, { "epoch": 1.529044647143543, "grad_norm": 3.22967529296875, "learning_rate": 2.4592180563726803e-05, "loss": 5.1191, "step": 19110 }, { "epoch": 1.5298447751640263, "grad_norm": 5.411147117614746, "learning_rate": 2.4578809434668666e-05, "loss": 4.9711, "step": 19120 }, { "epoch": 1.5306449031845095, "grad_norm": 2.6100914478302, "learning_rate": 2.456543830561053e-05, "loss": 4.8903, "step": 19130 }, { "epoch": 1.5314450312049928, "grad_norm": 4.481126308441162, "learning_rate": 2.455206717655239e-05, "loss": 5.0353, "step": 19140 }, { "epoch": 1.532245159225476, "grad_norm": 2.8812084197998047, "learning_rate": 2.4538696047494254e-05, "loss": 5.0517, "step": 19150 }, { "epoch": 1.5330452872459595, "grad_norm": 3.40692138671875, "learning_rate": 2.4525324918436113e-05, "loss": 5.1662, "step": 19160 }, { "epoch": 1.5338454152664425, "grad_norm": 2.64628529548645, "learning_rate": 2.4511953789377976e-05, "loss": 5.1305, "step": 19170 }, { "epoch": 1.534645543286926, "grad_norm": 3.343109607696533, "learning_rate": 2.449858266031984e-05, "loss": 5.0441, "step": 19180 }, { "epoch": 1.5354456713074092, "grad_norm": 3.306133270263672, "learning_rate": 2.44852115312617e-05, "loss": 4.9099, "step": 19190 }, { "epoch": 1.5362457993278924, "grad_norm": 3.217883825302124, "learning_rate": 2.4471840402203564e-05, "loss": 5.0963, "step": 19200 }, { "epoch": 1.5370459273483759, "grad_norm": 2.76540470123291, "learning_rate": 2.4458469273145427e-05, "loss": 5.0184, "step": 19210 }, { "epoch": 1.5378460553688589, "grad_norm": 3.376350164413452, "learning_rate": 2.444509814408729e-05, "loss": 5.0837, "step": 19220 }, { "epoch": 1.5386461833893423, "grad_norm": 4.767297744750977, "learning_rate": 2.443172701502915e-05, "loss": 5.0034, "step": 19230 }, { "epoch": 1.5394463114098256, "grad_norm": 3.958462953567505, "learning_rate": 2.441835588597101e-05, "loss": 5.1961, "step": 19240 }, { "epoch": 1.5402464394303088, "grad_norm": 2.8455073833465576, "learning_rate": 2.4404984756912874e-05, "loss": 4.92, "step": 19250 }, { "epoch": 1.5410465674507923, "grad_norm": 2.7165021896362305, "learning_rate": 2.4391613627854737e-05, "loss": 4.9634, "step": 19260 }, { "epoch": 1.5418466954712753, "grad_norm": 4.948694705963135, "learning_rate": 2.43782424987966e-05, "loss": 4.9641, "step": 19270 }, { "epoch": 1.5426468234917587, "grad_norm": 3.5397555828094482, "learning_rate": 2.4364871369738462e-05, "loss": 4.891, "step": 19280 }, { "epoch": 1.543446951512242, "grad_norm": 3.971384048461914, "learning_rate": 2.4351500240680325e-05, "loss": 5.0002, "step": 19290 }, { "epoch": 1.5442470795327252, "grad_norm": 2.627703905105591, "learning_rate": 2.4338129111622187e-05, "loss": 4.9231, "step": 19300 }, { "epoch": 1.5450472075532087, "grad_norm": 6.132839202880859, "learning_rate": 2.4324757982564047e-05, "loss": 4.9914, "step": 19310 }, { "epoch": 1.5458473355736917, "grad_norm": 2.6523935794830322, "learning_rate": 2.431138685350591e-05, "loss": 4.9752, "step": 19320 }, { "epoch": 1.5466474635941752, "grad_norm": 3.1848411560058594, "learning_rate": 2.4298015724447772e-05, "loss": 5.1569, "step": 19330 }, { "epoch": 1.5474475916146584, "grad_norm": 3.3984134197235107, "learning_rate": 2.4284644595389635e-05, "loss": 4.967, "step": 19340 }, { "epoch": 1.5482477196351416, "grad_norm": 3.3843936920166016, "learning_rate": 2.4271273466331497e-05, "loss": 5.0225, "step": 19350 }, { "epoch": 1.5490478476556249, "grad_norm": 3.611131191253662, "learning_rate": 2.425790233727336e-05, "loss": 4.9721, "step": 19360 }, { "epoch": 1.549847975676108, "grad_norm": 3.0888559818267822, "learning_rate": 2.4244531208215223e-05, "loss": 5.0042, "step": 19370 }, { "epoch": 1.5506481036965916, "grad_norm": 3.0562126636505127, "learning_rate": 2.4231160079157086e-05, "loss": 5.1078, "step": 19380 }, { "epoch": 1.5514482317170746, "grad_norm": 2.9445252418518066, "learning_rate": 2.4217788950098945e-05, "loss": 4.9544, "step": 19390 }, { "epoch": 1.552248359737558, "grad_norm": 2.643602132797241, "learning_rate": 2.4204417821040808e-05, "loss": 5.0008, "step": 19400 }, { "epoch": 1.5530484877580413, "grad_norm": 3.939926862716675, "learning_rate": 2.419104669198267e-05, "loss": 5.0934, "step": 19410 }, { "epoch": 1.5538486157785245, "grad_norm": 4.864276885986328, "learning_rate": 2.4177675562924533e-05, "loss": 4.7946, "step": 19420 }, { "epoch": 1.554648743799008, "grad_norm": 3.8411593437194824, "learning_rate": 2.4164304433866396e-05, "loss": 4.9201, "step": 19430 }, { "epoch": 1.555448871819491, "grad_norm": 3.1794567108154297, "learning_rate": 2.4150933304808258e-05, "loss": 5.0872, "step": 19440 }, { "epoch": 1.5562489998399744, "grad_norm": 3.6120903491973877, "learning_rate": 2.413756217575012e-05, "loss": 5.091, "step": 19450 }, { "epoch": 1.5570491278604577, "grad_norm": 2.759181022644043, "learning_rate": 2.4124191046691984e-05, "loss": 5.0926, "step": 19460 }, { "epoch": 1.557849255880941, "grad_norm": 3.43410062789917, "learning_rate": 2.4110819917633846e-05, "loss": 5.0082, "step": 19470 }, { "epoch": 1.5586493839014244, "grad_norm": 5.191288948059082, "learning_rate": 2.409744878857571e-05, "loss": 5.1893, "step": 19480 }, { "epoch": 1.5594495119219074, "grad_norm": 2.750684976577759, "learning_rate": 2.4084077659517572e-05, "loss": 5.0553, "step": 19490 }, { "epoch": 1.5602496399423909, "grad_norm": 3.0791661739349365, "learning_rate": 2.4070706530459434e-05, "loss": 5.0705, "step": 19500 }, { "epoch": 1.561049767962874, "grad_norm": 3.4672889709472656, "learning_rate": 2.4057335401401297e-05, "loss": 5.131, "step": 19510 }, { "epoch": 1.5618498959833573, "grad_norm": 4.692345142364502, "learning_rate": 2.404396427234316e-05, "loss": 5.1392, "step": 19520 }, { "epoch": 1.5626500240038406, "grad_norm": 3.5774660110473633, "learning_rate": 2.4030593143285023e-05, "loss": 5.055, "step": 19530 }, { "epoch": 1.5634501520243238, "grad_norm": 2.421363592147827, "learning_rate": 2.4017222014226882e-05, "loss": 5.0437, "step": 19540 }, { "epoch": 1.5642502800448073, "grad_norm": 4.430188179016113, "learning_rate": 2.4003850885168745e-05, "loss": 5.1267, "step": 19550 }, { "epoch": 1.5650504080652905, "grad_norm": 3.8973233699798584, "learning_rate": 2.3990479756110607e-05, "loss": 4.9341, "step": 19560 }, { "epoch": 1.5658505360857737, "grad_norm": 4.119020462036133, "learning_rate": 2.397710862705247e-05, "loss": 5.0242, "step": 19570 }, { "epoch": 1.566650664106257, "grad_norm": 7.76237154006958, "learning_rate": 2.3963737497994333e-05, "loss": 5.1869, "step": 19580 }, { "epoch": 1.5674507921267402, "grad_norm": 2.831879138946533, "learning_rate": 2.3950366368936195e-05, "loss": 5.043, "step": 19590 }, { "epoch": 1.5682509201472237, "grad_norm": 3.3284389972686768, "learning_rate": 2.3936995239878058e-05, "loss": 4.9872, "step": 19600 }, { "epoch": 1.5690510481677067, "grad_norm": 2.6948013305664062, "learning_rate": 2.392362411081992e-05, "loss": 5.0638, "step": 19610 }, { "epoch": 1.5698511761881901, "grad_norm": 2.664315700531006, "learning_rate": 2.391025298176178e-05, "loss": 4.9393, "step": 19620 }, { "epoch": 1.5706513042086734, "grad_norm": 3.3716866970062256, "learning_rate": 2.3896881852703643e-05, "loss": 4.9731, "step": 19630 }, { "epoch": 1.5714514322291566, "grad_norm": 3.558425188064575, "learning_rate": 2.3883510723645505e-05, "loss": 5.0486, "step": 19640 }, { "epoch": 1.57225156024964, "grad_norm": 3.051264762878418, "learning_rate": 2.3870139594587368e-05, "loss": 5.1662, "step": 19650 }, { "epoch": 1.573051688270123, "grad_norm": 4.363539695739746, "learning_rate": 2.385676846552923e-05, "loss": 4.8295, "step": 19660 }, { "epoch": 1.5738518162906066, "grad_norm": 3.811920166015625, "learning_rate": 2.3843397336471093e-05, "loss": 5.0468, "step": 19670 }, { "epoch": 1.5746519443110898, "grad_norm": 4.834085464477539, "learning_rate": 2.3830026207412956e-05, "loss": 5.0537, "step": 19680 }, { "epoch": 1.575452072331573, "grad_norm": 3.673466444015503, "learning_rate": 2.381665507835482e-05, "loss": 5.0796, "step": 19690 }, { "epoch": 1.5762522003520565, "grad_norm": 2.913201093673706, "learning_rate": 2.3803283949296678e-05, "loss": 5.0079, "step": 19700 }, { "epoch": 1.5770523283725395, "grad_norm": 2.7734830379486084, "learning_rate": 2.378991282023854e-05, "loss": 5.0544, "step": 19710 }, { "epoch": 1.577852456393023, "grad_norm": 2.8796842098236084, "learning_rate": 2.3776541691180403e-05, "loss": 4.9974, "step": 19720 }, { "epoch": 1.5786525844135062, "grad_norm": 3.4903788566589355, "learning_rate": 2.3763170562122266e-05, "loss": 5.1065, "step": 19730 }, { "epoch": 1.5794527124339894, "grad_norm": 2.96079683303833, "learning_rate": 2.374979943306413e-05, "loss": 4.8157, "step": 19740 }, { "epoch": 1.5802528404544727, "grad_norm": 3.280221700668335, "learning_rate": 2.373642830400599e-05, "loss": 4.9894, "step": 19750 }, { "epoch": 1.581052968474956, "grad_norm": 3.4602150917053223, "learning_rate": 2.3723057174947854e-05, "loss": 5.0472, "step": 19760 }, { "epoch": 1.5818530964954394, "grad_norm": 2.6225857734680176, "learning_rate": 2.3709686045889717e-05, "loss": 4.9227, "step": 19770 }, { "epoch": 1.5826532245159224, "grad_norm": 2.760751247406006, "learning_rate": 2.3696314916831576e-05, "loss": 4.9783, "step": 19780 }, { "epoch": 1.5834533525364058, "grad_norm": 3.081970691680908, "learning_rate": 2.368294378777344e-05, "loss": 5.1022, "step": 19790 }, { "epoch": 1.584253480556889, "grad_norm": 4.927330017089844, "learning_rate": 2.36695726587153e-05, "loss": 5.0991, "step": 19800 }, { "epoch": 1.5850536085773723, "grad_norm": 3.2381088733673096, "learning_rate": 2.3656201529657164e-05, "loss": 4.9632, "step": 19810 }, { "epoch": 1.5858537365978558, "grad_norm": 3.803109884262085, "learning_rate": 2.3642830400599027e-05, "loss": 5.2309, "step": 19820 }, { "epoch": 1.5866538646183388, "grad_norm": 3.326345682144165, "learning_rate": 2.362945927154089e-05, "loss": 4.9104, "step": 19830 }, { "epoch": 1.5874539926388223, "grad_norm": 3.231013059616089, "learning_rate": 2.3616088142482752e-05, "loss": 4.9701, "step": 19840 }, { "epoch": 1.5882541206593055, "grad_norm": 4.239320278167725, "learning_rate": 2.3602717013424615e-05, "loss": 4.8555, "step": 19850 }, { "epoch": 1.5890542486797887, "grad_norm": 2.4793667793273926, "learning_rate": 2.3589345884366478e-05, "loss": 4.9989, "step": 19860 }, { "epoch": 1.5898543767002722, "grad_norm": 2.578479528427124, "learning_rate": 2.357597475530834e-05, "loss": 5.0905, "step": 19870 }, { "epoch": 1.5906545047207552, "grad_norm": 3.510375499725342, "learning_rate": 2.3562603626250203e-05, "loss": 5.1081, "step": 19880 }, { "epoch": 1.5914546327412387, "grad_norm": 2.9421463012695312, "learning_rate": 2.3549232497192066e-05, "loss": 5.1245, "step": 19890 }, { "epoch": 1.592254760761722, "grad_norm": 2.8770906925201416, "learning_rate": 2.353586136813393e-05, "loss": 4.8955, "step": 19900 }, { "epoch": 1.5930548887822051, "grad_norm": 3.7492361068725586, "learning_rate": 2.352249023907579e-05, "loss": 4.9209, "step": 19910 }, { "epoch": 1.5938550168026886, "grad_norm": 3.8520376682281494, "learning_rate": 2.3509119110017654e-05, "loss": 5.0284, "step": 19920 }, { "epoch": 1.5946551448231716, "grad_norm": 4.247003555297852, "learning_rate": 2.3495747980959513e-05, "loss": 4.9151, "step": 19930 }, { "epoch": 1.595455272843655, "grad_norm": 2.5604374408721924, "learning_rate": 2.3482376851901376e-05, "loss": 4.9954, "step": 19940 }, { "epoch": 1.5962554008641383, "grad_norm": 3.5611424446105957, "learning_rate": 2.346900572284324e-05, "loss": 5.2244, "step": 19950 }, { "epoch": 1.5970555288846215, "grad_norm": 3.0666303634643555, "learning_rate": 2.34556345937851e-05, "loss": 5.0632, "step": 19960 }, { "epoch": 1.5978556569051048, "grad_norm": 3.0092382431030273, "learning_rate": 2.3442263464726964e-05, "loss": 4.9118, "step": 19970 }, { "epoch": 1.598655784925588, "grad_norm": 3.099194288253784, "learning_rate": 2.3428892335668827e-05, "loss": 4.877, "step": 19980 }, { "epoch": 1.5994559129460715, "grad_norm": 3.5368402004241943, "learning_rate": 2.341552120661069e-05, "loss": 5.1603, "step": 19990 }, { "epoch": 1.6002560409665545, "grad_norm": 3.597376823425293, "learning_rate": 2.3402150077552552e-05, "loss": 4.974, "step": 20000 }, { "epoch": 1.601056168987038, "grad_norm": 2.8193488121032715, "learning_rate": 2.338877894849441e-05, "loss": 5.0046, "step": 20010 }, { "epoch": 1.6018562970075212, "grad_norm": 3.2118396759033203, "learning_rate": 2.3375407819436274e-05, "loss": 5.1282, "step": 20020 }, { "epoch": 1.6026564250280044, "grad_norm": 3.20345401763916, "learning_rate": 2.3362036690378137e-05, "loss": 4.998, "step": 20030 }, { "epoch": 1.6034565530484879, "grad_norm": 2.5816469192504883, "learning_rate": 2.334866556132e-05, "loss": 4.988, "step": 20040 }, { "epoch": 1.604256681068971, "grad_norm": 3.370908260345459, "learning_rate": 2.3335294432261862e-05, "loss": 4.8653, "step": 20050 }, { "epoch": 1.6050568090894544, "grad_norm": 3.355635404586792, "learning_rate": 2.3321923303203725e-05, "loss": 4.9665, "step": 20060 }, { "epoch": 1.6058569371099376, "grad_norm": 4.375133991241455, "learning_rate": 2.3308552174145587e-05, "loss": 4.8952, "step": 20070 }, { "epoch": 1.6066570651304208, "grad_norm": 3.30694580078125, "learning_rate": 2.3295181045087447e-05, "loss": 4.8787, "step": 20080 }, { "epoch": 1.6074571931509043, "grad_norm": 3.211714267730713, "learning_rate": 2.328180991602931e-05, "loss": 5.0676, "step": 20090 }, { "epoch": 1.6082573211713873, "grad_norm": 2.7749686241149902, "learning_rate": 2.3268438786971172e-05, "loss": 5.0726, "step": 20100 }, { "epoch": 1.6090574491918708, "grad_norm": 2.468235731124878, "learning_rate": 2.3255067657913035e-05, "loss": 5.1256, "step": 20110 }, { "epoch": 1.609857577212354, "grad_norm": 2.8571157455444336, "learning_rate": 2.3241696528854898e-05, "loss": 5.0278, "step": 20120 }, { "epoch": 1.6106577052328372, "grad_norm": 3.286158561706543, "learning_rate": 2.322832539979676e-05, "loss": 5.0154, "step": 20130 }, { "epoch": 1.6114578332533205, "grad_norm": 3.3127472400665283, "learning_rate": 2.3214954270738623e-05, "loss": 5.0027, "step": 20140 }, { "epoch": 1.6122579612738037, "grad_norm": 2.9354875087738037, "learning_rate": 2.3201583141680486e-05, "loss": 4.9334, "step": 20150 }, { "epoch": 1.6130580892942872, "grad_norm": 5.583163261413574, "learning_rate": 2.3188212012622345e-05, "loss": 5.0311, "step": 20160 }, { "epoch": 1.6138582173147704, "grad_norm": 2.93721079826355, "learning_rate": 2.3174840883564208e-05, "loss": 5.0904, "step": 20170 }, { "epoch": 1.6146583453352537, "grad_norm": 3.2356340885162354, "learning_rate": 2.316146975450607e-05, "loss": 4.8532, "step": 20180 }, { "epoch": 1.615458473355737, "grad_norm": 2.2506487369537354, "learning_rate": 2.3148098625447933e-05, "loss": 4.9251, "step": 20190 }, { "epoch": 1.6162586013762201, "grad_norm": 4.617792129516602, "learning_rate": 2.3134727496389796e-05, "loss": 5.0233, "step": 20200 }, { "epoch": 1.6170587293967036, "grad_norm": 5.237868309020996, "learning_rate": 2.312135636733166e-05, "loss": 4.9328, "step": 20210 }, { "epoch": 1.6178588574171866, "grad_norm": 3.0345842838287354, "learning_rate": 2.310798523827352e-05, "loss": 4.9304, "step": 20220 }, { "epoch": 1.61865898543767, "grad_norm": 4.008810997009277, "learning_rate": 2.3094614109215384e-05, "loss": 5.0552, "step": 20230 }, { "epoch": 1.6194591134581533, "grad_norm": 3.126352310180664, "learning_rate": 2.3081242980157243e-05, "loss": 5.1723, "step": 20240 }, { "epoch": 1.6202592414786365, "grad_norm": 2.8254294395446777, "learning_rate": 2.3067871851099106e-05, "loss": 5.0642, "step": 20250 }, { "epoch": 1.62105936949912, "grad_norm": 5.126392364501953, "learning_rate": 2.305450072204097e-05, "loss": 5.0199, "step": 20260 }, { "epoch": 1.621859497519603, "grad_norm": 3.706127882003784, "learning_rate": 2.304112959298283e-05, "loss": 4.7973, "step": 20270 }, { "epoch": 1.6226596255400865, "grad_norm": 2.41340970993042, "learning_rate": 2.3027758463924694e-05, "loss": 5.0053, "step": 20280 }, { "epoch": 1.6234597535605697, "grad_norm": 3.6797945499420166, "learning_rate": 2.3014387334866556e-05, "loss": 5.0205, "step": 20290 }, { "epoch": 1.624259881581053, "grad_norm": 3.8211288452148438, "learning_rate": 2.300101620580842e-05, "loss": 4.9126, "step": 20300 }, { "epoch": 1.6250600096015364, "grad_norm": 4.667059421539307, "learning_rate": 2.2987645076750282e-05, "loss": 4.8957, "step": 20310 }, { "epoch": 1.6258601376220194, "grad_norm": 3.175122022628784, "learning_rate": 2.2974273947692145e-05, "loss": 4.9999, "step": 20320 }, { "epoch": 1.6266602656425029, "grad_norm": 5.523460865020752, "learning_rate": 2.2960902818634007e-05, "loss": 5.0519, "step": 20330 }, { "epoch": 1.6274603936629861, "grad_norm": 4.125027656555176, "learning_rate": 2.294753168957587e-05, "loss": 5.0168, "step": 20340 }, { "epoch": 1.6282605216834694, "grad_norm": 3.6286473274230957, "learning_rate": 2.2934160560517733e-05, "loss": 5.0361, "step": 20350 }, { "epoch": 1.6290606497039526, "grad_norm": 3.122196674346924, "learning_rate": 2.2920789431459595e-05, "loss": 4.9964, "step": 20360 }, { "epoch": 1.6298607777244358, "grad_norm": 3.153103828430176, "learning_rate": 2.2907418302401458e-05, "loss": 5.2442, "step": 20370 }, { "epoch": 1.6306609057449193, "grad_norm": 3.096548557281494, "learning_rate": 2.289404717334332e-05, "loss": 4.9076, "step": 20380 }, { "epoch": 1.6314610337654023, "grad_norm": 3.5147552490234375, "learning_rate": 2.288067604428518e-05, "loss": 4.9899, "step": 20390 }, { "epoch": 1.6322611617858858, "grad_norm": 2.5290558338165283, "learning_rate": 2.2867304915227043e-05, "loss": 4.8719, "step": 20400 }, { "epoch": 1.633061289806369, "grad_norm": 3.173197031021118, "learning_rate": 2.2853933786168905e-05, "loss": 5.1526, "step": 20410 }, { "epoch": 1.6338614178268522, "grad_norm": 3.5563409328460693, "learning_rate": 2.2840562657110768e-05, "loss": 5.1405, "step": 20420 }, { "epoch": 1.6346615458473357, "grad_norm": 4.4011335372924805, "learning_rate": 2.282719152805263e-05, "loss": 5.0407, "step": 20430 }, { "epoch": 1.6354616738678187, "grad_norm": 2.9350881576538086, "learning_rate": 2.2813820398994493e-05, "loss": 5.0016, "step": 20440 }, { "epoch": 1.6362618018883022, "grad_norm": 3.446441650390625, "learning_rate": 2.2800449269936356e-05, "loss": 5.0064, "step": 20450 }, { "epoch": 1.6370619299087854, "grad_norm": 4.476505756378174, "learning_rate": 2.278707814087822e-05, "loss": 4.9877, "step": 20460 }, { "epoch": 1.6378620579292686, "grad_norm": 3.0098717212677, "learning_rate": 2.2773707011820078e-05, "loss": 4.9854, "step": 20470 }, { "epoch": 1.638662185949752, "grad_norm": 3.7491648197174072, "learning_rate": 2.276033588276194e-05, "loss": 4.9484, "step": 20480 }, { "epoch": 1.6394623139702351, "grad_norm": 2.899272918701172, "learning_rate": 2.2746964753703803e-05, "loss": 5.0725, "step": 20490 }, { "epoch": 1.6402624419907186, "grad_norm": 4.026328086853027, "learning_rate": 2.2733593624645666e-05, "loss": 5.1211, "step": 20500 }, { "epoch": 1.6410625700112018, "grad_norm": 3.2892467975616455, "learning_rate": 2.272022249558753e-05, "loss": 5.138, "step": 20510 }, { "epoch": 1.641862698031685, "grad_norm": 3.549945116043091, "learning_rate": 2.270685136652939e-05, "loss": 5.1562, "step": 20520 }, { "epoch": 1.6426628260521685, "grad_norm": 2.596651792526245, "learning_rate": 2.2693480237471254e-05, "loss": 4.9889, "step": 20530 }, { "epoch": 1.6434629540726515, "grad_norm": 4.061959266662598, "learning_rate": 2.2680109108413117e-05, "loss": 4.7925, "step": 20540 }, { "epoch": 1.644263082093135, "grad_norm": 4.479005336761475, "learning_rate": 2.2666737979354976e-05, "loss": 5.0266, "step": 20550 }, { "epoch": 1.6450632101136182, "grad_norm": 3.3911259174346924, "learning_rate": 2.265336685029684e-05, "loss": 5.0193, "step": 20560 }, { "epoch": 1.6458633381341015, "grad_norm": 3.546149969100952, "learning_rate": 2.26399957212387e-05, "loss": 5.1076, "step": 20570 }, { "epoch": 1.6466634661545847, "grad_norm": 3.0582120418548584, "learning_rate": 2.2626624592180564e-05, "loss": 5.0789, "step": 20580 }, { "epoch": 1.647463594175068, "grad_norm": 3.0280404090881348, "learning_rate": 2.2613253463122427e-05, "loss": 5.1195, "step": 20590 }, { "epoch": 1.6482637221955514, "grad_norm": 3.3636317253112793, "learning_rate": 2.259988233406429e-05, "loss": 4.7961, "step": 20600 }, { "epoch": 1.6490638502160344, "grad_norm": 2.9316253662109375, "learning_rate": 2.2586511205006152e-05, "loss": 4.9303, "step": 20610 }, { "epoch": 1.6498639782365179, "grad_norm": 2.799799919128418, "learning_rate": 2.2573140075948015e-05, "loss": 5.0877, "step": 20620 }, { "epoch": 1.650664106257001, "grad_norm": 4.057336807250977, "learning_rate": 2.2559768946889874e-05, "loss": 5.0723, "step": 20630 }, { "epoch": 1.6514642342774843, "grad_norm": 3.0030903816223145, "learning_rate": 2.2546397817831737e-05, "loss": 5.0231, "step": 20640 }, { "epoch": 1.6522643622979678, "grad_norm": 3.6355438232421875, "learning_rate": 2.25330266887736e-05, "loss": 4.9928, "step": 20650 }, { "epoch": 1.6530644903184508, "grad_norm": 2.6018145084381104, "learning_rate": 2.2519655559715462e-05, "loss": 4.9014, "step": 20660 }, { "epoch": 1.6538646183389343, "grad_norm": 2.9779539108276367, "learning_rate": 2.2506284430657325e-05, "loss": 5.1838, "step": 20670 }, { "epoch": 1.6546647463594175, "grad_norm": 2.8421037197113037, "learning_rate": 2.2492913301599188e-05, "loss": 5.0329, "step": 20680 }, { "epoch": 1.6554648743799008, "grad_norm": 4.846928119659424, "learning_rate": 2.247954217254105e-05, "loss": 4.8454, "step": 20690 }, { "epoch": 1.6562650024003842, "grad_norm": 2.6106202602386475, "learning_rate": 2.246617104348291e-05, "loss": 5.0867, "step": 20700 }, { "epoch": 1.6570651304208672, "grad_norm": 4.7461676597595215, "learning_rate": 2.2452799914424773e-05, "loss": 4.9611, "step": 20710 }, { "epoch": 1.6578652584413507, "grad_norm": 2.6487231254577637, "learning_rate": 2.2439428785366635e-05, "loss": 4.9811, "step": 20720 }, { "epoch": 1.658665386461834, "grad_norm": 3.8583147525787354, "learning_rate": 2.2426057656308498e-05, "loss": 5.0762, "step": 20730 }, { "epoch": 1.6594655144823172, "grad_norm": 5.3724846839904785, "learning_rate": 2.241268652725036e-05, "loss": 4.9821, "step": 20740 }, { "epoch": 1.6602656425028004, "grad_norm": 3.061331033706665, "learning_rate": 2.2399315398192223e-05, "loss": 4.9907, "step": 20750 }, { "epoch": 1.6610657705232836, "grad_norm": 2.9054677486419678, "learning_rate": 2.2385944269134086e-05, "loss": 5.0538, "step": 20760 }, { "epoch": 1.661865898543767, "grad_norm": 2.819784164428711, "learning_rate": 2.237257314007595e-05, "loss": 5.142, "step": 20770 }, { "epoch": 1.6626660265642503, "grad_norm": 2.849148750305176, "learning_rate": 2.235920201101781e-05, "loss": 4.9267, "step": 20780 }, { "epoch": 1.6634661545847336, "grad_norm": 3.2724263668060303, "learning_rate": 2.2345830881959674e-05, "loss": 4.9058, "step": 20790 }, { "epoch": 1.6642662826052168, "grad_norm": 2.774827480316162, "learning_rate": 2.2332459752901537e-05, "loss": 5.0322, "step": 20800 }, { "epoch": 1.6650664106257, "grad_norm": 5.824799537658691, "learning_rate": 2.23190886238434e-05, "loss": 4.9, "step": 20810 }, { "epoch": 1.6658665386461835, "grad_norm": 2.275923490524292, "learning_rate": 2.2305717494785262e-05, "loss": 5.037, "step": 20820 }, { "epoch": 1.6666666666666665, "grad_norm": 3.7803714275360107, "learning_rate": 2.2292346365727125e-05, "loss": 4.9231, "step": 20830 }, { "epoch": 1.66746679468715, "grad_norm": 3.1279492378234863, "learning_rate": 2.2278975236668987e-05, "loss": 5.1147, "step": 20840 }, { "epoch": 1.6682669227076332, "grad_norm": 2.410947561264038, "learning_rate": 2.226560410761085e-05, "loss": 5.0152, "step": 20850 }, { "epoch": 1.6690670507281165, "grad_norm": 3.348628282546997, "learning_rate": 2.225223297855271e-05, "loss": 4.9903, "step": 20860 }, { "epoch": 1.6698671787486, "grad_norm": 3.7435998916625977, "learning_rate": 2.2238861849494572e-05, "loss": 5.1829, "step": 20870 }, { "epoch": 1.670667306769083, "grad_norm": 2.872425079345703, "learning_rate": 2.2225490720436435e-05, "loss": 5.0331, "step": 20880 }, { "epoch": 1.6714674347895664, "grad_norm": 3.688359022140503, "learning_rate": 2.2212119591378298e-05, "loss": 4.9848, "step": 20890 }, { "epoch": 1.6722675628100496, "grad_norm": 2.4183199405670166, "learning_rate": 2.219874846232016e-05, "loss": 4.9103, "step": 20900 }, { "epoch": 1.6730676908305329, "grad_norm": 3.6013360023498535, "learning_rate": 2.2185377333262023e-05, "loss": 4.9898, "step": 20910 }, { "epoch": 1.6738678188510163, "grad_norm": 3.179523229598999, "learning_rate": 2.2172006204203886e-05, "loss": 5.0444, "step": 20920 }, { "epoch": 1.6746679468714993, "grad_norm": 3.9969305992126465, "learning_rate": 2.2158635075145745e-05, "loss": 5.2334, "step": 20930 }, { "epoch": 1.6754680748919828, "grad_norm": 3.4864439964294434, "learning_rate": 2.2145263946087608e-05, "loss": 4.9699, "step": 20940 }, { "epoch": 1.676268202912466, "grad_norm": 3.6860687732696533, "learning_rate": 2.213189281702947e-05, "loss": 5.0731, "step": 20950 }, { "epoch": 1.6770683309329493, "grad_norm": 5.067797660827637, "learning_rate": 2.2118521687971333e-05, "loss": 5.0058, "step": 20960 }, { "epoch": 1.6778684589534325, "grad_norm": 3.295374631881714, "learning_rate": 2.2105150558913196e-05, "loss": 5.2059, "step": 20970 }, { "epoch": 1.6786685869739157, "grad_norm": 2.855257987976074, "learning_rate": 2.209177942985506e-05, "loss": 5.134, "step": 20980 }, { "epoch": 1.6794687149943992, "grad_norm": 3.441474199295044, "learning_rate": 2.207840830079692e-05, "loss": 5.0258, "step": 20990 }, { "epoch": 1.6802688430148822, "grad_norm": 2.9601447582244873, "learning_rate": 2.2065037171738784e-05, "loss": 4.9197, "step": 21000 }, { "epoch": 1.6802688430148822, "eval_loss": 5.565999984741211, "eval_runtime": 13.2003, "eval_samples_per_second": 3.03, "eval_steps_per_second": 0.379, "step": 21000 }, { "epoch": 1.6810689710353657, "grad_norm": 3.3317766189575195, "learning_rate": 2.2051666042680643e-05, "loss": 4.8904, "step": 21010 }, { "epoch": 1.681869099055849, "grad_norm": 3.8202877044677734, "learning_rate": 2.2038294913622506e-05, "loss": 4.9713, "step": 21020 }, { "epoch": 1.6826692270763322, "grad_norm": 4.571699142456055, "learning_rate": 2.202492378456437e-05, "loss": 4.8879, "step": 21030 }, { "epoch": 1.6834693550968156, "grad_norm": 3.028830051422119, "learning_rate": 2.201155265550623e-05, "loss": 5.1747, "step": 21040 }, { "epoch": 1.6842694831172986, "grad_norm": 3.196197271347046, "learning_rate": 2.1998181526448094e-05, "loss": 5.0718, "step": 21050 }, { "epoch": 1.685069611137782, "grad_norm": 2.611711263656616, "learning_rate": 2.1984810397389957e-05, "loss": 4.8901, "step": 21060 }, { "epoch": 1.6858697391582653, "grad_norm": 2.8445308208465576, "learning_rate": 2.197143926833182e-05, "loss": 4.8063, "step": 21070 }, { "epoch": 1.6866698671787486, "grad_norm": 3.1951494216918945, "learning_rate": 2.1958068139273682e-05, "loss": 4.9334, "step": 21080 }, { "epoch": 1.687469995199232, "grad_norm": 2.3488590717315674, "learning_rate": 2.194469701021554e-05, "loss": 4.9107, "step": 21090 }, { "epoch": 1.688270123219715, "grad_norm": 4.008801460266113, "learning_rate": 2.1931325881157404e-05, "loss": 5.0342, "step": 21100 }, { "epoch": 1.6890702512401985, "grad_norm": 2.185612201690674, "learning_rate": 2.1917954752099267e-05, "loss": 5.0431, "step": 21110 }, { "epoch": 1.6898703792606817, "grad_norm": 2.571093797683716, "learning_rate": 2.190458362304113e-05, "loss": 4.9865, "step": 21120 }, { "epoch": 1.690670507281165, "grad_norm": 3.084378957748413, "learning_rate": 2.1891212493982992e-05, "loss": 4.9913, "step": 21130 }, { "epoch": 1.6914706353016484, "grad_norm": 2.625178813934326, "learning_rate": 2.1877841364924855e-05, "loss": 4.8234, "step": 21140 }, { "epoch": 1.6922707633221314, "grad_norm": 3.9121668338775635, "learning_rate": 2.1864470235866717e-05, "loss": 4.9822, "step": 21150 }, { "epoch": 1.693070891342615, "grad_norm": 3.599107265472412, "learning_rate": 2.185109910680858e-05, "loss": 5.0993, "step": 21160 }, { "epoch": 1.6938710193630981, "grad_norm": 2.523613929748535, "learning_rate": 2.1837727977750443e-05, "loss": 4.7632, "step": 21170 }, { "epoch": 1.6946711473835814, "grad_norm": 5.175262928009033, "learning_rate": 2.1824356848692305e-05, "loss": 5.0006, "step": 21180 }, { "epoch": 1.6954712754040646, "grad_norm": 2.815415382385254, "learning_rate": 2.1810985719634168e-05, "loss": 4.9402, "step": 21190 }, { "epoch": 1.6962714034245479, "grad_norm": 3.0755605697631836, "learning_rate": 2.179761459057603e-05, "loss": 4.8869, "step": 21200 }, { "epoch": 1.6970715314450313, "grad_norm": 2.684621810913086, "learning_rate": 2.1784243461517893e-05, "loss": 4.9828, "step": 21210 }, { "epoch": 1.6978716594655143, "grad_norm": 2.6758275032043457, "learning_rate": 2.1770872332459756e-05, "loss": 5.1145, "step": 21220 }, { "epoch": 1.6986717874859978, "grad_norm": 3.088541030883789, "learning_rate": 2.175750120340162e-05, "loss": 4.9545, "step": 21230 }, { "epoch": 1.699471915506481, "grad_norm": 2.9712045192718506, "learning_rate": 2.1744130074343478e-05, "loss": 4.8429, "step": 21240 }, { "epoch": 1.7002720435269643, "grad_norm": 2.9696614742279053, "learning_rate": 2.173075894528534e-05, "loss": 5.097, "step": 21250 }, { "epoch": 1.7010721715474477, "grad_norm": 3.073406934738159, "learning_rate": 2.1717387816227204e-05, "loss": 4.9124, "step": 21260 }, { "epoch": 1.7018722995679307, "grad_norm": 2.623845100402832, "learning_rate": 2.1704016687169066e-05, "loss": 4.9771, "step": 21270 }, { "epoch": 1.7026724275884142, "grad_norm": 3.0245361328125, "learning_rate": 2.169064555811093e-05, "loss": 4.98, "step": 21280 }, { "epoch": 1.7034725556088974, "grad_norm": 5.218230724334717, "learning_rate": 2.167727442905279e-05, "loss": 5.0837, "step": 21290 }, { "epoch": 1.7042726836293807, "grad_norm": 4.287439823150635, "learning_rate": 2.1663903299994654e-05, "loss": 4.9261, "step": 21300 }, { "epoch": 1.7050728116498641, "grad_norm": 2.7261388301849365, "learning_rate": 2.1650532170936517e-05, "loss": 4.9462, "step": 21310 }, { "epoch": 1.7058729396703471, "grad_norm": 3.0269289016723633, "learning_rate": 2.1637161041878376e-05, "loss": 4.9972, "step": 21320 }, { "epoch": 1.7066730676908306, "grad_norm": 3.629207134246826, "learning_rate": 2.162378991282024e-05, "loss": 5.0556, "step": 21330 }, { "epoch": 1.7074731957113138, "grad_norm": 4.204544544219971, "learning_rate": 2.16104187837621e-05, "loss": 5.0391, "step": 21340 }, { "epoch": 1.708273323731797, "grad_norm": 3.9278762340545654, "learning_rate": 2.1597047654703964e-05, "loss": 5.0107, "step": 21350 }, { "epoch": 1.7090734517522803, "grad_norm": 3.9020843505859375, "learning_rate": 2.1583676525645827e-05, "loss": 4.9798, "step": 21360 }, { "epoch": 1.7098735797727636, "grad_norm": 3.871673107147217, "learning_rate": 2.157030539658769e-05, "loss": 5.0305, "step": 21370 }, { "epoch": 1.710673707793247, "grad_norm": 3.754272937774658, "learning_rate": 2.1556934267529552e-05, "loss": 5.1621, "step": 21380 }, { "epoch": 1.7114738358137302, "grad_norm": 3.5809175968170166, "learning_rate": 2.1543563138471415e-05, "loss": 5.0017, "step": 21390 }, { "epoch": 1.7122739638342135, "grad_norm": 3.7547807693481445, "learning_rate": 2.1530192009413274e-05, "loss": 5.0926, "step": 21400 }, { "epoch": 1.7130740918546967, "grad_norm": 3.9002151489257812, "learning_rate": 2.1516820880355137e-05, "loss": 4.9253, "step": 21410 }, { "epoch": 1.71387421987518, "grad_norm": 3.080944299697876, "learning_rate": 2.1503449751297e-05, "loss": 4.963, "step": 21420 }, { "epoch": 1.7146743478956634, "grad_norm": 4.989534378051758, "learning_rate": 2.1490078622238862e-05, "loss": 5.149, "step": 21430 }, { "epoch": 1.7154744759161464, "grad_norm": 3.4675042629241943, "learning_rate": 2.1476707493180725e-05, "loss": 4.9175, "step": 21440 }, { "epoch": 1.71627460393663, "grad_norm": 2.9454565048217773, "learning_rate": 2.1463336364122588e-05, "loss": 5.0187, "step": 21450 }, { "epoch": 1.7170747319571131, "grad_norm": 3.1735284328460693, "learning_rate": 2.144996523506445e-05, "loss": 4.932, "step": 21460 }, { "epoch": 1.7178748599775964, "grad_norm": 4.5251054763793945, "learning_rate": 2.1436594106006313e-05, "loss": 5.0622, "step": 21470 }, { "epoch": 1.7186749879980798, "grad_norm": 3.7694296836853027, "learning_rate": 2.1423222976948173e-05, "loss": 5.071, "step": 21480 }, { "epoch": 1.7194751160185628, "grad_norm": 3.9317219257354736, "learning_rate": 2.1409851847890035e-05, "loss": 4.9159, "step": 21490 }, { "epoch": 1.7202752440390463, "grad_norm": 3.568376064300537, "learning_rate": 2.1396480718831898e-05, "loss": 5.1121, "step": 21500 }, { "epoch": 1.7210753720595295, "grad_norm": 3.1742336750030518, "learning_rate": 2.138310958977376e-05, "loss": 5.0329, "step": 21510 }, { "epoch": 1.7218755000800128, "grad_norm": 4.287962913513184, "learning_rate": 2.1369738460715623e-05, "loss": 4.9761, "step": 21520 }, { "epoch": 1.7226756281004962, "grad_norm": 3.787036180496216, "learning_rate": 2.1356367331657486e-05, "loss": 4.9856, "step": 21530 }, { "epoch": 1.7234757561209793, "grad_norm": 3.4677791595458984, "learning_rate": 2.134299620259935e-05, "loss": 5.0777, "step": 21540 }, { "epoch": 1.7242758841414627, "grad_norm": 3.511162757873535, "learning_rate": 2.1329625073541208e-05, "loss": 5.0944, "step": 21550 }, { "epoch": 1.725076012161946, "grad_norm": 3.1692442893981934, "learning_rate": 2.131625394448307e-05, "loss": 4.936, "step": 21560 }, { "epoch": 1.7258761401824292, "grad_norm": 3.253077507019043, "learning_rate": 2.1302882815424933e-05, "loss": 5.1004, "step": 21570 }, { "epoch": 1.7266762682029124, "grad_norm": 4.503719806671143, "learning_rate": 2.1289511686366796e-05, "loss": 5.0816, "step": 21580 }, { "epoch": 1.7274763962233957, "grad_norm": 3.9515204429626465, "learning_rate": 2.127614055730866e-05, "loss": 5.0213, "step": 21590 }, { "epoch": 1.7282765242438791, "grad_norm": 3.2743678092956543, "learning_rate": 2.126276942825052e-05, "loss": 4.945, "step": 21600 }, { "epoch": 1.7290766522643621, "grad_norm": 3.2138924598693848, "learning_rate": 2.1249398299192384e-05, "loss": 4.9907, "step": 21610 }, { "epoch": 1.7298767802848456, "grad_norm": 2.7698373794555664, "learning_rate": 2.1236027170134247e-05, "loss": 5.0068, "step": 21620 }, { "epoch": 1.7306769083053288, "grad_norm": 3.4661061763763428, "learning_rate": 2.122265604107611e-05, "loss": 4.7865, "step": 21630 }, { "epoch": 1.731477036325812, "grad_norm": 4.473834991455078, "learning_rate": 2.1209284912017972e-05, "loss": 5.0263, "step": 21640 }, { "epoch": 1.7322771643462955, "grad_norm": 4.1776275634765625, "learning_rate": 2.1195913782959835e-05, "loss": 4.8809, "step": 21650 }, { "epoch": 1.7330772923667785, "grad_norm": 3.335580348968506, "learning_rate": 2.1182542653901698e-05, "loss": 5.0557, "step": 21660 }, { "epoch": 1.733877420387262, "grad_norm": 4.314727783203125, "learning_rate": 2.116917152484356e-05, "loss": 4.8628, "step": 21670 }, { "epoch": 1.7346775484077452, "grad_norm": 3.1467959880828857, "learning_rate": 2.1155800395785423e-05, "loss": 5.0191, "step": 21680 }, { "epoch": 1.7354776764282285, "grad_norm": 3.9090895652770996, "learning_rate": 2.1142429266727286e-05, "loss": 4.8245, "step": 21690 }, { "epoch": 1.736277804448712, "grad_norm": 2.6189544200897217, "learning_rate": 2.112905813766915e-05, "loss": 4.8945, "step": 21700 }, { "epoch": 1.737077932469195, "grad_norm": 3.156756639480591, "learning_rate": 2.1115687008611008e-05, "loss": 5.0731, "step": 21710 }, { "epoch": 1.7378780604896784, "grad_norm": 3.3744938373565674, "learning_rate": 2.110231587955287e-05, "loss": 5.202, "step": 21720 }, { "epoch": 1.7386781885101616, "grad_norm": 3.677507162094116, "learning_rate": 2.1088944750494733e-05, "loss": 5.0053, "step": 21730 }, { "epoch": 1.7394783165306449, "grad_norm": 2.5175113677978516, "learning_rate": 2.1075573621436596e-05, "loss": 4.8895, "step": 21740 }, { "epoch": 1.7402784445511283, "grad_norm": 3.9189088344573975, "learning_rate": 2.106220249237846e-05, "loss": 4.9188, "step": 21750 }, { "epoch": 1.7410785725716114, "grad_norm": 4.0822625160217285, "learning_rate": 2.104883136332032e-05, "loss": 4.9906, "step": 21760 }, { "epoch": 1.7418787005920948, "grad_norm": 3.3237664699554443, "learning_rate": 2.1035460234262184e-05, "loss": 5.0885, "step": 21770 }, { "epoch": 1.742678828612578, "grad_norm": 4.276275634765625, "learning_rate": 2.1022089105204043e-05, "loss": 4.9108, "step": 21780 }, { "epoch": 1.7434789566330613, "grad_norm": 4.213038444519043, "learning_rate": 2.1008717976145906e-05, "loss": 4.9961, "step": 21790 }, { "epoch": 1.7442790846535445, "grad_norm": 3.0935757160186768, "learning_rate": 2.099534684708777e-05, "loss": 5.0978, "step": 21800 }, { "epoch": 1.7450792126740278, "grad_norm": 3.3736915588378906, "learning_rate": 2.098197571802963e-05, "loss": 4.8611, "step": 21810 }, { "epoch": 1.7458793406945112, "grad_norm": 3.6856744289398193, "learning_rate": 2.0968604588971494e-05, "loss": 4.9403, "step": 21820 }, { "epoch": 1.7466794687149942, "grad_norm": 3.847534656524658, "learning_rate": 2.0955233459913357e-05, "loss": 5.0023, "step": 21830 }, { "epoch": 1.7474795967354777, "grad_norm": 4.812027931213379, "learning_rate": 2.094186233085522e-05, "loss": 4.9796, "step": 21840 }, { "epoch": 1.748279724755961, "grad_norm": 2.484204053878784, "learning_rate": 2.0928491201797082e-05, "loss": 4.9755, "step": 21850 }, { "epoch": 1.7490798527764442, "grad_norm": 3.052938938140869, "learning_rate": 2.091512007273894e-05, "loss": 5.013, "step": 21860 }, { "epoch": 1.7498799807969276, "grad_norm": 3.8069536685943604, "learning_rate": 2.0901748943680804e-05, "loss": 5.1675, "step": 21870 }, { "epoch": 1.7506801088174107, "grad_norm": 4.167869567871094, "learning_rate": 2.0888377814622667e-05, "loss": 5.1041, "step": 21880 }, { "epoch": 1.7514802368378941, "grad_norm": 3.1281728744506836, "learning_rate": 2.087500668556453e-05, "loss": 5.0955, "step": 21890 }, { "epoch": 1.7522803648583773, "grad_norm": 3.4163386821746826, "learning_rate": 2.0861635556506392e-05, "loss": 4.976, "step": 21900 }, { "epoch": 1.7530804928788606, "grad_norm": 2.665518045425415, "learning_rate": 2.0848264427448255e-05, "loss": 4.9827, "step": 21910 }, { "epoch": 1.753880620899344, "grad_norm": 3.978625535964966, "learning_rate": 2.0834893298390117e-05, "loss": 5.125, "step": 21920 }, { "epoch": 1.754680748919827, "grad_norm": 2.702791452407837, "learning_rate": 2.082152216933198e-05, "loss": 4.9508, "step": 21930 }, { "epoch": 1.7554808769403105, "grad_norm": 2.9301092624664307, "learning_rate": 2.080815104027384e-05, "loss": 5.1346, "step": 21940 }, { "epoch": 1.7562810049607938, "grad_norm": 3.169142007827759, "learning_rate": 2.0794779911215702e-05, "loss": 4.899, "step": 21950 }, { "epoch": 1.757081132981277, "grad_norm": 4.339301109313965, "learning_rate": 2.0781408782157565e-05, "loss": 4.9364, "step": 21960 }, { "epoch": 1.7578812610017602, "grad_norm": 3.94887113571167, "learning_rate": 2.0768037653099427e-05, "loss": 4.9057, "step": 21970 }, { "epoch": 1.7586813890222435, "grad_norm": 2.7177371978759766, "learning_rate": 2.075466652404129e-05, "loss": 5.0119, "step": 21980 }, { "epoch": 1.759481517042727, "grad_norm": 4.602911472320557, "learning_rate": 2.0741295394983153e-05, "loss": 4.8852, "step": 21990 }, { "epoch": 1.7602816450632102, "grad_norm": 3.233092784881592, "learning_rate": 2.0727924265925015e-05, "loss": 5.0399, "step": 22000 }, { "epoch": 1.7610817730836934, "grad_norm": 3.1726016998291016, "learning_rate": 2.0714553136866878e-05, "loss": 4.9855, "step": 22010 }, { "epoch": 1.7618819011041766, "grad_norm": 8.455097198486328, "learning_rate": 2.070118200780874e-05, "loss": 4.8389, "step": 22020 }, { "epoch": 1.7626820291246599, "grad_norm": 3.6071341037750244, "learning_rate": 2.0687810878750604e-05, "loss": 5.059, "step": 22030 }, { "epoch": 1.7634821571451433, "grad_norm": 3.182056188583374, "learning_rate": 2.0674439749692466e-05, "loss": 5.0172, "step": 22040 }, { "epoch": 1.7642822851656264, "grad_norm": 3.3609728813171387, "learning_rate": 2.066106862063433e-05, "loss": 4.8298, "step": 22050 }, { "epoch": 1.7650824131861098, "grad_norm": 4.301130771636963, "learning_rate": 2.064769749157619e-05, "loss": 5.0654, "step": 22060 }, { "epoch": 1.765882541206593, "grad_norm": 3.5085902214050293, "learning_rate": 2.0634326362518054e-05, "loss": 5.035, "step": 22070 }, { "epoch": 1.7666826692270763, "grad_norm": 3.628776788711548, "learning_rate": 2.0620955233459917e-05, "loss": 4.846, "step": 22080 }, { "epoch": 1.7674827972475597, "grad_norm": 2.593838691711426, "learning_rate": 2.0607584104401776e-05, "loss": 4.9209, "step": 22090 }, { "epoch": 1.7682829252680428, "grad_norm": 4.2121477127075195, "learning_rate": 2.059421297534364e-05, "loss": 5.0184, "step": 22100 }, { "epoch": 1.7690830532885262, "grad_norm": 2.8298094272613525, "learning_rate": 2.05808418462855e-05, "loss": 4.9215, "step": 22110 }, { "epoch": 1.7698831813090095, "grad_norm": 4.237027168273926, "learning_rate": 2.0567470717227364e-05, "loss": 5.0723, "step": 22120 }, { "epoch": 1.7706833093294927, "grad_norm": 3.460895299911499, "learning_rate": 2.0554099588169227e-05, "loss": 5.0134, "step": 22130 }, { "epoch": 1.7714834373499762, "grad_norm": 2.9256222248077393, "learning_rate": 2.054072845911109e-05, "loss": 4.8817, "step": 22140 }, { "epoch": 1.7722835653704592, "grad_norm": 3.658893585205078, "learning_rate": 2.0527357330052952e-05, "loss": 4.906, "step": 22150 }, { "epoch": 1.7730836933909426, "grad_norm": 3.463165521621704, "learning_rate": 2.0513986200994815e-05, "loss": 5.0674, "step": 22160 }, { "epoch": 1.7738838214114259, "grad_norm": 2.669952154159546, "learning_rate": 2.0500615071936674e-05, "loss": 4.9548, "step": 22170 }, { "epoch": 1.774683949431909, "grad_norm": 3.712484836578369, "learning_rate": 2.0487243942878537e-05, "loss": 5.0874, "step": 22180 }, { "epoch": 1.7754840774523923, "grad_norm": 3.4459969997406006, "learning_rate": 2.04738728138204e-05, "loss": 4.9384, "step": 22190 }, { "epoch": 1.7762842054728756, "grad_norm": 4.165179252624512, "learning_rate": 2.0460501684762263e-05, "loss": 5.0259, "step": 22200 }, { "epoch": 1.777084333493359, "grad_norm": 3.050100326538086, "learning_rate": 2.0447130555704125e-05, "loss": 4.9838, "step": 22210 }, { "epoch": 1.777884461513842, "grad_norm": 3.6571977138519287, "learning_rate": 2.0433759426645988e-05, "loss": 4.8971, "step": 22220 }, { "epoch": 1.7786845895343255, "grad_norm": 3.6707186698913574, "learning_rate": 2.042038829758785e-05, "loss": 5.0419, "step": 22230 }, { "epoch": 1.7794847175548087, "grad_norm": 3.24741792678833, "learning_rate": 2.0407017168529713e-05, "loss": 5.1247, "step": 22240 }, { "epoch": 1.780284845575292, "grad_norm": 4.1845703125, "learning_rate": 2.0393646039471573e-05, "loss": 5.1055, "step": 22250 }, { "epoch": 1.7810849735957754, "grad_norm": 2.6318318843841553, "learning_rate": 2.0380274910413435e-05, "loss": 4.9075, "step": 22260 }, { "epoch": 1.7818851016162585, "grad_norm": 3.2743167877197266, "learning_rate": 2.0366903781355298e-05, "loss": 5.0425, "step": 22270 }, { "epoch": 1.782685229636742, "grad_norm": 2.7158398628234863, "learning_rate": 2.035353265229716e-05, "loss": 4.913, "step": 22280 }, { "epoch": 1.7834853576572252, "grad_norm": 4.428764343261719, "learning_rate": 2.0340161523239023e-05, "loss": 4.9971, "step": 22290 }, { "epoch": 1.7842854856777084, "grad_norm": 3.811169385910034, "learning_rate": 2.0326790394180886e-05, "loss": 4.9423, "step": 22300 }, { "epoch": 1.7850856136981919, "grad_norm": 3.651449680328369, "learning_rate": 2.031341926512275e-05, "loss": 5.2026, "step": 22310 }, { "epoch": 1.7858857417186749, "grad_norm": 3.165903329849243, "learning_rate": 2.030004813606461e-05, "loss": 5.1141, "step": 22320 }, { "epoch": 1.7866858697391583, "grad_norm": 2.8548033237457275, "learning_rate": 2.028667700700647e-05, "loss": 5.046, "step": 22330 }, { "epoch": 1.7874859977596416, "grad_norm": 3.2436907291412354, "learning_rate": 2.0273305877948333e-05, "loss": 5.1067, "step": 22340 }, { "epoch": 1.7882861257801248, "grad_norm": 3.599581241607666, "learning_rate": 2.0259934748890196e-05, "loss": 5.0487, "step": 22350 }, { "epoch": 1.7890862538006083, "grad_norm": 3.722228765487671, "learning_rate": 2.024656361983206e-05, "loss": 5.0016, "step": 22360 }, { "epoch": 1.7898863818210913, "grad_norm": 3.774880886077881, "learning_rate": 2.023319249077392e-05, "loss": 5.0675, "step": 22370 }, { "epoch": 1.7906865098415747, "grad_norm": 4.3687968254089355, "learning_rate": 2.0219821361715784e-05, "loss": 5.0152, "step": 22380 }, { "epoch": 1.791486637862058, "grad_norm": 3.880603790283203, "learning_rate": 2.0206450232657647e-05, "loss": 4.9069, "step": 22390 }, { "epoch": 1.7922867658825412, "grad_norm": 6.448166847229004, "learning_rate": 2.0193079103599506e-05, "loss": 5.0175, "step": 22400 }, { "epoch": 1.7930868939030244, "grad_norm": 3.8508150577545166, "learning_rate": 2.017970797454137e-05, "loss": 5.1245, "step": 22410 }, { "epoch": 1.7938870219235077, "grad_norm": 2.9482107162475586, "learning_rate": 2.016633684548323e-05, "loss": 4.9807, "step": 22420 }, { "epoch": 1.7946871499439911, "grad_norm": 3.987626314163208, "learning_rate": 2.0152965716425094e-05, "loss": 5.1364, "step": 22430 }, { "epoch": 1.7954872779644742, "grad_norm": 3.995668649673462, "learning_rate": 2.0139594587366957e-05, "loss": 5.0172, "step": 22440 }, { "epoch": 1.7962874059849576, "grad_norm": 3.471242904663086, "learning_rate": 2.012622345830882e-05, "loss": 5.0859, "step": 22450 }, { "epoch": 1.7970875340054409, "grad_norm": 4.026039123535156, "learning_rate": 2.0112852329250682e-05, "loss": 5.1441, "step": 22460 }, { "epoch": 1.797887662025924, "grad_norm": 5.313076019287109, "learning_rate": 2.0099481200192545e-05, "loss": 4.9975, "step": 22470 }, { "epoch": 1.7986877900464076, "grad_norm": 3.445133924484253, "learning_rate": 2.0086110071134408e-05, "loss": 5.0569, "step": 22480 }, { "epoch": 1.7994879180668906, "grad_norm": 3.028310775756836, "learning_rate": 2.007273894207627e-05, "loss": 4.9515, "step": 22490 }, { "epoch": 1.800288046087374, "grad_norm": 3.2729883193969727, "learning_rate": 2.0059367813018133e-05, "loss": 4.9365, "step": 22500 }, { "epoch": 1.8010881741078573, "grad_norm": 2.9513375759124756, "learning_rate": 2.0045996683959996e-05, "loss": 4.8909, "step": 22510 }, { "epoch": 1.8018883021283405, "grad_norm": 3.4466264247894287, "learning_rate": 2.003262555490186e-05, "loss": 4.8748, "step": 22520 }, { "epoch": 1.802688430148824, "grad_norm": 3.1651034355163574, "learning_rate": 2.001925442584372e-05, "loss": 4.8218, "step": 22530 }, { "epoch": 1.803488558169307, "grad_norm": 2.885279893875122, "learning_rate": 2.0005883296785584e-05, "loss": 4.8861, "step": 22540 }, { "epoch": 1.8042886861897904, "grad_norm": 3.265558958053589, "learning_rate": 1.9992512167727447e-05, "loss": 5.1021, "step": 22550 }, { "epoch": 1.8050888142102737, "grad_norm": 3.3836588859558105, "learning_rate": 1.9979141038669306e-05, "loss": 4.9152, "step": 22560 }, { "epoch": 1.805888942230757, "grad_norm": 4.1803693771362305, "learning_rate": 1.996576990961117e-05, "loss": 4.9414, "step": 22570 }, { "epoch": 1.8066890702512401, "grad_norm": 3.471590995788574, "learning_rate": 1.995239878055303e-05, "loss": 4.8972, "step": 22580 }, { "epoch": 1.8074891982717234, "grad_norm": 4.42878532409668, "learning_rate": 1.9939027651494894e-05, "loss": 5.0472, "step": 22590 }, { "epoch": 1.8082893262922068, "grad_norm": 3.460707187652588, "learning_rate": 1.9925656522436757e-05, "loss": 5.1091, "step": 22600 }, { "epoch": 1.80908945431269, "grad_norm": 2.901390790939331, "learning_rate": 1.991228539337862e-05, "loss": 4.9029, "step": 22610 }, { "epoch": 1.8098895823331733, "grad_norm": 3.826354503631592, "learning_rate": 1.9898914264320482e-05, "loss": 5.0852, "step": 22620 }, { "epoch": 1.8106897103536566, "grad_norm": 3.9841110706329346, "learning_rate": 1.988554313526234e-05, "loss": 4.9441, "step": 22630 }, { "epoch": 1.8114898383741398, "grad_norm": 3.2412731647491455, "learning_rate": 1.9872172006204204e-05, "loss": 4.9013, "step": 22640 }, { "epoch": 1.8122899663946233, "grad_norm": 3.6046488285064697, "learning_rate": 1.9858800877146067e-05, "loss": 5.0176, "step": 22650 }, { "epoch": 1.8130900944151063, "grad_norm": 4.1147260665893555, "learning_rate": 1.984542974808793e-05, "loss": 4.8024, "step": 22660 }, { "epoch": 1.8138902224355897, "grad_norm": 3.180675983428955, "learning_rate": 1.9832058619029792e-05, "loss": 4.9099, "step": 22670 }, { "epoch": 1.814690350456073, "grad_norm": 3.9428789615631104, "learning_rate": 1.9818687489971655e-05, "loss": 5.003, "step": 22680 }, { "epoch": 1.8154904784765562, "grad_norm": 2.911067247390747, "learning_rate": 1.9805316360913517e-05, "loss": 4.9009, "step": 22690 }, { "epoch": 1.8162906064970397, "grad_norm": 3.8873329162597656, "learning_rate": 1.979194523185538e-05, "loss": 5.1122, "step": 22700 }, { "epoch": 1.8170907345175227, "grad_norm": 3.5868935585021973, "learning_rate": 1.977857410279724e-05, "loss": 4.9064, "step": 22710 }, { "epoch": 1.8178908625380061, "grad_norm": 3.8088326454162598, "learning_rate": 1.9765202973739102e-05, "loss": 5.0972, "step": 22720 }, { "epoch": 1.8186909905584894, "grad_norm": 3.4376535415649414, "learning_rate": 1.9751831844680965e-05, "loss": 4.9626, "step": 22730 }, { "epoch": 1.8194911185789726, "grad_norm": 3.151939630508423, "learning_rate": 1.9738460715622827e-05, "loss": 4.9541, "step": 22740 }, { "epoch": 1.820291246599456, "grad_norm": 4.372435569763184, "learning_rate": 1.972508958656469e-05, "loss": 4.9463, "step": 22750 }, { "epoch": 1.821091374619939, "grad_norm": 2.9375088214874268, "learning_rate": 1.9711718457506553e-05, "loss": 5.019, "step": 22760 }, { "epoch": 1.8218915026404225, "grad_norm": 3.458109140396118, "learning_rate": 1.9698347328448416e-05, "loss": 5.0701, "step": 22770 }, { "epoch": 1.8226916306609058, "grad_norm": 3.90632700920105, "learning_rate": 1.9684976199390278e-05, "loss": 4.9235, "step": 22780 }, { "epoch": 1.823491758681389, "grad_norm": 3.9509658813476562, "learning_rate": 1.9671605070332138e-05, "loss": 4.892, "step": 22790 }, { "epoch": 1.8242918867018723, "grad_norm": 3.8089678287506104, "learning_rate": 1.9658233941274e-05, "loss": 5.0283, "step": 22800 }, { "epoch": 1.8250920147223555, "grad_norm": 3.4844071865081787, "learning_rate": 1.9644862812215863e-05, "loss": 4.9961, "step": 22810 }, { "epoch": 1.825892142742839, "grad_norm": 2.961293935775757, "learning_rate": 1.9631491683157726e-05, "loss": 5.0996, "step": 22820 }, { "epoch": 1.826692270763322, "grad_norm": 2.8211441040039062, "learning_rate": 1.9618120554099588e-05, "loss": 4.9355, "step": 22830 }, { "epoch": 1.8274923987838054, "grad_norm": 4.970695972442627, "learning_rate": 1.960474942504145e-05, "loss": 5.0109, "step": 22840 }, { "epoch": 1.8282925268042887, "grad_norm": 3.2211780548095703, "learning_rate": 1.9591378295983314e-05, "loss": 4.9364, "step": 22850 }, { "epoch": 1.829092654824772, "grad_norm": 2.114428758621216, "learning_rate": 1.9578007166925176e-05, "loss": 4.9879, "step": 22860 }, { "epoch": 1.8298927828452554, "grad_norm": 5.383838176727295, "learning_rate": 1.956463603786704e-05, "loss": 5.0305, "step": 22870 }, { "epoch": 1.8306929108657384, "grad_norm": 3.4552454948425293, "learning_rate": 1.9551264908808902e-05, "loss": 4.946, "step": 22880 }, { "epoch": 1.8314930388862218, "grad_norm": 3.4518730640411377, "learning_rate": 1.9537893779750764e-05, "loss": 4.9089, "step": 22890 }, { "epoch": 1.832293166906705, "grad_norm": 8.194537162780762, "learning_rate": 1.9524522650692627e-05, "loss": 4.8648, "step": 22900 }, { "epoch": 1.8330932949271883, "grad_norm": 3.472346782684326, "learning_rate": 1.951115152163449e-05, "loss": 5.0056, "step": 22910 }, { "epoch": 1.8338934229476718, "grad_norm": 2.3425405025482178, "learning_rate": 1.9497780392576352e-05, "loss": 5.0722, "step": 22920 }, { "epoch": 1.8346935509681548, "grad_norm": 4.770868301391602, "learning_rate": 1.9484409263518215e-05, "loss": 4.8792, "step": 22930 }, { "epoch": 1.8354936789886382, "grad_norm": 3.50227689743042, "learning_rate": 1.9471038134460074e-05, "loss": 4.9434, "step": 22940 }, { "epoch": 1.8362938070091215, "grad_norm": 3.5927786827087402, "learning_rate": 1.9457667005401937e-05, "loss": 4.9179, "step": 22950 }, { "epoch": 1.8370939350296047, "grad_norm": 4.16779088973999, "learning_rate": 1.94442958763438e-05, "loss": 5.1915, "step": 22960 }, { "epoch": 1.8378940630500882, "grad_norm": 2.5966103076934814, "learning_rate": 1.9430924747285663e-05, "loss": 4.9658, "step": 22970 }, { "epoch": 1.8386941910705712, "grad_norm": 2.5739660263061523, "learning_rate": 1.9417553618227525e-05, "loss": 4.9782, "step": 22980 }, { "epoch": 1.8394943190910547, "grad_norm": 2.8742406368255615, "learning_rate": 1.9404182489169388e-05, "loss": 5.0417, "step": 22990 }, { "epoch": 1.840294447111538, "grad_norm": 4.191195964813232, "learning_rate": 1.939081136011125e-05, "loss": 4.8885, "step": 23000 }, { "epoch": 1.8410945751320211, "grad_norm": 3.3833866119384766, "learning_rate": 1.9377440231053113e-05, "loss": 5.1906, "step": 23010 }, { "epoch": 1.8418947031525044, "grad_norm": 3.468492031097412, "learning_rate": 1.9364069101994973e-05, "loss": 5.0216, "step": 23020 }, { "epoch": 1.8426948311729876, "grad_norm": 3.2246339321136475, "learning_rate": 1.9350697972936835e-05, "loss": 5.0528, "step": 23030 }, { "epoch": 1.843494959193471, "grad_norm": 3.780441999435425, "learning_rate": 1.9337326843878698e-05, "loss": 4.9197, "step": 23040 }, { "epoch": 1.844295087213954, "grad_norm": 3.0459189414978027, "learning_rate": 1.932395571482056e-05, "loss": 4.9636, "step": 23050 }, { "epoch": 1.8450952152344375, "grad_norm": 2.835489511489868, "learning_rate": 1.9310584585762423e-05, "loss": 5.044, "step": 23060 }, { "epoch": 1.8458953432549208, "grad_norm": 2.615612506866455, "learning_rate": 1.9297213456704286e-05, "loss": 4.9497, "step": 23070 }, { "epoch": 1.846695471275404, "grad_norm": 3.1220602989196777, "learning_rate": 1.928384232764615e-05, "loss": 4.8735, "step": 23080 }, { "epoch": 1.8474955992958875, "grad_norm": 4.260631561279297, "learning_rate": 1.9271808311493824e-05, "loss": 5.0581, "step": 23090 }, { "epoch": 1.8482957273163705, "grad_norm": 2.928492546081543, "learning_rate": 1.9258437182435684e-05, "loss": 4.9898, "step": 23100 }, { "epoch": 1.849095855336854, "grad_norm": 3.321458101272583, "learning_rate": 1.9245066053377546e-05, "loss": 4.9585, "step": 23110 }, { "epoch": 1.8498959833573372, "grad_norm": 3.4391822814941406, "learning_rate": 1.923169492431941e-05, "loss": 5.0762, "step": 23120 }, { "epoch": 1.8506961113778204, "grad_norm": 2.8308773040771484, "learning_rate": 1.9218323795261272e-05, "loss": 4.9746, "step": 23130 }, { "epoch": 1.8514962393983039, "grad_norm": 3.0411789417266846, "learning_rate": 1.9204952666203135e-05, "loss": 4.7202, "step": 23140 }, { "epoch": 1.852296367418787, "grad_norm": 4.386777400970459, "learning_rate": 1.9191581537144997e-05, "loss": 5.0321, "step": 23150 }, { "epoch": 1.8530964954392704, "grad_norm": 3.381636381149292, "learning_rate": 1.917821040808686e-05, "loss": 4.9549, "step": 23160 }, { "epoch": 1.8538966234597536, "grad_norm": 2.740494728088379, "learning_rate": 1.9164839279028723e-05, "loss": 5.1105, "step": 23170 }, { "epoch": 1.8546967514802368, "grad_norm": 2.8650572299957275, "learning_rate": 1.9151468149970582e-05, "loss": 4.9549, "step": 23180 }, { "epoch": 1.85549687950072, "grad_norm": 2.8108630180358887, "learning_rate": 1.9138097020912445e-05, "loss": 4.8654, "step": 23190 }, { "epoch": 1.8562970075212033, "grad_norm": 4.914087772369385, "learning_rate": 1.9124725891854307e-05, "loss": 4.9375, "step": 23200 }, { "epoch": 1.8570971355416868, "grad_norm": 3.421011447906494, "learning_rate": 1.911135476279617e-05, "loss": 5.0395, "step": 23210 }, { "epoch": 1.85789726356217, "grad_norm": 5.41282844543457, "learning_rate": 1.9097983633738033e-05, "loss": 4.9296, "step": 23220 }, { "epoch": 1.8586973915826532, "grad_norm": 5.05392599105835, "learning_rate": 1.9084612504679895e-05, "loss": 5.1467, "step": 23230 }, { "epoch": 1.8594975196031365, "grad_norm": 2.838409185409546, "learning_rate": 1.9071241375621758e-05, "loss": 4.7497, "step": 23240 }, { "epoch": 1.8602976476236197, "grad_norm": 3.0737788677215576, "learning_rate": 1.905787024656362e-05, "loss": 4.9982, "step": 23250 }, { "epoch": 1.8610977756441032, "grad_norm": 2.8832297325134277, "learning_rate": 1.9044499117505483e-05, "loss": 5.0327, "step": 23260 }, { "epoch": 1.8618979036645862, "grad_norm": 3.0281708240509033, "learning_rate": 1.9031127988447346e-05, "loss": 5.0735, "step": 23270 }, { "epoch": 1.8626980316850696, "grad_norm": 2.8153724670410156, "learning_rate": 1.901775685938921e-05, "loss": 4.97, "step": 23280 }, { "epoch": 1.8634981597055529, "grad_norm": 3.8780734539031982, "learning_rate": 1.900438573033107e-05, "loss": 5.1535, "step": 23290 }, { "epoch": 1.8642982877260361, "grad_norm": 3.875718832015991, "learning_rate": 1.8991014601272934e-05, "loss": 5.0427, "step": 23300 }, { "epoch": 1.8650984157465196, "grad_norm": 5.313499927520752, "learning_rate": 1.8977643472214797e-05, "loss": 5.0147, "step": 23310 }, { "epoch": 1.8658985437670026, "grad_norm": 3.9303500652313232, "learning_rate": 1.896427234315666e-05, "loss": 4.9529, "step": 23320 }, { "epoch": 1.866698671787486, "grad_norm": 5.089270114898682, "learning_rate": 1.895090121409852e-05, "loss": 4.7841, "step": 23330 }, { "epoch": 1.8674987998079693, "grad_norm": 4.434947490692139, "learning_rate": 1.893753008504038e-05, "loss": 4.9213, "step": 23340 }, { "epoch": 1.8682989278284525, "grad_norm": 3.5199437141418457, "learning_rate": 1.8924158955982244e-05, "loss": 4.9473, "step": 23350 }, { "epoch": 1.869099055848936, "grad_norm": 2.435863494873047, "learning_rate": 1.8910787826924107e-05, "loss": 4.9539, "step": 23360 }, { "epoch": 1.869899183869419, "grad_norm": 3.309080123901367, "learning_rate": 1.889741669786597e-05, "loss": 5.1178, "step": 23370 }, { "epoch": 1.8706993118899025, "grad_norm": 3.9439151287078857, "learning_rate": 1.8884045568807832e-05, "loss": 4.8227, "step": 23380 }, { "epoch": 1.8714994399103857, "grad_norm": 4.685158729553223, "learning_rate": 1.8870674439749695e-05, "loss": 4.9227, "step": 23390 }, { "epoch": 1.872299567930869, "grad_norm": 3.316544532775879, "learning_rate": 1.8857303310691558e-05, "loss": 4.9866, "step": 23400 }, { "epoch": 1.8730996959513522, "grad_norm": 2.861067056655884, "learning_rate": 1.8843932181633417e-05, "loss": 4.9267, "step": 23410 }, { "epoch": 1.8738998239718354, "grad_norm": 3.334657907485962, "learning_rate": 1.883056105257528e-05, "loss": 5.0008, "step": 23420 }, { "epoch": 1.8746999519923189, "grad_norm": 3.105860948562622, "learning_rate": 1.8817189923517142e-05, "loss": 4.9593, "step": 23430 }, { "epoch": 1.8755000800128019, "grad_norm": 4.091304779052734, "learning_rate": 1.8803818794459005e-05, "loss": 4.9262, "step": 23440 }, { "epoch": 1.8763002080332853, "grad_norm": 2.4841034412384033, "learning_rate": 1.8790447665400868e-05, "loss": 4.9754, "step": 23450 }, { "epoch": 1.8771003360537686, "grad_norm": 3.671037435531616, "learning_rate": 1.877707653634273e-05, "loss": 4.9462, "step": 23460 }, { "epoch": 1.8779004640742518, "grad_norm": 3.1614885330200195, "learning_rate": 1.8763705407284593e-05, "loss": 5.0635, "step": 23470 }, { "epoch": 1.8787005920947353, "grad_norm": 3.2696330547332764, "learning_rate": 1.8750334278226456e-05, "loss": 4.9501, "step": 23480 }, { "epoch": 1.8795007201152183, "grad_norm": 4.031539440155029, "learning_rate": 1.8736963149168315e-05, "loss": 4.9381, "step": 23490 }, { "epoch": 1.8803008481357018, "grad_norm": 3.9160051345825195, "learning_rate": 1.8723592020110178e-05, "loss": 5.0243, "step": 23500 }, { "epoch": 1.881100976156185, "grad_norm": 4.479321479797363, "learning_rate": 1.871022089105204e-05, "loss": 4.9724, "step": 23510 }, { "epoch": 1.8819011041766682, "grad_norm": 3.626620054244995, "learning_rate": 1.8696849761993903e-05, "loss": 4.9047, "step": 23520 }, { "epoch": 1.8827012321971517, "grad_norm": 3.6235501766204834, "learning_rate": 1.8683478632935766e-05, "loss": 4.9176, "step": 23530 }, { "epoch": 1.8835013602176347, "grad_norm": 3.4153671264648438, "learning_rate": 1.867010750387763e-05, "loss": 5.1204, "step": 23540 }, { "epoch": 1.8843014882381182, "grad_norm": 2.90492844581604, "learning_rate": 1.865673637481949e-05, "loss": 4.9843, "step": 23550 }, { "epoch": 1.8851016162586014, "grad_norm": 4.847935199737549, "learning_rate": 1.864336524576135e-05, "loss": 4.9955, "step": 23560 }, { "epoch": 1.8859017442790846, "grad_norm": 3.3774425983428955, "learning_rate": 1.8629994116703213e-05, "loss": 5.0434, "step": 23570 }, { "epoch": 1.886701872299568, "grad_norm": 4.005410671234131, "learning_rate": 1.8616622987645076e-05, "loss": 4.943, "step": 23580 }, { "epoch": 1.887502000320051, "grad_norm": 3.4165306091308594, "learning_rate": 1.860325185858694e-05, "loss": 4.7749, "step": 23590 }, { "epoch": 1.8883021283405346, "grad_norm": 4.466346263885498, "learning_rate": 1.85898807295288e-05, "loss": 4.953, "step": 23600 }, { "epoch": 1.8891022563610178, "grad_norm": 3.477444887161255, "learning_rate": 1.8576509600470664e-05, "loss": 5.0208, "step": 23610 }, { "epoch": 1.889902384381501, "grad_norm": 3.3577370643615723, "learning_rate": 1.8563138471412527e-05, "loss": 4.9861, "step": 23620 }, { "epoch": 1.8907025124019843, "grad_norm": 5.305810451507568, "learning_rate": 1.854976734235439e-05, "loss": 5.2137, "step": 23630 }, { "epoch": 1.8915026404224675, "grad_norm": 2.8031234741210938, "learning_rate": 1.853639621329625e-05, "loss": 5.0322, "step": 23640 }, { "epoch": 1.892302768442951, "grad_norm": 2.6856045722961426, "learning_rate": 1.852302508423811e-05, "loss": 4.7417, "step": 23650 }, { "epoch": 1.893102896463434, "grad_norm": 3.518064498901367, "learning_rate": 1.8509653955179974e-05, "loss": 4.9908, "step": 23660 }, { "epoch": 1.8939030244839175, "grad_norm": 4.442662239074707, "learning_rate": 1.8496282826121837e-05, "loss": 5.0058, "step": 23670 }, { "epoch": 1.8947031525044007, "grad_norm": 3.661250352859497, "learning_rate": 1.84829116970637e-05, "loss": 4.9123, "step": 23680 }, { "epoch": 1.895503280524884, "grad_norm": 2.6517558097839355, "learning_rate": 1.8469540568005562e-05, "loss": 4.9644, "step": 23690 }, { "epoch": 1.8963034085453674, "grad_norm": 2.9907848834991455, "learning_rate": 1.8456169438947425e-05, "loss": 4.8955, "step": 23700 }, { "epoch": 1.8971035365658504, "grad_norm": 3.4989070892333984, "learning_rate": 1.8442798309889288e-05, "loss": 4.9782, "step": 23710 }, { "epoch": 1.8979036645863339, "grad_norm": 3.2629289627075195, "learning_rate": 1.842942718083115e-05, "loss": 5.073, "step": 23720 }, { "epoch": 1.898703792606817, "grad_norm": 2.9857161045074463, "learning_rate": 1.8416056051773013e-05, "loss": 5.022, "step": 23730 }, { "epoch": 1.8995039206273003, "grad_norm": 3.6350457668304443, "learning_rate": 1.8402684922714876e-05, "loss": 5.0102, "step": 23740 }, { "epoch": 1.9003040486477838, "grad_norm": 3.987959146499634, "learning_rate": 1.838931379365674e-05, "loss": 4.8519, "step": 23750 }, { "epoch": 1.9011041766682668, "grad_norm": 2.2430574893951416, "learning_rate": 1.83759426645986e-05, "loss": 4.9962, "step": 23760 }, { "epoch": 1.9019043046887503, "grad_norm": 2.8868260383605957, "learning_rate": 1.8362571535540464e-05, "loss": 5.116, "step": 23770 }, { "epoch": 1.9027044327092335, "grad_norm": 3.6265523433685303, "learning_rate": 1.8349200406482326e-05, "loss": 4.9706, "step": 23780 }, { "epoch": 1.9035045607297167, "grad_norm": 4.022701740264893, "learning_rate": 1.8335829277424186e-05, "loss": 4.9531, "step": 23790 }, { "epoch": 1.9043046887502, "grad_norm": 3.127108573913574, "learning_rate": 1.832245814836605e-05, "loss": 5.0577, "step": 23800 }, { "epoch": 1.9051048167706832, "grad_norm": 6.9320502281188965, "learning_rate": 1.830908701930791e-05, "loss": 4.8681, "step": 23810 }, { "epoch": 1.9059049447911667, "grad_norm": 4.856078624725342, "learning_rate": 1.8295715890249774e-05, "loss": 5.0294, "step": 23820 }, { "epoch": 1.90670507281165, "grad_norm": 3.244516372680664, "learning_rate": 1.8282344761191636e-05, "loss": 4.9191, "step": 23830 }, { "epoch": 1.9075052008321332, "grad_norm": 3.1606297492980957, "learning_rate": 1.82689736321335e-05, "loss": 5.0632, "step": 23840 }, { "epoch": 1.9083053288526164, "grad_norm": 4.155543804168701, "learning_rate": 1.8255602503075362e-05, "loss": 5.1398, "step": 23850 }, { "epoch": 1.9091054568730996, "grad_norm": 2.770430088043213, "learning_rate": 1.8242231374017225e-05, "loss": 5.0403, "step": 23860 }, { "epoch": 1.909905584893583, "grad_norm": 3.0609514713287354, "learning_rate": 1.8228860244959084e-05, "loss": 4.8498, "step": 23870 }, { "epoch": 1.910705712914066, "grad_norm": 3.4689247608184814, "learning_rate": 1.8215489115900947e-05, "loss": 4.9944, "step": 23880 }, { "epoch": 1.9115058409345496, "grad_norm": 3.3922059535980225, "learning_rate": 1.820211798684281e-05, "loss": 4.7982, "step": 23890 }, { "epoch": 1.9123059689550328, "grad_norm": 3.3366963863372803, "learning_rate": 1.8188746857784672e-05, "loss": 4.9941, "step": 23900 }, { "epoch": 1.913106096975516, "grad_norm": 2.8962085247039795, "learning_rate": 1.8175375728726535e-05, "loss": 4.7124, "step": 23910 }, { "epoch": 1.9139062249959995, "grad_norm": 6.0768141746521, "learning_rate": 1.8162004599668397e-05, "loss": 4.9149, "step": 23920 }, { "epoch": 1.9147063530164825, "grad_norm": 3.3171348571777344, "learning_rate": 1.814863347061026e-05, "loss": 4.9691, "step": 23930 }, { "epoch": 1.915506481036966, "grad_norm": 4.5769453048706055, "learning_rate": 1.8135262341552123e-05, "loss": 4.8855, "step": 23940 }, { "epoch": 1.9163066090574492, "grad_norm": 3.704608917236328, "learning_rate": 1.8121891212493982e-05, "loss": 5.0219, "step": 23950 }, { "epoch": 1.9171067370779324, "grad_norm": 4.647409439086914, "learning_rate": 1.8108520083435845e-05, "loss": 4.9864, "step": 23960 }, { "epoch": 1.917906865098416, "grad_norm": 3.0767157077789307, "learning_rate": 1.8095148954377707e-05, "loss": 4.9873, "step": 23970 }, { "epoch": 1.918706993118899, "grad_norm": 3.1951663494110107, "learning_rate": 1.808177782531957e-05, "loss": 4.9565, "step": 23980 }, { "epoch": 1.9195071211393824, "grad_norm": 3.0078985691070557, "learning_rate": 1.8068406696261433e-05, "loss": 5.0643, "step": 23990 }, { "epoch": 1.9203072491598656, "grad_norm": 2.8039395809173584, "learning_rate": 1.8055035567203295e-05, "loss": 4.968, "step": 24000 }, { "epoch": 1.9211073771803489, "grad_norm": 2.70816969871521, "learning_rate": 1.8041664438145158e-05, "loss": 5.0067, "step": 24010 }, { "epoch": 1.921907505200832, "grad_norm": 3.6637299060821533, "learning_rate": 1.802829330908702e-05, "loss": 5.0332, "step": 24020 }, { "epoch": 1.9227076332213153, "grad_norm": 2.8274829387664795, "learning_rate": 1.801492218002888e-05, "loss": 4.8953, "step": 24030 }, { "epoch": 1.9235077612417988, "grad_norm": 3.530215263366699, "learning_rate": 1.8001551050970743e-05, "loss": 4.9168, "step": 24040 }, { "epoch": 1.9243078892622818, "grad_norm": 2.753448724746704, "learning_rate": 1.7988179921912605e-05, "loss": 5.016, "step": 24050 }, { "epoch": 1.9251080172827653, "grad_norm": 3.4737961292266846, "learning_rate": 1.7974808792854468e-05, "loss": 4.9456, "step": 24060 }, { "epoch": 1.9259081453032485, "grad_norm": 3.040010452270508, "learning_rate": 1.796143766379633e-05, "loss": 4.903, "step": 24070 }, { "epoch": 1.9267082733237317, "grad_norm": 2.961254835128784, "learning_rate": 1.7948066534738194e-05, "loss": 5.0527, "step": 24080 }, { "epoch": 1.9275084013442152, "grad_norm": 2.690537929534912, "learning_rate": 1.7934695405680056e-05, "loss": 4.8845, "step": 24090 }, { "epoch": 1.9283085293646982, "grad_norm": 3.9988303184509277, "learning_rate": 1.792132427662192e-05, "loss": 5.0803, "step": 24100 }, { "epoch": 1.9291086573851817, "grad_norm": 2.8897204399108887, "learning_rate": 1.790795314756378e-05, "loss": 4.9407, "step": 24110 }, { "epoch": 1.929908785405665, "grad_norm": 4.699467182159424, "learning_rate": 1.7894582018505644e-05, "loss": 4.9493, "step": 24120 }, { "epoch": 1.9307089134261481, "grad_norm": 4.01737117767334, "learning_rate": 1.7881210889447507e-05, "loss": 4.9465, "step": 24130 }, { "epoch": 1.9315090414466316, "grad_norm": 2.703599214553833, "learning_rate": 1.786783976038937e-05, "loss": 4.9857, "step": 24140 }, { "epoch": 1.9323091694671146, "grad_norm": 2.5545291900634766, "learning_rate": 1.7854468631331232e-05, "loss": 5.0033, "step": 24150 }, { "epoch": 1.933109297487598, "grad_norm": 3.572033166885376, "learning_rate": 1.7841097502273095e-05, "loss": 4.9381, "step": 24160 }, { "epoch": 1.9339094255080813, "grad_norm": 4.481420516967773, "learning_rate": 1.7827726373214958e-05, "loss": 4.976, "step": 24170 }, { "epoch": 1.9347095535285646, "grad_norm": 8.394909858703613, "learning_rate": 1.7814355244156817e-05, "loss": 4.871, "step": 24180 }, { "epoch": 1.935509681549048, "grad_norm": 3.418012857437134, "learning_rate": 1.780098411509868e-05, "loss": 4.8412, "step": 24190 }, { "epoch": 1.936309809569531, "grad_norm": 4.226028919219971, "learning_rate": 1.7787612986040542e-05, "loss": 4.9214, "step": 24200 }, { "epoch": 1.9371099375900145, "grad_norm": 3.3171331882476807, "learning_rate": 1.7774241856982405e-05, "loss": 4.8912, "step": 24210 }, { "epoch": 1.9379100656104977, "grad_norm": 2.7133543491363525, "learning_rate": 1.7760870727924268e-05, "loss": 5.067, "step": 24220 }, { "epoch": 1.938710193630981, "grad_norm": 3.8669393062591553, "learning_rate": 1.774749959886613e-05, "loss": 4.9646, "step": 24230 }, { "epoch": 1.9395103216514642, "grad_norm": 3.915174722671509, "learning_rate": 1.7734128469807993e-05, "loss": 4.9525, "step": 24240 }, { "epoch": 1.9403104496719474, "grad_norm": 4.278127193450928, "learning_rate": 1.7720757340749856e-05, "loss": 4.9979, "step": 24250 }, { "epoch": 1.941110577692431, "grad_norm": 4.306387901306152, "learning_rate": 1.7707386211691715e-05, "loss": 4.8818, "step": 24260 }, { "epoch": 1.941910705712914, "grad_norm": 3.726982831954956, "learning_rate": 1.7694015082633578e-05, "loss": 5.0455, "step": 24270 }, { "epoch": 1.9427108337333974, "grad_norm": 4.664205551147461, "learning_rate": 1.768064395357544e-05, "loss": 5.043, "step": 24280 }, { "epoch": 1.9435109617538806, "grad_norm": 3.069760799407959, "learning_rate": 1.7667272824517303e-05, "loss": 4.9116, "step": 24290 }, { "epoch": 1.9443110897743638, "grad_norm": 3.658348798751831, "learning_rate": 1.7653901695459166e-05, "loss": 4.858, "step": 24300 }, { "epoch": 1.9451112177948473, "grad_norm": 2.518824338912964, "learning_rate": 1.764053056640103e-05, "loss": 4.8353, "step": 24310 }, { "epoch": 1.9459113458153303, "grad_norm": 3.897017478942871, "learning_rate": 1.762715943734289e-05, "loss": 4.9834, "step": 24320 }, { "epoch": 1.9467114738358138, "grad_norm": 3.2303273677825928, "learning_rate": 1.7613788308284754e-05, "loss": 5.0684, "step": 24330 }, { "epoch": 1.947511601856297, "grad_norm": 3.314673662185669, "learning_rate": 1.7600417179226613e-05, "loss": 4.9663, "step": 24340 }, { "epoch": 1.9483117298767803, "grad_norm": 2.891897201538086, "learning_rate": 1.7587046050168476e-05, "loss": 4.8986, "step": 24350 }, { "epoch": 1.9491118578972637, "grad_norm": 2.976445436477661, "learning_rate": 1.757367492111034e-05, "loss": 5.0788, "step": 24360 }, { "epoch": 1.9499119859177467, "grad_norm": 2.883258819580078, "learning_rate": 1.75603037920522e-05, "loss": 5.03, "step": 24370 }, { "epoch": 1.9507121139382302, "grad_norm": 3.258368968963623, "learning_rate": 1.7546932662994064e-05, "loss": 4.9691, "step": 24380 }, { "epoch": 1.9515122419587134, "grad_norm": 3.8020787239074707, "learning_rate": 1.7533561533935927e-05, "loss": 4.9491, "step": 24390 }, { "epoch": 1.9523123699791967, "grad_norm": 3.3602609634399414, "learning_rate": 1.752019040487779e-05, "loss": 4.8515, "step": 24400 }, { "epoch": 1.95311249799968, "grad_norm": 3.7602756023406982, "learning_rate": 1.750681927581965e-05, "loss": 4.9146, "step": 24410 }, { "epoch": 1.9539126260201631, "grad_norm": 3.219118356704712, "learning_rate": 1.749344814676151e-05, "loss": 4.9882, "step": 24420 }, { "epoch": 1.9547127540406466, "grad_norm": 3.1614081859588623, "learning_rate": 1.7480077017703374e-05, "loss": 4.9193, "step": 24430 }, { "epoch": 1.9555128820611298, "grad_norm": 3.2397539615631104, "learning_rate": 1.7466705888645237e-05, "loss": 5.0602, "step": 24440 }, { "epoch": 1.956313010081613, "grad_norm": 2.708376169204712, "learning_rate": 1.74533347595871e-05, "loss": 4.8835, "step": 24450 }, { "epoch": 1.9571131381020963, "grad_norm": 2.535634994506836, "learning_rate": 1.7439963630528962e-05, "loss": 4.9571, "step": 24460 }, { "epoch": 1.9579132661225795, "grad_norm": 3.6022346019744873, "learning_rate": 1.7426592501470825e-05, "loss": 4.8711, "step": 24470 }, { "epoch": 1.958713394143063, "grad_norm": 3.2954986095428467, "learning_rate": 1.7413221372412688e-05, "loss": 4.8985, "step": 24480 }, { "epoch": 1.959513522163546, "grad_norm": 4.4332122802734375, "learning_rate": 1.7399850243354547e-05, "loss": 4.9637, "step": 24490 }, { "epoch": 1.9603136501840295, "grad_norm": 3.2086029052734375, "learning_rate": 1.738647911429641e-05, "loss": 4.9817, "step": 24500 }, { "epoch": 1.9611137782045127, "grad_norm": 3.447162389755249, "learning_rate": 1.7373107985238272e-05, "loss": 4.8359, "step": 24510 }, { "epoch": 1.961913906224996, "grad_norm": 3.8578414916992188, "learning_rate": 1.7359736856180135e-05, "loss": 4.9379, "step": 24520 }, { "epoch": 1.9627140342454794, "grad_norm": 3.037017822265625, "learning_rate": 1.7346365727121998e-05, "loss": 5.1682, "step": 24530 }, { "epoch": 1.9635141622659624, "grad_norm": 2.9559504985809326, "learning_rate": 1.733299459806386e-05, "loss": 4.8014, "step": 24540 }, { "epoch": 1.9643142902864459, "grad_norm": 4.131083011627197, "learning_rate": 1.7319623469005723e-05, "loss": 4.9633, "step": 24550 }, { "epoch": 1.9651144183069291, "grad_norm": 3.3252639770507812, "learning_rate": 1.7306252339947586e-05, "loss": 5.0054, "step": 24560 }, { "epoch": 1.9659145463274124, "grad_norm": 3.904116153717041, "learning_rate": 1.729288121088945e-05, "loss": 4.971, "step": 24570 }, { "epoch": 1.9667146743478958, "grad_norm": 3.152641773223877, "learning_rate": 1.727951008183131e-05, "loss": 5.1016, "step": 24580 }, { "epoch": 1.9675148023683788, "grad_norm": 3.344860315322876, "learning_rate": 1.7266138952773174e-05, "loss": 4.9248, "step": 24590 }, { "epoch": 1.9683149303888623, "grad_norm": 2.9340274333953857, "learning_rate": 1.7252767823715036e-05, "loss": 4.987, "step": 24600 }, { "epoch": 1.9691150584093455, "grad_norm": 2.496817111968994, "learning_rate": 1.72393966946569e-05, "loss": 5.031, "step": 24610 }, { "epoch": 1.9699151864298288, "grad_norm": 3.044074296951294, "learning_rate": 1.7226025565598762e-05, "loss": 4.9457, "step": 24620 }, { "epoch": 1.970715314450312, "grad_norm": 4.735547065734863, "learning_rate": 1.7212654436540625e-05, "loss": 4.8686, "step": 24630 }, { "epoch": 1.9715154424707952, "grad_norm": 3.4390697479248047, "learning_rate": 1.7199283307482484e-05, "loss": 4.9262, "step": 24640 }, { "epoch": 1.9723155704912787, "grad_norm": 3.429409980773926, "learning_rate": 1.7185912178424347e-05, "loss": 5.0453, "step": 24650 }, { "epoch": 1.9731156985117617, "grad_norm": 5.1396942138671875, "learning_rate": 1.717254104936621e-05, "loss": 4.8638, "step": 24660 }, { "epoch": 1.9739158265322452, "grad_norm": 3.099233388900757, "learning_rate": 1.7159169920308072e-05, "loss": 5.0777, "step": 24670 }, { "epoch": 1.9747159545527284, "grad_norm": 6.756525993347168, "learning_rate": 1.7145798791249935e-05, "loss": 4.9537, "step": 24680 }, { "epoch": 1.9755160825732117, "grad_norm": 2.5224523544311523, "learning_rate": 1.7132427662191797e-05, "loss": 5.0601, "step": 24690 }, { "epoch": 1.976316210593695, "grad_norm": 2.7391388416290283, "learning_rate": 1.711905653313366e-05, "loss": 4.8926, "step": 24700 }, { "epoch": 1.9771163386141781, "grad_norm": 3.1466548442840576, "learning_rate": 1.7105685404075523e-05, "loss": 4.9819, "step": 24710 }, { "epoch": 1.9779164666346616, "grad_norm": 3.471501588821411, "learning_rate": 1.7092314275017382e-05, "loss": 5.0699, "step": 24720 }, { "epoch": 1.9787165946551448, "grad_norm": 4.275609970092773, "learning_rate": 1.7078943145959245e-05, "loss": 4.7845, "step": 24730 }, { "epoch": 1.979516722675628, "grad_norm": 3.1587014198303223, "learning_rate": 1.7065572016901107e-05, "loss": 5.0001, "step": 24740 }, { "epoch": 1.9803168506961115, "grad_norm": 5.504805088043213, "learning_rate": 1.705220088784297e-05, "loss": 4.8489, "step": 24750 }, { "epoch": 1.9811169787165945, "grad_norm": 4.1532206535339355, "learning_rate": 1.7038829758784833e-05, "loss": 4.9888, "step": 24760 }, { "epoch": 1.981917106737078, "grad_norm": 3.094951629638672, "learning_rate": 1.7025458629726695e-05, "loss": 4.9468, "step": 24770 }, { "epoch": 1.9827172347575612, "grad_norm": 5.671091556549072, "learning_rate": 1.7012087500668558e-05, "loss": 5.0144, "step": 24780 }, { "epoch": 1.9835173627780445, "grad_norm": 4.130462646484375, "learning_rate": 1.699871637161042e-05, "loss": 4.9417, "step": 24790 }, { "epoch": 1.984317490798528, "grad_norm": 6.027336597442627, "learning_rate": 1.698534524255228e-05, "loss": 4.8084, "step": 24800 }, { "epoch": 1.985117618819011, "grad_norm": 2.759535312652588, "learning_rate": 1.6971974113494143e-05, "loss": 4.962, "step": 24810 }, { "epoch": 1.9859177468394944, "grad_norm": 2.916520833969116, "learning_rate": 1.6958602984436006e-05, "loss": 4.9052, "step": 24820 }, { "epoch": 1.9867178748599776, "grad_norm": 3.448692560195923, "learning_rate": 1.6945231855377868e-05, "loss": 4.8362, "step": 24830 }, { "epoch": 1.9875180028804609, "grad_norm": 2.9183404445648193, "learning_rate": 1.693186072631973e-05, "loss": 4.9522, "step": 24840 }, { "epoch": 1.9883181309009441, "grad_norm": 2.444122076034546, "learning_rate": 1.6918489597261594e-05, "loss": 5.0336, "step": 24850 }, { "epoch": 1.9891182589214274, "grad_norm": 5.753563404083252, "learning_rate": 1.6905118468203456e-05, "loss": 4.9037, "step": 24860 }, { "epoch": 1.9899183869419108, "grad_norm": 2.641191244125366, "learning_rate": 1.689174733914532e-05, "loss": 4.795, "step": 24870 }, { "epoch": 1.9907185149623938, "grad_norm": 2.918086290359497, "learning_rate": 1.6878376210087178e-05, "loss": 4.901, "step": 24880 }, { "epoch": 1.9915186429828773, "grad_norm": 3.7362258434295654, "learning_rate": 1.686500508102904e-05, "loss": 4.9758, "step": 24890 }, { "epoch": 1.9923187710033605, "grad_norm": 3.5491786003112793, "learning_rate": 1.6851633951970904e-05, "loss": 5.126, "step": 24900 }, { "epoch": 1.9931188990238438, "grad_norm": 2.8364782333374023, "learning_rate": 1.6838262822912766e-05, "loss": 5.0067, "step": 24910 }, { "epoch": 1.9939190270443272, "grad_norm": 4.282176971435547, "learning_rate": 1.682489169385463e-05, "loss": 4.8921, "step": 24920 }, { "epoch": 1.9947191550648102, "grad_norm": 4.440709590911865, "learning_rate": 1.6811520564796492e-05, "loss": 5.0284, "step": 24930 }, { "epoch": 1.9955192830852937, "grad_norm": 3.450624704360962, "learning_rate": 1.6798149435738354e-05, "loss": 5.0158, "step": 24940 }, { "epoch": 1.996319411105777, "grad_norm": 3.278850793838501, "learning_rate": 1.6784778306680217e-05, "loss": 5.0562, "step": 24950 }, { "epoch": 1.9971195391262602, "grad_norm": 3.931243658065796, "learning_rate": 1.677140717762208e-05, "loss": 4.966, "step": 24960 }, { "epoch": 1.9979196671467436, "grad_norm": 4.377293586730957, "learning_rate": 1.6758036048563942e-05, "loss": 4.8362, "step": 24970 }, { "epoch": 1.9987197951672266, "grad_norm": 3.4111695289611816, "learning_rate": 1.6744664919505805e-05, "loss": 5.1388, "step": 24980 }, { "epoch": 1.99951992318771, "grad_norm": 3.131072521209717, "learning_rate": 1.6731293790447668e-05, "loss": 4.8877, "step": 24990 }, { "epoch": 2.000320051208193, "grad_norm": 3.912254571914673, "learning_rate": 1.671792266138953e-05, "loss": 4.8321, "step": 25000 }, { "epoch": 2.0011201792286766, "grad_norm": 3.7595908641815186, "learning_rate": 1.6704551532331393e-05, "loss": 4.338, "step": 25010 }, { "epoch": 2.00192030724916, "grad_norm": 5.618625640869141, "learning_rate": 1.6691180403273256e-05, "loss": 4.3945, "step": 25020 }, { "epoch": 2.002720435269643, "grad_norm": 5.2287468910217285, "learning_rate": 1.6677809274215115e-05, "loss": 4.2806, "step": 25030 }, { "epoch": 2.0035205632901265, "grad_norm": 3.5416719913482666, "learning_rate": 1.6664438145156978e-05, "loss": 4.3418, "step": 25040 }, { "epoch": 2.0043206913106095, "grad_norm": 4.5456109046936035, "learning_rate": 1.665106701609884e-05, "loss": 4.2207, "step": 25050 }, { "epoch": 2.005120819331093, "grad_norm": 4.272172927856445, "learning_rate": 1.6637695887040703e-05, "loss": 4.2139, "step": 25060 }, { "epoch": 2.0059209473515764, "grad_norm": 4.0079121589660645, "learning_rate": 1.6624324757982566e-05, "loss": 4.4569, "step": 25070 }, { "epoch": 2.0067210753720595, "grad_norm": 5.050939083099365, "learning_rate": 1.661095362892443e-05, "loss": 4.4224, "step": 25080 }, { "epoch": 2.007521203392543, "grad_norm": 3.5307321548461914, "learning_rate": 1.659758249986629e-05, "loss": 4.307, "step": 25090 }, { "epoch": 2.008321331413026, "grad_norm": 4.467302322387695, "learning_rate": 1.6584211370808154e-05, "loss": 4.304, "step": 25100 }, { "epoch": 2.0091214594335094, "grad_norm": 5.906796455383301, "learning_rate": 1.6570840241750013e-05, "loss": 4.2925, "step": 25110 }, { "epoch": 2.009921587453993, "grad_norm": 4.336019039154053, "learning_rate": 1.6557469112691876e-05, "loss": 4.1098, "step": 25120 }, { "epoch": 2.010721715474476, "grad_norm": 6.186092376708984, "learning_rate": 1.654409798363374e-05, "loss": 4.384, "step": 25130 }, { "epoch": 2.0115218434949593, "grad_norm": 6.390085220336914, "learning_rate": 1.65307268545756e-05, "loss": 4.1767, "step": 25140 }, { "epoch": 2.0123219715154423, "grad_norm": 5.528529167175293, "learning_rate": 1.6517355725517464e-05, "loss": 4.2967, "step": 25150 }, { "epoch": 2.013122099535926, "grad_norm": 4.196529865264893, "learning_rate": 1.6503984596459327e-05, "loss": 4.3129, "step": 25160 }, { "epoch": 2.013922227556409, "grad_norm": 4.266825199127197, "learning_rate": 1.649061346740119e-05, "loss": 4.1665, "step": 25170 }, { "epoch": 2.0147223555768923, "grad_norm": 3.9628491401672363, "learning_rate": 1.647724233834305e-05, "loss": 4.2982, "step": 25180 }, { "epoch": 2.0155224835973757, "grad_norm": 5.902552604675293, "learning_rate": 1.646387120928491e-05, "loss": 4.2788, "step": 25190 }, { "epoch": 2.0163226116178588, "grad_norm": 4.918710708618164, "learning_rate": 1.6450500080226774e-05, "loss": 4.4017, "step": 25200 }, { "epoch": 2.017122739638342, "grad_norm": 5.401872158050537, "learning_rate": 1.6437128951168637e-05, "loss": 4.3384, "step": 25210 }, { "epoch": 2.0179228676588252, "grad_norm": 4.330984592437744, "learning_rate": 1.64237578221105e-05, "loss": 4.2743, "step": 25220 }, { "epoch": 2.0187229956793087, "grad_norm": 4.99106502532959, "learning_rate": 1.6410386693052362e-05, "loss": 4.3562, "step": 25230 }, { "epoch": 2.019523123699792, "grad_norm": 5.204268455505371, "learning_rate": 1.6397015563994225e-05, "loss": 4.325, "step": 25240 }, { "epoch": 2.020323251720275, "grad_norm": 4.139738082885742, "learning_rate": 1.6383644434936088e-05, "loss": 4.3247, "step": 25250 }, { "epoch": 2.0211233797407586, "grad_norm": 4.763280868530273, "learning_rate": 1.6370273305877947e-05, "loss": 4.2754, "step": 25260 }, { "epoch": 2.0219235077612416, "grad_norm": 4.396899223327637, "learning_rate": 1.635690217681981e-05, "loss": 4.4056, "step": 25270 }, { "epoch": 2.022723635781725, "grad_norm": 5.1739702224731445, "learning_rate": 1.6343531047761672e-05, "loss": 4.2894, "step": 25280 }, { "epoch": 2.0235237638022086, "grad_norm": 3.864838123321533, "learning_rate": 1.6330159918703535e-05, "loss": 4.4537, "step": 25290 }, { "epoch": 2.0243238918226916, "grad_norm": 5.35354471206665, "learning_rate": 1.6316788789645398e-05, "loss": 4.2356, "step": 25300 }, { "epoch": 2.025124019843175, "grad_norm": 7.268294811248779, "learning_rate": 1.630341766058726e-05, "loss": 4.3507, "step": 25310 }, { "epoch": 2.025924147863658, "grad_norm": 3.9637956619262695, "learning_rate": 1.6290046531529123e-05, "loss": 4.3009, "step": 25320 }, { "epoch": 2.0267242758841415, "grad_norm": 4.4494476318359375, "learning_rate": 1.6276675402470986e-05, "loss": 4.1836, "step": 25330 }, { "epoch": 2.027524403904625, "grad_norm": 4.388992786407471, "learning_rate": 1.6263304273412845e-05, "loss": 4.3256, "step": 25340 }, { "epoch": 2.028324531925108, "grad_norm": 6.115511417388916, "learning_rate": 1.6249933144354708e-05, "loss": 4.3195, "step": 25350 }, { "epoch": 2.0291246599455914, "grad_norm": 5.367356300354004, "learning_rate": 1.623656201529657e-05, "loss": 4.333, "step": 25360 }, { "epoch": 2.0299247879660745, "grad_norm": 6.696788787841797, "learning_rate": 1.6223190886238433e-05, "loss": 4.1472, "step": 25370 }, { "epoch": 2.030724915986558, "grad_norm": 5.665947914123535, "learning_rate": 1.6209819757180296e-05, "loss": 4.4066, "step": 25380 }, { "epoch": 2.031525044007041, "grad_norm": 5.398012638092041, "learning_rate": 1.619644862812216e-05, "loss": 4.2199, "step": 25390 }, { "epoch": 2.0323251720275244, "grad_norm": 6.494447708129883, "learning_rate": 1.618307749906402e-05, "loss": 4.1575, "step": 25400 }, { "epoch": 2.033125300048008, "grad_norm": 4.272920608520508, "learning_rate": 1.6169706370005884e-05, "loss": 4.2159, "step": 25410 }, { "epoch": 2.033925428068491, "grad_norm": 5.143256187438965, "learning_rate": 1.6156335240947747e-05, "loss": 4.1872, "step": 25420 }, { "epoch": 2.0347255560889743, "grad_norm": 6.310397624969482, "learning_rate": 1.614296411188961e-05, "loss": 4.3852, "step": 25430 }, { "epoch": 2.0355256841094573, "grad_norm": 5.364697456359863, "learning_rate": 1.6129592982831472e-05, "loss": 4.3888, "step": 25440 }, { "epoch": 2.036325812129941, "grad_norm": 5.257568836212158, "learning_rate": 1.6116221853773335e-05, "loss": 4.3191, "step": 25450 }, { "epoch": 2.0371259401504243, "grad_norm": 5.236495494842529, "learning_rate": 1.6102850724715197e-05, "loss": 4.3468, "step": 25460 }, { "epoch": 2.0379260681709073, "grad_norm": 5.136487007141113, "learning_rate": 1.608947959565706e-05, "loss": 4.3643, "step": 25470 }, { "epoch": 2.0387261961913907, "grad_norm": 3.6016764640808105, "learning_rate": 1.6076108466598923e-05, "loss": 4.3133, "step": 25480 }, { "epoch": 2.0395263242118737, "grad_norm": 4.143081188201904, "learning_rate": 1.6062737337540782e-05, "loss": 4.3392, "step": 25490 }, { "epoch": 2.040326452232357, "grad_norm": 3.9485390186309814, "learning_rate": 1.6049366208482645e-05, "loss": 4.3585, "step": 25500 }, { "epoch": 2.0411265802528407, "grad_norm": 4.107684135437012, "learning_rate": 1.6035995079424507e-05, "loss": 4.1508, "step": 25510 }, { "epoch": 2.0419267082733237, "grad_norm": 3.8906235694885254, "learning_rate": 1.602262395036637e-05, "loss": 4.144, "step": 25520 }, { "epoch": 2.042726836293807, "grad_norm": 8.17663860321045, "learning_rate": 1.6009252821308233e-05, "loss": 4.2195, "step": 25530 }, { "epoch": 2.04352696431429, "grad_norm": 4.338802337646484, "learning_rate": 1.5995881692250095e-05, "loss": 4.2329, "step": 25540 }, { "epoch": 2.0443270923347736, "grad_norm": 4.052145481109619, "learning_rate": 1.5982510563191958e-05, "loss": 4.3109, "step": 25550 }, { "epoch": 2.0451272203552566, "grad_norm": 3.725970506668091, "learning_rate": 1.596913943413382e-05, "loss": 4.2197, "step": 25560 }, { "epoch": 2.04592734837574, "grad_norm": 5.759598731994629, "learning_rate": 1.595576830507568e-05, "loss": 4.2164, "step": 25570 }, { "epoch": 2.0467274763962235, "grad_norm": 5.7644195556640625, "learning_rate": 1.5942397176017543e-05, "loss": 4.4158, "step": 25580 }, { "epoch": 2.0475276044167066, "grad_norm": 4.668453693389893, "learning_rate": 1.5929026046959406e-05, "loss": 4.2267, "step": 25590 }, { "epoch": 2.04832773243719, "grad_norm": 11.40067195892334, "learning_rate": 1.5915654917901268e-05, "loss": 4.4523, "step": 25600 }, { "epoch": 2.049127860457673, "grad_norm": 5.45416784286499, "learning_rate": 1.590228378884313e-05, "loss": 4.1557, "step": 25610 }, { "epoch": 2.0499279884781565, "grad_norm": 5.021010875701904, "learning_rate": 1.5888912659784994e-05, "loss": 4.3983, "step": 25620 }, { "epoch": 2.05072811649864, "grad_norm": 5.092507362365723, "learning_rate": 1.5875541530726856e-05, "loss": 4.2009, "step": 25630 }, { "epoch": 2.051528244519123, "grad_norm": 4.925893306732178, "learning_rate": 1.586217040166872e-05, "loss": 4.1259, "step": 25640 }, { "epoch": 2.0523283725396064, "grad_norm": 5.131783485412598, "learning_rate": 1.5848799272610578e-05, "loss": 4.1493, "step": 25650 }, { "epoch": 2.0531285005600894, "grad_norm": 5.167172431945801, "learning_rate": 1.583542814355244e-05, "loss": 4.3849, "step": 25660 }, { "epoch": 2.053928628580573, "grad_norm": 6.500699043273926, "learning_rate": 1.5822057014494304e-05, "loss": 4.1778, "step": 25670 }, { "epoch": 2.0547287566010564, "grad_norm": 4.588797569274902, "learning_rate": 1.5808685885436166e-05, "loss": 4.3639, "step": 25680 }, { "epoch": 2.0555288846215394, "grad_norm": 4.2404608726501465, "learning_rate": 1.579531475637803e-05, "loss": 4.2917, "step": 25690 }, { "epoch": 2.056329012642023, "grad_norm": 4.761908054351807, "learning_rate": 1.5781943627319892e-05, "loss": 4.312, "step": 25700 }, { "epoch": 2.057129140662506, "grad_norm": 5.473426818847656, "learning_rate": 1.5768572498261754e-05, "loss": 4.172, "step": 25710 }, { "epoch": 2.0579292686829893, "grad_norm": 4.701712608337402, "learning_rate": 1.5755201369203617e-05, "loss": 4.3831, "step": 25720 }, { "epoch": 2.0587293967034728, "grad_norm": 9.382383346557617, "learning_rate": 1.5741830240145476e-05, "loss": 4.182, "step": 25730 }, { "epoch": 2.059529524723956, "grad_norm": 4.000535011291504, "learning_rate": 1.572845911108734e-05, "loss": 4.2129, "step": 25740 }, { "epoch": 2.0603296527444392, "grad_norm": 4.71032190322876, "learning_rate": 1.5715087982029202e-05, "loss": 4.2976, "step": 25750 }, { "epoch": 2.0611297807649223, "grad_norm": 4.48236083984375, "learning_rate": 1.5701716852971064e-05, "loss": 4.342, "step": 25760 }, { "epoch": 2.0619299087854057, "grad_norm": 4.62367057800293, "learning_rate": 1.5688345723912927e-05, "loss": 4.256, "step": 25770 }, { "epoch": 2.0627300368058887, "grad_norm": 3.801255464553833, "learning_rate": 1.567497459485479e-05, "loss": 4.4418, "step": 25780 }, { "epoch": 2.063530164826372, "grad_norm": 3.6294422149658203, "learning_rate": 1.5661603465796653e-05, "loss": 4.2888, "step": 25790 }, { "epoch": 2.0643302928468557, "grad_norm": 4.3946146965026855, "learning_rate": 1.5648232336738515e-05, "loss": 4.2986, "step": 25800 }, { "epoch": 2.0651304208673387, "grad_norm": 4.36204195022583, "learning_rate": 1.5634861207680378e-05, "loss": 4.2825, "step": 25810 }, { "epoch": 2.065930548887822, "grad_norm": 4.553360462188721, "learning_rate": 1.562149007862224e-05, "loss": 4.3057, "step": 25820 }, { "epoch": 2.066730676908305, "grad_norm": 5.6123127937316895, "learning_rate": 1.5608118949564103e-05, "loss": 4.4922, "step": 25830 }, { "epoch": 2.0675308049287886, "grad_norm": 4.838746070861816, "learning_rate": 1.5594747820505966e-05, "loss": 4.4108, "step": 25840 }, { "epoch": 2.068330932949272, "grad_norm": 5.99254035949707, "learning_rate": 1.558137669144783e-05, "loss": 4.3313, "step": 25850 }, { "epoch": 2.069131060969755, "grad_norm": 5.4797282218933105, "learning_rate": 1.556800556238969e-05, "loss": 4.2516, "step": 25860 }, { "epoch": 2.0699311889902385, "grad_norm": 5.283844947814941, "learning_rate": 1.5554634433331554e-05, "loss": 4.4497, "step": 25870 }, { "epoch": 2.0707313170107216, "grad_norm": 4.948397636413574, "learning_rate": 1.5541263304273413e-05, "loss": 4.0877, "step": 25880 }, { "epoch": 2.071531445031205, "grad_norm": 7.391055107116699, "learning_rate": 1.5527892175215276e-05, "loss": 4.3646, "step": 25890 }, { "epoch": 2.0723315730516885, "grad_norm": 6.029944896697998, "learning_rate": 1.551452104615714e-05, "loss": 4.3319, "step": 25900 }, { "epoch": 2.0731317010721715, "grad_norm": 5.694515228271484, "learning_rate": 1.5501149917099e-05, "loss": 4.3011, "step": 25910 }, { "epoch": 2.073931829092655, "grad_norm": 5.351522922515869, "learning_rate": 1.5487778788040864e-05, "loss": 4.3508, "step": 25920 }, { "epoch": 2.074731957113138, "grad_norm": 5.074141025543213, "learning_rate": 1.5474407658982727e-05, "loss": 4.3033, "step": 25930 }, { "epoch": 2.0755320851336214, "grad_norm": 4.946030139923096, "learning_rate": 1.546103652992459e-05, "loss": 4.4583, "step": 25940 }, { "epoch": 2.076332213154105, "grad_norm": 4.760500431060791, "learning_rate": 1.5447665400866452e-05, "loss": 4.2749, "step": 25950 }, { "epoch": 2.077132341174588, "grad_norm": 5.357514381408691, "learning_rate": 1.543429427180831e-05, "loss": 4.3748, "step": 25960 }, { "epoch": 2.0779324691950714, "grad_norm": 4.862049579620361, "learning_rate": 1.5420923142750174e-05, "loss": 4.2431, "step": 25970 }, { "epoch": 2.0787325972155544, "grad_norm": 4.293335914611816, "learning_rate": 1.5407552013692037e-05, "loss": 4.2634, "step": 25980 }, { "epoch": 2.079532725236038, "grad_norm": 4.8585615158081055, "learning_rate": 1.53941808846339e-05, "loss": 4.3626, "step": 25990 }, { "epoch": 2.080332853256521, "grad_norm": 5.967334270477295, "learning_rate": 1.5380809755575762e-05, "loss": 4.2984, "step": 26000 }, { "epoch": 2.0811329812770043, "grad_norm": 3.997610569000244, "learning_rate": 1.5367438626517625e-05, "loss": 4.2662, "step": 26010 }, { "epoch": 2.0819331092974878, "grad_norm": 5.521271228790283, "learning_rate": 1.5354067497459488e-05, "loss": 4.3059, "step": 26020 }, { "epoch": 2.0827332373179708, "grad_norm": 5.361555576324463, "learning_rate": 1.5340696368401347e-05, "loss": 4.2308, "step": 26030 }, { "epoch": 2.0835333653384542, "grad_norm": 4.794193267822266, "learning_rate": 1.532732523934321e-05, "loss": 4.4665, "step": 26040 }, { "epoch": 2.0843334933589373, "grad_norm": 4.499879360198975, "learning_rate": 1.5313954110285072e-05, "loss": 4.2128, "step": 26050 }, { "epoch": 2.0851336213794207, "grad_norm": 3.574612855911255, "learning_rate": 1.5300582981226935e-05, "loss": 4.3812, "step": 26060 }, { "epoch": 2.085933749399904, "grad_norm": 5.214561939239502, "learning_rate": 1.5287211852168798e-05, "loss": 4.5072, "step": 26070 }, { "epoch": 2.086733877420387, "grad_norm": 6.188739776611328, "learning_rate": 1.527384072311066e-05, "loss": 4.4135, "step": 26080 }, { "epoch": 2.0875340054408706, "grad_norm": 6.990460395812988, "learning_rate": 1.5260469594052523e-05, "loss": 4.3509, "step": 26090 }, { "epoch": 2.0883341334613537, "grad_norm": 4.942378520965576, "learning_rate": 1.5247098464994386e-05, "loss": 4.3602, "step": 26100 }, { "epoch": 2.089134261481837, "grad_norm": 6.065731048583984, "learning_rate": 1.5233727335936245e-05, "loss": 4.1068, "step": 26110 }, { "epoch": 2.0899343895023206, "grad_norm": 4.916247844696045, "learning_rate": 1.5220356206878108e-05, "loss": 4.3243, "step": 26120 }, { "epoch": 2.0907345175228036, "grad_norm": 6.229096412658691, "learning_rate": 1.520698507781997e-05, "loss": 4.2303, "step": 26130 }, { "epoch": 2.091534645543287, "grad_norm": 5.115452289581299, "learning_rate": 1.5193613948761833e-05, "loss": 4.3319, "step": 26140 }, { "epoch": 2.09233477356377, "grad_norm": 6.276767253875732, "learning_rate": 1.5180242819703696e-05, "loss": 4.3341, "step": 26150 }, { "epoch": 2.0931349015842535, "grad_norm": 5.382528781890869, "learning_rate": 1.5166871690645559e-05, "loss": 4.3723, "step": 26160 }, { "epoch": 2.0939350296047365, "grad_norm": 8.183712005615234, "learning_rate": 1.5153500561587421e-05, "loss": 4.2733, "step": 26170 }, { "epoch": 2.09473515762522, "grad_norm": 4.3973188400268555, "learning_rate": 1.5140129432529284e-05, "loss": 4.1724, "step": 26180 }, { "epoch": 2.0955352856457035, "grad_norm": 11.1422758102417, "learning_rate": 1.5126758303471145e-05, "loss": 4.2879, "step": 26190 }, { "epoch": 2.0963354136661865, "grad_norm": 3.9524261951446533, "learning_rate": 1.5113387174413008e-05, "loss": 4.1583, "step": 26200 }, { "epoch": 2.09713554168667, "grad_norm": 5.151445388793945, "learning_rate": 1.510001604535487e-05, "loss": 4.3369, "step": 26210 }, { "epoch": 2.097935669707153, "grad_norm": 4.194479465484619, "learning_rate": 1.5086644916296733e-05, "loss": 4.2749, "step": 26220 }, { "epoch": 2.0987357977276364, "grad_norm": 3.648646831512451, "learning_rate": 1.5073273787238596e-05, "loss": 4.1944, "step": 26230 }, { "epoch": 2.09953592574812, "grad_norm": 4.388803005218506, "learning_rate": 1.5059902658180458e-05, "loss": 4.336, "step": 26240 }, { "epoch": 2.100336053768603, "grad_norm": 8.618120193481445, "learning_rate": 1.5046531529122321e-05, "loss": 4.2613, "step": 26250 }, { "epoch": 2.1011361817890863, "grad_norm": 4.906309604644775, "learning_rate": 1.5033160400064184e-05, "loss": 4.3029, "step": 26260 }, { "epoch": 2.1019363098095694, "grad_norm": 4.446979522705078, "learning_rate": 1.5019789271006043e-05, "loss": 4.3164, "step": 26270 }, { "epoch": 2.102736437830053, "grad_norm": 4.349876403808594, "learning_rate": 1.5006418141947906e-05, "loss": 4.3869, "step": 26280 }, { "epoch": 2.1035365658505363, "grad_norm": 5.089364528656006, "learning_rate": 1.4993047012889768e-05, "loss": 4.4643, "step": 26290 }, { "epoch": 2.1043366938710193, "grad_norm": 5.051955699920654, "learning_rate": 1.4979675883831631e-05, "loss": 4.4284, "step": 26300 }, { "epoch": 2.1051368218915028, "grad_norm": 6.135965824127197, "learning_rate": 1.4966304754773494e-05, "loss": 4.2545, "step": 26310 }, { "epoch": 2.1059369499119858, "grad_norm": 5.15330696105957, "learning_rate": 1.4952933625715357e-05, "loss": 4.0856, "step": 26320 }, { "epoch": 2.1067370779324692, "grad_norm": 5.134254455566406, "learning_rate": 1.493956249665722e-05, "loss": 4.2384, "step": 26330 }, { "epoch": 2.1075372059529527, "grad_norm": 4.56715726852417, "learning_rate": 1.492619136759908e-05, "loss": 4.2916, "step": 26340 }, { "epoch": 2.1083373339734357, "grad_norm": 5.437539100646973, "learning_rate": 1.4912820238540943e-05, "loss": 4.1559, "step": 26350 }, { "epoch": 2.109137461993919, "grad_norm": 4.780087471008301, "learning_rate": 1.4900786222388619e-05, "loss": 4.1329, "step": 26360 }, { "epoch": 2.109937590014402, "grad_norm": 4.702531337738037, "learning_rate": 1.4887415093330481e-05, "loss": 4.3365, "step": 26370 }, { "epoch": 2.1107377180348856, "grad_norm": 4.454277515411377, "learning_rate": 1.4874043964272344e-05, "loss": 4.2059, "step": 26380 }, { "epoch": 2.1115378460553687, "grad_norm": 4.2128119468688965, "learning_rate": 1.4860672835214207e-05, "loss": 4.0916, "step": 26390 }, { "epoch": 2.112337974075852, "grad_norm": 4.9969635009765625, "learning_rate": 1.484730170615607e-05, "loss": 4.3129, "step": 26400 }, { "epoch": 2.1131381020963356, "grad_norm": 5.207634449005127, "learning_rate": 1.4833930577097932e-05, "loss": 4.2242, "step": 26410 }, { "epoch": 2.1139382301168186, "grad_norm": 5.739307403564453, "learning_rate": 1.4820559448039791e-05, "loss": 4.3439, "step": 26420 }, { "epoch": 2.114738358137302, "grad_norm": 4.130275726318359, "learning_rate": 1.4807188318981654e-05, "loss": 4.3128, "step": 26430 }, { "epoch": 2.115538486157785, "grad_norm": 4.299551486968994, "learning_rate": 1.4793817189923517e-05, "loss": 4.3764, "step": 26440 }, { "epoch": 2.1163386141782685, "grad_norm": 5.259521484375, "learning_rate": 1.478044606086538e-05, "loss": 4.2741, "step": 26450 }, { "epoch": 2.117138742198752, "grad_norm": 5.917020797729492, "learning_rate": 1.4767074931807242e-05, "loss": 4.2622, "step": 26460 }, { "epoch": 2.117938870219235, "grad_norm": 5.523458957672119, "learning_rate": 1.4753703802749105e-05, "loss": 4.2634, "step": 26470 }, { "epoch": 2.1187389982397185, "grad_norm": 5.1210432052612305, "learning_rate": 1.4740332673690968e-05, "loss": 4.3752, "step": 26480 }, { "epoch": 2.1195391262602015, "grad_norm": 5.014133930206299, "learning_rate": 1.472696154463283e-05, "loss": 4.3112, "step": 26490 }, { "epoch": 2.120339254280685, "grad_norm": 4.23324728012085, "learning_rate": 1.4713590415574691e-05, "loss": 4.3547, "step": 26500 }, { "epoch": 2.1211393823011684, "grad_norm": 4.404748439788818, "learning_rate": 1.4700219286516554e-05, "loss": 4.4991, "step": 26510 }, { "epoch": 2.1219395103216514, "grad_norm": 7.926307201385498, "learning_rate": 1.4686848157458417e-05, "loss": 4.2694, "step": 26520 }, { "epoch": 2.122739638342135, "grad_norm": 4.4926347732543945, "learning_rate": 1.467347702840028e-05, "loss": 4.2697, "step": 26530 }, { "epoch": 2.123539766362618, "grad_norm": 5.678550720214844, "learning_rate": 1.4660105899342142e-05, "loss": 4.3183, "step": 26540 }, { "epoch": 2.1243398943831013, "grad_norm": 5.447049617767334, "learning_rate": 1.4646734770284005e-05, "loss": 4.2823, "step": 26550 }, { "epoch": 2.125140022403585, "grad_norm": 3.123251438140869, "learning_rate": 1.4633363641225867e-05, "loss": 4.2201, "step": 26560 }, { "epoch": 2.125940150424068, "grad_norm": 3.6877589225769043, "learning_rate": 1.461999251216773e-05, "loss": 4.289, "step": 26570 }, { "epoch": 2.1267402784445513, "grad_norm": 5.2759785652160645, "learning_rate": 1.460662138310959e-05, "loss": 4.4292, "step": 26580 }, { "epoch": 2.1275404064650343, "grad_norm": 4.95992374420166, "learning_rate": 1.4593250254051452e-05, "loss": 4.3569, "step": 26590 }, { "epoch": 2.1283405344855177, "grad_norm": 4.725872993469238, "learning_rate": 1.4579879124993315e-05, "loss": 4.37, "step": 26600 }, { "epoch": 2.1291406625060008, "grad_norm": 5.375117301940918, "learning_rate": 1.4566507995935177e-05, "loss": 4.2949, "step": 26610 }, { "epoch": 2.129940790526484, "grad_norm": 5.185641765594482, "learning_rate": 1.455313686687704e-05, "loss": 4.2391, "step": 26620 }, { "epoch": 2.1307409185469677, "grad_norm": 3.6917166709899902, "learning_rate": 1.4539765737818903e-05, "loss": 4.1506, "step": 26630 }, { "epoch": 2.1315410465674507, "grad_norm": 5.494895935058594, "learning_rate": 1.4526394608760765e-05, "loss": 4.4124, "step": 26640 }, { "epoch": 2.132341174587934, "grad_norm": 5.010682106018066, "learning_rate": 1.4513023479702625e-05, "loss": 4.3642, "step": 26650 }, { "epoch": 2.133141302608417, "grad_norm": 4.535843372344971, "learning_rate": 1.4499652350644487e-05, "loss": 4.2799, "step": 26660 }, { "epoch": 2.1339414306289006, "grad_norm": 3.600165367126465, "learning_rate": 1.448628122158635e-05, "loss": 4.2767, "step": 26670 }, { "epoch": 2.134741558649384, "grad_norm": 5.112745761871338, "learning_rate": 1.4472910092528213e-05, "loss": 4.3118, "step": 26680 }, { "epoch": 2.135541686669867, "grad_norm": 7.475447654724121, "learning_rate": 1.4459538963470076e-05, "loss": 4.3916, "step": 26690 }, { "epoch": 2.1363418146903506, "grad_norm": 5.247001647949219, "learning_rate": 1.4446167834411938e-05, "loss": 4.2402, "step": 26700 }, { "epoch": 2.1371419427108336, "grad_norm": 7.609944820404053, "learning_rate": 1.4432796705353801e-05, "loss": 4.2037, "step": 26710 }, { "epoch": 2.137942070731317, "grad_norm": 4.795637130737305, "learning_rate": 1.4419425576295664e-05, "loss": 4.3401, "step": 26720 }, { "epoch": 2.1387421987518005, "grad_norm": 5.049854755401611, "learning_rate": 1.4406054447237525e-05, "loss": 4.3105, "step": 26730 }, { "epoch": 2.1395423267722835, "grad_norm": 4.4695143699646, "learning_rate": 1.4392683318179387e-05, "loss": 4.3286, "step": 26740 }, { "epoch": 2.140342454792767, "grad_norm": 5.833003997802734, "learning_rate": 1.437931218912125e-05, "loss": 4.386, "step": 26750 }, { "epoch": 2.14114258281325, "grad_norm": 5.397075653076172, "learning_rate": 1.4365941060063113e-05, "loss": 4.3105, "step": 26760 }, { "epoch": 2.1419427108337334, "grad_norm": 4.232978820800781, "learning_rate": 1.4352569931004975e-05, "loss": 4.2622, "step": 26770 }, { "epoch": 2.1427428388542165, "grad_norm": 6.420408725738525, "learning_rate": 1.4339198801946838e-05, "loss": 4.2835, "step": 26780 }, { "epoch": 2.1435429668747, "grad_norm": 6.694440841674805, "learning_rate": 1.43258276728887e-05, "loss": 4.345, "step": 26790 }, { "epoch": 2.1443430948951834, "grad_norm": 6.900730133056641, "learning_rate": 1.4312456543830563e-05, "loss": 4.1415, "step": 26800 }, { "epoch": 2.1451432229156664, "grad_norm": 6.370269775390625, "learning_rate": 1.4299085414772423e-05, "loss": 4.3092, "step": 26810 }, { "epoch": 2.14594335093615, "grad_norm": 5.234679222106934, "learning_rate": 1.4285714285714285e-05, "loss": 4.456, "step": 26820 }, { "epoch": 2.146743478956633, "grad_norm": 4.147063732147217, "learning_rate": 1.4272343156656148e-05, "loss": 4.1961, "step": 26830 }, { "epoch": 2.1475436069771163, "grad_norm": 4.461815357208252, "learning_rate": 1.425897202759801e-05, "loss": 4.158, "step": 26840 }, { "epoch": 2.1483437349976, "grad_norm": 7.251457691192627, "learning_rate": 1.4245600898539874e-05, "loss": 4.3905, "step": 26850 }, { "epoch": 2.149143863018083, "grad_norm": 4.718042850494385, "learning_rate": 1.4232229769481736e-05, "loss": 4.3418, "step": 26860 }, { "epoch": 2.1499439910385663, "grad_norm": 4.637687683105469, "learning_rate": 1.4218858640423599e-05, "loss": 4.3702, "step": 26870 }, { "epoch": 2.1507441190590493, "grad_norm": 6.43583869934082, "learning_rate": 1.4205487511365462e-05, "loss": 4.3164, "step": 26880 }, { "epoch": 2.1515442470795327, "grad_norm": 5.462271213531494, "learning_rate": 1.4192116382307321e-05, "loss": 4.3193, "step": 26890 }, { "epoch": 2.152344375100016, "grad_norm": 4.406450271606445, "learning_rate": 1.4178745253249184e-05, "loss": 4.3865, "step": 26900 }, { "epoch": 2.153144503120499, "grad_norm": 6.0672383308410645, "learning_rate": 1.4165374124191046e-05, "loss": 4.2768, "step": 26910 }, { "epoch": 2.1539446311409827, "grad_norm": 4.165776252746582, "learning_rate": 1.4152002995132909e-05, "loss": 4.1918, "step": 26920 }, { "epoch": 2.1547447591614657, "grad_norm": 15.013633728027344, "learning_rate": 1.4138631866074772e-05, "loss": 4.2488, "step": 26930 }, { "epoch": 2.155544887181949, "grad_norm": 5.367081165313721, "learning_rate": 1.4125260737016634e-05, "loss": 4.2885, "step": 26940 }, { "epoch": 2.156345015202432, "grad_norm": 3.9611103534698486, "learning_rate": 1.4111889607958497e-05, "loss": 4.2249, "step": 26950 }, { "epoch": 2.1571451432229156, "grad_norm": 4.482510089874268, "learning_rate": 1.4098518478900358e-05, "loss": 4.3155, "step": 26960 }, { "epoch": 2.157945271243399, "grad_norm": 4.127138137817383, "learning_rate": 1.408514734984222e-05, "loss": 4.3791, "step": 26970 }, { "epoch": 2.158745399263882, "grad_norm": 3.858367919921875, "learning_rate": 1.4071776220784083e-05, "loss": 4.3788, "step": 26980 }, { "epoch": 2.1595455272843656, "grad_norm": 5.6368184089660645, "learning_rate": 1.4058405091725946e-05, "loss": 4.2714, "step": 26990 }, { "epoch": 2.160345655304849, "grad_norm": 5.943724632263184, "learning_rate": 1.4045033962667809e-05, "loss": 4.3902, "step": 27000 }, { "epoch": 2.161145783325332, "grad_norm": 5.1518683433532715, "learning_rate": 1.4031662833609671e-05, "loss": 4.2325, "step": 27010 }, { "epoch": 2.1619459113458155, "grad_norm": 3.714491605758667, "learning_rate": 1.4018291704551534e-05, "loss": 4.3165, "step": 27020 }, { "epoch": 2.1627460393662985, "grad_norm": 4.722376823425293, "learning_rate": 1.4004920575493397e-05, "loss": 4.3397, "step": 27030 }, { "epoch": 2.163546167386782, "grad_norm": 4.137203693389893, "learning_rate": 1.3991549446435256e-05, "loss": 4.0377, "step": 27040 }, { "epoch": 2.164346295407265, "grad_norm": 6.193617820739746, "learning_rate": 1.3978178317377119e-05, "loss": 4.2832, "step": 27050 }, { "epoch": 2.1651464234277484, "grad_norm": 4.346227169036865, "learning_rate": 1.3964807188318982e-05, "loss": 4.2071, "step": 27060 }, { "epoch": 2.165946551448232, "grad_norm": 5.1168928146362305, "learning_rate": 1.3951436059260844e-05, "loss": 4.4359, "step": 27070 }, { "epoch": 2.166746679468715, "grad_norm": 6.076730728149414, "learning_rate": 1.3938064930202707e-05, "loss": 4.2271, "step": 27080 }, { "epoch": 2.1675468074891984, "grad_norm": 4.987486839294434, "learning_rate": 1.392469380114457e-05, "loss": 4.1962, "step": 27090 }, { "epoch": 2.1683469355096814, "grad_norm": 4.582950592041016, "learning_rate": 1.3911322672086432e-05, "loss": 4.1916, "step": 27100 }, { "epoch": 2.169147063530165, "grad_norm": 4.27584981918335, "learning_rate": 1.3897951543028295e-05, "loss": 4.3356, "step": 27110 }, { "epoch": 2.1699471915506483, "grad_norm": 4.555540084838867, "learning_rate": 1.3884580413970156e-05, "loss": 4.3487, "step": 27120 }, { "epoch": 2.1707473195711313, "grad_norm": 7.551029205322266, "learning_rate": 1.3871209284912019e-05, "loss": 4.3706, "step": 27130 }, { "epoch": 2.1715474475916148, "grad_norm": 6.390832901000977, "learning_rate": 1.3857838155853881e-05, "loss": 4.2307, "step": 27140 }, { "epoch": 2.172347575612098, "grad_norm": 5.295320510864258, "learning_rate": 1.3844467026795744e-05, "loss": 4.3262, "step": 27150 }, { "epoch": 2.1731477036325813, "grad_norm": 4.816108703613281, "learning_rate": 1.3831095897737607e-05, "loss": 4.4674, "step": 27160 }, { "epoch": 2.1739478316530647, "grad_norm": 7.950437545776367, "learning_rate": 1.381772476867947e-05, "loss": 4.1512, "step": 27170 }, { "epoch": 2.1747479596735477, "grad_norm": 6.867645740509033, "learning_rate": 1.3804353639621332e-05, "loss": 4.3138, "step": 27180 }, { "epoch": 2.175548087694031, "grad_norm": 5.51462459564209, "learning_rate": 1.3790982510563191e-05, "loss": 4.1856, "step": 27190 }, { "epoch": 2.176348215714514, "grad_norm": 11.651870727539062, "learning_rate": 1.3777611381505054e-05, "loss": 4.3696, "step": 27200 }, { "epoch": 2.1771483437349977, "grad_norm": 5.805605411529541, "learning_rate": 1.3764240252446917e-05, "loss": 4.376, "step": 27210 }, { "epoch": 2.1779484717554807, "grad_norm": 4.33229398727417, "learning_rate": 1.375086912338878e-05, "loss": 4.1416, "step": 27220 }, { "epoch": 2.178748599775964, "grad_norm": 5.466543674468994, "learning_rate": 1.3737497994330642e-05, "loss": 4.416, "step": 27230 }, { "epoch": 2.1795487277964476, "grad_norm": 4.547317028045654, "learning_rate": 1.3724126865272505e-05, "loss": 4.4502, "step": 27240 }, { "epoch": 2.1803488558169306, "grad_norm": 6.405552864074707, "learning_rate": 1.3710755736214368e-05, "loss": 4.2897, "step": 27250 }, { "epoch": 2.181148983837414, "grad_norm": 4.219295024871826, "learning_rate": 1.369738460715623e-05, "loss": 4.1738, "step": 27260 }, { "epoch": 2.181949111857897, "grad_norm": 3.7601661682128906, "learning_rate": 1.368401347809809e-05, "loss": 4.3072, "step": 27270 }, { "epoch": 2.1827492398783805, "grad_norm": 6.388236045837402, "learning_rate": 1.3670642349039952e-05, "loss": 4.2357, "step": 27280 }, { "epoch": 2.183549367898864, "grad_norm": 4.893552780151367, "learning_rate": 1.3657271219981815e-05, "loss": 4.2591, "step": 27290 }, { "epoch": 2.184349495919347, "grad_norm": 9.936212539672852, "learning_rate": 1.3643900090923678e-05, "loss": 4.3157, "step": 27300 }, { "epoch": 2.1851496239398305, "grad_norm": 4.956419944763184, "learning_rate": 1.363052896186554e-05, "loss": 4.2788, "step": 27310 }, { "epoch": 2.1859497519603135, "grad_norm": 6.442319869995117, "learning_rate": 1.3617157832807403e-05, "loss": 4.3374, "step": 27320 }, { "epoch": 2.186749879980797, "grad_norm": 5.18787956237793, "learning_rate": 1.3603786703749266e-05, "loss": 4.2136, "step": 27330 }, { "epoch": 2.1875500080012804, "grad_norm": 4.710720062255859, "learning_rate": 1.3590415574691128e-05, "loss": 4.3571, "step": 27340 }, { "epoch": 2.1883501360217634, "grad_norm": 7.41901159286499, "learning_rate": 1.357704444563299e-05, "loss": 4.2718, "step": 27350 }, { "epoch": 2.189150264042247, "grad_norm": 4.362005710601807, "learning_rate": 1.3563673316574852e-05, "loss": 4.3566, "step": 27360 }, { "epoch": 2.18995039206273, "grad_norm": 4.403487205505371, "learning_rate": 1.3550302187516715e-05, "loss": 4.3139, "step": 27370 }, { "epoch": 2.1907505200832134, "grad_norm": 8.87827205657959, "learning_rate": 1.3536931058458577e-05, "loss": 4.099, "step": 27380 }, { "epoch": 2.1915506481036964, "grad_norm": 4.843017578125, "learning_rate": 1.352355992940044e-05, "loss": 4.2981, "step": 27390 }, { "epoch": 2.19235077612418, "grad_norm": 8.040793418884277, "learning_rate": 1.3510188800342303e-05, "loss": 4.3916, "step": 27400 }, { "epoch": 2.1931509041446633, "grad_norm": 5.31903076171875, "learning_rate": 1.3496817671284166e-05, "loss": 4.3502, "step": 27410 }, { "epoch": 2.1939510321651463, "grad_norm": 5.527568817138672, "learning_rate": 1.3483446542226028e-05, "loss": 4.3331, "step": 27420 }, { "epoch": 2.1947511601856298, "grad_norm": 4.029179573059082, "learning_rate": 1.3470075413167888e-05, "loss": 4.2993, "step": 27430 }, { "epoch": 2.195551288206113, "grad_norm": 5.164677619934082, "learning_rate": 1.345670428410975e-05, "loss": 4.3953, "step": 27440 }, { "epoch": 2.1963514162265962, "grad_norm": 5.0720720291137695, "learning_rate": 1.3443333155051613e-05, "loss": 4.4189, "step": 27450 }, { "epoch": 2.1971515442470797, "grad_norm": 4.484434127807617, "learning_rate": 1.3429962025993476e-05, "loss": 4.3077, "step": 27460 }, { "epoch": 2.1979516722675627, "grad_norm": 4.3660969734191895, "learning_rate": 1.3416590896935338e-05, "loss": 4.1555, "step": 27470 }, { "epoch": 2.198751800288046, "grad_norm": 4.788058757781982, "learning_rate": 1.3403219767877201e-05, "loss": 4.2902, "step": 27480 }, { "epoch": 2.199551928308529, "grad_norm": 6.046142578125, "learning_rate": 1.3389848638819064e-05, "loss": 4.3144, "step": 27490 }, { "epoch": 2.2003520563290127, "grad_norm": 4.384228229522705, "learning_rate": 1.3376477509760923e-05, "loss": 4.4575, "step": 27500 }, { "epoch": 2.201152184349496, "grad_norm": 6.044991493225098, "learning_rate": 1.3363106380702786e-05, "loss": 4.2463, "step": 27510 }, { "epoch": 2.201952312369979, "grad_norm": 5.55415678024292, "learning_rate": 1.3349735251644648e-05, "loss": 4.2905, "step": 27520 }, { "epoch": 2.2027524403904626, "grad_norm": 6.360270977020264, "learning_rate": 1.3336364122586511e-05, "loss": 4.3698, "step": 27530 }, { "epoch": 2.2035525684109456, "grad_norm": 5.214306354522705, "learning_rate": 1.3322992993528374e-05, "loss": 4.2608, "step": 27540 }, { "epoch": 2.204352696431429, "grad_norm": 5.412326335906982, "learning_rate": 1.3309621864470236e-05, "loss": 4.4204, "step": 27550 }, { "epoch": 2.205152824451912, "grad_norm": 4.402900218963623, "learning_rate": 1.3296250735412099e-05, "loss": 4.3345, "step": 27560 }, { "epoch": 2.2059529524723955, "grad_norm": 4.055781364440918, "learning_rate": 1.3282879606353962e-05, "loss": 4.2738, "step": 27570 }, { "epoch": 2.206753080492879, "grad_norm": 4.639005661010742, "learning_rate": 1.3269508477295823e-05, "loss": 4.3226, "step": 27580 }, { "epoch": 2.207553208513362, "grad_norm": 6.268497467041016, "learning_rate": 1.3256137348237685e-05, "loss": 4.2283, "step": 27590 }, { "epoch": 2.2083533365338455, "grad_norm": 5.508733749389648, "learning_rate": 1.3242766219179548e-05, "loss": 4.3479, "step": 27600 }, { "epoch": 2.209153464554329, "grad_norm": 6.806684970855713, "learning_rate": 1.322939509012141e-05, "loss": 4.2675, "step": 27610 }, { "epoch": 2.209953592574812, "grad_norm": 7.854143142700195, "learning_rate": 1.3216023961063274e-05, "loss": 4.289, "step": 27620 }, { "epoch": 2.2107537205952954, "grad_norm": 4.771146297454834, "learning_rate": 1.3202652832005136e-05, "loss": 4.3741, "step": 27630 }, { "epoch": 2.2115538486157784, "grad_norm": 4.5379838943481445, "learning_rate": 1.3189281702946999e-05, "loss": 4.1667, "step": 27640 }, { "epoch": 2.212353976636262, "grad_norm": 8.35084056854248, "learning_rate": 1.3175910573888862e-05, "loss": 4.3594, "step": 27650 }, { "epoch": 2.213154104656745, "grad_norm": 5.7083306312561035, "learning_rate": 1.3162539444830721e-05, "loss": 4.3591, "step": 27660 }, { "epoch": 2.2139542326772284, "grad_norm": 5.427682399749756, "learning_rate": 1.3149168315772584e-05, "loss": 4.2273, "step": 27670 }, { "epoch": 2.214754360697712, "grad_norm": 3.5514488220214844, "learning_rate": 1.3135797186714446e-05, "loss": 4.1346, "step": 27680 }, { "epoch": 2.215554488718195, "grad_norm": 6.326035022735596, "learning_rate": 1.3122426057656309e-05, "loss": 4.3059, "step": 27690 }, { "epoch": 2.2163546167386783, "grad_norm": 3.6098668575286865, "learning_rate": 1.3109054928598172e-05, "loss": 4.318, "step": 27700 }, { "epoch": 2.2171547447591613, "grad_norm": 5.207823276519775, "learning_rate": 1.3095683799540034e-05, "loss": 4.3012, "step": 27710 }, { "epoch": 2.2179548727796448, "grad_norm": 3.878981590270996, "learning_rate": 1.3082312670481897e-05, "loss": 4.208, "step": 27720 }, { "epoch": 2.218755000800128, "grad_norm": 4.142792701721191, "learning_rate": 1.306894154142376e-05, "loss": 4.2057, "step": 27730 }, { "epoch": 2.2195551288206112, "grad_norm": 4.735614776611328, "learning_rate": 1.3055570412365619e-05, "loss": 4.3577, "step": 27740 }, { "epoch": 2.2203552568410947, "grad_norm": 4.719352722167969, "learning_rate": 1.3042199283307482e-05, "loss": 4.2049, "step": 27750 }, { "epoch": 2.2211553848615777, "grad_norm": 5.648311138153076, "learning_rate": 1.3028828154249344e-05, "loss": 4.2577, "step": 27760 }, { "epoch": 2.221955512882061, "grad_norm": 4.707512855529785, "learning_rate": 1.3015457025191207e-05, "loss": 4.2289, "step": 27770 }, { "epoch": 2.2227556409025446, "grad_norm": 6.615943431854248, "learning_rate": 1.300208589613307e-05, "loss": 4.3439, "step": 27780 }, { "epoch": 2.2235557689230276, "grad_norm": 11.488637924194336, "learning_rate": 1.2988714767074932e-05, "loss": 4.4672, "step": 27790 }, { "epoch": 2.224355896943511, "grad_norm": 4.287919044494629, "learning_rate": 1.2975343638016795e-05, "loss": 4.6242, "step": 27800 }, { "epoch": 2.225156024963994, "grad_norm": 4.993651866912842, "learning_rate": 1.2961972508958656e-05, "loss": 4.3635, "step": 27810 }, { "epoch": 2.2259561529844776, "grad_norm": 7.000411510467529, "learning_rate": 1.2948601379900519e-05, "loss": 4.092, "step": 27820 }, { "epoch": 2.2267562810049606, "grad_norm": 13.637682914733887, "learning_rate": 1.2935230250842382e-05, "loss": 4.4024, "step": 27830 }, { "epoch": 2.227556409025444, "grad_norm": 5.699049472808838, "learning_rate": 1.2921859121784244e-05, "loss": 4.2697, "step": 27840 }, { "epoch": 2.2283565370459275, "grad_norm": 5.741790771484375, "learning_rate": 1.2908487992726107e-05, "loss": 4.2566, "step": 27850 }, { "epoch": 2.2291566650664105, "grad_norm": 8.050752639770508, "learning_rate": 1.289511686366797e-05, "loss": 4.3565, "step": 27860 }, { "epoch": 2.229956793086894, "grad_norm": 4.524864196777344, "learning_rate": 1.2881745734609832e-05, "loss": 4.3523, "step": 27870 }, { "epoch": 2.230756921107377, "grad_norm": 5.576345920562744, "learning_rate": 1.2868374605551695e-05, "loss": 4.2655, "step": 27880 }, { "epoch": 2.2315570491278605, "grad_norm": 7.234230041503906, "learning_rate": 1.2855003476493554e-05, "loss": 4.224, "step": 27890 }, { "epoch": 2.232357177148344, "grad_norm": 3.625093698501587, "learning_rate": 1.2841632347435417e-05, "loss": 4.314, "step": 27900 }, { "epoch": 2.233157305168827, "grad_norm": 5.142899990081787, "learning_rate": 1.282826121837728e-05, "loss": 4.2523, "step": 27910 }, { "epoch": 2.2339574331893104, "grad_norm": 5.890714168548584, "learning_rate": 1.2814890089319142e-05, "loss": 4.408, "step": 27920 }, { "epoch": 2.2347575612097934, "grad_norm": 6.055865287780762, "learning_rate": 1.2801518960261005e-05, "loss": 4.2961, "step": 27930 }, { "epoch": 2.235557689230277, "grad_norm": 12.97319507598877, "learning_rate": 1.2788147831202868e-05, "loss": 4.2993, "step": 27940 }, { "epoch": 2.2363578172507603, "grad_norm": 5.342799186706543, "learning_rate": 1.277477670214473e-05, "loss": 4.224, "step": 27950 }, { "epoch": 2.2371579452712433, "grad_norm": 3.9940850734710693, "learning_rate": 1.2761405573086593e-05, "loss": 4.2813, "step": 27960 }, { "epoch": 2.237958073291727, "grad_norm": 3.819955587387085, "learning_rate": 1.2748034444028452e-05, "loss": 4.2159, "step": 27970 }, { "epoch": 2.23875820131221, "grad_norm": 6.087779998779297, "learning_rate": 1.2734663314970315e-05, "loss": 4.2637, "step": 27980 }, { "epoch": 2.2395583293326933, "grad_norm": 5.827731132507324, "learning_rate": 1.2721292185912178e-05, "loss": 4.391, "step": 27990 }, { "epoch": 2.2403584573531763, "grad_norm": 5.9532389640808105, "learning_rate": 1.270792105685404e-05, "loss": 4.2314, "step": 28000 }, { "epoch": 2.2403584573531763, "eval_loss": 5.650319576263428, "eval_runtime": 17.4181, "eval_samples_per_second": 2.296, "eval_steps_per_second": 0.287, "step": 28000 }, { "epoch": 2.2411585853736598, "grad_norm": 4.932950973510742, "learning_rate": 1.2694549927795903e-05, "loss": 4.3077, "step": 28010 }, { "epoch": 2.241958713394143, "grad_norm": 6.454247951507568, "learning_rate": 1.2681178798737766e-05, "loss": 4.2879, "step": 28020 }, { "epoch": 2.2427588414146262, "grad_norm": 6.555384159088135, "learning_rate": 1.2667807669679629e-05, "loss": 4.1309, "step": 28030 }, { "epoch": 2.2435589694351097, "grad_norm": 5.856935501098633, "learning_rate": 1.265443654062149e-05, "loss": 4.1817, "step": 28040 }, { "epoch": 2.2443590974555927, "grad_norm": 6.825374603271484, "learning_rate": 1.2641065411563352e-05, "loss": 4.2748, "step": 28050 }, { "epoch": 2.245159225476076, "grad_norm": 4.445517063140869, "learning_rate": 1.2627694282505215e-05, "loss": 4.1561, "step": 28060 }, { "epoch": 2.2459593534965596, "grad_norm": 5.488250255584717, "learning_rate": 1.2614323153447078e-05, "loss": 4.1993, "step": 28070 }, { "epoch": 2.2467594815170426, "grad_norm": 8.348297119140625, "learning_rate": 1.260095202438894e-05, "loss": 4.1527, "step": 28080 }, { "epoch": 2.247559609537526, "grad_norm": 4.551492214202881, "learning_rate": 1.2587580895330803e-05, "loss": 4.2166, "step": 28090 }, { "epoch": 2.248359737558009, "grad_norm": 4.065390110015869, "learning_rate": 1.2574209766272666e-05, "loss": 4.3189, "step": 28100 }, { "epoch": 2.2491598655784926, "grad_norm": 4.235226154327393, "learning_rate": 1.2560838637214528e-05, "loss": 4.2793, "step": 28110 }, { "epoch": 2.249959993598976, "grad_norm": 4.71818733215332, "learning_rate": 1.2547467508156388e-05, "loss": 4.3372, "step": 28120 }, { "epoch": 2.250760121619459, "grad_norm": 5.000930309295654, "learning_rate": 1.253409637909825e-05, "loss": 4.227, "step": 28130 }, { "epoch": 2.2515602496399425, "grad_norm": 4.730818748474121, "learning_rate": 1.2520725250040113e-05, "loss": 4.3551, "step": 28140 }, { "epoch": 2.2523603776604255, "grad_norm": 5.300826549530029, "learning_rate": 1.2507354120981976e-05, "loss": 4.2116, "step": 28150 }, { "epoch": 2.253160505680909, "grad_norm": 4.391541957855225, "learning_rate": 1.2493982991923838e-05, "loss": 4.3452, "step": 28160 }, { "epoch": 2.253960633701392, "grad_norm": 7.184095859527588, "learning_rate": 1.2480611862865701e-05, "loss": 4.0692, "step": 28170 }, { "epoch": 2.2547607617218755, "grad_norm": 6.555243968963623, "learning_rate": 1.2467240733807562e-05, "loss": 4.2418, "step": 28180 }, { "epoch": 2.255560889742359, "grad_norm": 5.414023399353027, "learning_rate": 1.2453869604749425e-05, "loss": 4.2476, "step": 28190 }, { "epoch": 2.256361017762842, "grad_norm": 5.8872599601745605, "learning_rate": 1.2440498475691288e-05, "loss": 4.3209, "step": 28200 }, { "epoch": 2.2571611457833254, "grad_norm": 5.530123233795166, "learning_rate": 1.242712734663315e-05, "loss": 4.2192, "step": 28210 }, { "epoch": 2.257961273803809, "grad_norm": 4.799288749694824, "learning_rate": 1.2413756217575013e-05, "loss": 4.1957, "step": 28220 }, { "epoch": 2.258761401824292, "grad_norm": 5.001644134521484, "learning_rate": 1.2400385088516876e-05, "loss": 4.3231, "step": 28230 }, { "epoch": 2.2595615298447753, "grad_norm": 6.378586292266846, "learning_rate": 1.2387013959458738e-05, "loss": 4.3512, "step": 28240 }, { "epoch": 2.2603616578652583, "grad_norm": 4.091723918914795, "learning_rate": 1.2373642830400601e-05, "loss": 4.4076, "step": 28250 }, { "epoch": 2.261161785885742, "grad_norm": 4.692946434020996, "learning_rate": 1.2360271701342462e-05, "loss": 4.34, "step": 28260 }, { "epoch": 2.261961913906225, "grad_norm": 4.702794075012207, "learning_rate": 1.2346900572284325e-05, "loss": 4.3236, "step": 28270 }, { "epoch": 2.2627620419267083, "grad_norm": 6.910373210906982, "learning_rate": 1.2333529443226187e-05, "loss": 4.2764, "step": 28280 }, { "epoch": 2.2635621699471917, "grad_norm": 4.597626209259033, "learning_rate": 1.232015831416805e-05, "loss": 4.3461, "step": 28290 }, { "epoch": 2.2643622979676747, "grad_norm": 6.760199546813965, "learning_rate": 1.2306787185109911e-05, "loss": 4.2325, "step": 28300 }, { "epoch": 2.265162425988158, "grad_norm": 6.140738010406494, "learning_rate": 1.2293416056051774e-05, "loss": 4.2288, "step": 28310 }, { "epoch": 2.265962554008641, "grad_norm": 5.178011894226074, "learning_rate": 1.2280044926993636e-05, "loss": 4.364, "step": 28320 }, { "epoch": 2.2667626820291247, "grad_norm": 6.677469253540039, "learning_rate": 1.2266673797935497e-05, "loss": 4.3654, "step": 28330 }, { "epoch": 2.2675628100496077, "grad_norm": 7.555559158325195, "learning_rate": 1.225330266887736e-05, "loss": 4.1391, "step": 28340 }, { "epoch": 2.268362938070091, "grad_norm": 5.3249406814575195, "learning_rate": 1.2239931539819223e-05, "loss": 4.2865, "step": 28350 }, { "epoch": 2.2691630660905746, "grad_norm": 6.221409320831299, "learning_rate": 1.2226560410761085e-05, "loss": 4.4498, "step": 28360 }, { "epoch": 2.2699631941110576, "grad_norm": 6.600496292114258, "learning_rate": 1.2213189281702946e-05, "loss": 4.2898, "step": 28370 }, { "epoch": 2.270763322131541, "grad_norm": 6.635727405548096, "learning_rate": 1.219981815264481e-05, "loss": 4.1712, "step": 28380 }, { "epoch": 2.2715634501520245, "grad_norm": 6.427636623382568, "learning_rate": 1.2186447023586672e-05, "loss": 4.316, "step": 28390 }, { "epoch": 2.2723635781725076, "grad_norm": 5.909119606018066, "learning_rate": 1.2173075894528535e-05, "loss": 4.3188, "step": 28400 }, { "epoch": 2.273163706192991, "grad_norm": 4.973991870880127, "learning_rate": 1.2159704765470396e-05, "loss": 4.1295, "step": 28410 }, { "epoch": 2.273963834213474, "grad_norm": 5.875889301300049, "learning_rate": 1.2146333636412258e-05, "loss": 4.1011, "step": 28420 }, { "epoch": 2.2747639622339575, "grad_norm": 6.347286224365234, "learning_rate": 1.2132962507354121e-05, "loss": 4.3025, "step": 28430 }, { "epoch": 2.2755640902544405, "grad_norm": 9.685184478759766, "learning_rate": 1.2119591378295984e-05, "loss": 4.2504, "step": 28440 }, { "epoch": 2.276364218274924, "grad_norm": 3.5592947006225586, "learning_rate": 1.2106220249237846e-05, "loss": 4.0199, "step": 28450 }, { "epoch": 2.2771643462954074, "grad_norm": 5.19354248046875, "learning_rate": 1.2092849120179709e-05, "loss": 4.3501, "step": 28460 }, { "epoch": 2.2779644743158904, "grad_norm": 5.163251876831055, "learning_rate": 1.2079477991121572e-05, "loss": 4.2562, "step": 28470 }, { "epoch": 2.278764602336374, "grad_norm": 3.630056381225586, "learning_rate": 1.2067443974969247e-05, "loss": 4.3743, "step": 28480 }, { "epoch": 2.279564730356857, "grad_norm": 3.2357378005981445, "learning_rate": 1.2054072845911108e-05, "loss": 4.333, "step": 28490 }, { "epoch": 2.2803648583773404, "grad_norm": 5.15466833114624, "learning_rate": 1.2040701716852971e-05, "loss": 4.295, "step": 28500 }, { "epoch": 2.281164986397824, "grad_norm": 6.575782775878906, "learning_rate": 1.2027330587794834e-05, "loss": 4.1019, "step": 28510 }, { "epoch": 2.281965114418307, "grad_norm": 4.305944442749023, "learning_rate": 1.2013959458736697e-05, "loss": 4.2613, "step": 28520 }, { "epoch": 2.2827652424387903, "grad_norm": 6.759955406188965, "learning_rate": 1.2000588329678558e-05, "loss": 4.324, "step": 28530 }, { "epoch": 2.2835653704592733, "grad_norm": 4.603118419647217, "learning_rate": 1.198721720062042e-05, "loss": 4.2917, "step": 28540 }, { "epoch": 2.284365498479757, "grad_norm": 5.116035461425781, "learning_rate": 1.1973846071562283e-05, "loss": 4.0603, "step": 28550 }, { "epoch": 2.2851656265002402, "grad_norm": 4.8761796951293945, "learning_rate": 1.1960474942504146e-05, "loss": 4.2702, "step": 28560 }, { "epoch": 2.2859657545207233, "grad_norm": 7.16318941116333, "learning_rate": 1.1947103813446008e-05, "loss": 4.2756, "step": 28570 }, { "epoch": 2.2867658825412067, "grad_norm": 8.105825424194336, "learning_rate": 1.1933732684387871e-05, "loss": 4.2905, "step": 28580 }, { "epoch": 2.2875660105616897, "grad_norm": 6.250253200531006, "learning_rate": 1.1920361555329734e-05, "loss": 4.2202, "step": 28590 }, { "epoch": 2.288366138582173, "grad_norm": 4.617362022399902, "learning_rate": 1.1906990426271596e-05, "loss": 4.0861, "step": 28600 }, { "epoch": 2.289166266602656, "grad_norm": 5.895457744598389, "learning_rate": 1.1893619297213457e-05, "loss": 4.0952, "step": 28610 }, { "epoch": 2.2899663946231397, "grad_norm": 4.059090614318848, "learning_rate": 1.188024816815532e-05, "loss": 4.1865, "step": 28620 }, { "epoch": 2.290766522643623, "grad_norm": 8.624724388122559, "learning_rate": 1.1866877039097183e-05, "loss": 4.2946, "step": 28630 }, { "epoch": 2.291566650664106, "grad_norm": 6.121615409851074, "learning_rate": 1.1853505910039044e-05, "loss": 4.1917, "step": 28640 }, { "epoch": 2.2923667786845896, "grad_norm": 6.071402549743652, "learning_rate": 1.1840134780980906e-05, "loss": 4.3638, "step": 28650 }, { "epoch": 2.293166906705073, "grad_norm": 4.712641716003418, "learning_rate": 1.1826763651922769e-05, "loss": 4.1598, "step": 28660 }, { "epoch": 2.293967034725556, "grad_norm": 5.36449670791626, "learning_rate": 1.1813392522864632e-05, "loss": 4.2292, "step": 28670 }, { "epoch": 2.2947671627460395, "grad_norm": 5.73878812789917, "learning_rate": 1.1800021393806493e-05, "loss": 4.362, "step": 28680 }, { "epoch": 2.2955672907665226, "grad_norm": 4.448238372802734, "learning_rate": 1.1786650264748355e-05, "loss": 4.1564, "step": 28690 }, { "epoch": 2.296367418787006, "grad_norm": 5.033682823181152, "learning_rate": 1.1773279135690218e-05, "loss": 4.147, "step": 28700 }, { "epoch": 2.297167546807489, "grad_norm": 4.061682224273682, "learning_rate": 1.175990800663208e-05, "loss": 4.4025, "step": 28710 }, { "epoch": 2.2979676748279725, "grad_norm": 5.760485649108887, "learning_rate": 1.1746536877573942e-05, "loss": 4.3773, "step": 28720 }, { "epoch": 2.298767802848456, "grad_norm": 8.28410816192627, "learning_rate": 1.1733165748515805e-05, "loss": 4.1615, "step": 28730 }, { "epoch": 2.299567930868939, "grad_norm": 4.579817771911621, "learning_rate": 1.1719794619457667e-05, "loss": 4.365, "step": 28740 }, { "epoch": 2.3003680588894224, "grad_norm": 5.556467533111572, "learning_rate": 1.170642349039953e-05, "loss": 4.36, "step": 28750 }, { "epoch": 2.3011681869099054, "grad_norm": 4.739517688751221, "learning_rate": 1.1693052361341391e-05, "loss": 4.3933, "step": 28760 }, { "epoch": 2.301968314930389, "grad_norm": 4.348276138305664, "learning_rate": 1.1679681232283254e-05, "loss": 4.0874, "step": 28770 }, { "epoch": 2.302768442950872, "grad_norm": 4.935428142547607, "learning_rate": 1.1666310103225116e-05, "loss": 4.3363, "step": 28780 }, { "epoch": 2.3035685709713554, "grad_norm": 3.923457145690918, "learning_rate": 1.1652938974166979e-05, "loss": 4.206, "step": 28790 }, { "epoch": 2.304368698991839, "grad_norm": 4.47625732421875, "learning_rate": 1.1639567845108842e-05, "loss": 4.3173, "step": 28800 }, { "epoch": 2.305168827012322, "grad_norm": 5.407947540283203, "learning_rate": 1.1626196716050704e-05, "loss": 4.3247, "step": 28810 }, { "epoch": 2.3059689550328053, "grad_norm": 4.727847099304199, "learning_rate": 1.1612825586992567e-05, "loss": 4.366, "step": 28820 }, { "epoch": 2.3067690830532888, "grad_norm": 5.98025369644165, "learning_rate": 1.159945445793443e-05, "loss": 4.2372, "step": 28830 }, { "epoch": 2.3075692110737718, "grad_norm": 4.812393665313721, "learning_rate": 1.158608332887629e-05, "loss": 4.3197, "step": 28840 }, { "epoch": 2.3083693390942552, "grad_norm": 6.968140125274658, "learning_rate": 1.1572712199818153e-05, "loss": 4.4518, "step": 28850 }, { "epoch": 2.3091694671147383, "grad_norm": 5.6051740646362305, "learning_rate": 1.1559341070760016e-05, "loss": 4.1207, "step": 28860 }, { "epoch": 2.3099695951352217, "grad_norm": 4.6692705154418945, "learning_rate": 1.1545969941701879e-05, "loss": 4.1852, "step": 28870 }, { "epoch": 2.3107697231557047, "grad_norm": 5.183732032775879, "learning_rate": 1.153259881264374e-05, "loss": 4.167, "step": 28880 }, { "epoch": 2.311569851176188, "grad_norm": 4.867135524749756, "learning_rate": 1.1519227683585602e-05, "loss": 4.148, "step": 28890 }, { "epoch": 2.3123699791966716, "grad_norm": 4.534615993499756, "learning_rate": 1.1505856554527465e-05, "loss": 4.2532, "step": 28900 }, { "epoch": 2.3131701072171547, "grad_norm": 5.32534122467041, "learning_rate": 1.1492485425469326e-05, "loss": 4.4451, "step": 28910 }, { "epoch": 2.313970235237638, "grad_norm": 6.864644527435303, "learning_rate": 1.1479114296411189e-05, "loss": 4.345, "step": 28920 }, { "epoch": 2.314770363258121, "grad_norm": 4.056292533874512, "learning_rate": 1.1465743167353052e-05, "loss": 4.1855, "step": 28930 }, { "epoch": 2.3155704912786046, "grad_norm": 4.13967227935791, "learning_rate": 1.1452372038294914e-05, "loss": 4.2302, "step": 28940 }, { "epoch": 2.3163706192990876, "grad_norm": 4.996280670166016, "learning_rate": 1.1439000909236775e-05, "loss": 4.3012, "step": 28950 }, { "epoch": 2.317170747319571, "grad_norm": 5.1281914710998535, "learning_rate": 1.1425629780178638e-05, "loss": 4.2522, "step": 28960 }, { "epoch": 2.3179708753400545, "grad_norm": 8.366037368774414, "learning_rate": 1.14122586511205e-05, "loss": 4.4064, "step": 28970 }, { "epoch": 2.3187710033605375, "grad_norm": 4.212780475616455, "learning_rate": 1.1398887522062363e-05, "loss": 4.1981, "step": 28980 }, { "epoch": 2.319571131381021, "grad_norm": 7.184803485870361, "learning_rate": 1.1385516393004226e-05, "loss": 4.3587, "step": 28990 }, { "epoch": 2.3203712594015045, "grad_norm": 5.358102321624756, "learning_rate": 1.1372145263946089e-05, "loss": 4.2411, "step": 29000 }, { "epoch": 2.3211713874219875, "grad_norm": 4.48073673248291, "learning_rate": 1.1358774134887951e-05, "loss": 4.2713, "step": 29010 }, { "epoch": 2.321971515442471, "grad_norm": 4.86694860458374, "learning_rate": 1.1345403005829814e-05, "loss": 4.2218, "step": 29020 }, { "epoch": 2.322771643462954, "grad_norm": 5.006281852722168, "learning_rate": 1.1332031876771675e-05, "loss": 4.1286, "step": 29030 }, { "epoch": 2.3235717714834374, "grad_norm": 4.683011531829834, "learning_rate": 1.1318660747713538e-05, "loss": 4.1421, "step": 29040 }, { "epoch": 2.3243718995039204, "grad_norm": 9.335220336914062, "learning_rate": 1.13052896186554e-05, "loss": 4.0164, "step": 29050 }, { "epoch": 2.325172027524404, "grad_norm": 5.779821872711182, "learning_rate": 1.1291918489597263e-05, "loss": 4.1787, "step": 29060 }, { "epoch": 2.3259721555448873, "grad_norm": 4.714453220367432, "learning_rate": 1.1278547360539124e-05, "loss": 4.2722, "step": 29070 }, { "epoch": 2.3267722835653704, "grad_norm": 8.025455474853516, "learning_rate": 1.1265176231480987e-05, "loss": 4.4028, "step": 29080 }, { "epoch": 2.327572411585854, "grad_norm": 5.3493194580078125, "learning_rate": 1.125180510242285e-05, "loss": 4.2032, "step": 29090 }, { "epoch": 2.328372539606337, "grad_norm": 4.934419631958008, "learning_rate": 1.1238433973364712e-05, "loss": 4.2815, "step": 29100 }, { "epoch": 2.3291726676268203, "grad_norm": 5.229598045349121, "learning_rate": 1.1225062844306573e-05, "loss": 4.3688, "step": 29110 }, { "epoch": 2.3299727956473038, "grad_norm": 6.556520938873291, "learning_rate": 1.1211691715248436e-05, "loss": 4.2369, "step": 29120 }, { "epoch": 2.3307729236677868, "grad_norm": 4.490777015686035, "learning_rate": 1.1198320586190299e-05, "loss": 4.2573, "step": 29130 }, { "epoch": 2.3315730516882702, "grad_norm": 5.96286678314209, "learning_rate": 1.1184949457132161e-05, "loss": 4.1455, "step": 29140 }, { "epoch": 2.3323731797087532, "grad_norm": 6.92281436920166, "learning_rate": 1.1171578328074022e-05, "loss": 4.185, "step": 29150 }, { "epoch": 2.3331733077292367, "grad_norm": 5.416003704071045, "learning_rate": 1.1158207199015885e-05, "loss": 4.2704, "step": 29160 }, { "epoch": 2.33397343574972, "grad_norm": 3.974001407623291, "learning_rate": 1.1144836069957748e-05, "loss": 4.2986, "step": 29170 }, { "epoch": 2.334773563770203, "grad_norm": 5.721689701080322, "learning_rate": 1.113146494089961e-05, "loss": 4.1773, "step": 29180 }, { "epoch": 2.3355736917906866, "grad_norm": 4.181761741638184, "learning_rate": 1.1118093811841471e-05, "loss": 4.3206, "step": 29190 }, { "epoch": 2.3363738198111696, "grad_norm": 5.180452823638916, "learning_rate": 1.1104722682783334e-05, "loss": 4.3047, "step": 29200 }, { "epoch": 2.337173947831653, "grad_norm": 3.443024158477783, "learning_rate": 1.1091351553725197e-05, "loss": 4.3859, "step": 29210 }, { "epoch": 2.337974075852136, "grad_norm": 5.066608905792236, "learning_rate": 1.107798042466706e-05, "loss": 4.2045, "step": 29220 }, { "epoch": 2.3387742038726196, "grad_norm": 4.342000484466553, "learning_rate": 1.1064609295608922e-05, "loss": 4.1469, "step": 29230 }, { "epoch": 2.339574331893103, "grad_norm": 8.228534698486328, "learning_rate": 1.1051238166550785e-05, "loss": 4.3781, "step": 29240 }, { "epoch": 2.340374459913586, "grad_norm": 4.797886371612549, "learning_rate": 1.1037867037492647e-05, "loss": 4.489, "step": 29250 }, { "epoch": 2.3411745879340695, "grad_norm": 5.288513660430908, "learning_rate": 1.1024495908434508e-05, "loss": 4.4058, "step": 29260 }, { "epoch": 2.341974715954553, "grad_norm": 4.347620010375977, "learning_rate": 1.1011124779376371e-05, "loss": 4.2193, "step": 29270 }, { "epoch": 2.342774843975036, "grad_norm": 6.2110466957092285, "learning_rate": 1.0997753650318234e-05, "loss": 4.1746, "step": 29280 }, { "epoch": 2.3435749719955195, "grad_norm": 6.015201568603516, "learning_rate": 1.0984382521260097e-05, "loss": 4.3359, "step": 29290 }, { "epoch": 2.3443751000160025, "grad_norm": 4.607378005981445, "learning_rate": 1.0971011392201958e-05, "loss": 4.3533, "step": 29300 }, { "epoch": 2.345175228036486, "grad_norm": 4.885862827301025, "learning_rate": 1.095764026314382e-05, "loss": 4.2704, "step": 29310 }, { "epoch": 2.345975356056969, "grad_norm": 3.8569679260253906, "learning_rate": 1.0944269134085683e-05, "loss": 4.3441, "step": 29320 }, { "epoch": 2.3467754840774524, "grad_norm": 3.915543794631958, "learning_rate": 1.0930898005027546e-05, "loss": 4.3674, "step": 29330 }, { "epoch": 2.347575612097936, "grad_norm": 5.9416375160217285, "learning_rate": 1.0917526875969407e-05, "loss": 4.2987, "step": 29340 }, { "epoch": 2.348375740118419, "grad_norm": 4.978701114654541, "learning_rate": 1.090415574691127e-05, "loss": 4.1645, "step": 29350 }, { "epoch": 2.3491758681389023, "grad_norm": 4.31766939163208, "learning_rate": 1.0890784617853132e-05, "loss": 4.4458, "step": 29360 }, { "epoch": 2.3499759961593853, "grad_norm": 7.840646266937256, "learning_rate": 1.0877413488794995e-05, "loss": 4.2837, "step": 29370 }, { "epoch": 2.350776124179869, "grad_norm": 6.027830600738525, "learning_rate": 1.0864042359736856e-05, "loss": 4.0439, "step": 29380 }, { "epoch": 2.351576252200352, "grad_norm": 5.188173294067383, "learning_rate": 1.0850671230678718e-05, "loss": 4.2065, "step": 29390 }, { "epoch": 2.3523763802208353, "grad_norm": 4.097430229187012, "learning_rate": 1.0837300101620581e-05, "loss": 4.344, "step": 29400 }, { "epoch": 2.3531765082413187, "grad_norm": 9.700491905212402, "learning_rate": 1.0823928972562444e-05, "loss": 4.162, "step": 29410 }, { "epoch": 2.3539766362618018, "grad_norm": 4.775502681732178, "learning_rate": 1.0810557843504306e-05, "loss": 4.142, "step": 29420 }, { "epoch": 2.354776764282285, "grad_norm": 4.824921131134033, "learning_rate": 1.0797186714446169e-05, "loss": 4.3454, "step": 29430 }, { "epoch": 2.3555768923027687, "grad_norm": 6.839511394500732, "learning_rate": 1.0783815585388032e-05, "loss": 4.3381, "step": 29440 }, { "epoch": 2.3563770203232517, "grad_norm": 6.512791156768799, "learning_rate": 1.0770444456329895e-05, "loss": 4.0973, "step": 29450 }, { "epoch": 2.357177148343735, "grad_norm": 5.187317848205566, "learning_rate": 1.0757073327271755e-05, "loss": 4.1051, "step": 29460 }, { "epoch": 2.357977276364218, "grad_norm": 4.162054061889648, "learning_rate": 1.0743702198213618e-05, "loss": 4.2611, "step": 29470 }, { "epoch": 2.3587774043847016, "grad_norm": 5.449061870574951, "learning_rate": 1.0730331069155481e-05, "loss": 4.1903, "step": 29480 }, { "epoch": 2.3595775324051846, "grad_norm": 4.118699550628662, "learning_rate": 1.0716959940097342e-05, "loss": 4.2631, "step": 29490 }, { "epoch": 2.360377660425668, "grad_norm": 7.31171178817749, "learning_rate": 1.0703588811039205e-05, "loss": 4.2629, "step": 29500 }, { "epoch": 2.3611777884461516, "grad_norm": 5.051353931427002, "learning_rate": 1.0690217681981067e-05, "loss": 4.1623, "step": 29510 }, { "epoch": 2.3619779164666346, "grad_norm": 6.965731620788574, "learning_rate": 1.067684655292293e-05, "loss": 4.295, "step": 29520 }, { "epoch": 2.362778044487118, "grad_norm": 5.027985095977783, "learning_rate": 1.0663475423864791e-05, "loss": 4.1316, "step": 29530 }, { "epoch": 2.363578172507601, "grad_norm": 4.405726432800293, "learning_rate": 1.0650104294806654e-05, "loss": 4.3583, "step": 29540 }, { "epoch": 2.3643783005280845, "grad_norm": 5.971738815307617, "learning_rate": 1.0636733165748516e-05, "loss": 4.2283, "step": 29550 }, { "epoch": 2.3651784285485675, "grad_norm": 6.3857269287109375, "learning_rate": 1.0623362036690379e-05, "loss": 4.2043, "step": 29560 }, { "epoch": 2.365978556569051, "grad_norm": 7.309497833251953, "learning_rate": 1.060999090763224e-05, "loss": 4.274, "step": 29570 }, { "epoch": 2.3667786845895344, "grad_norm": 5.590941905975342, "learning_rate": 1.0596619778574103e-05, "loss": 4.3008, "step": 29580 }, { "epoch": 2.3675788126100175, "grad_norm": 8.809772491455078, "learning_rate": 1.0583248649515965e-05, "loss": 4.1339, "step": 29590 }, { "epoch": 2.368378940630501, "grad_norm": 3.935534954071045, "learning_rate": 1.0569877520457828e-05, "loss": 4.3561, "step": 29600 }, { "epoch": 2.3691790686509844, "grad_norm": 6.036637783050537, "learning_rate": 1.0556506391399689e-05, "loss": 4.1984, "step": 29610 }, { "epoch": 2.3699791966714674, "grad_norm": 5.820141315460205, "learning_rate": 1.0543135262341552e-05, "loss": 4.211, "step": 29620 }, { "epoch": 2.370779324691951, "grad_norm": 6.568235397338867, "learning_rate": 1.0529764133283414e-05, "loss": 4.3565, "step": 29630 }, { "epoch": 2.371579452712434, "grad_norm": 4.8180060386657715, "learning_rate": 1.0516393004225277e-05, "loss": 4.3178, "step": 29640 }, { "epoch": 2.3723795807329173, "grad_norm": 6.591854572296143, "learning_rate": 1.050302187516714e-05, "loss": 4.3688, "step": 29650 }, { "epoch": 2.3731797087534003, "grad_norm": 4.703768730163574, "learning_rate": 1.0489650746109003e-05, "loss": 4.3742, "step": 29660 }, { "epoch": 2.373979836773884, "grad_norm": 5.566431045532227, "learning_rate": 1.0476279617050865e-05, "loss": 4.2257, "step": 29670 }, { "epoch": 2.3747799647943673, "grad_norm": 4.521890640258789, "learning_rate": 1.0462908487992728e-05, "loss": 4.4083, "step": 29680 }, { "epoch": 2.3755800928148503, "grad_norm": 4.3846025466918945, "learning_rate": 1.0449537358934589e-05, "loss": 4.1587, "step": 29690 }, { "epoch": 2.3763802208353337, "grad_norm": 4.914963245391846, "learning_rate": 1.0436166229876452e-05, "loss": 4.2169, "step": 29700 }, { "epoch": 2.3771803488558167, "grad_norm": 4.025776386260986, "learning_rate": 1.0422795100818314e-05, "loss": 4.3142, "step": 29710 }, { "epoch": 2.3779804768763, "grad_norm": 4.935368537902832, "learning_rate": 1.0409423971760177e-05, "loss": 4.328, "step": 29720 }, { "epoch": 2.3787806048967837, "grad_norm": 4.549187660217285, "learning_rate": 1.0396052842702038e-05, "loss": 4.2335, "step": 29730 }, { "epoch": 2.3795807329172667, "grad_norm": 4.583531856536865, "learning_rate": 1.03826817136439e-05, "loss": 4.2718, "step": 29740 }, { "epoch": 2.38038086093775, "grad_norm": 5.972599983215332, "learning_rate": 1.0369310584585763e-05, "loss": 4.279, "step": 29750 }, { "epoch": 2.381180988958233, "grad_norm": 4.461510181427002, "learning_rate": 1.0355939455527624e-05, "loss": 4.3608, "step": 29760 }, { "epoch": 2.3819811169787166, "grad_norm": 5.027605056762695, "learning_rate": 1.0342568326469487e-05, "loss": 4.3147, "step": 29770 }, { "epoch": 2.3827812449992, "grad_norm": 5.157301425933838, "learning_rate": 1.032919719741135e-05, "loss": 4.3418, "step": 29780 }, { "epoch": 2.383581373019683, "grad_norm": 6.807774543762207, "learning_rate": 1.0315826068353212e-05, "loss": 4.1424, "step": 29790 }, { "epoch": 2.3843815010401666, "grad_norm": 4.789431095123291, "learning_rate": 1.0302454939295073e-05, "loss": 4.2169, "step": 29800 }, { "epoch": 2.3851816290606496, "grad_norm": 4.129378318786621, "learning_rate": 1.0289083810236936e-05, "loss": 4.1229, "step": 29810 }, { "epoch": 2.385981757081133, "grad_norm": 5.426341533660889, "learning_rate": 1.0275712681178799e-05, "loss": 4.2576, "step": 29820 }, { "epoch": 2.386781885101616, "grad_norm": 4.135334491729736, "learning_rate": 1.0262341552120661e-05, "loss": 4.2543, "step": 29830 }, { "epoch": 2.3875820131220995, "grad_norm": 5.468907833099365, "learning_rate": 1.0248970423062522e-05, "loss": 4.4257, "step": 29840 }, { "epoch": 2.388382141142583, "grad_norm": 4.926033020019531, "learning_rate": 1.0235599294004385e-05, "loss": 4.3505, "step": 29850 }, { "epoch": 2.389182269163066, "grad_norm": 6.9714155197143555, "learning_rate": 1.0222228164946248e-05, "loss": 4.2705, "step": 29860 }, { "epoch": 2.3899823971835494, "grad_norm": 5.102124214172363, "learning_rate": 1.020885703588811e-05, "loss": 4.2783, "step": 29870 }, { "epoch": 2.390782525204033, "grad_norm": 4.479608535766602, "learning_rate": 1.0195485906829973e-05, "loss": 4.3488, "step": 29880 }, { "epoch": 2.391582653224516, "grad_norm": 8.478421211242676, "learning_rate": 1.0182114777771836e-05, "loss": 4.1135, "step": 29890 }, { "epoch": 2.3923827812449994, "grad_norm": 8.210994720458984, "learning_rate": 1.0168743648713699e-05, "loss": 4.2423, "step": 29900 }, { "epoch": 2.3931829092654824, "grad_norm": 5.55629825592041, "learning_rate": 1.0155372519655561e-05, "loss": 4.3846, "step": 29910 }, { "epoch": 2.393983037285966, "grad_norm": 4.0907487869262695, "learning_rate": 1.0142001390597422e-05, "loss": 4.117, "step": 29920 }, { "epoch": 2.394783165306449, "grad_norm": 4.413903713226318, "learning_rate": 1.0128630261539285e-05, "loss": 4.4148, "step": 29930 }, { "epoch": 2.3955832933269323, "grad_norm": 5.763768672943115, "learning_rate": 1.0115259132481148e-05, "loss": 4.3248, "step": 29940 }, { "epoch": 2.3963834213474158, "grad_norm": 4.842255115509033, "learning_rate": 1.010188800342301e-05, "loss": 4.1066, "step": 29950 }, { "epoch": 2.397183549367899, "grad_norm": 5.915595054626465, "learning_rate": 1.0088516874364871e-05, "loss": 4.2386, "step": 29960 }, { "epoch": 2.3979836773883823, "grad_norm": 6.0812201499938965, "learning_rate": 1.0075145745306734e-05, "loss": 4.2461, "step": 29970 }, { "epoch": 2.3987838054088653, "grad_norm": 5.185386657714844, "learning_rate": 1.0061774616248597e-05, "loss": 4.0738, "step": 29980 }, { "epoch": 2.3995839334293487, "grad_norm": 6.472476482391357, "learning_rate": 1.004840348719046e-05, "loss": 4.3379, "step": 29990 }, { "epoch": 2.4003840614498317, "grad_norm": 5.108096599578857, "learning_rate": 1.003503235813232e-05, "loss": 4.1048, "step": 30000 }, { "epoch": 2.401184189470315, "grad_norm": 6.5041890144348145, "learning_rate": 1.0021661229074183e-05, "loss": 4.2413, "step": 30010 }, { "epoch": 2.4019843174907987, "grad_norm": 3.8431293964385986, "learning_rate": 1.0008290100016046e-05, "loss": 4.1759, "step": 30020 }, { "epoch": 2.4027844455112817, "grad_norm": 4.535530090332031, "learning_rate": 9.994918970957909e-06, "loss": 4.1232, "step": 30030 }, { "epoch": 2.403584573531765, "grad_norm": 7.020683288574219, "learning_rate": 9.98154784189977e-06, "loss": 4.1428, "step": 30040 }, { "epoch": 2.4043847015522486, "grad_norm": 5.312134742736816, "learning_rate": 9.968176712841632e-06, "loss": 4.2044, "step": 30050 }, { "epoch": 2.4051848295727316, "grad_norm": 5.777432918548584, "learning_rate": 9.954805583783495e-06, "loss": 4.3428, "step": 30060 }, { "epoch": 2.405984957593215, "grad_norm": 6.370398044586182, "learning_rate": 9.941434454725358e-06, "loss": 4.2981, "step": 30070 }, { "epoch": 2.406785085613698, "grad_norm": 4.892792701721191, "learning_rate": 9.92806332566722e-06, "loss": 4.3618, "step": 30080 }, { "epoch": 2.4075852136341815, "grad_norm": 5.481409072875977, "learning_rate": 9.914692196609083e-06, "loss": 4.2313, "step": 30090 }, { "epoch": 2.4083853416546646, "grad_norm": 7.531737804412842, "learning_rate": 9.901321067550946e-06, "loss": 4.1667, "step": 30100 }, { "epoch": 2.409185469675148, "grad_norm": 4.554537296295166, "learning_rate": 9.887949938492807e-06, "loss": 4.328, "step": 30110 }, { "epoch": 2.4099855976956315, "grad_norm": 7.280032157897949, "learning_rate": 9.87457880943467e-06, "loss": 4.3348, "step": 30120 }, { "epoch": 2.4107857257161145, "grad_norm": 5.1274824142456055, "learning_rate": 9.861207680376532e-06, "loss": 4.2058, "step": 30130 }, { "epoch": 2.411585853736598, "grad_norm": 5.370729923248291, "learning_rate": 9.847836551318395e-06, "loss": 4.1759, "step": 30140 }, { "epoch": 2.412385981757081, "grad_norm": 5.877347946166992, "learning_rate": 9.834465422260256e-06, "loss": 4.3684, "step": 30150 }, { "epoch": 2.4131861097775644, "grad_norm": 3.648756742477417, "learning_rate": 9.821094293202118e-06, "loss": 4.0863, "step": 30160 }, { "epoch": 2.4139862377980474, "grad_norm": 6.443056106567383, "learning_rate": 9.807723164143981e-06, "loss": 4.2268, "step": 30170 }, { "epoch": 2.414786365818531, "grad_norm": 5.324306488037109, "learning_rate": 9.794352035085844e-06, "loss": 4.1204, "step": 30180 }, { "epoch": 2.4155864938390144, "grad_norm": 6.442959308624268, "learning_rate": 9.780980906027705e-06, "loss": 4.1933, "step": 30190 }, { "epoch": 2.4163866218594974, "grad_norm": 5.393689155578613, "learning_rate": 9.767609776969567e-06, "loss": 4.2285, "step": 30200 }, { "epoch": 2.417186749879981, "grad_norm": 5.3242573738098145, "learning_rate": 9.75423864791143e-06, "loss": 4.24, "step": 30210 }, { "epoch": 2.4179868779004643, "grad_norm": 4.855564594268799, "learning_rate": 9.740867518853293e-06, "loss": 4.2122, "step": 30220 }, { "epoch": 2.4187870059209473, "grad_norm": 7.359550952911377, "learning_rate": 9.727496389795154e-06, "loss": 4.2153, "step": 30230 }, { "epoch": 2.4195871339414308, "grad_norm": 6.255536079406738, "learning_rate": 9.714125260737017e-06, "loss": 4.243, "step": 30240 }, { "epoch": 2.420387261961914, "grad_norm": 11.542984962463379, "learning_rate": 9.70075413167888e-06, "loss": 4.1648, "step": 30250 }, { "epoch": 2.4211873899823972, "grad_norm": 3.8510329723358154, "learning_rate": 9.687383002620742e-06, "loss": 4.2724, "step": 30260 }, { "epoch": 2.4219875180028803, "grad_norm": 6.371697425842285, "learning_rate": 9.674011873562603e-06, "loss": 4.3379, "step": 30270 }, { "epoch": 2.4227876460233637, "grad_norm": 5.638033866882324, "learning_rate": 9.660640744504466e-06, "loss": 4.243, "step": 30280 }, { "epoch": 2.423587774043847, "grad_norm": 4.4373698234558105, "learning_rate": 9.647269615446328e-06, "loss": 4.0961, "step": 30290 }, { "epoch": 2.42438790206433, "grad_norm": 5.150527000427246, "learning_rate": 9.633898486388191e-06, "loss": 4.4039, "step": 30300 }, { "epoch": 2.4251880300848137, "grad_norm": 3.810324192047119, "learning_rate": 9.620527357330054e-06, "loss": 4.4131, "step": 30310 }, { "epoch": 2.4259881581052967, "grad_norm": 4.635262489318848, "learning_rate": 9.607156228271916e-06, "loss": 4.3309, "step": 30320 }, { "epoch": 2.42678828612578, "grad_norm": 4.7242021560668945, "learning_rate": 9.593785099213779e-06, "loss": 4.4903, "step": 30330 }, { "epoch": 2.4275884141462636, "grad_norm": 6.102695941925049, "learning_rate": 9.58041397015564e-06, "loss": 4.1923, "step": 30340 }, { "epoch": 2.4283885421667466, "grad_norm": 7.34291934967041, "learning_rate": 9.567042841097503e-06, "loss": 4.2971, "step": 30350 }, { "epoch": 2.42918867018723, "grad_norm": 5.011170864105225, "learning_rate": 9.553671712039365e-06, "loss": 4.3888, "step": 30360 }, { "epoch": 2.429988798207713, "grad_norm": 5.15313720703125, "learning_rate": 9.540300582981228e-06, "loss": 4.0506, "step": 30370 }, { "epoch": 2.4307889262281965, "grad_norm": 3.9554715156555176, "learning_rate": 9.526929453923089e-06, "loss": 4.3425, "step": 30380 }, { "epoch": 2.43158905424868, "grad_norm": 7.2298479080200195, "learning_rate": 9.513558324864952e-06, "loss": 4.4475, "step": 30390 }, { "epoch": 2.432389182269163, "grad_norm": 3.8345696926116943, "learning_rate": 9.500187195806814e-06, "loss": 4.0772, "step": 30400 }, { "epoch": 2.4331893102896465, "grad_norm": 5.277257442474365, "learning_rate": 9.486816066748677e-06, "loss": 4.214, "step": 30410 }, { "epoch": 2.4339894383101295, "grad_norm": 4.5606513023376465, "learning_rate": 9.473444937690538e-06, "loss": 4.2904, "step": 30420 }, { "epoch": 2.434789566330613, "grad_norm": 6.693448543548584, "learning_rate": 9.460073808632401e-06, "loss": 4.3922, "step": 30430 }, { "epoch": 2.435589694351096, "grad_norm": 6.090394973754883, "learning_rate": 9.446702679574264e-06, "loss": 4.1605, "step": 30440 }, { "epoch": 2.4363898223715794, "grad_norm": 5.3264479637146, "learning_rate": 9.433331550516126e-06, "loss": 4.3312, "step": 30450 }, { "epoch": 2.437189950392063, "grad_norm": 5.852473258972168, "learning_rate": 9.419960421457987e-06, "loss": 4.1236, "step": 30460 }, { "epoch": 2.437990078412546, "grad_norm": 7.659939765930176, "learning_rate": 9.40658929239985e-06, "loss": 4.5643, "step": 30470 }, { "epoch": 2.4387902064330294, "grad_norm": 3.983794927597046, "learning_rate": 9.393218163341713e-06, "loss": 4.2261, "step": 30480 }, { "epoch": 2.439590334453513, "grad_norm": 5.556244373321533, "learning_rate": 9.379847034283575e-06, "loss": 4.3164, "step": 30490 }, { "epoch": 2.440390462473996, "grad_norm": 4.4613213539123535, "learning_rate": 9.366475905225438e-06, "loss": 4.2227, "step": 30500 }, { "epoch": 2.4411905904944793, "grad_norm": 4.627000331878662, "learning_rate": 9.3531047761673e-06, "loss": 4.442, "step": 30510 }, { "epoch": 2.4419907185149623, "grad_norm": 8.667407989501953, "learning_rate": 9.339733647109163e-06, "loss": 4.1635, "step": 30520 }, { "epoch": 2.4427908465354458, "grad_norm": 10.679652214050293, "learning_rate": 9.326362518051026e-06, "loss": 4.3699, "step": 30530 }, { "epoch": 2.4435909745559288, "grad_norm": 7.4805006980896, "learning_rate": 9.312991388992887e-06, "loss": 4.2092, "step": 30540 }, { "epoch": 2.4443911025764122, "grad_norm": 4.780892372131348, "learning_rate": 9.29962025993475e-06, "loss": 4.3487, "step": 30550 }, { "epoch": 2.4451912305968957, "grad_norm": 5.3561930656433105, "learning_rate": 9.286249130876612e-06, "loss": 4.1617, "step": 30560 }, { "epoch": 2.4459913586173787, "grad_norm": 4.493958950042725, "learning_rate": 9.272878001818475e-06, "loss": 4.2219, "step": 30570 }, { "epoch": 2.446791486637862, "grad_norm": 4.278980255126953, "learning_rate": 9.259506872760336e-06, "loss": 4.1333, "step": 30580 }, { "epoch": 2.447591614658345, "grad_norm": 4.631113052368164, "learning_rate": 9.246135743702199e-06, "loss": 4.2754, "step": 30590 }, { "epoch": 2.4483917426788286, "grad_norm": 5.248873233795166, "learning_rate": 9.232764614644062e-06, "loss": 4.2683, "step": 30600 }, { "epoch": 2.4491918706993117, "grad_norm": 3.8190808296203613, "learning_rate": 9.219393485585923e-06, "loss": 4.3059, "step": 30610 }, { "epoch": 2.449991998719795, "grad_norm": 6.609463214874268, "learning_rate": 9.206022356527785e-06, "loss": 4.3925, "step": 30620 }, { "epoch": 2.4507921267402786, "grad_norm": 5.2939581871032715, "learning_rate": 9.192651227469648e-06, "loss": 4.3248, "step": 30630 }, { "epoch": 2.4515922547607616, "grad_norm": 8.84293270111084, "learning_rate": 9.17928009841151e-06, "loss": 4.2664, "step": 30640 }, { "epoch": 2.452392382781245, "grad_norm": 6.027024269104004, "learning_rate": 9.165908969353372e-06, "loss": 4.2791, "step": 30650 }, { "epoch": 2.4531925108017285, "grad_norm": 6.528242588043213, "learning_rate": 9.152537840295234e-06, "loss": 4.1843, "step": 30660 }, { "epoch": 2.4539926388222115, "grad_norm": 6.670463562011719, "learning_rate": 9.139166711237097e-06, "loss": 4.0791, "step": 30670 }, { "epoch": 2.454792766842695, "grad_norm": 4.609775066375732, "learning_rate": 9.12579558217896e-06, "loss": 4.1602, "step": 30680 }, { "epoch": 2.455592894863178, "grad_norm": 5.768373012542725, "learning_rate": 9.11242445312082e-06, "loss": 4.1335, "step": 30690 }, { "epoch": 2.4563930228836615, "grad_norm": 5.554232120513916, "learning_rate": 9.099053324062683e-06, "loss": 4.1415, "step": 30700 }, { "epoch": 2.4571931509041445, "grad_norm": 5.334545612335205, "learning_rate": 9.085682195004546e-06, "loss": 4.2514, "step": 30710 }, { "epoch": 2.457993278924628, "grad_norm": 11.706725120544434, "learning_rate": 9.072311065946409e-06, "loss": 4.3661, "step": 30720 }, { "epoch": 2.4587934069451114, "grad_norm": 9.615589141845703, "learning_rate": 9.058939936888271e-06, "loss": 4.1575, "step": 30730 }, { "epoch": 2.4595935349655944, "grad_norm": 8.476457595825195, "learning_rate": 9.045568807830134e-06, "loss": 4.3892, "step": 30740 }, { "epoch": 2.460393662986078, "grad_norm": 8.876557350158691, "learning_rate": 9.032197678771997e-06, "loss": 4.3411, "step": 30750 }, { "epoch": 2.461193791006561, "grad_norm": 4.643357276916504, "learning_rate": 9.01882654971386e-06, "loss": 4.344, "step": 30760 }, { "epoch": 2.4619939190270443, "grad_norm": 4.706933975219727, "learning_rate": 9.00545542065572e-06, "loss": 4.3931, "step": 30770 }, { "epoch": 2.4627940470475274, "grad_norm": 3.6565020084381104, "learning_rate": 8.992084291597583e-06, "loss": 4.2277, "step": 30780 }, { "epoch": 2.463594175068011, "grad_norm": 6.164667129516602, "learning_rate": 8.978713162539446e-06, "loss": 4.217, "step": 30790 }, { "epoch": 2.4643943030884943, "grad_norm": 5.391159534454346, "learning_rate": 8.965342033481309e-06, "loss": 4.3037, "step": 30800 }, { "epoch": 2.4651944311089773, "grad_norm": 4.463956832885742, "learning_rate": 8.95197090442317e-06, "loss": 4.174, "step": 30810 }, { "epoch": 2.4659945591294608, "grad_norm": 5.862019062042236, "learning_rate": 8.938599775365032e-06, "loss": 4.152, "step": 30820 }, { "epoch": 2.466794687149944, "grad_norm": 6.314411163330078, "learning_rate": 8.925228646306895e-06, "loss": 4.1766, "step": 30830 }, { "epoch": 2.4675948151704272, "grad_norm": 5.068113803863525, "learning_rate": 8.911857517248758e-06, "loss": 4.3446, "step": 30840 }, { "epoch": 2.4683949431909107, "grad_norm": 4.690631866455078, "learning_rate": 8.898486388190619e-06, "loss": 4.3606, "step": 30850 }, { "epoch": 2.4691950712113937, "grad_norm": 10.336904525756836, "learning_rate": 8.885115259132481e-06, "loss": 4.2696, "step": 30860 }, { "epoch": 2.469995199231877, "grad_norm": 4.266592502593994, "learning_rate": 8.871744130074344e-06, "loss": 4.3406, "step": 30870 }, { "epoch": 2.47079532725236, "grad_norm": 6.125695705413818, "learning_rate": 8.858373001016205e-06, "loss": 4.2, "step": 30880 }, { "epoch": 2.4715954552728436, "grad_norm": 8.705606460571289, "learning_rate": 8.845001871958068e-06, "loss": 4.3633, "step": 30890 }, { "epoch": 2.472395583293327, "grad_norm": 3.629751205444336, "learning_rate": 8.83163074289993e-06, "loss": 4.2029, "step": 30900 }, { "epoch": 2.47319571131381, "grad_norm": 5.424594879150391, "learning_rate": 8.818259613841793e-06, "loss": 4.3171, "step": 30910 }, { "epoch": 2.4739958393342936, "grad_norm": 5.205273628234863, "learning_rate": 8.804888484783656e-06, "loss": 4.2902, "step": 30920 }, { "epoch": 2.4747959673547766, "grad_norm": 5.293666839599609, "learning_rate": 8.791517355725518e-06, "loss": 4.2665, "step": 30930 }, { "epoch": 2.47559609537526, "grad_norm": 7.430743217468262, "learning_rate": 8.778146226667381e-06, "loss": 4.227, "step": 30940 }, { "epoch": 2.4763962233957435, "grad_norm": 7.585148811340332, "learning_rate": 8.764775097609244e-06, "loss": 4.1007, "step": 30950 }, { "epoch": 2.4771963514162265, "grad_norm": 3.8104374408721924, "learning_rate": 8.751403968551105e-06, "loss": 4.2632, "step": 30960 }, { "epoch": 2.47799647943671, "grad_norm": 4.126226902008057, "learning_rate": 8.738032839492967e-06, "loss": 4.2082, "step": 30970 }, { "epoch": 2.478796607457193, "grad_norm": 4.376924514770508, "learning_rate": 8.72466171043483e-06, "loss": 4.2911, "step": 30980 }, { "epoch": 2.4795967354776765, "grad_norm": 5.41331148147583, "learning_rate": 8.711290581376693e-06, "loss": 4.1285, "step": 30990 }, { "epoch": 2.48039686349816, "grad_norm": 5.507528305053711, "learning_rate": 8.697919452318554e-06, "loss": 4.3046, "step": 31000 }, { "epoch": 2.481196991518643, "grad_norm": 5.433472156524658, "learning_rate": 8.684548323260417e-06, "loss": 4.2791, "step": 31010 }, { "epoch": 2.4819971195391264, "grad_norm": 5.576103210449219, "learning_rate": 8.67117719420228e-06, "loss": 4.3817, "step": 31020 }, { "epoch": 2.4827972475596094, "grad_norm": 8.479930877685547, "learning_rate": 8.659143178049955e-06, "loss": 4.4041, "step": 31030 }, { "epoch": 2.483597375580093, "grad_norm": 10.624326705932617, "learning_rate": 8.645772048991816e-06, "loss": 4.2272, "step": 31040 }, { "epoch": 2.484397503600576, "grad_norm": 9.005964279174805, "learning_rate": 8.632400919933679e-06, "loss": 4.2243, "step": 31050 }, { "epoch": 2.4851976316210593, "grad_norm": 5.739156723022461, "learning_rate": 8.619029790875541e-06, "loss": 4.2756, "step": 31060 }, { "epoch": 2.485997759641543, "grad_norm": 4.491320610046387, "learning_rate": 8.605658661817404e-06, "loss": 4.307, "step": 31070 }, { "epoch": 2.486797887662026, "grad_norm": 5.620110988616943, "learning_rate": 8.592287532759267e-06, "loss": 4.1882, "step": 31080 }, { "epoch": 2.4875980156825093, "grad_norm": 7.674624443054199, "learning_rate": 8.57891640370113e-06, "loss": 4.2893, "step": 31090 }, { "epoch": 2.4883981437029927, "grad_norm": 7.919490337371826, "learning_rate": 8.565545274642992e-06, "loss": 4.2795, "step": 31100 }, { "epoch": 2.4891982717234757, "grad_norm": 5.725222110748291, "learning_rate": 8.552174145584855e-06, "loss": 4.2809, "step": 31110 }, { "epoch": 2.489998399743959, "grad_norm": 5.071161270141602, "learning_rate": 8.538803016526716e-06, "loss": 4.0492, "step": 31120 }, { "epoch": 2.490798527764442, "grad_norm": 6.060359954833984, "learning_rate": 8.525431887468579e-06, "loss": 4.2231, "step": 31130 }, { "epoch": 2.4915986557849257, "grad_norm": 5.7602081298828125, "learning_rate": 8.512060758410441e-06, "loss": 4.0903, "step": 31140 }, { "epoch": 2.4923987838054087, "grad_norm": 6.505451679229736, "learning_rate": 8.498689629352304e-06, "loss": 4.3469, "step": 31150 }, { "epoch": 2.493198911825892, "grad_norm": 4.702768802642822, "learning_rate": 8.485318500294165e-06, "loss": 4.2681, "step": 31160 }, { "epoch": 2.4939990398463756, "grad_norm": 5.579276084899902, "learning_rate": 8.471947371236028e-06, "loss": 4.1579, "step": 31170 }, { "epoch": 2.4947991678668586, "grad_norm": 4.7463788986206055, "learning_rate": 8.45857624217789e-06, "loss": 4.3411, "step": 31180 }, { "epoch": 2.495599295887342, "grad_norm": 5.336600303649902, "learning_rate": 8.445205113119751e-06, "loss": 4.3368, "step": 31190 }, { "epoch": 2.496399423907825, "grad_norm": 5.681230545043945, "learning_rate": 8.431833984061614e-06, "loss": 4.2342, "step": 31200 }, { "epoch": 2.4971995519283086, "grad_norm": 5.4162139892578125, "learning_rate": 8.418462855003477e-06, "loss": 4.2021, "step": 31210 }, { "epoch": 2.4979996799487916, "grad_norm": 4.396571636199951, "learning_rate": 8.40509172594534e-06, "loss": 4.1137, "step": 31220 }, { "epoch": 2.498799807969275, "grad_norm": 5.792074203491211, "learning_rate": 8.3917205968872e-06, "loss": 4.1966, "step": 31230 }, { "epoch": 2.4995999359897585, "grad_norm": 4.996471881866455, "learning_rate": 8.378349467829063e-06, "loss": 4.2879, "step": 31240 }, { "epoch": 2.5004000640102415, "grad_norm": 5.074433326721191, "learning_rate": 8.364978338770926e-06, "loss": 4.1484, "step": 31250 }, { "epoch": 2.501200192030725, "grad_norm": 4.65183162689209, "learning_rate": 8.351607209712788e-06, "loss": 4.3771, "step": 31260 }, { "epoch": 2.5020003200512084, "grad_norm": 5.808924198150635, "learning_rate": 8.338236080654651e-06, "loss": 4.205, "step": 31270 }, { "epoch": 2.5028004480716914, "grad_norm": 7.044647216796875, "learning_rate": 8.324864951596514e-06, "loss": 4.2596, "step": 31280 }, { "epoch": 2.503600576092175, "grad_norm": 7.707614898681641, "learning_rate": 8.311493822538376e-06, "loss": 4.1315, "step": 31290 }, { "epoch": 2.504400704112658, "grad_norm": 4.458408355712891, "learning_rate": 8.29812269348024e-06, "loss": 4.3516, "step": 31300 }, { "epoch": 2.5052008321331414, "grad_norm": 5.028105735778809, "learning_rate": 8.2847515644221e-06, "loss": 4.2033, "step": 31310 }, { "epoch": 2.5060009601536244, "grad_norm": 5.130579471588135, "learning_rate": 8.271380435363963e-06, "loss": 4.2853, "step": 31320 }, { "epoch": 2.506801088174108, "grad_norm": 7.856816291809082, "learning_rate": 8.258009306305826e-06, "loss": 4.0916, "step": 31330 }, { "epoch": 2.5076012161945913, "grad_norm": 5.169328212738037, "learning_rate": 8.244638177247688e-06, "loss": 4.2018, "step": 31340 }, { "epoch": 2.5084013442150743, "grad_norm": 6.61760139465332, "learning_rate": 8.23126704818955e-06, "loss": 4.1828, "step": 31350 }, { "epoch": 2.509201472235558, "grad_norm": 4.724680423736572, "learning_rate": 8.217895919131412e-06, "loss": 4.0726, "step": 31360 }, { "epoch": 2.5100016002560412, "grad_norm": 5.668628215789795, "learning_rate": 8.204524790073275e-06, "loss": 4.2675, "step": 31370 }, { "epoch": 2.5108017282765243, "grad_norm": 4.8627142906188965, "learning_rate": 8.191153661015137e-06, "loss": 4.2985, "step": 31380 }, { "epoch": 2.5116018562970073, "grad_norm": 5.198115825653076, "learning_rate": 8.177782531956998e-06, "loss": 4.3581, "step": 31390 }, { "epoch": 2.5124019843174907, "grad_norm": 4.514540195465088, "learning_rate": 8.164411402898861e-06, "loss": 4.3062, "step": 31400 }, { "epoch": 2.513202112337974, "grad_norm": 7.25886344909668, "learning_rate": 8.151040273840724e-06, "loss": 4.2991, "step": 31410 }, { "epoch": 2.514002240358457, "grad_norm": 5.974603176116943, "learning_rate": 8.137669144782586e-06, "loss": 4.1022, "step": 31420 }, { "epoch": 2.5148023683789407, "grad_norm": 5.448121547698975, "learning_rate": 8.124298015724447e-06, "loss": 4.1075, "step": 31430 }, { "epoch": 2.515602496399424, "grad_norm": 5.210214614868164, "learning_rate": 8.11092688666631e-06, "loss": 4.2649, "step": 31440 }, { "epoch": 2.516402624419907, "grad_norm": 11.993659019470215, "learning_rate": 8.097555757608173e-06, "loss": 4.2198, "step": 31450 }, { "epoch": 2.5172027524403906, "grad_norm": 4.927745819091797, "learning_rate": 8.084184628550035e-06, "loss": 4.0223, "step": 31460 }, { "epoch": 2.5180028804608736, "grad_norm": 4.729024887084961, "learning_rate": 8.070813499491896e-06, "loss": 4.2895, "step": 31470 }, { "epoch": 2.518803008481357, "grad_norm": 5.384552955627441, "learning_rate": 8.057442370433759e-06, "loss": 4.3527, "step": 31480 }, { "epoch": 2.51960313650184, "grad_norm": 8.20038890838623, "learning_rate": 8.044071241375622e-06, "loss": 4.2892, "step": 31490 }, { "epoch": 2.5204032645223235, "grad_norm": 4.21329927444458, "learning_rate": 8.030700112317484e-06, "loss": 4.4118, "step": 31500 }, { "epoch": 2.521203392542807, "grad_norm": 5.070785999298096, "learning_rate": 8.017328983259347e-06, "loss": 4.4495, "step": 31510 }, { "epoch": 2.52200352056329, "grad_norm": 5.975208759307861, "learning_rate": 8.00395785420121e-06, "loss": 4.2316, "step": 31520 }, { "epoch": 2.5228036485837735, "grad_norm": 4.3103718757629395, "learning_rate": 7.990586725143073e-06, "loss": 4.3446, "step": 31530 }, { "epoch": 2.523603776604257, "grad_norm": 4.407105922698975, "learning_rate": 7.977215596084934e-06, "loss": 4.387, "step": 31540 }, { "epoch": 2.52440390462474, "grad_norm": 6.030669212341309, "learning_rate": 7.963844467026796e-06, "loss": 4.4376, "step": 31550 }, { "epoch": 2.525204032645223, "grad_norm": 5.338545799255371, "learning_rate": 7.950473337968659e-06, "loss": 4.2574, "step": 31560 }, { "epoch": 2.5260041606657064, "grad_norm": 7.339935779571533, "learning_rate": 7.937102208910522e-06, "loss": 4.186, "step": 31570 }, { "epoch": 2.52680428868619, "grad_norm": 5.970847129821777, "learning_rate": 7.923731079852383e-06, "loss": 4.3115, "step": 31580 }, { "epoch": 2.527604416706673, "grad_norm": 4.5242228507995605, "learning_rate": 7.910359950794245e-06, "loss": 4.4381, "step": 31590 }, { "epoch": 2.5284045447271564, "grad_norm": 7.288176536560059, "learning_rate": 7.896988821736108e-06, "loss": 4.1498, "step": 31600 }, { "epoch": 2.52920467274764, "grad_norm": 4.609799861907959, "learning_rate": 7.88361769267797e-06, "loss": 4.3975, "step": 31610 }, { "epoch": 2.530004800768123, "grad_norm": 6.066311359405518, "learning_rate": 7.870246563619832e-06, "loss": 4.2106, "step": 31620 }, { "epoch": 2.5308049287886063, "grad_norm": 4.771033763885498, "learning_rate": 7.856875434561694e-06, "loss": 4.4468, "step": 31630 }, { "epoch": 2.5316050568090893, "grad_norm": 5.995556831359863, "learning_rate": 7.843504305503557e-06, "loss": 4.2434, "step": 31640 }, { "epoch": 2.5324051848295728, "grad_norm": 6.122812747955322, "learning_rate": 7.83013317644542e-06, "loss": 4.2819, "step": 31650 }, { "epoch": 2.533205312850056, "grad_norm": 6.691094875335693, "learning_rate": 7.81676204738728e-06, "loss": 4.3078, "step": 31660 }, { "epoch": 2.5340054408705392, "grad_norm": 5.067676544189453, "learning_rate": 7.803390918329143e-06, "loss": 4.1885, "step": 31670 }, { "epoch": 2.5348055688910227, "grad_norm": 5.308253288269043, "learning_rate": 7.790019789271006e-06, "loss": 4.1887, "step": 31680 }, { "epoch": 2.5356056969115057, "grad_norm": 5.868967533111572, "learning_rate": 7.776648660212869e-06, "loss": 4.1599, "step": 31690 }, { "epoch": 2.536405824931989, "grad_norm": 5.12509298324585, "learning_rate": 7.76327753115473e-06, "loss": 4.1866, "step": 31700 }, { "epoch": 2.5372059529524726, "grad_norm": 7.411617279052734, "learning_rate": 7.749906402096593e-06, "loss": 4.3881, "step": 31710 }, { "epoch": 2.5380060809729557, "grad_norm": 5.138408660888672, "learning_rate": 7.736535273038455e-06, "loss": 4.2416, "step": 31720 }, { "epoch": 2.5388062089934387, "grad_norm": 4.619532108306885, "learning_rate": 7.723164143980318e-06, "loss": 4.361, "step": 31730 }, { "epoch": 2.539606337013922, "grad_norm": 4.554988861083984, "learning_rate": 7.70979301492218e-06, "loss": 4.2662, "step": 31740 }, { "epoch": 2.5404064650344056, "grad_norm": 5.840234279632568, "learning_rate": 7.696421885864043e-06, "loss": 4.1966, "step": 31750 }, { "epoch": 2.5412065930548886, "grad_norm": 4.803789138793945, "learning_rate": 7.683050756805906e-06, "loss": 4.1967, "step": 31760 }, { "epoch": 2.542006721075372, "grad_norm": 7.9466142654418945, "learning_rate": 7.669679627747767e-06, "loss": 4.3066, "step": 31770 }, { "epoch": 2.5428068490958555, "grad_norm": 5.5534844398498535, "learning_rate": 7.65630849868963e-06, "loss": 4.2559, "step": 31780 }, { "epoch": 2.5436069771163385, "grad_norm": 4.615208148956299, "learning_rate": 7.642937369631492e-06, "loss": 4.0991, "step": 31790 }, { "epoch": 2.544407105136822, "grad_norm": 5.248322486877441, "learning_rate": 7.629566240573355e-06, "loss": 4.2412, "step": 31800 }, { "epoch": 2.545207233157305, "grad_norm": 5.644094467163086, "learning_rate": 7.616195111515216e-06, "loss": 4.1779, "step": 31810 }, { "epoch": 2.5460073611777885, "grad_norm": 5.409252643585205, "learning_rate": 7.602823982457079e-06, "loss": 4.4094, "step": 31820 }, { "epoch": 2.5468074891982715, "grad_norm": 6.40919303894043, "learning_rate": 7.589452853398941e-06, "loss": 4.1346, "step": 31830 }, { "epoch": 2.547607617218755, "grad_norm": 4.60876989364624, "learning_rate": 7.576081724340804e-06, "loss": 4.2401, "step": 31840 }, { "epoch": 2.5484077452392384, "grad_norm": 4.963344097137451, "learning_rate": 7.562710595282665e-06, "loss": 4.396, "step": 31850 }, { "epoch": 2.5492078732597214, "grad_norm": 8.271472930908203, "learning_rate": 7.549339466224528e-06, "loss": 4.3553, "step": 31860 }, { "epoch": 2.550008001280205, "grad_norm": 5.118110179901123, "learning_rate": 7.5359683371663905e-06, "loss": 4.1426, "step": 31870 }, { "epoch": 2.5508081293006883, "grad_norm": 6.315212249755859, "learning_rate": 7.522597208108253e-06, "loss": 4.132, "step": 31880 }, { "epoch": 2.5516082573211714, "grad_norm": 6.116096019744873, "learning_rate": 7.509226079050115e-06, "loss": 4.1163, "step": 31890 }, { "epoch": 2.552408385341655, "grad_norm": 7.4716877937316895, "learning_rate": 7.495854949991978e-06, "loss": 4.3314, "step": 31900 }, { "epoch": 2.553208513362138, "grad_norm": 6.010276794433594, "learning_rate": 7.48248382093384e-06, "loss": 4.3779, "step": 31910 }, { "epoch": 2.5540086413826213, "grad_norm": 5.290060043334961, "learning_rate": 7.469112691875703e-06, "loss": 4.3454, "step": 31920 }, { "epoch": 2.5548087694031043, "grad_norm": 4.201476097106934, "learning_rate": 7.455741562817564e-06, "loss": 4.3305, "step": 31930 }, { "epoch": 2.5556088974235878, "grad_norm": 4.023552894592285, "learning_rate": 7.442370433759427e-06, "loss": 4.2372, "step": 31940 }, { "epoch": 2.5564090254440712, "grad_norm": 6.457052707672119, "learning_rate": 7.4289993047012894e-06, "loss": 4.3504, "step": 31950 }, { "epoch": 2.5572091534645542, "grad_norm": 5.600893497467041, "learning_rate": 7.415628175643152e-06, "loss": 4.2232, "step": 31960 }, { "epoch": 2.5580092814850377, "grad_norm": 7.067571640014648, "learning_rate": 7.402257046585014e-06, "loss": 4.3225, "step": 31970 }, { "epoch": 2.558809409505521, "grad_norm": 5.7097249031066895, "learning_rate": 7.388885917526877e-06, "loss": 4.3295, "step": 31980 }, { "epoch": 2.559609537526004, "grad_norm": 4.277981758117676, "learning_rate": 7.375514788468739e-06, "loss": 4.1644, "step": 31990 }, { "epoch": 2.560409665546487, "grad_norm": 5.122984886169434, "learning_rate": 7.362143659410602e-06, "loss": 4.0891, "step": 32000 }, { "epoch": 2.5612097935669706, "grad_norm": 4.904813289642334, "learning_rate": 7.348772530352463e-06, "loss": 4.3021, "step": 32010 }, { "epoch": 2.562009921587454, "grad_norm": 5.248583793640137, "learning_rate": 7.335401401294326e-06, "loss": 4.2706, "step": 32020 }, { "epoch": 2.562810049607937, "grad_norm": 4.642624855041504, "learning_rate": 7.3220302722361884e-06, "loss": 4.3708, "step": 32030 }, { "epoch": 2.5636101776284206, "grad_norm": 4.165192127227783, "learning_rate": 7.3086591431780494e-06, "loss": 4.2576, "step": 32040 }, { "epoch": 2.564410305648904, "grad_norm": 4.46279239654541, "learning_rate": 7.295288014119912e-06, "loss": 4.3745, "step": 32050 }, { "epoch": 2.565210433669387, "grad_norm": 6.376815319061279, "learning_rate": 7.281916885061775e-06, "loss": 4.2703, "step": 32060 }, { "epoch": 2.5660105616898705, "grad_norm": 5.364983081817627, "learning_rate": 7.2685457560036375e-06, "loss": 4.2359, "step": 32070 }, { "epoch": 2.5668106897103535, "grad_norm": 5.545655727386475, "learning_rate": 7.255174626945499e-06, "loss": 4.3481, "step": 32080 }, { "epoch": 2.567610817730837, "grad_norm": 6.107922077178955, "learning_rate": 7.241803497887362e-06, "loss": 4.2213, "step": 32090 }, { "epoch": 2.56841094575132, "grad_norm": 8.90227222442627, "learning_rate": 7.228432368829225e-06, "loss": 4.0856, "step": 32100 }, { "epoch": 2.5692110737718035, "grad_norm": 4.109879493713379, "learning_rate": 7.215061239771087e-06, "loss": 4.232, "step": 32110 }, { "epoch": 2.570011201792287, "grad_norm": 4.951588153839111, "learning_rate": 7.201690110712948e-06, "loss": 4.2644, "step": 32120 }, { "epoch": 2.57081132981277, "grad_norm": 6.807476043701172, "learning_rate": 7.188318981654811e-06, "loss": 4.2228, "step": 32130 }, { "epoch": 2.5716114578332534, "grad_norm": 4.3869547843933105, "learning_rate": 7.174947852596674e-06, "loss": 4.2833, "step": 32140 }, { "epoch": 2.572411585853737, "grad_norm": 7.943798542022705, "learning_rate": 7.1615767235385365e-06, "loss": 4.1522, "step": 32150 }, { "epoch": 2.57321171387422, "grad_norm": 5.016458034515381, "learning_rate": 7.1482055944803975e-06, "loss": 4.276, "step": 32160 }, { "epoch": 2.574011841894703, "grad_norm": 5.562156677246094, "learning_rate": 7.13483446542226e-06, "loss": 4.2845, "step": 32170 }, { "epoch": 2.5748119699151863, "grad_norm": 4.856637477874756, "learning_rate": 7.121463336364123e-06, "loss": 4.2354, "step": 32180 }, { "epoch": 2.57561209793567, "grad_norm": 4.221564769744873, "learning_rate": 7.1080922073059855e-06, "loss": 4.4084, "step": 32190 }, { "epoch": 2.576412225956153, "grad_norm": 5.096288681030273, "learning_rate": 7.094721078247847e-06, "loss": 4.2814, "step": 32200 }, { "epoch": 2.5772123539766363, "grad_norm": 6.132599353790283, "learning_rate": 7.08134994918971e-06, "loss": 4.1574, "step": 32210 }, { "epoch": 2.5780124819971197, "grad_norm": 5.005707740783691, "learning_rate": 7.067978820131573e-06, "loss": 4.1658, "step": 32220 }, { "epoch": 2.5788126100176028, "grad_norm": 4.944260597229004, "learning_rate": 7.0546076910734355e-06, "loss": 4.3081, "step": 32230 }, { "epoch": 2.579612738038086, "grad_norm": 8.648292541503906, "learning_rate": 7.0412365620152964e-06, "loss": 4.2806, "step": 32240 }, { "epoch": 2.5804128660585692, "grad_norm": 6.548149108886719, "learning_rate": 7.027865432957159e-06, "loss": 4.2142, "step": 32250 }, { "epoch": 2.5812129940790527, "grad_norm": 4.361201286315918, "learning_rate": 7.014494303899022e-06, "loss": 4.1742, "step": 32260 }, { "epoch": 2.5820131220995357, "grad_norm": 5.768430709838867, "learning_rate": 7.0011231748408845e-06, "loss": 4.2943, "step": 32270 }, { "epoch": 2.582813250120019, "grad_norm": 6.053584575653076, "learning_rate": 6.9877520457827455e-06, "loss": 4.2239, "step": 32280 }, { "epoch": 2.5836133781405026, "grad_norm": 7.139733791351318, "learning_rate": 6.974380916724608e-06, "loss": 4.4575, "step": 32290 }, { "epoch": 2.5844135061609856, "grad_norm": 4.825145721435547, "learning_rate": 6.961009787666471e-06, "loss": 4.2602, "step": 32300 }, { "epoch": 2.585213634181469, "grad_norm": 4.957446575164795, "learning_rate": 6.947638658608334e-06, "loss": 4.2541, "step": 32310 }, { "epoch": 2.5860137622019526, "grad_norm": 4.746156215667725, "learning_rate": 6.9342675295501954e-06, "loss": 4.1876, "step": 32320 }, { "epoch": 2.5868138902224356, "grad_norm": 6.32666015625, "learning_rate": 6.920896400492058e-06, "loss": 4.2707, "step": 32330 }, { "epoch": 2.5876140182429186, "grad_norm": 7.385421276092529, "learning_rate": 6.907525271433921e-06, "loss": 4.3383, "step": 32340 }, { "epoch": 2.588414146263402, "grad_norm": 5.54195499420166, "learning_rate": 6.894154142375782e-06, "loss": 4.2667, "step": 32350 }, { "epoch": 2.5892142742838855, "grad_norm": 5.738757133483887, "learning_rate": 6.8807830133176445e-06, "loss": 4.2666, "step": 32360 }, { "epoch": 2.5900144023043685, "grad_norm": 4.932640552520752, "learning_rate": 6.867411884259507e-06, "loss": 4.0864, "step": 32370 }, { "epoch": 2.590814530324852, "grad_norm": 6.3347320556640625, "learning_rate": 6.85404075520137e-06, "loss": 4.2319, "step": 32380 }, { "epoch": 2.5916146583453354, "grad_norm": 6.726009368896484, "learning_rate": 6.840669626143231e-06, "loss": 4.2479, "step": 32390 }, { "epoch": 2.5924147863658185, "grad_norm": 5.636048316955566, "learning_rate": 6.8272984970850936e-06, "loss": 4.2169, "step": 32400 }, { "epoch": 2.593214914386302, "grad_norm": 5.036827564239502, "learning_rate": 6.813927368026956e-06, "loss": 4.1135, "step": 32410 }, { "epoch": 2.594015042406785, "grad_norm": 9.839925765991211, "learning_rate": 6.800556238968819e-06, "loss": 4.5051, "step": 32420 }, { "epoch": 2.5948151704272684, "grad_norm": 6.498042583465576, "learning_rate": 6.787185109910681e-06, "loss": 4.3001, "step": 32430 }, { "epoch": 2.5956152984477514, "grad_norm": 5.901638031005859, "learning_rate": 6.7738139808525435e-06, "loss": 4.2694, "step": 32440 }, { "epoch": 2.596415426468235, "grad_norm": 4.632312774658203, "learning_rate": 6.760442851794406e-06, "loss": 4.2341, "step": 32450 }, { "epoch": 2.5972155544887183, "grad_norm": 6.943007469177246, "learning_rate": 6.747071722736269e-06, "loss": 4.2814, "step": 32460 }, { "epoch": 2.5980156825092013, "grad_norm": 5.386791229248047, "learning_rate": 6.73370059367813e-06, "loss": 4.2518, "step": 32470 }, { "epoch": 2.598815810529685, "grad_norm": 5.245943069458008, "learning_rate": 6.7203294646199925e-06, "loss": 4.3079, "step": 32480 }, { "epoch": 2.5996159385501683, "grad_norm": 4.878678798675537, "learning_rate": 6.706958335561855e-06, "loss": 4.3576, "step": 32490 }, { "epoch": 2.6004160665706513, "grad_norm": 6.896281719207764, "learning_rate": 6.693587206503718e-06, "loss": 3.9788, "step": 32500 }, { "epoch": 2.6012161945911347, "grad_norm": 11.621658325195312, "learning_rate": 6.68021607744558e-06, "loss": 4.3443, "step": 32510 }, { "epoch": 2.6020163226116177, "grad_norm": 5.696441173553467, "learning_rate": 6.6668449483874425e-06, "loss": 4.2309, "step": 32520 }, { "epoch": 2.602816450632101, "grad_norm": 6.503956317901611, "learning_rate": 6.653473819329305e-06, "loss": 4.195, "step": 32530 }, { "epoch": 2.6036165786525842, "grad_norm": 5.1271209716796875, "learning_rate": 6.640102690271168e-06, "loss": 4.1775, "step": 32540 }, { "epoch": 2.6044167066730677, "grad_norm": 7.013203144073486, "learning_rate": 6.626731561213029e-06, "loss": 4.2677, "step": 32550 }, { "epoch": 2.605216834693551, "grad_norm": 5.267161846160889, "learning_rate": 6.6133604321548915e-06, "loss": 4.1407, "step": 32560 }, { "epoch": 2.606016962714034, "grad_norm": 5.86568546295166, "learning_rate": 6.599989303096754e-06, "loss": 4.3156, "step": 32570 }, { "epoch": 2.6068170907345176, "grad_norm": 12.335358619689941, "learning_rate": 6.586618174038617e-06, "loss": 4.2735, "step": 32580 }, { "epoch": 2.607617218755001, "grad_norm": 6.064377784729004, "learning_rate": 6.573247044980478e-06, "loss": 4.2603, "step": 32590 }, { "epoch": 2.608417346775484, "grad_norm": 4.735530376434326, "learning_rate": 6.559875915922341e-06, "loss": 4.2768, "step": 32600 }, { "epoch": 2.609217474795967, "grad_norm": 5.105945587158203, "learning_rate": 6.546504786864203e-06, "loss": 4.2271, "step": 32610 }, { "epoch": 2.6100176028164506, "grad_norm": 4.72859001159668, "learning_rate": 6.533133657806065e-06, "loss": 4.4265, "step": 32620 }, { "epoch": 2.610817730836934, "grad_norm": 5.76402473449707, "learning_rate": 6.519762528747928e-06, "loss": 4.2542, "step": 32630 }, { "epoch": 2.611617858857417, "grad_norm": 4.846867084503174, "learning_rate": 6.5063913996897905e-06, "loss": 4.1611, "step": 32640 }, { "epoch": 2.6124179868779005, "grad_norm": 3.9136717319488525, "learning_rate": 6.493020270631653e-06, "loss": 4.1242, "step": 32650 }, { "epoch": 2.613218114898384, "grad_norm": 5.419270992279053, "learning_rate": 6.479649141573514e-06, "loss": 4.1072, "step": 32660 }, { "epoch": 2.614018242918867, "grad_norm": 4.859950542449951, "learning_rate": 6.466278012515377e-06, "loss": 4.3008, "step": 32670 }, { "epoch": 2.6148183709393504, "grad_norm": 4.817934989929199, "learning_rate": 6.4529068834572396e-06, "loss": 4.2979, "step": 32680 }, { "epoch": 2.6156184989598334, "grad_norm": 6.053598403930664, "learning_rate": 6.439535754399102e-06, "loss": 4.0841, "step": 32690 }, { "epoch": 2.616418626980317, "grad_norm": 6.537989139556885, "learning_rate": 6.426164625340963e-06, "loss": 4.2682, "step": 32700 }, { "epoch": 2.6172187550008, "grad_norm": 6.115545749664307, "learning_rate": 6.412793496282826e-06, "loss": 4.3778, "step": 32710 }, { "epoch": 2.6180188830212834, "grad_norm": 9.145062446594238, "learning_rate": 6.399422367224689e-06, "loss": 4.3598, "step": 32720 }, { "epoch": 2.618819011041767, "grad_norm": 4.901673316955566, "learning_rate": 6.386051238166551e-06, "loss": 4.3805, "step": 32730 }, { "epoch": 2.61961913906225, "grad_norm": 5.381241798400879, "learning_rate": 6.372680109108413e-06, "loss": 4.2268, "step": 32740 }, { "epoch": 2.6204192670827333, "grad_norm": 4.830419063568115, "learning_rate": 6.359308980050276e-06, "loss": 4.2856, "step": 32750 }, { "epoch": 2.6212193951032168, "grad_norm": 5.5708794593811035, "learning_rate": 6.3459378509921385e-06, "loss": 4.2111, "step": 32760 }, { "epoch": 2.6220195231237, "grad_norm": 5.884067535400391, "learning_rate": 6.332566721934001e-06, "loss": 4.3577, "step": 32770 }, { "epoch": 2.622819651144183, "grad_norm": 5.066050052642822, "learning_rate": 6.319195592875862e-06, "loss": 4.2514, "step": 32780 }, { "epoch": 2.6236197791646663, "grad_norm": 4.707382678985596, "learning_rate": 6.305824463817725e-06, "loss": 4.1726, "step": 32790 }, { "epoch": 2.6244199071851497, "grad_norm": 5.376534938812256, "learning_rate": 6.292453334759588e-06, "loss": 4.2471, "step": 32800 }, { "epoch": 2.6252200352056327, "grad_norm": 4.097368240356445, "learning_rate": 6.27908220570145e-06, "loss": 4.1882, "step": 32810 }, { "epoch": 2.626020163226116, "grad_norm": 8.935079574584961, "learning_rate": 6.265711076643311e-06, "loss": 4.3797, "step": 32820 }, { "epoch": 2.6268202912465997, "grad_norm": 7.076402187347412, "learning_rate": 6.252339947585174e-06, "loss": 4.1752, "step": 32830 }, { "epoch": 2.6276204192670827, "grad_norm": 6.128974914550781, "learning_rate": 6.238968818527037e-06, "loss": 4.2087, "step": 32840 }, { "epoch": 2.628420547287566, "grad_norm": 5.931004047393799, "learning_rate": 6.225597689468899e-06, "loss": 4.2863, "step": 32850 }, { "epoch": 2.629220675308049, "grad_norm": 5.642322540283203, "learning_rate": 6.212226560410762e-06, "loss": 4.3657, "step": 32860 }, { "epoch": 2.6300208033285326, "grad_norm": 5.371090888977051, "learning_rate": 6.198855431352624e-06, "loss": 3.9877, "step": 32870 }, { "epoch": 2.6308209313490156, "grad_norm": 10.01162052154541, "learning_rate": 6.185484302294487e-06, "loss": 4.2464, "step": 32880 }, { "epoch": 2.631621059369499, "grad_norm": 5.889012336730957, "learning_rate": 6.1721131732363484e-06, "loss": 4.3251, "step": 32890 }, { "epoch": 2.6324211873899825, "grad_norm": 5.746575832366943, "learning_rate": 6.15874204417821e-06, "loss": 4.2681, "step": 32900 }, { "epoch": 2.6332213154104656, "grad_norm": 5.833990573883057, "learning_rate": 6.145370915120073e-06, "loss": 4.2323, "step": 32910 }, { "epoch": 2.634021443430949, "grad_norm": 6.475826740264893, "learning_rate": 6.131999786061935e-06, "loss": 4.177, "step": 32920 }, { "epoch": 2.6348215714514325, "grad_norm": 6.790261268615723, "learning_rate": 6.1186286570037975e-06, "loss": 4.1992, "step": 32930 }, { "epoch": 2.6356216994719155, "grad_norm": 6.625553131103516, "learning_rate": 6.105257527945659e-06, "loss": 4.2821, "step": 32940 }, { "epoch": 2.6364218274923985, "grad_norm": 4.441702365875244, "learning_rate": 6.091886398887522e-06, "loss": 4.3954, "step": 32950 }, { "epoch": 2.637221955512882, "grad_norm": 8.516060829162598, "learning_rate": 6.078515269829385e-06, "loss": 4.4075, "step": 32960 }, { "epoch": 2.6380220835333654, "grad_norm": 7.188780784606934, "learning_rate": 6.065144140771247e-06, "loss": 4.1722, "step": 32970 }, { "epoch": 2.6388222115538484, "grad_norm": 4.861379146575928, "learning_rate": 6.051773011713109e-06, "loss": 4.3142, "step": 32980 }, { "epoch": 2.639622339574332, "grad_norm": 8.562806129455566, "learning_rate": 6.038401882654972e-06, "loss": 4.4365, "step": 32990 }, { "epoch": 2.6404224675948154, "grad_norm": 4.433754920959473, "learning_rate": 6.025030753596834e-06, "loss": 4.2044, "step": 33000 }, { "epoch": 2.6412225956152984, "grad_norm": 5.652956485748291, "learning_rate": 6.0116596245386965e-06, "loss": 4.2038, "step": 33010 }, { "epoch": 2.642022723635782, "grad_norm": 5.2881693840026855, "learning_rate": 5.998288495480558e-06, "loss": 4.2475, "step": 33020 }, { "epoch": 2.642822851656265, "grad_norm": 6.064772129058838, "learning_rate": 5.984917366422421e-06, "loss": 4.3771, "step": 33030 }, { "epoch": 2.6436229796767483, "grad_norm": 4.478077411651611, "learning_rate": 5.971546237364283e-06, "loss": 4.2836, "step": 33040 }, { "epoch": 2.6444231076972313, "grad_norm": 6.518775939941406, "learning_rate": 5.9581751083061455e-06, "loss": 4.2132, "step": 33050 }, { "epoch": 2.645223235717715, "grad_norm": 5.864172458648682, "learning_rate": 5.944803979248008e-06, "loss": 4.1413, "step": 33060 }, { "epoch": 2.6460233637381982, "grad_norm": 6.359694480895996, "learning_rate": 5.931432850189871e-06, "loss": 4.012, "step": 33070 }, { "epoch": 2.6468234917586813, "grad_norm": 4.7483439445495605, "learning_rate": 5.918061721131733e-06, "loss": 4.2365, "step": 33080 }, { "epoch": 2.6476236197791647, "grad_norm": 6.524518013000488, "learning_rate": 5.9046905920735955e-06, "loss": 4.2251, "step": 33090 }, { "epoch": 2.648423747799648, "grad_norm": 4.645505428314209, "learning_rate": 5.891319463015457e-06, "loss": 4.3661, "step": 33100 }, { "epoch": 2.649223875820131, "grad_norm": 4.526218414306641, "learning_rate": 5.87794833395732e-06, "loss": 4.157, "step": 33110 }, { "epoch": 2.6500240038406147, "grad_norm": 6.154730796813965, "learning_rate": 5.864577204899182e-06, "loss": 4.1405, "step": 33120 }, { "epoch": 2.6508241318610977, "grad_norm": 4.572940349578857, "learning_rate": 5.8512060758410445e-06, "loss": 4.3686, "step": 33130 }, { "epoch": 2.651624259881581, "grad_norm": 4.475552558898926, "learning_rate": 5.837834946782906e-06, "loss": 4.1899, "step": 33140 }, { "epoch": 2.652424387902064, "grad_norm": 5.439560890197754, "learning_rate": 5.824463817724769e-06, "loss": 4.278, "step": 33150 }, { "epoch": 2.6532245159225476, "grad_norm": 5.904727458953857, "learning_rate": 5.811092688666631e-06, "loss": 4.1599, "step": 33160 }, { "epoch": 2.654024643943031, "grad_norm": 7.036167621612549, "learning_rate": 5.797721559608494e-06, "loss": 4.2771, "step": 33170 }, { "epoch": 2.654824771963514, "grad_norm": 5.601536273956299, "learning_rate": 5.785687543456169e-06, "loss": 4.316, "step": 33180 }, { "epoch": 2.6556248999839975, "grad_norm": 5.479313373565674, "learning_rate": 5.772316414398032e-06, "loss": 4.2582, "step": 33190 }, { "epoch": 2.656425028004481, "grad_norm": 7.038112640380859, "learning_rate": 5.758945285339895e-06, "loss": 4.3412, "step": 33200 }, { "epoch": 2.657225156024964, "grad_norm": 6.468174934387207, "learning_rate": 5.7455741562817566e-06, "loss": 4.3545, "step": 33210 }, { "epoch": 2.658025284045447, "grad_norm": 7.697492599487305, "learning_rate": 5.732203027223619e-06, "loss": 4.4066, "step": 33220 }, { "epoch": 2.6588254120659305, "grad_norm": 4.525124549865723, "learning_rate": 5.718831898165481e-06, "loss": 4.3206, "step": 33230 }, { "epoch": 2.659625540086414, "grad_norm": 3.5019888877868652, "learning_rate": 5.705460769107344e-06, "loss": 4.1181, "step": 33240 }, { "epoch": 2.660425668106897, "grad_norm": 5.208031177520752, "learning_rate": 5.692089640049206e-06, "loss": 4.3428, "step": 33250 }, { "epoch": 2.6612257961273804, "grad_norm": 4.360513687133789, "learning_rate": 5.678718510991068e-06, "loss": 4.2993, "step": 33260 }, { "epoch": 2.662025924147864, "grad_norm": 5.4428911209106445, "learning_rate": 5.66534738193293e-06, "loss": 4.3044, "step": 33270 }, { "epoch": 2.662826052168347, "grad_norm": 5.437986373901367, "learning_rate": 5.651976252874793e-06, "loss": 4.1053, "step": 33280 }, { "epoch": 2.6636261801888304, "grad_norm": 6.016399383544922, "learning_rate": 5.6386051238166555e-06, "loss": 4.0533, "step": 33290 }, { "epoch": 2.6644263082093134, "grad_norm": 4.479848861694336, "learning_rate": 5.625233994758518e-06, "loss": 4.278, "step": 33300 }, { "epoch": 2.665226436229797, "grad_norm": 3.8998403549194336, "learning_rate": 5.61186286570038e-06, "loss": 4.1033, "step": 33310 }, { "epoch": 2.66602656425028, "grad_norm": 5.68388032913208, "learning_rate": 5.598491736642243e-06, "loss": 4.3475, "step": 33320 }, { "epoch": 2.6668266922707633, "grad_norm": 5.672200679779053, "learning_rate": 5.585120607584105e-06, "loss": 4.2638, "step": 33330 }, { "epoch": 2.6676268202912468, "grad_norm": 5.235283851623535, "learning_rate": 5.571749478525967e-06, "loss": 4.1448, "step": 33340 }, { "epoch": 2.6684269483117298, "grad_norm": 5.4079108238220215, "learning_rate": 5.558378349467829e-06, "loss": 3.9993, "step": 33350 }, { "epoch": 2.6692270763322132, "grad_norm": 9.00804328918457, "learning_rate": 5.545007220409692e-06, "loss": 4.296, "step": 33360 }, { "epoch": 2.6700272043526967, "grad_norm": 4.167604923248291, "learning_rate": 5.531636091351554e-06, "loss": 4.1637, "step": 33370 }, { "epoch": 2.6708273323731797, "grad_norm": 4.981602668762207, "learning_rate": 5.518264962293416e-06, "loss": 4.3314, "step": 33380 }, { "epoch": 2.6716274603936627, "grad_norm": 4.367321968078613, "learning_rate": 5.504893833235278e-06, "loss": 4.2848, "step": 33390 }, { "epoch": 2.672427588414146, "grad_norm": 5.457118988037109, "learning_rate": 5.491522704177141e-06, "loss": 4.2957, "step": 33400 }, { "epoch": 2.6732277164346296, "grad_norm": 5.582805156707764, "learning_rate": 5.478151575119004e-06, "loss": 4.2366, "step": 33410 }, { "epoch": 2.6740278444551127, "grad_norm": 6.88631010055542, "learning_rate": 5.464780446060866e-06, "loss": 4.1834, "step": 33420 }, { "epoch": 2.674827972475596, "grad_norm": 6.5617218017578125, "learning_rate": 5.451409317002728e-06, "loss": 4.3329, "step": 33430 }, { "epoch": 2.6756281004960796, "grad_norm": 5.843389511108398, "learning_rate": 5.438038187944591e-06, "loss": 4.3819, "step": 33440 }, { "epoch": 2.6764282285165626, "grad_norm": 5.388167381286621, "learning_rate": 5.424667058886453e-06, "loss": 4.2512, "step": 33450 }, { "epoch": 2.677228356537046, "grad_norm": 8.47705078125, "learning_rate": 5.411295929828315e-06, "loss": 4.1728, "step": 33460 }, { "epoch": 2.678028484557529, "grad_norm": 17.4747371673584, "learning_rate": 5.397924800770177e-06, "loss": 4.4086, "step": 33470 }, { "epoch": 2.6788286125780125, "grad_norm": 5.968992710113525, "learning_rate": 5.384553671712039e-06, "loss": 4.2446, "step": 33480 }, { "epoch": 2.6796287405984955, "grad_norm": 5.716801166534424, "learning_rate": 5.371182542653902e-06, "loss": 4.3397, "step": 33490 }, { "epoch": 2.680428868618979, "grad_norm": 5.442111968994141, "learning_rate": 5.3578114135957636e-06, "loss": 4.2471, "step": 33500 }, { "epoch": 2.6812289966394625, "grad_norm": 6.218289852142334, "learning_rate": 5.344440284537626e-06, "loss": 4.3326, "step": 33510 }, { "epoch": 2.6820291246599455, "grad_norm": 5.563192367553711, "learning_rate": 5.331069155479489e-06, "loss": 4.1598, "step": 33520 }, { "epoch": 2.682829252680429, "grad_norm": 6.493220329284668, "learning_rate": 5.317698026421352e-06, "loss": 4.0064, "step": 33530 }, { "epoch": 2.6836293807009124, "grad_norm": 6.705641269683838, "learning_rate": 5.3043268973632135e-06, "loss": 4.286, "step": 33540 }, { "epoch": 2.6844295087213954, "grad_norm": 8.996630668640137, "learning_rate": 5.290955768305076e-06, "loss": 4.0485, "step": 33550 }, { "epoch": 2.685229636741879, "grad_norm": 5.042446136474609, "learning_rate": 5.277584639246938e-06, "loss": 4.4114, "step": 33560 }, { "epoch": 2.686029764762362, "grad_norm": 5.418910026550293, "learning_rate": 5.264213510188801e-06, "loss": 4.1548, "step": 33570 }, { "epoch": 2.6868298927828453, "grad_norm": 4.982604026794434, "learning_rate": 5.2508423811306625e-06, "loss": 4.1292, "step": 33580 }, { "epoch": 2.6876300208033284, "grad_norm": 5.434889316558838, "learning_rate": 5.237471252072525e-06, "loss": 4.4752, "step": 33590 }, { "epoch": 2.688430148823812, "grad_norm": 5.765676021575928, "learning_rate": 5.224100123014387e-06, "loss": 4.2641, "step": 33600 }, { "epoch": 2.6892302768442953, "grad_norm": 5.155886650085449, "learning_rate": 5.21072899395625e-06, "loss": 4.0127, "step": 33610 }, { "epoch": 2.6900304048647783, "grad_norm": 8.582798957824707, "learning_rate": 5.1973578648981125e-06, "loss": 4.1382, "step": 33620 }, { "epoch": 2.6908305328852617, "grad_norm": 7.404249668121338, "learning_rate": 5.183986735839975e-06, "loss": 4.2623, "step": 33630 }, { "epoch": 2.6916306609057448, "grad_norm": 9.338781356811523, "learning_rate": 5.170615606781837e-06, "loss": 4.1845, "step": 33640 }, { "epoch": 2.6924307889262282, "grad_norm": 6.720228672027588, "learning_rate": 5.1572444777237e-06, "loss": 4.2549, "step": 33650 }, { "epoch": 2.6932309169467112, "grad_norm": 6.17422342300415, "learning_rate": 5.1438733486655615e-06, "loss": 4.3159, "step": 33660 }, { "epoch": 2.6940310449671947, "grad_norm": 5.542844772338867, "learning_rate": 5.130502219607424e-06, "loss": 4.3158, "step": 33670 }, { "epoch": 2.694831172987678, "grad_norm": 4.788525104522705, "learning_rate": 5.117131090549286e-06, "loss": 4.3005, "step": 33680 }, { "epoch": 2.695631301008161, "grad_norm": 5.047336578369141, "learning_rate": 5.103759961491149e-06, "loss": 4.0872, "step": 33690 }, { "epoch": 2.6964314290286446, "grad_norm": 9.078147888183594, "learning_rate": 5.090388832433011e-06, "loss": 4.3044, "step": 33700 }, { "epoch": 2.697231557049128, "grad_norm": 6.686065673828125, "learning_rate": 5.077017703374873e-06, "loss": 4.1941, "step": 33710 }, { "epoch": 2.698031685069611, "grad_norm": 5.865580081939697, "learning_rate": 5.063646574316735e-06, "loss": 4.2317, "step": 33720 }, { "epoch": 2.6988318130900946, "grad_norm": 4.9534173011779785, "learning_rate": 5.050275445258598e-06, "loss": 4.2982, "step": 33730 }, { "epoch": 2.6996319411105776, "grad_norm": 4.907919406890869, "learning_rate": 5.0369043162004605e-06, "loss": 4.4078, "step": 33740 }, { "epoch": 2.700432069131061, "grad_norm": 6.2328267097473145, "learning_rate": 5.023533187142323e-06, "loss": 4.1612, "step": 33750 }, { "epoch": 2.701232197151544, "grad_norm": 5.902498722076416, "learning_rate": 5.010162058084185e-06, "loss": 4.0897, "step": 33760 }, { "epoch": 2.7020323251720275, "grad_norm": 4.614875793457031, "learning_rate": 4.996790929026047e-06, "loss": 4.1132, "step": 33770 }, { "epoch": 2.702832453192511, "grad_norm": 8.119933128356934, "learning_rate": 4.9834197999679096e-06, "loss": 3.9534, "step": 33780 }, { "epoch": 2.703632581212994, "grad_norm": 6.0485148429870605, "learning_rate": 4.970048670909771e-06, "loss": 4.3585, "step": 33790 }, { "epoch": 2.7044327092334774, "grad_norm": 6.55008602142334, "learning_rate": 4.956677541851634e-06, "loss": 4.2445, "step": 33800 }, { "epoch": 2.705232837253961, "grad_norm": 4.953001976013184, "learning_rate": 4.943306412793496e-06, "loss": 4.2733, "step": 33810 }, { "epoch": 2.706032965274444, "grad_norm": 6.0347490310668945, "learning_rate": 4.929935283735359e-06, "loss": 4.2542, "step": 33820 }, { "epoch": 2.706833093294927, "grad_norm": 4.666191101074219, "learning_rate": 4.916564154677221e-06, "loss": 4.2326, "step": 33830 }, { "epoch": 2.7076332213154104, "grad_norm": 4.969473361968994, "learning_rate": 4.903193025619084e-06, "loss": 4.0745, "step": 33840 }, { "epoch": 2.708433349335894, "grad_norm": 5.747929096221924, "learning_rate": 4.889821896560946e-06, "loss": 4.1663, "step": 33850 }, { "epoch": 2.709233477356377, "grad_norm": 4.519825458526611, "learning_rate": 4.8764507675028085e-06, "loss": 4.1346, "step": 33860 }, { "epoch": 2.7100336053768603, "grad_norm": 5.24179220199585, "learning_rate": 4.86307963844467e-06, "loss": 4.165, "step": 33870 }, { "epoch": 2.710833733397344, "grad_norm": 5.994459629058838, "learning_rate": 4.849708509386533e-06, "loss": 4.2889, "step": 33880 }, { "epoch": 2.711633861417827, "grad_norm": 5.778345584869385, "learning_rate": 4.836337380328395e-06, "loss": 4.1544, "step": 33890 }, { "epoch": 2.7124339894383103, "grad_norm": 7.161036968231201, "learning_rate": 4.822966251270258e-06, "loss": 4.1862, "step": 33900 }, { "epoch": 2.7132341174587933, "grad_norm": 7.405507564544678, "learning_rate": 4.8095951222121195e-06, "loss": 4.1942, "step": 33910 }, { "epoch": 2.7140342454792767, "grad_norm": 5.971241474151611, "learning_rate": 4.796223993153982e-06, "loss": 4.2161, "step": 33920 }, { "epoch": 2.7148343734997598, "grad_norm": 5.452059268951416, "learning_rate": 4.782852864095844e-06, "loss": 4.1921, "step": 33930 }, { "epoch": 2.715634501520243, "grad_norm": 4.6873345375061035, "learning_rate": 4.769481735037707e-06, "loss": 4.2156, "step": 33940 }, { "epoch": 2.7164346295407267, "grad_norm": 4.376823902130127, "learning_rate": 4.756110605979569e-06, "loss": 4.1594, "step": 33950 }, { "epoch": 2.7172347575612097, "grad_norm": 5.478845119476318, "learning_rate": 4.742739476921432e-06, "loss": 4.2715, "step": 33960 }, { "epoch": 2.718034885581693, "grad_norm": 8.683806419372559, "learning_rate": 4.729368347863294e-06, "loss": 4.2901, "step": 33970 }, { "epoch": 2.7188350136021766, "grad_norm": 5.288990497589111, "learning_rate": 4.715997218805157e-06, "loss": 4.2991, "step": 33980 }, { "epoch": 2.7196351416226596, "grad_norm": 6.266578197479248, "learning_rate": 4.7026260897470184e-06, "loss": 4.255, "step": 33990 }, { "epoch": 2.7204352696431426, "grad_norm": 6.196168422698975, "learning_rate": 4.689254960688881e-06, "loss": 4.3106, "step": 34000 }, { "epoch": 2.721235397663626, "grad_norm": 5.192313194274902, "learning_rate": 4.675883831630743e-06, "loss": 4.2563, "step": 34010 }, { "epoch": 2.7220355256841096, "grad_norm": 5.5003886222839355, "learning_rate": 4.662512702572606e-06, "loss": 4.4209, "step": 34020 }, { "epoch": 2.7228356537045926, "grad_norm": 5.434267997741699, "learning_rate": 4.6491415735144675e-06, "loss": 4.2888, "step": 34030 }, { "epoch": 2.723635781725076, "grad_norm": 8.187822341918945, "learning_rate": 4.635770444456329e-06, "loss": 4.2338, "step": 34040 }, { "epoch": 2.7244359097455595, "grad_norm": 5.527400970458984, "learning_rate": 4.622399315398192e-06, "loss": 4.0227, "step": 34050 }, { "epoch": 2.7252360377660425, "grad_norm": 7.845839977264404, "learning_rate": 4.609028186340055e-06, "loss": 4.2575, "step": 34060 }, { "epoch": 2.726036165786526, "grad_norm": 4.837810039520264, "learning_rate": 4.595657057281917e-06, "loss": 4.1485, "step": 34070 }, { "epoch": 2.726836293807009, "grad_norm": 7.224013328552246, "learning_rate": 4.582285928223779e-06, "loss": 4.2924, "step": 34080 }, { "epoch": 2.7276364218274924, "grad_norm": 6.373143196105957, "learning_rate": 4.568914799165642e-06, "loss": 4.3085, "step": 34090 }, { "epoch": 2.7284365498479755, "grad_norm": 4.7438764572143555, "learning_rate": 4.555543670107504e-06, "loss": 4.1749, "step": 34100 }, { "epoch": 2.729236677868459, "grad_norm": 4.517533779144287, "learning_rate": 4.5421725410493665e-06, "loss": 4.3599, "step": 34110 }, { "epoch": 2.7300368058889424, "grad_norm": 6.462946891784668, "learning_rate": 4.528801411991228e-06, "loss": 4.2767, "step": 34120 }, { "epoch": 2.7308369339094254, "grad_norm": 4.755046367645264, "learning_rate": 4.515430282933091e-06, "loss": 4.2932, "step": 34130 }, { "epoch": 2.731637061929909, "grad_norm": 5.173582077026367, "learning_rate": 4.502059153874953e-06, "loss": 4.4344, "step": 34140 }, { "epoch": 2.7324371899503923, "grad_norm": 7.195948600769043, "learning_rate": 4.4886880248168156e-06, "loss": 4.1337, "step": 34150 }, { "epoch": 2.7332373179708753, "grad_norm": 4.898958206176758, "learning_rate": 4.475316895758678e-06, "loss": 4.2497, "step": 34160 }, { "epoch": 2.734037445991359, "grad_norm": 6.266502857208252, "learning_rate": 4.461945766700541e-06, "loss": 4.0755, "step": 34170 }, { "epoch": 2.734837574011842, "grad_norm": 12.440402030944824, "learning_rate": 4.448574637642403e-06, "loss": 4.2299, "step": 34180 }, { "epoch": 2.7356377020323253, "grad_norm": 5.204728126525879, "learning_rate": 4.4352035085842655e-06, "loss": 4.2285, "step": 34190 }, { "epoch": 2.7364378300528083, "grad_norm": 4.905936241149902, "learning_rate": 4.421832379526127e-06, "loss": 4.2483, "step": 34200 }, { "epoch": 2.7372379580732917, "grad_norm": 4.211105823516846, "learning_rate": 4.40846125046799e-06, "loss": 4.0633, "step": 34210 }, { "epoch": 2.738038086093775, "grad_norm": 4.4559807777404785, "learning_rate": 4.395090121409852e-06, "loss": 4.1973, "step": 34220 }, { "epoch": 2.738838214114258, "grad_norm": 7.232518196105957, "learning_rate": 4.3817189923517145e-06, "loss": 4.0555, "step": 34230 }, { "epoch": 2.7396383421347417, "grad_norm": 4.614950656890869, "learning_rate": 4.368347863293576e-06, "loss": 4.2235, "step": 34240 }, { "epoch": 2.7404384701552247, "grad_norm": 6.457540035247803, "learning_rate": 4.354976734235439e-06, "loss": 4.238, "step": 34250 }, { "epoch": 2.741238598175708, "grad_norm": 6.911721706390381, "learning_rate": 4.341605605177301e-06, "loss": 4.1752, "step": 34260 }, { "epoch": 2.742038726196191, "grad_norm": 5.3381876945495605, "learning_rate": 4.328234476119164e-06, "loss": 4.3043, "step": 34270 }, { "epoch": 2.7428388542166746, "grad_norm": 4.276921272277832, "learning_rate": 4.314863347061026e-06, "loss": 4.2093, "step": 34280 }, { "epoch": 2.743638982237158, "grad_norm": 6.417922496795654, "learning_rate": 4.301492218002889e-06, "loss": 4.2126, "step": 34290 }, { "epoch": 2.744439110257641, "grad_norm": 6.303336143493652, "learning_rate": 4.288121088944751e-06, "loss": 4.2688, "step": 34300 }, { "epoch": 2.7452392382781245, "grad_norm": 6.443734645843506, "learning_rate": 4.2747499598866135e-06, "loss": 4.3859, "step": 34310 }, { "epoch": 2.746039366298608, "grad_norm": 5.473753452301025, "learning_rate": 4.261378830828475e-06, "loss": 4.2519, "step": 34320 }, { "epoch": 2.746839494319091, "grad_norm": 5.459244728088379, "learning_rate": 4.248007701770337e-06, "loss": 4.3614, "step": 34330 }, { "epoch": 2.7476396223395745, "grad_norm": 6.8252434730529785, "learning_rate": 4.2346365727122e-06, "loss": 4.1491, "step": 34340 }, { "epoch": 2.7484397503600575, "grad_norm": 4.675537586212158, "learning_rate": 4.221265443654062e-06, "loss": 4.3317, "step": 34350 }, { "epoch": 2.749239878380541, "grad_norm": 4.900448322296143, "learning_rate": 4.207894314595924e-06, "loss": 4.2671, "step": 34360 }, { "epoch": 2.750040006401024, "grad_norm": 4.971939563751221, "learning_rate": 4.194523185537787e-06, "loss": 4.1103, "step": 34370 }, { "epoch": 2.7508401344215074, "grad_norm": 4.37131929397583, "learning_rate": 4.18115205647965e-06, "loss": 4.2015, "step": 34380 }, { "epoch": 2.751640262441991, "grad_norm": 4.4726786613464355, "learning_rate": 4.167780927421512e-06, "loss": 4.3048, "step": 34390 }, { "epoch": 2.752440390462474, "grad_norm": 4.819699764251709, "learning_rate": 4.154409798363374e-06, "loss": 4.0875, "step": 34400 }, { "epoch": 2.7532405184829574, "grad_norm": 4.992520332336426, "learning_rate": 4.141038669305236e-06, "loss": 4.22, "step": 34410 }, { "epoch": 2.754040646503441, "grad_norm": 7.037467002868652, "learning_rate": 4.127667540247099e-06, "loss": 4.0536, "step": 34420 }, { "epoch": 2.754840774523924, "grad_norm": 4.956068515777588, "learning_rate": 4.114296411188961e-06, "loss": 4.2732, "step": 34430 }, { "epoch": 2.755640902544407, "grad_norm": 5.513150691986084, "learning_rate": 4.100925282130823e-06, "loss": 4.2847, "step": 34440 }, { "epoch": 2.7564410305648903, "grad_norm": 7.297191143035889, "learning_rate": 4.087554153072685e-06, "loss": 4.2398, "step": 34450 }, { "epoch": 2.7572411585853738, "grad_norm": 4.674797058105469, "learning_rate": 4.074183024014548e-06, "loss": 4.1731, "step": 34460 }, { "epoch": 2.758041286605857, "grad_norm": 4.875251293182373, "learning_rate": 4.06081189495641e-06, "loss": 4.2261, "step": 34470 }, { "epoch": 2.7588414146263402, "grad_norm": 6.054131507873535, "learning_rate": 4.0474407658982725e-06, "loss": 4.1148, "step": 34480 }, { "epoch": 2.7596415426468237, "grad_norm": 6.8574910163879395, "learning_rate": 4.034069636840135e-06, "loss": 4.3257, "step": 34490 }, { "epoch": 2.7604416706673067, "grad_norm": 3.8812949657440186, "learning_rate": 4.020698507781998e-06, "loss": 4.3107, "step": 34500 }, { "epoch": 2.76124179868779, "grad_norm": 4.484575271606445, "learning_rate": 4.00732737872386e-06, "loss": 4.1445, "step": 34510 }, { "epoch": 2.762041926708273, "grad_norm": 4.373636722564697, "learning_rate": 3.993956249665722e-06, "loss": 4.1372, "step": 34520 }, { "epoch": 2.7628420547287567, "grad_norm": 5.4754509925842285, "learning_rate": 3.980585120607584e-06, "loss": 4.1681, "step": 34530 }, { "epoch": 2.7636421827492397, "grad_norm": 16.14682960510254, "learning_rate": 3.967213991549447e-06, "loss": 4.1908, "step": 34540 }, { "epoch": 2.764442310769723, "grad_norm": 6.3839592933654785, "learning_rate": 3.953842862491309e-06, "loss": 4.1729, "step": 34550 }, { "epoch": 2.7652424387902066, "grad_norm": 4.453866004943848, "learning_rate": 3.9404717334331714e-06, "loss": 4.149, "step": 34560 }, { "epoch": 2.7660425668106896, "grad_norm": 6.7042951583862305, "learning_rate": 3.927100604375033e-06, "loss": 4.1094, "step": 34570 }, { "epoch": 2.766842694831173, "grad_norm": 5.805509567260742, "learning_rate": 3.913729475316896e-06, "loss": 4.1464, "step": 34580 }, { "epoch": 2.7676428228516565, "grad_norm": 6.5062360763549805, "learning_rate": 3.900358346258759e-06, "loss": 4.1876, "step": 34590 }, { "epoch": 2.7684429508721395, "grad_norm": 6.451037406921387, "learning_rate": 3.886987217200621e-06, "loss": 4.2076, "step": 34600 }, { "epoch": 2.7692430788926226, "grad_norm": 4.453220844268799, "learning_rate": 3.873616088142483e-06, "loss": 4.0763, "step": 34610 }, { "epoch": 2.770043206913106, "grad_norm": 5.057280540466309, "learning_rate": 3.860244959084345e-06, "loss": 4.2001, "step": 34620 }, { "epoch": 2.7708433349335895, "grad_norm": 4.674801349639893, "learning_rate": 3.846873830026208e-06, "loss": 4.4211, "step": 34630 }, { "epoch": 2.7716434629540725, "grad_norm": 4.858605861663818, "learning_rate": 3.83350270096807e-06, "loss": 4.1275, "step": 34640 }, { "epoch": 2.772443590974556, "grad_norm": 6.631062030792236, "learning_rate": 3.820131571909932e-06, "loss": 4.342, "step": 34650 }, { "epoch": 2.7732437189950394, "grad_norm": 7.900754928588867, "learning_rate": 3.806760442851794e-06, "loss": 4.1309, "step": 34660 }, { "epoch": 2.7740438470155224, "grad_norm": 6.944363117218018, "learning_rate": 3.793389313793657e-06, "loss": 4.3148, "step": 34670 }, { "epoch": 2.774843975036006, "grad_norm": 9.841028213500977, "learning_rate": 3.780018184735519e-06, "loss": 4.2127, "step": 34680 }, { "epoch": 2.775644103056489, "grad_norm": 5.778743743896484, "learning_rate": 3.7666470556773818e-06, "loss": 4.2897, "step": 34690 }, { "epoch": 2.7764442310769724, "grad_norm": 7.016035079956055, "learning_rate": 3.7532759266192436e-06, "loss": 4.0521, "step": 34700 }, { "epoch": 2.7772443590974554, "grad_norm": 5.496110916137695, "learning_rate": 3.7399047975611063e-06, "loss": 4.378, "step": 34710 }, { "epoch": 2.778044487117939, "grad_norm": 4.283048629760742, "learning_rate": 3.7265336685029686e-06, "loss": 4.2144, "step": 34720 }, { "epoch": 2.7788446151384223, "grad_norm": 4.840237617492676, "learning_rate": 3.7131625394448312e-06, "loss": 4.0939, "step": 34730 }, { "epoch": 2.7796447431589053, "grad_norm": 6.175449371337891, "learning_rate": 3.699791410386693e-06, "loss": 4.1317, "step": 34740 }, { "epoch": 2.7804448711793888, "grad_norm": 7.649020195007324, "learning_rate": 3.6864202813285558e-06, "loss": 4.0876, "step": 34750 }, { "epoch": 2.7812449991998722, "grad_norm": 4.315440654754639, "learning_rate": 3.6730491522704176e-06, "loss": 4.3091, "step": 34760 }, { "epoch": 2.7820451272203552, "grad_norm": 7.573521137237549, "learning_rate": 3.6596780232122803e-06, "loss": 4.2258, "step": 34770 }, { "epoch": 2.7828452552408387, "grad_norm": 5.323004722595215, "learning_rate": 3.6463068941541426e-06, "loss": 4.3041, "step": 34780 }, { "epoch": 2.7836453832613217, "grad_norm": 5.814512729644775, "learning_rate": 3.6329357650960053e-06, "loss": 4.121, "step": 34790 }, { "epoch": 2.784445511281805, "grad_norm": 5.472853183746338, "learning_rate": 3.619564636037867e-06, "loss": 4.2736, "step": 34800 }, { "epoch": 2.785245639302288, "grad_norm": 6.29842472076416, "learning_rate": 3.60619350697973e-06, "loss": 4.1963, "step": 34810 }, { "epoch": 2.7860457673227716, "grad_norm": 4.491445064544678, "learning_rate": 3.5928223779215916e-06, "loss": 4.2659, "step": 34820 }, { "epoch": 2.786845895343255, "grad_norm": 7.308801651000977, "learning_rate": 3.5794512488634543e-06, "loss": 4.2923, "step": 34830 }, { "epoch": 2.787646023363738, "grad_norm": 5.32773494720459, "learning_rate": 3.5660801198053166e-06, "loss": 4.2347, "step": 34840 }, { "epoch": 2.7884461513842216, "grad_norm": 5.688913822174072, "learning_rate": 3.5527089907471793e-06, "loss": 4.1971, "step": 34850 }, { "epoch": 2.7892462794047046, "grad_norm": 5.6290740966796875, "learning_rate": 3.539337861689041e-06, "loss": 4.1935, "step": 34860 }, { "epoch": 2.790046407425188, "grad_norm": 4.931374549865723, "learning_rate": 3.525966732630904e-06, "loss": 4.1234, "step": 34870 }, { "epoch": 2.790846535445671, "grad_norm": 6.678748607635498, "learning_rate": 3.5125956035727657e-06, "loss": 4.2878, "step": 34880 }, { "epoch": 2.7916466634661545, "grad_norm": 4.9022626876831055, "learning_rate": 3.499224474514628e-06, "loss": 4.1224, "step": 34890 }, { "epoch": 2.792446791486638, "grad_norm": 6.170747756958008, "learning_rate": 3.4858533454564906e-06, "loss": 4.2608, "step": 34900 }, { "epoch": 2.793246919507121, "grad_norm": 4.918048858642578, "learning_rate": 3.4724822163983525e-06, "loss": 4.1567, "step": 34910 }, { "epoch": 2.7940470475276045, "grad_norm": 7.252740383148193, "learning_rate": 3.459111087340215e-06, "loss": 4.1902, "step": 34920 }, { "epoch": 2.794847175548088, "grad_norm": 5.8041157722473145, "learning_rate": 3.445739958282077e-06, "loss": 4.2221, "step": 34930 }, { "epoch": 2.795647303568571, "grad_norm": 6.8014020919799805, "learning_rate": 3.4323688292239397e-06, "loss": 4.118, "step": 34940 }, { "epoch": 2.7964474315890544, "grad_norm": 6.862493991851807, "learning_rate": 3.418997700165802e-06, "loss": 4.2246, "step": 34950 }, { "epoch": 2.7972475596095374, "grad_norm": 3.999680995941162, "learning_rate": 3.4056265711076646e-06, "loss": 4.2414, "step": 34960 }, { "epoch": 2.798047687630021, "grad_norm": 5.764650821685791, "learning_rate": 3.3922554420495265e-06, "loss": 4.1001, "step": 34970 }, { "epoch": 2.798847815650504, "grad_norm": 4.690990447998047, "learning_rate": 3.378884312991389e-06, "loss": 4.2748, "step": 34980 }, { "epoch": 2.7996479436709873, "grad_norm": 5.3460259437561035, "learning_rate": 3.3655131839332514e-06, "loss": 4.245, "step": 34990 }, { "epoch": 2.800448071691471, "grad_norm": 3.6493239402770996, "learning_rate": 3.352142054875114e-06, "loss": 4.2913, "step": 35000 }, { "epoch": 2.800448071691471, "eval_loss": 5.665900230407715, "eval_runtime": 17.279, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.289, "step": 35000 }, { "epoch": 2.801248199711954, "grad_norm": 8.602275848388672, "learning_rate": 3.338770925816976e-06, "loss": 4.2604, "step": 35010 }, { "epoch": 2.8020483277324373, "grad_norm": 5.3672356605529785, "learning_rate": 3.3253997967588387e-06, "loss": 4.1385, "step": 35020 }, { "epoch": 2.8028484557529207, "grad_norm": 9.254061698913574, "learning_rate": 3.3120286677007005e-06, "loss": 4.3666, "step": 35030 }, { "epoch": 2.8036485837734038, "grad_norm": 5.195075988769531, "learning_rate": 3.298657538642563e-06, "loss": 4.0343, "step": 35040 }, { "epoch": 2.8044487117938868, "grad_norm": 5.531332969665527, "learning_rate": 3.2852864095844255e-06, "loss": 4.4317, "step": 35050 }, { "epoch": 2.8052488398143702, "grad_norm": 5.089931964874268, "learning_rate": 3.271915280526288e-06, "loss": 4.2116, "step": 35060 }, { "epoch": 2.8060489678348537, "grad_norm": 7.625751495361328, "learning_rate": 3.25854415146815e-06, "loss": 4.0876, "step": 35070 }, { "epoch": 2.8068490958553367, "grad_norm": 4.376953601837158, "learning_rate": 3.2451730224100127e-06, "loss": 4.2479, "step": 35080 }, { "epoch": 2.80764922387582, "grad_norm": 4.922122001647949, "learning_rate": 3.2318018933518745e-06, "loss": 4.3492, "step": 35090 }, { "epoch": 2.8084493518963036, "grad_norm": 5.147998332977295, "learning_rate": 3.2184307642937372e-06, "loss": 4.0972, "step": 35100 }, { "epoch": 2.8092494799167866, "grad_norm": 5.979716777801514, "learning_rate": 3.2050596352355995e-06, "loss": 4.4772, "step": 35110 }, { "epoch": 2.81004960793727, "grad_norm": 5.627559185028076, "learning_rate": 3.191688506177462e-06, "loss": 4.3933, "step": 35120 }, { "epoch": 2.810849735957753, "grad_norm": 4.930499076843262, "learning_rate": 3.178317377119324e-06, "loss": 4.2395, "step": 35130 }, { "epoch": 2.8116498639782366, "grad_norm": 9.172618865966797, "learning_rate": 3.1649462480611867e-06, "loss": 4.1436, "step": 35140 }, { "epoch": 2.8124499919987196, "grad_norm": 5.223548412322998, "learning_rate": 3.1515751190030486e-06, "loss": 4.3852, "step": 35150 }, { "epoch": 2.813250120019203, "grad_norm": 5.304259300231934, "learning_rate": 3.1382039899449113e-06, "loss": 4.297, "step": 35160 }, { "epoch": 2.8140502480396865, "grad_norm": 5.6694655418396, "learning_rate": 3.1248328608867735e-06, "loss": 4.3311, "step": 35170 }, { "epoch": 2.8148503760601695, "grad_norm": 4.797468185424805, "learning_rate": 3.1114617318286358e-06, "loss": 4.3337, "step": 35180 }, { "epoch": 2.815650504080653, "grad_norm": 5.934649467468262, "learning_rate": 3.098090602770498e-06, "loss": 4.3562, "step": 35190 }, { "epoch": 2.8164506321011364, "grad_norm": 7.17952823638916, "learning_rate": 3.0847194737123603e-06, "loss": 4.2319, "step": 35200 }, { "epoch": 2.8172507601216195, "grad_norm": 5.0281081199646, "learning_rate": 3.071348344654223e-06, "loss": 4.2155, "step": 35210 }, { "epoch": 2.8180508881421025, "grad_norm": 4.19122314453125, "learning_rate": 3.0579772155960853e-06, "loss": 4.2386, "step": 35220 }, { "epoch": 2.818851016162586, "grad_norm": 4.930305480957031, "learning_rate": 3.0446060865379475e-06, "loss": 4.3804, "step": 35230 }, { "epoch": 2.8196511441830694, "grad_norm": 5.851728916168213, "learning_rate": 3.03123495747981e-06, "loss": 4.2732, "step": 35240 }, { "epoch": 2.8204512722035524, "grad_norm": 6.313448905944824, "learning_rate": 3.017863828421672e-06, "loss": 4.2819, "step": 35250 }, { "epoch": 2.821251400224036, "grad_norm": 6.813796520233154, "learning_rate": 3.0044926993635343e-06, "loss": 4.078, "step": 35260 }, { "epoch": 2.8220515282445193, "grad_norm": 11.20564079284668, "learning_rate": 2.991121570305397e-06, "loss": 3.9932, "step": 35270 }, { "epoch": 2.8228516562650023, "grad_norm": 5.3791351318359375, "learning_rate": 2.9777504412472593e-06, "loss": 4.2421, "step": 35280 }, { "epoch": 2.823651784285486, "grad_norm": 5.504123210906982, "learning_rate": 2.9643793121891216e-06, "loss": 4.1563, "step": 35290 }, { "epoch": 2.824451912305969, "grad_norm": 5.196000099182129, "learning_rate": 2.951008183130984e-06, "loss": 4.3874, "step": 35300 }, { "epoch": 2.8252520403264523, "grad_norm": 6.91581916809082, "learning_rate": 2.9376370540728457e-06, "loss": 4.246, "step": 35310 }, { "epoch": 2.8260521683469353, "grad_norm": 5.90737247467041, "learning_rate": 2.9242659250147084e-06, "loss": 4.2119, "step": 35320 }, { "epoch": 2.8268522963674187, "grad_norm": 6.170504093170166, "learning_rate": 2.9108947959565706e-06, "loss": 4.252, "step": 35330 }, { "epoch": 2.827652424387902, "grad_norm": 5.4018354415893555, "learning_rate": 2.897523666898433e-06, "loss": 3.8861, "step": 35340 }, { "epoch": 2.8284525524083852, "grad_norm": 6.109228134155273, "learning_rate": 2.884152537840295e-06, "loss": 4.347, "step": 35350 }, { "epoch": 2.8292526804288687, "grad_norm": 5.722598075866699, "learning_rate": 2.8707814087821574e-06, "loss": 4.1786, "step": 35360 }, { "epoch": 2.830052808449352, "grad_norm": 4.858404636383057, "learning_rate": 2.85741027972402e-06, "loss": 4.2734, "step": 35370 }, { "epoch": 2.830852936469835, "grad_norm": 4.960545063018799, "learning_rate": 2.8440391506658824e-06, "loss": 4.3257, "step": 35380 }, { "epoch": 2.8316530644903186, "grad_norm": 7.317862033843994, "learning_rate": 2.8306680216077447e-06, "loss": 4.1959, "step": 35390 }, { "epoch": 2.8324531925108016, "grad_norm": 4.587612152099609, "learning_rate": 2.817296892549607e-06, "loss": 4.0904, "step": 35400 }, { "epoch": 2.833253320531285, "grad_norm": 5.755588531494141, "learning_rate": 2.803925763491469e-06, "loss": 4.3367, "step": 35410 }, { "epoch": 2.834053448551768, "grad_norm": 4.245760917663574, "learning_rate": 2.7905546344333315e-06, "loss": 4.4096, "step": 35420 }, { "epoch": 2.8348535765722516, "grad_norm": 4.4361748695373535, "learning_rate": 2.777183505375194e-06, "loss": 4.1928, "step": 35430 }, { "epoch": 2.835653704592735, "grad_norm": 4.350141525268555, "learning_rate": 2.7638123763170564e-06, "loss": 4.2633, "step": 35440 }, { "epoch": 2.836453832613218, "grad_norm": 7.448699951171875, "learning_rate": 2.7504412472589187e-06, "loss": 4.2588, "step": 35450 }, { "epoch": 2.8372539606337015, "grad_norm": 3.8748276233673096, "learning_rate": 2.737070118200781e-06, "loss": 4.1945, "step": 35460 }, { "epoch": 2.8380540886541845, "grad_norm": 4.915309906005859, "learning_rate": 2.723698989142643e-06, "loss": 4.2373, "step": 35470 }, { "epoch": 2.838854216674668, "grad_norm": 7.0673980712890625, "learning_rate": 2.710327860084506e-06, "loss": 4.2509, "step": 35480 }, { "epoch": 2.839654344695151, "grad_norm": 4.1713547706604, "learning_rate": 2.696956731026368e-06, "loss": 4.2477, "step": 35490 }, { "epoch": 2.8404544727156344, "grad_norm": 5.206265926361084, "learning_rate": 2.6835856019682304e-06, "loss": 4.2303, "step": 35500 }, { "epoch": 2.841254600736118, "grad_norm": 8.962867736816406, "learning_rate": 2.6702144729100927e-06, "loss": 4.3039, "step": 35510 }, { "epoch": 2.842054728756601, "grad_norm": 7.5982985496521, "learning_rate": 2.656843343851955e-06, "loss": 4.1855, "step": 35520 }, { "epoch": 2.8428548567770844, "grad_norm": 3.994152069091797, "learning_rate": 2.6434722147938172e-06, "loss": 4.1927, "step": 35530 }, { "epoch": 2.843654984797568, "grad_norm": Infinity, "learning_rate": 2.6314381986414934e-06, "loss": 4.3871, "step": 35540 }, { "epoch": 2.844455112818051, "grad_norm": 5.288627624511719, "learning_rate": 2.6180670695833557e-06, "loss": 4.0285, "step": 35550 }, { "epoch": 2.8452552408385343, "grad_norm": 4.253042221069336, "learning_rate": 2.6046959405252184e-06, "loss": 4.2831, "step": 35560 }, { "epoch": 2.8460553688590173, "grad_norm": 4.621375560760498, "learning_rate": 2.5913248114670806e-06, "loss": 4.2246, "step": 35570 }, { "epoch": 2.846855496879501, "grad_norm": 5.700193881988525, "learning_rate": 2.577953682408943e-06, "loss": 4.0086, "step": 35580 }, { "epoch": 2.847655624899984, "grad_norm": 4.042226791381836, "learning_rate": 2.564582553350805e-06, "loss": 4.1566, "step": 35590 }, { "epoch": 2.8484557529204673, "grad_norm": 5.7738776206970215, "learning_rate": 2.5512114242926674e-06, "loss": 4.1277, "step": 35600 }, { "epoch": 2.8492558809409507, "grad_norm": 5.566997528076172, "learning_rate": 2.5378402952345297e-06, "loss": 4.2208, "step": 35610 }, { "epoch": 2.8500560089614337, "grad_norm": 6.773410320281982, "learning_rate": 2.524469166176392e-06, "loss": 4.1155, "step": 35620 }, { "epoch": 2.850856136981917, "grad_norm": 4.425107002258301, "learning_rate": 2.5110980371182542e-06, "loss": 4.2836, "step": 35630 }, { "epoch": 2.8516562650024007, "grad_norm": 6.652309417724609, "learning_rate": 2.4977269080601165e-06, "loss": 4.2207, "step": 35640 }, { "epoch": 2.8524563930228837, "grad_norm": 10.535252571105957, "learning_rate": 2.4843557790019788e-06, "loss": 4.2426, "step": 35650 }, { "epoch": 2.8532565210433667, "grad_norm": 7.071555137634277, "learning_rate": 2.4709846499438414e-06, "loss": 4.1919, "step": 35660 }, { "epoch": 2.85405664906385, "grad_norm": 6.044496059417725, "learning_rate": 2.4576135208857037e-06, "loss": 4.1179, "step": 35670 }, { "epoch": 2.8548567770843336, "grad_norm": 4.740421772003174, "learning_rate": 2.444242391827566e-06, "loss": 4.3368, "step": 35680 }, { "epoch": 2.8556569051048166, "grad_norm": 4.672318935394287, "learning_rate": 2.4308712627694282e-06, "loss": 4.2928, "step": 35690 }, { "epoch": 2.8564570331253, "grad_norm": 4.998258590698242, "learning_rate": 2.4175001337112905e-06, "loss": 4.1935, "step": 35700 }, { "epoch": 2.8572571611457835, "grad_norm": 6.342236042022705, "learning_rate": 2.4041290046531528e-06, "loss": 4.2918, "step": 35710 }, { "epoch": 2.8580572891662666, "grad_norm": 6.538265705108643, "learning_rate": 2.3907578755950155e-06, "loss": 4.2393, "step": 35720 }, { "epoch": 2.85885741718675, "grad_norm": 5.26876974105835, "learning_rate": 2.3773867465368777e-06, "loss": 4.1355, "step": 35730 }, { "epoch": 2.859657545207233, "grad_norm": 6.272813320159912, "learning_rate": 2.36401561747874e-06, "loss": 4.277, "step": 35740 }, { "epoch": 2.8604576732277165, "grad_norm": 4.440347671508789, "learning_rate": 2.3506444884206023e-06, "loss": 4.2423, "step": 35750 }, { "epoch": 2.8612578012481995, "grad_norm": 4.575409889221191, "learning_rate": 2.3372733593624645e-06, "loss": 4.2464, "step": 35760 }, { "epoch": 2.862057929268683, "grad_norm": 6.975805759429932, "learning_rate": 2.3239022303043272e-06, "loss": 4.3602, "step": 35770 }, { "epoch": 2.8628580572891664, "grad_norm": 4.794578552246094, "learning_rate": 2.3105311012461895e-06, "loss": 4.2069, "step": 35780 }, { "epoch": 2.8636581853096494, "grad_norm": 4.724869251251221, "learning_rate": 2.2971599721880518e-06, "loss": 4.319, "step": 35790 }, { "epoch": 2.864458313330133, "grad_norm": 5.100827217102051, "learning_rate": 2.283788843129914e-06, "loss": 4.1811, "step": 35800 }, { "epoch": 2.8652584413506164, "grad_norm": 5.946887016296387, "learning_rate": 2.2704177140717763e-06, "loss": 4.3413, "step": 35810 }, { "epoch": 2.8660585693710994, "grad_norm": 4.958827018737793, "learning_rate": 2.2570465850136386e-06, "loss": 4.2534, "step": 35820 }, { "epoch": 2.8668586973915824, "grad_norm": 4.055940628051758, "learning_rate": 2.2436754559555012e-06, "loss": 4.3207, "step": 35830 }, { "epoch": 2.867658825412066, "grad_norm": 5.415339946746826, "learning_rate": 2.2303043268973635e-06, "loss": 4.0927, "step": 35840 }, { "epoch": 2.8684589534325493, "grad_norm": 11.86763858795166, "learning_rate": 2.2169331978392258e-06, "loss": 4.2369, "step": 35850 }, { "epoch": 2.8692590814530323, "grad_norm": 5.227077960968018, "learning_rate": 2.203562068781088e-06, "loss": 4.0475, "step": 35860 }, { "epoch": 2.870059209473516, "grad_norm": 4.520332336425781, "learning_rate": 2.1901909397229503e-06, "loss": 4.248, "step": 35870 }, { "epoch": 2.8708593374939992, "grad_norm": 4.784921169281006, "learning_rate": 2.1768198106648126e-06, "loss": 4.061, "step": 35880 }, { "epoch": 2.8716594655144823, "grad_norm": 4.605025768280029, "learning_rate": 2.1634486816066753e-06, "loss": 4.3272, "step": 35890 }, { "epoch": 2.8724595935349657, "grad_norm": 4.685732364654541, "learning_rate": 2.1500775525485375e-06, "loss": 4.2693, "step": 35900 }, { "epoch": 2.8732597215554487, "grad_norm": 5.242833614349365, "learning_rate": 2.1367064234903994e-06, "loss": 4.2223, "step": 35910 }, { "epoch": 2.874059849575932, "grad_norm": 7.820953369140625, "learning_rate": 2.1233352944322616e-06, "loss": 3.8063, "step": 35920 }, { "epoch": 2.874859977596415, "grad_norm": 6.070064067840576, "learning_rate": 2.1099641653741243e-06, "loss": 4.1858, "step": 35930 }, { "epoch": 2.8756601056168987, "grad_norm": 10.925037384033203, "learning_rate": 2.0965930363159866e-06, "loss": 4.2107, "step": 35940 }, { "epoch": 2.876460233637382, "grad_norm": 7.051306247711182, "learning_rate": 2.083221907257849e-06, "loss": 4.1936, "step": 35950 }, { "epoch": 2.877260361657865, "grad_norm": 5.999988079071045, "learning_rate": 2.069850778199711e-06, "loss": 4.1867, "step": 35960 }, { "epoch": 2.8780604896783486, "grad_norm": 5.072703838348389, "learning_rate": 2.0564796491415734e-06, "loss": 4.0283, "step": 35970 }, { "epoch": 2.878860617698832, "grad_norm": 6.015980243682861, "learning_rate": 2.0431085200834357e-06, "loss": 4.2977, "step": 35980 }, { "epoch": 2.879660745719315, "grad_norm": 5.586410999298096, "learning_rate": 2.0297373910252984e-06, "loss": 4.0267, "step": 35990 }, { "epoch": 2.8804608737397985, "grad_norm": 4.132725238800049, "learning_rate": 2.0163662619671606e-06, "loss": 4.3056, "step": 36000 }, { "epoch": 2.8812610017602815, "grad_norm": 5.6204023361206055, "learning_rate": 2.002995132909023e-06, "loss": 4.1451, "step": 36010 }, { "epoch": 2.882061129780765, "grad_norm": 5.820326805114746, "learning_rate": 1.989624003850885e-06, "loss": 4.2092, "step": 36020 }, { "epoch": 2.882861257801248, "grad_norm": 7.098259449005127, "learning_rate": 1.9762528747927474e-06, "loss": 4.2706, "step": 36030 }, { "epoch": 2.8836613858217315, "grad_norm": 6.010718822479248, "learning_rate": 1.96288174573461e-06, "loss": 4.3539, "step": 36040 }, { "epoch": 2.884461513842215, "grad_norm": 6.73037052154541, "learning_rate": 1.9495106166764724e-06, "loss": 4.3154, "step": 36050 }, { "epoch": 2.885261641862698, "grad_norm": 5.936001777648926, "learning_rate": 1.9361394876183347e-06, "loss": 4.1449, "step": 36060 }, { "epoch": 2.8860617698831814, "grad_norm": 5.640296936035156, "learning_rate": 1.922768358560197e-06, "loss": 4.1601, "step": 36070 }, { "epoch": 2.8868618979036644, "grad_norm": 6.558215618133545, "learning_rate": 1.909397229502059e-06, "loss": 4.1861, "step": 36080 }, { "epoch": 2.887662025924148, "grad_norm": 4.897027015686035, "learning_rate": 1.8960261004439217e-06, "loss": 4.2773, "step": 36090 }, { "epoch": 2.888462153944631, "grad_norm": 6.5792436599731445, "learning_rate": 1.882654971385784e-06, "loss": 4.0926, "step": 36100 }, { "epoch": 2.8892622819651144, "grad_norm": 5.062023639678955, "learning_rate": 1.8692838423276464e-06, "loss": 4.2352, "step": 36110 }, { "epoch": 2.890062409985598, "grad_norm": 4.872011661529541, "learning_rate": 1.8559127132695087e-06, "loss": 4.1284, "step": 36120 }, { "epoch": 2.890862538006081, "grad_norm": 4.747717380523682, "learning_rate": 1.842541584211371e-06, "loss": 4.2554, "step": 36130 }, { "epoch": 2.8916626660265643, "grad_norm": 7.434582710266113, "learning_rate": 1.8291704551532334e-06, "loss": 4.288, "step": 36140 }, { "epoch": 2.8924627940470478, "grad_norm": 4.58209228515625, "learning_rate": 1.8157993260950957e-06, "loss": 4.2023, "step": 36150 }, { "epoch": 2.8932629220675308, "grad_norm": 5.5834455490112305, "learning_rate": 1.802428197036958e-06, "loss": 4.4362, "step": 36160 }, { "epoch": 2.8940630500880142, "grad_norm": 7.5915422439575195, "learning_rate": 1.7890570679788204e-06, "loss": 4.1086, "step": 36170 }, { "epoch": 2.8948631781084972, "grad_norm": 4.346075534820557, "learning_rate": 1.7756859389206827e-06, "loss": 4.2259, "step": 36180 }, { "epoch": 2.8956633061289807, "grad_norm": 5.605753421783447, "learning_rate": 1.7623148098625448e-06, "loss": 4.3252, "step": 36190 }, { "epoch": 2.8964634341494637, "grad_norm": 5.893536567687988, "learning_rate": 1.748943680804407e-06, "loss": 4.1954, "step": 36200 }, { "epoch": 2.897263562169947, "grad_norm": 4.114583492279053, "learning_rate": 1.7355725517462695e-06, "loss": 4.1583, "step": 36210 }, { "epoch": 2.8980636901904306, "grad_norm": 5.583434581756592, "learning_rate": 1.7222014226881318e-06, "loss": 4.2023, "step": 36220 }, { "epoch": 2.8988638182109137, "grad_norm": 7.7079973220825195, "learning_rate": 1.708830293629994e-06, "loss": 4.335, "step": 36230 }, { "epoch": 2.899663946231397, "grad_norm": 6.053271770477295, "learning_rate": 1.6954591645718565e-06, "loss": 4.1763, "step": 36240 }, { "epoch": 2.9004640742518806, "grad_norm": 6.019364356994629, "learning_rate": 1.6820880355137188e-06, "loss": 4.3155, "step": 36250 }, { "epoch": 2.9012642022723636, "grad_norm": 4.506904125213623, "learning_rate": 1.668716906455581e-06, "loss": 4.191, "step": 36260 }, { "epoch": 2.9020643302928466, "grad_norm": 4.384410381317139, "learning_rate": 1.6553457773974435e-06, "loss": 4.2903, "step": 36270 }, { "epoch": 2.90286445831333, "grad_norm": 5.195708751678467, "learning_rate": 1.6419746483393058e-06, "loss": 4.3516, "step": 36280 }, { "epoch": 2.9036645863338135, "grad_norm": 5.231687068939209, "learning_rate": 1.628603519281168e-06, "loss": 4.1651, "step": 36290 }, { "epoch": 2.9044647143542965, "grad_norm": 5.189396381378174, "learning_rate": 1.6152323902230305e-06, "loss": 3.9818, "step": 36300 }, { "epoch": 2.90526484237478, "grad_norm": 7.152802467346191, "learning_rate": 1.6018612611648928e-06, "loss": 4.1943, "step": 36310 }, { "epoch": 2.9060649703952635, "grad_norm": 3.657484769821167, "learning_rate": 1.588490132106755e-06, "loss": 4.1238, "step": 36320 }, { "epoch": 2.9068650984157465, "grad_norm": 6.751424789428711, "learning_rate": 1.5751190030486175e-06, "loss": 4.3217, "step": 36330 }, { "epoch": 2.90766522643623, "grad_norm": 6.126858711242676, "learning_rate": 1.5617478739904798e-06, "loss": 4.2568, "step": 36340 }, { "epoch": 2.908465354456713, "grad_norm": 7.0460638999938965, "learning_rate": 1.5483767449323423e-06, "loss": 4.5145, "step": 36350 }, { "epoch": 2.9092654824771964, "grad_norm": 5.848211765289307, "learning_rate": 1.5350056158742046e-06, "loss": 4.3336, "step": 36360 }, { "epoch": 2.9100656104976794, "grad_norm": 3.3985276222229004, "learning_rate": 1.5216344868160668e-06, "loss": 4.1685, "step": 36370 }, { "epoch": 2.910865738518163, "grad_norm": 4.115097522735596, "learning_rate": 1.5082633577579293e-06, "loss": 4.1931, "step": 36380 }, { "epoch": 2.9116658665386463, "grad_norm": 4.848307132720947, "learning_rate": 1.4948922286997916e-06, "loss": 4.239, "step": 36390 }, { "epoch": 2.9124659945591294, "grad_norm": 7.926689147949219, "learning_rate": 1.4815210996416538e-06, "loss": 4.1758, "step": 36400 }, { "epoch": 2.913266122579613, "grad_norm": 4.529385089874268, "learning_rate": 1.468149970583516e-06, "loss": 4.3033, "step": 36410 }, { "epoch": 2.9140662506000963, "grad_norm": 6.482027530670166, "learning_rate": 1.4547788415253784e-06, "loss": 4.228, "step": 36420 }, { "epoch": 2.9148663786205793, "grad_norm": 8.809609413146973, "learning_rate": 1.4414077124672408e-06, "loss": 4.2323, "step": 36430 }, { "epoch": 2.9156665066410623, "grad_norm": 6.071745872497559, "learning_rate": 1.4280365834091031e-06, "loss": 4.1434, "step": 36440 }, { "epoch": 2.9164666346615458, "grad_norm": 10.558405876159668, "learning_rate": 1.4146654543509654e-06, "loss": 4.1691, "step": 36450 }, { "epoch": 2.9172667626820292, "grad_norm": 5.350115776062012, "learning_rate": 1.4012943252928279e-06, "loss": 4.1962, "step": 36460 }, { "epoch": 2.9180668907025122, "grad_norm": 5.859897136688232, "learning_rate": 1.3879231962346901e-06, "loss": 4.4039, "step": 36470 }, { "epoch": 2.9188670187229957, "grad_norm": 5.368648052215576, "learning_rate": 1.3745520671765524e-06, "loss": 4.0889, "step": 36480 }, { "epoch": 2.919667146743479, "grad_norm": 6.981392860412598, "learning_rate": 1.3611809381184149e-06, "loss": 4.0967, "step": 36490 }, { "epoch": 2.920467274763962, "grad_norm": 10.068036079406738, "learning_rate": 1.3478098090602771e-06, "loss": 4.2863, "step": 36500 }, { "epoch": 2.9212674027844456, "grad_norm": 6.100005149841309, "learning_rate": 1.3344386800021394e-06, "loss": 4.3266, "step": 36510 }, { "epoch": 2.9220675308049286, "grad_norm": 4.14323616027832, "learning_rate": 1.3210675509440019e-06, "loss": 4.319, "step": 36520 }, { "epoch": 2.922867658825412, "grad_norm": 5.502248287200928, "learning_rate": 1.3076964218858641e-06, "loss": 4.272, "step": 36530 }, { "epoch": 2.923667786845895, "grad_norm": 9.553668022155762, "learning_rate": 1.2943252928277264e-06, "loss": 4.2033, "step": 36540 }, { "epoch": 2.9244679148663786, "grad_norm": 6.439156532287598, "learning_rate": 1.2809541637695887e-06, "loss": 4.1459, "step": 36550 }, { "epoch": 2.925268042886862, "grad_norm": 6.06801700592041, "learning_rate": 1.267583034711451e-06, "loss": 4.1379, "step": 36560 }, { "epoch": 2.926068170907345, "grad_norm": 6.407751083374023, "learning_rate": 1.2542119056533134e-06, "loss": 4.2743, "step": 36570 }, { "epoch": 2.9268682989278285, "grad_norm": 5.329061985015869, "learning_rate": 1.2408407765951757e-06, "loss": 4.2889, "step": 36580 }, { "epoch": 2.927668426948312, "grad_norm": 5.764720916748047, "learning_rate": 1.2274696475370382e-06, "loss": 4.411, "step": 36590 }, { "epoch": 2.928468554968795, "grad_norm": 5.452577590942383, "learning_rate": 1.2140985184789004e-06, "loss": 4.2659, "step": 36600 }, { "epoch": 2.9292686829892784, "grad_norm": 5.00178337097168, "learning_rate": 1.2007273894207627e-06, "loss": 4.4003, "step": 36610 }, { "epoch": 2.9300688110097615, "grad_norm": 4.074296474456787, "learning_rate": 1.1873562603626252e-06, "loss": 4.2294, "step": 36620 }, { "epoch": 2.930868939030245, "grad_norm": 4.745507717132568, "learning_rate": 1.1739851313044874e-06, "loss": 4.1708, "step": 36630 }, { "epoch": 2.931669067050728, "grad_norm": 5.222048282623291, "learning_rate": 1.1606140022463497e-06, "loss": 4.0252, "step": 36640 }, { "epoch": 2.9324691950712114, "grad_norm": 5.4653639793396, "learning_rate": 1.1472428731882122e-06, "loss": 4.0802, "step": 36650 }, { "epoch": 2.933269323091695, "grad_norm": 5.444248676300049, "learning_rate": 1.1338717441300745e-06, "loss": 4.194, "step": 36660 }, { "epoch": 2.934069451112178, "grad_norm": 4.424251079559326, "learning_rate": 1.1205006150719367e-06, "loss": 4.121, "step": 36670 }, { "epoch": 2.9348695791326613, "grad_norm": 5.136073112487793, "learning_rate": 1.107129486013799e-06, "loss": 4.1506, "step": 36680 }, { "epoch": 2.9356697071531443, "grad_norm": 6.3223876953125, "learning_rate": 1.0937583569556613e-06, "loss": 4.2885, "step": 36690 }, { "epoch": 2.936469835173628, "grad_norm": 8.229373931884766, "learning_rate": 1.0803872278975237e-06, "loss": 4.2838, "step": 36700 }, { "epoch": 2.937269963194111, "grad_norm": 4.667328834533691, "learning_rate": 1.067016098839386e-06, "loss": 4.1594, "step": 36710 }, { "epoch": 2.9380700912145943, "grad_norm": 11.4064302444458, "learning_rate": 1.0536449697812483e-06, "loss": 4.1362, "step": 36720 }, { "epoch": 2.9388702192350777, "grad_norm": 11.03482437133789, "learning_rate": 1.0402738407231107e-06, "loss": 4.1872, "step": 36730 }, { "epoch": 2.9396703472555608, "grad_norm": 5.653985977172852, "learning_rate": 1.026902711664973e-06, "loss": 4.221, "step": 36740 }, { "epoch": 2.940470475276044, "grad_norm": 7.984028339385986, "learning_rate": 1.0135315826068353e-06, "loss": 4.1823, "step": 36750 }, { "epoch": 2.9412706032965277, "grad_norm": 7.034099578857422, "learning_rate": 1.0001604535486978e-06, "loss": 4.2426, "step": 36760 }, { "epoch": 2.9420707313170107, "grad_norm": 6.773263931274414, "learning_rate": 9.8678932449056e-07, "loss": 4.2967, "step": 36770 }, { "epoch": 2.942870859337494, "grad_norm": 4.996553421020508, "learning_rate": 9.734181954324225e-07, "loss": 4.2479, "step": 36780 }, { "epoch": 2.943670987357977, "grad_norm": 12.89632797241211, "learning_rate": 9.600470663742848e-07, "loss": 4.1419, "step": 36790 }, { "epoch": 2.9444711153784606, "grad_norm": 4.559993743896484, "learning_rate": 9.46675937316147e-07, "loss": 4.2202, "step": 36800 }, { "epoch": 2.9452712433989436, "grad_norm": 5.565128803253174, "learning_rate": 9.333048082580094e-07, "loss": 4.3414, "step": 36810 }, { "epoch": 2.946071371419427, "grad_norm": 4.633853912353516, "learning_rate": 9.199336791998716e-07, "loss": 4.3058, "step": 36820 }, { "epoch": 2.9468714994399106, "grad_norm": 6.64238166809082, "learning_rate": 9.065625501417339e-07, "loss": 4.1815, "step": 36830 }, { "epoch": 2.9476716274603936, "grad_norm": 4.056352615356445, "learning_rate": 8.931914210835963e-07, "loss": 4.3363, "step": 36840 }, { "epoch": 2.948471755480877, "grad_norm": 5.0011491775512695, "learning_rate": 8.798202920254586e-07, "loss": 4.2357, "step": 36850 }, { "epoch": 2.9492718835013605, "grad_norm": 6.450883865356445, "learning_rate": 8.66449162967321e-07, "loss": 4.2437, "step": 36860 }, { "epoch": 2.9500720115218435, "grad_norm": 6.694143295288086, "learning_rate": 8.530780339091833e-07, "loss": 4.2564, "step": 36870 }, { "epoch": 2.9508721395423265, "grad_norm": 4.477694511413574, "learning_rate": 8.397069048510457e-07, "loss": 4.2768, "step": 36880 }, { "epoch": 2.95167226756281, "grad_norm": 5.470662593841553, "learning_rate": 8.26335775792908e-07, "loss": 4.3017, "step": 36890 }, { "epoch": 2.9524723955832934, "grad_norm": 5.221342086791992, "learning_rate": 8.129646467347703e-07, "loss": 4.0872, "step": 36900 }, { "epoch": 2.9532725236037765, "grad_norm": 6.306137561798096, "learning_rate": 7.995935176766327e-07, "loss": 4.2408, "step": 36910 }, { "epoch": 2.95407265162426, "grad_norm": 6.915722370147705, "learning_rate": 7.86222388618495e-07, "loss": 4.3012, "step": 36920 }, { "epoch": 2.9548727796447434, "grad_norm": 4.694388389587402, "learning_rate": 7.728512595603572e-07, "loss": 4.2886, "step": 36930 }, { "epoch": 2.9556729076652264, "grad_norm": 4.7695136070251465, "learning_rate": 7.594801305022196e-07, "loss": 4.1293, "step": 36940 }, { "epoch": 2.95647303568571, "grad_norm": 8.320331573486328, "learning_rate": 7.46109001444082e-07, "loss": 4.2172, "step": 36950 }, { "epoch": 2.957273163706193, "grad_norm": 6.202345371246338, "learning_rate": 7.327378723859443e-07, "loss": 4.0253, "step": 36960 }, { "epoch": 2.9580732917266763, "grad_norm": 8.366705894470215, "learning_rate": 7.193667433278066e-07, "loss": 4.1124, "step": 36970 }, { "epoch": 2.9588734197471593, "grad_norm": 4.636302471160889, "learning_rate": 7.05995614269669e-07, "loss": 4.3183, "step": 36980 }, { "epoch": 2.959673547767643, "grad_norm": 5.838320732116699, "learning_rate": 6.926244852115314e-07, "loss": 4.2729, "step": 36990 }, { "epoch": 2.9604736757881263, "grad_norm": 5.318964004516602, "learning_rate": 6.792533561533936e-07, "loss": 4.2532, "step": 37000 }, { "epoch": 2.9612738038086093, "grad_norm": 8.026263236999512, "learning_rate": 6.658822270952559e-07, "loss": 4.2332, "step": 37010 }, { "epoch": 2.9620739318290927, "grad_norm": 5.9107747077941895, "learning_rate": 6.525110980371183e-07, "loss": 4.1986, "step": 37020 }, { "epoch": 2.962874059849576, "grad_norm": 5.215660095214844, "learning_rate": 6.391399689789807e-07, "loss": 4.0658, "step": 37030 }, { "epoch": 2.963674187870059, "grad_norm": 7.460134506225586, "learning_rate": 6.257688399208429e-07, "loss": 4.264, "step": 37040 }, { "epoch": 2.964474315890542, "grad_norm": 7.57894229888916, "learning_rate": 6.123977108627053e-07, "loss": 4.0961, "step": 37050 }, { "epoch": 2.9652744439110257, "grad_norm": 6.080920219421387, "learning_rate": 5.990265818045677e-07, "loss": 4.2651, "step": 37060 }, { "epoch": 2.966074571931509, "grad_norm": 8.295559883117676, "learning_rate": 5.856554527464299e-07, "loss": 4.2018, "step": 37070 }, { "epoch": 2.966874699951992, "grad_norm": 8.348221778869629, "learning_rate": 5.722843236882922e-07, "loss": 4.3621, "step": 37080 }, { "epoch": 2.9676748279724756, "grad_norm": 4.557129383087158, "learning_rate": 5.589131946301546e-07, "loss": 4.1681, "step": 37090 }, { "epoch": 2.968474955992959, "grad_norm": 4.582211017608643, "learning_rate": 5.455420655720169e-07, "loss": 4.1408, "step": 37100 }, { "epoch": 2.969275084013442, "grad_norm": 4.796433925628662, "learning_rate": 5.321709365138793e-07, "loss": 4.2535, "step": 37110 }, { "epoch": 2.9700752120339255, "grad_norm": 7.60089111328125, "learning_rate": 5.187998074557416e-07, "loss": 4.1283, "step": 37120 }, { "epoch": 2.9708753400544086, "grad_norm": 7.079959392547607, "learning_rate": 5.05428678397604e-07, "loss": 4.1379, "step": 37130 }, { "epoch": 2.971675468074892, "grad_norm": 5.832749843597412, "learning_rate": 4.920575493394662e-07, "loss": 4.379, "step": 37140 }, { "epoch": 2.972475596095375, "grad_norm": 5.663512229919434, "learning_rate": 4.786864202813286e-07, "loss": 4.2176, "step": 37150 }, { "epoch": 2.9732757241158585, "grad_norm": 6.955112934112549, "learning_rate": 4.653152912231909e-07, "loss": 4.0786, "step": 37160 }, { "epoch": 2.974075852136342, "grad_norm": 5.792449951171875, "learning_rate": 4.5194416216505323e-07, "loss": 4.1782, "step": 37170 }, { "epoch": 2.974875980156825, "grad_norm": 4.567730903625488, "learning_rate": 4.385730331069156e-07, "loss": 4.0626, "step": 37180 }, { "epoch": 2.9756761081773084, "grad_norm": 5.905571937561035, "learning_rate": 4.252019040487779e-07, "loss": 4.3109, "step": 37190 }, { "epoch": 2.976476236197792, "grad_norm": 7.822843074798584, "learning_rate": 4.1183077499064024e-07, "loss": 4.1584, "step": 37200 }, { "epoch": 2.977276364218275, "grad_norm": 4.660794258117676, "learning_rate": 3.984596459325025e-07, "loss": 4.3121, "step": 37210 }, { "epoch": 2.9780764922387584, "grad_norm": 4.471508979797363, "learning_rate": 3.8508851687436493e-07, "loss": 4.1769, "step": 37220 }, { "epoch": 2.9788766202592414, "grad_norm": 6.368688106536865, "learning_rate": 3.717173878162272e-07, "loss": 4.2282, "step": 37230 }, { "epoch": 2.979676748279725, "grad_norm": 31.647247314453125, "learning_rate": 3.5834625875808957e-07, "loss": 4.295, "step": 37240 }, { "epoch": 2.980476876300208, "grad_norm": 5.0117950439453125, "learning_rate": 3.449751296999519e-07, "loss": 4.2265, "step": 37250 }, { "epoch": 2.9812770043206913, "grad_norm": 7.659223556518555, "learning_rate": 3.316040006418142e-07, "loss": 4.2627, "step": 37260 }, { "epoch": 2.9820771323411748, "grad_norm": 5.416043281555176, "learning_rate": 3.1823287158367653e-07, "loss": 4.1306, "step": 37270 }, { "epoch": 2.982877260361658, "grad_norm": 5.165368556976318, "learning_rate": 3.048617425255389e-07, "loss": 4.3564, "step": 37280 }, { "epoch": 2.9836773883821412, "grad_norm": 4.954187870025635, "learning_rate": 2.914906134674012e-07, "loss": 4.1988, "step": 37290 }, { "epoch": 2.9844775164026243, "grad_norm": 4.00586462020874, "learning_rate": 2.7811948440926354e-07, "loss": 4.186, "step": 37300 }, { "epoch": 2.9852776444231077, "grad_norm": 4.78800630569458, "learning_rate": 2.6474835535112586e-07, "loss": 4.0983, "step": 37310 }, { "epoch": 2.9860777724435907, "grad_norm": 6.098124980926514, "learning_rate": 2.513772262929882e-07, "loss": 4.335, "step": 37320 }, { "epoch": 2.986877900464074, "grad_norm": 8.325749397277832, "learning_rate": 2.380060972348505e-07, "loss": 4.1324, "step": 37330 }, { "epoch": 2.9876780284845577, "grad_norm": 4.86344051361084, "learning_rate": 2.2463496817671285e-07, "loss": 4.2644, "step": 37340 }, { "epoch": 2.9884781565050407, "grad_norm": 6.780296325683594, "learning_rate": 2.112638391185752e-07, "loss": 4.3815, "step": 37350 }, { "epoch": 2.989278284525524, "grad_norm": 5.512997150421143, "learning_rate": 1.9789271006043754e-07, "loss": 4.1606, "step": 37360 }, { "epoch": 2.9900784125460076, "grad_norm": 5.321970462799072, "learning_rate": 1.8452158100229986e-07, "loss": 4.0704, "step": 37370 }, { "epoch": 2.9908785405664906, "grad_norm": 5.405054569244385, "learning_rate": 1.7115045194416218e-07, "loss": 4.215, "step": 37380 }, { "epoch": 2.991678668586974, "grad_norm": 7.2463274002075195, "learning_rate": 1.577793228860245e-07, "loss": 4.2474, "step": 37390 }, { "epoch": 2.992478796607457, "grad_norm": 5.009868621826172, "learning_rate": 1.4440819382788684e-07, "loss": 4.4354, "step": 37400 }, { "epoch": 2.9932789246279405, "grad_norm": 5.4555206298828125, "learning_rate": 1.3103706476974916e-07, "loss": 4.3358, "step": 37410 }, { "epoch": 2.9940790526484236, "grad_norm": 4.64158296585083, "learning_rate": 1.176659357116115e-07, "loss": 4.1267, "step": 37420 }, { "epoch": 2.994879180668907, "grad_norm": 5.860431671142578, "learning_rate": 1.0429480665347382e-07, "loss": 4.4098, "step": 37430 }, { "epoch": 2.9956793086893905, "grad_norm": 5.745077133178711, "learning_rate": 9.092367759533615e-08, "loss": 4.255, "step": 37440 }, { "epoch": 2.9964794367098735, "grad_norm": 5.062090873718262, "learning_rate": 7.755254853719848e-08, "loss": 4.2798, "step": 37450 }, { "epoch": 2.997279564730357, "grad_norm": 4.622069358825684, "learning_rate": 6.418141947906081e-08, "loss": 4.1738, "step": 37460 }, { "epoch": 2.9980796927508404, "grad_norm": 5.84435510635376, "learning_rate": 5.0810290420923146e-08, "loss": 4.2221, "step": 37470 }, { "epoch": 2.9988798207713234, "grad_norm": 4.550410747528076, "learning_rate": 3.743916136278548e-08, "loss": 4.3505, "step": 37480 }, { "epoch": 2.9996799487918064, "grad_norm": 4.685887813568115, "learning_rate": 2.4068032304647808e-08, "loss": 4.0673, "step": 37490 }, { "epoch": 3.0, "step": 37494, "total_flos": 0.0, "train_loss": 4.986996560858022, "train_runtime": 84970.2993, "train_samples_per_second": 7.06, "train_steps_per_second": 0.441 } ], "logging_steps": 10, "max_steps": 37494, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 7000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }