{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5600896143382941, "eval_steps": 7000, "global_step": 7000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0, "eval_loss": 9.306169509887695, "eval_runtime": 10.9126, "eval_samples_per_second": 3.665, "eval_steps_per_second": 0.458, "step": 0 }, { "epoch": 0.0008001280204832773, "grad_norm": 8.51533031463623, "learning_rate": 3.5000000000000004e-06, "loss": 8.786, "step": 10 }, { "epoch": 0.0016002560409665546, "grad_norm": 10.90935230255127, "learning_rate": 8.500000000000002e-06, "loss": 8.3433, "step": 20 }, { "epoch": 0.002400384061449832, "grad_norm": 7.269016265869141, "learning_rate": 1.3500000000000001e-05, "loss": 7.549, "step": 30 }, { "epoch": 0.003200512081933109, "grad_norm": 8.790578842163086, "learning_rate": 1.85e-05, "loss": 7.2574, "step": 40 }, { "epoch": 0.004000640102416387, "grad_norm": 6.52068567276001, "learning_rate": 2.35e-05, "loss": 7.0024, "step": 50 }, { "epoch": 0.004800768122899664, "grad_norm": 6.902959823608398, "learning_rate": 2.8499999999999998e-05, "loss": 6.9074, "step": 60 }, { "epoch": 0.005600896143382941, "grad_norm": 5.350945949554443, "learning_rate": 3.35e-05, "loss": 6.8765, "step": 70 }, { "epoch": 0.006401024163866218, "grad_norm": 5.928489685058594, "learning_rate": 3.85e-05, "loss": 6.5663, "step": 80 }, { "epoch": 0.007201152184349496, "grad_norm": 9.222543716430664, "learning_rate": 4.35e-05, "loss": 6.6131, "step": 90 }, { "epoch": 0.008001280204832774, "grad_norm": 6.57027006149292, "learning_rate": 4.85e-05, "loss": 6.5829, "step": 100 }, { "epoch": 0.00880140822531605, "grad_norm": 5.280848503112793, "learning_rate": 4.999064020965931e-05, "loss": 6.5996, "step": 110 }, { "epoch": 0.009601536245799328, "grad_norm": 5.950971603393555, "learning_rate": 4.997726908060117e-05, "loss": 6.6075, "step": 120 }, { "epoch": 0.010401664266282605, "grad_norm": 4.300549507141113, "learning_rate": 4.996389795154303e-05, "loss": 6.5074, "step": 130 }, { "epoch": 0.011201792286765882, "grad_norm": 4.824333190917969, "learning_rate": 4.9950526822484896e-05, "loss": 6.6072, "step": 140 }, { "epoch": 0.01200192030724916, "grad_norm": 5.4324116706848145, "learning_rate": 4.993715569342676e-05, "loss": 6.6183, "step": 150 }, { "epoch": 0.012802048327732437, "grad_norm": 4.087579250335693, "learning_rate": 4.992378456436862e-05, "loss": 6.4806, "step": 160 }, { "epoch": 0.013602176348215714, "grad_norm": 7.260207653045654, "learning_rate": 4.9910413435310484e-05, "loss": 6.3709, "step": 170 }, { "epoch": 0.014402304368698993, "grad_norm": 4.145061016082764, "learning_rate": 4.9897042306252346e-05, "loss": 6.2951, "step": 180 }, { "epoch": 0.01520243238918227, "grad_norm": 3.2026450634002686, "learning_rate": 4.98836711771942e-05, "loss": 6.3255, "step": 190 }, { "epoch": 0.016002560409665547, "grad_norm": 3.443145751953125, "learning_rate": 4.9870300048136065e-05, "loss": 6.4894, "step": 200 }, { "epoch": 0.016802688430148822, "grad_norm": 5.324231147766113, "learning_rate": 4.985692891907793e-05, "loss": 6.4312, "step": 210 }, { "epoch": 0.0176028164506321, "grad_norm": 3.2833452224731445, "learning_rate": 4.984355779001979e-05, "loss": 6.513, "step": 220 }, { "epoch": 0.018402944471115377, "grad_norm": 3.8984358310699463, "learning_rate": 4.983018666096165e-05, "loss": 6.1683, "step": 230 }, { "epoch": 0.019203072491598656, "grad_norm": 4.183676719665527, "learning_rate": 4.9816815531903516e-05, "loss": 6.329, "step": 240 }, { "epoch": 0.020003200512081935, "grad_norm": 3.136693239212036, "learning_rate": 4.980344440284538e-05, "loss": 6.466, "step": 250 }, { "epoch": 0.02080332853256521, "grad_norm": 4.185967445373535, "learning_rate": 4.979007327378724e-05, "loss": 6.4613, "step": 260 }, { "epoch": 0.02160345655304849, "grad_norm": 3.105653762817383, "learning_rate": 4.9776702144729104e-05, "loss": 6.3596, "step": 270 }, { "epoch": 0.022403584573531764, "grad_norm": 3.927561044692993, "learning_rate": 4.9763331015670967e-05, "loss": 6.2604, "step": 280 }, { "epoch": 0.023203712594015043, "grad_norm": 3.513439178466797, "learning_rate": 4.974995988661283e-05, "loss": 6.2747, "step": 290 }, { "epoch": 0.02400384061449832, "grad_norm": 3.07377290725708, "learning_rate": 4.973658875755469e-05, "loss": 6.202, "step": 300 }, { "epoch": 0.024803968634981598, "grad_norm": 3.045619249343872, "learning_rate": 4.9723217628496555e-05, "loss": 6.1022, "step": 310 }, { "epoch": 0.025604096655464873, "grad_norm": 3.330648183822632, "learning_rate": 4.970984649943842e-05, "loss": 6.1544, "step": 320 }, { "epoch": 0.026404224675948152, "grad_norm": 3.0299668312072754, "learning_rate": 4.969647537038028e-05, "loss": 6.3119, "step": 330 }, { "epoch": 0.027204352696431428, "grad_norm": 3.687938928604126, "learning_rate": 4.9683104241322136e-05, "loss": 6.333, "step": 340 }, { "epoch": 0.028004480716914706, "grad_norm": 4.0919413566589355, "learning_rate": 4.9669733112264e-05, "loss": 6.1711, "step": 350 }, { "epoch": 0.028804608737397985, "grad_norm": 3.1327242851257324, "learning_rate": 4.965636198320586e-05, "loss": 6.3365, "step": 360 }, { "epoch": 0.02960473675788126, "grad_norm": 4.531859874725342, "learning_rate": 4.9642990854147724e-05, "loss": 6.2121, "step": 370 }, { "epoch": 0.03040486477836454, "grad_norm": 2.522672414779663, "learning_rate": 4.962961972508959e-05, "loss": 6.2388, "step": 380 }, { "epoch": 0.031204992798847815, "grad_norm": 5.62153959274292, "learning_rate": 4.961624859603145e-05, "loss": 6.168, "step": 390 }, { "epoch": 0.032005120819331094, "grad_norm": 3.522804021835327, "learning_rate": 4.960287746697331e-05, "loss": 6.1207, "step": 400 }, { "epoch": 0.03280524883981437, "grad_norm": 7.260324478149414, "learning_rate": 4.9589506337915175e-05, "loss": 6.31, "step": 410 }, { "epoch": 0.033605376860297645, "grad_norm": 4.309441566467285, "learning_rate": 4.957613520885704e-05, "loss": 6.1107, "step": 420 }, { "epoch": 0.034405504880780924, "grad_norm": 3.2409913539886475, "learning_rate": 4.95627640797989e-05, "loss": 6.2082, "step": 430 }, { "epoch": 0.0352056329012642, "grad_norm": 3.9414610862731934, "learning_rate": 4.954939295074076e-05, "loss": 6.2102, "step": 440 }, { "epoch": 0.03600576092174748, "grad_norm": 2.441235303878784, "learning_rate": 4.9536021821682626e-05, "loss": 6.1023, "step": 450 }, { "epoch": 0.036805888942230754, "grad_norm": 2.997591972351074, "learning_rate": 4.952265069262449e-05, "loss": 6.1147, "step": 460 }, { "epoch": 0.03760601696271403, "grad_norm": 3.950436592102051, "learning_rate": 4.950927956356635e-05, "loss": 6.0725, "step": 470 }, { "epoch": 0.03840614498319731, "grad_norm": 3.4340896606445312, "learning_rate": 4.9495908434508214e-05, "loss": 6.1336, "step": 480 }, { "epoch": 0.03920627300368059, "grad_norm": 3.28839373588562, "learning_rate": 4.948253730545007e-05, "loss": 6.1709, "step": 490 }, { "epoch": 0.04000640102416387, "grad_norm": 2.976365566253662, "learning_rate": 4.946916617639193e-05, "loss": 6.2074, "step": 500 }, { "epoch": 0.04080652904464714, "grad_norm": 4.156027793884277, "learning_rate": 4.9455795047333795e-05, "loss": 6.1694, "step": 510 }, { "epoch": 0.04160665706513042, "grad_norm": 3.4855797290802, "learning_rate": 4.944242391827566e-05, "loss": 6.1218, "step": 520 }, { "epoch": 0.0424067850856137, "grad_norm": 4.489185333251953, "learning_rate": 4.942905278921752e-05, "loss": 6.1507, "step": 530 }, { "epoch": 0.04320691310609698, "grad_norm": 3.2751166820526123, "learning_rate": 4.941568166015938e-05, "loss": 6.1055, "step": 540 }, { "epoch": 0.04400704112658025, "grad_norm": 2.4234585762023926, "learning_rate": 4.9402310531101246e-05, "loss": 6.1755, "step": 550 }, { "epoch": 0.04480716914706353, "grad_norm": 3.4436991214752197, "learning_rate": 4.938893940204311e-05, "loss": 6.1882, "step": 560 }, { "epoch": 0.04560729716754681, "grad_norm": 3.3731908798217773, "learning_rate": 4.937556827298497e-05, "loss": 6.0648, "step": 570 }, { "epoch": 0.04640742518803009, "grad_norm": 3.8733670711517334, "learning_rate": 4.9362197143926834e-05, "loss": 6.0621, "step": 580 }, { "epoch": 0.04720755320851336, "grad_norm": 4.126636505126953, "learning_rate": 4.9348826014868696e-05, "loss": 6.122, "step": 590 }, { "epoch": 0.04800768122899664, "grad_norm": 3.8605775833129883, "learning_rate": 4.933545488581056e-05, "loss": 5.9788, "step": 600 }, { "epoch": 0.048807809249479917, "grad_norm": 2.9509966373443604, "learning_rate": 4.932208375675242e-05, "loss": 6.2045, "step": 610 }, { "epoch": 0.049607937269963195, "grad_norm": 4.4266510009765625, "learning_rate": 4.9308712627694285e-05, "loss": 5.9981, "step": 620 }, { "epoch": 0.050408065290446474, "grad_norm": 2.79042649269104, "learning_rate": 4.929534149863615e-05, "loss": 6.1882, "step": 630 }, { "epoch": 0.051208193310929746, "grad_norm": 2.8986568450927734, "learning_rate": 4.928197036957801e-05, "loss": 6.1739, "step": 640 }, { "epoch": 0.052008321331413025, "grad_norm": 4.294217586517334, "learning_rate": 4.926859924051987e-05, "loss": 6.0566, "step": 650 }, { "epoch": 0.052808449351896304, "grad_norm": 8.848836898803711, "learning_rate": 4.9255228111461735e-05, "loss": 6.2994, "step": 660 }, { "epoch": 0.05360857737237958, "grad_norm": 3.2204337120056152, "learning_rate": 4.92418569824036e-05, "loss": 6.0573, "step": 670 }, { "epoch": 0.054408705392862855, "grad_norm": 4.775251865386963, "learning_rate": 4.922848585334546e-05, "loss": 5.9764, "step": 680 }, { "epoch": 0.055208833413346134, "grad_norm": 3.5426905155181885, "learning_rate": 4.921511472428732e-05, "loss": 6.0402, "step": 690 }, { "epoch": 0.05600896143382941, "grad_norm": 10.72481632232666, "learning_rate": 4.9201743595229186e-05, "loss": 6.0024, "step": 700 }, { "epoch": 0.05680908945431269, "grad_norm": 2.441681385040283, "learning_rate": 4.918837246617105e-05, "loss": 6.1122, "step": 710 }, { "epoch": 0.05760921747479597, "grad_norm": 3.375319480895996, "learning_rate": 4.917500133711291e-05, "loss": 6.058, "step": 720 }, { "epoch": 0.05840934549527924, "grad_norm": 2.821507453918457, "learning_rate": 4.9161630208054774e-05, "loss": 6.0586, "step": 730 }, { "epoch": 0.05920947351576252, "grad_norm": 2.8658957481384277, "learning_rate": 4.914825907899664e-05, "loss": 6.0115, "step": 740 }, { "epoch": 0.0600096015362458, "grad_norm": 2.239774227142334, "learning_rate": 4.91348879499385e-05, "loss": 6.0669, "step": 750 }, { "epoch": 0.06080972955672908, "grad_norm": 3.5249900817871094, "learning_rate": 4.912151682088036e-05, "loss": 6.1013, "step": 760 }, { "epoch": 0.06160985757721235, "grad_norm": 2.790356159210205, "learning_rate": 4.9108145691822225e-05, "loss": 6.0099, "step": 770 }, { "epoch": 0.06240998559769563, "grad_norm": 3.0729963779449463, "learning_rate": 4.909477456276409e-05, "loss": 6.1376, "step": 780 }, { "epoch": 0.06321011361817891, "grad_norm": 2.9490275382995605, "learning_rate": 4.908140343370595e-05, "loss": 6.1457, "step": 790 }, { "epoch": 0.06401024163866219, "grad_norm": 2.7475438117980957, "learning_rate": 4.9068032304647806e-05, "loss": 6.0041, "step": 800 }, { "epoch": 0.06481036965914547, "grad_norm": 2.755703926086426, "learning_rate": 4.905466117558967e-05, "loss": 6.0242, "step": 810 }, { "epoch": 0.06561049767962875, "grad_norm": 2.724515676498413, "learning_rate": 4.904129004653153e-05, "loss": 6.1827, "step": 820 }, { "epoch": 0.06641062570011202, "grad_norm": 4.498260974884033, "learning_rate": 4.9027918917473394e-05, "loss": 6.0892, "step": 830 }, { "epoch": 0.06721075372059529, "grad_norm": 2.4399070739746094, "learning_rate": 4.901454778841526e-05, "loss": 6.0197, "step": 840 }, { "epoch": 0.06801088174107857, "grad_norm": 2.7584304809570312, "learning_rate": 4.900117665935712e-05, "loss": 5.9056, "step": 850 }, { "epoch": 0.06881100976156185, "grad_norm": 2.8177144527435303, "learning_rate": 4.898780553029898e-05, "loss": 6.1484, "step": 860 }, { "epoch": 0.06961113778204513, "grad_norm": 4.181133270263672, "learning_rate": 4.8974434401240845e-05, "loss": 5.9376, "step": 870 }, { "epoch": 0.0704112658025284, "grad_norm": 3.677849769592285, "learning_rate": 4.896106327218271e-05, "loss": 6.0403, "step": 880 }, { "epoch": 0.07121139382301168, "grad_norm": 3.1553192138671875, "learning_rate": 4.894769214312457e-05, "loss": 6.0488, "step": 890 }, { "epoch": 0.07201152184349496, "grad_norm": 3.2580947875976562, "learning_rate": 4.893432101406643e-05, "loss": 6.1002, "step": 900 }, { "epoch": 0.07281164986397824, "grad_norm": 6.328150749206543, "learning_rate": 4.8920949885008296e-05, "loss": 6.0225, "step": 910 }, { "epoch": 0.07361177788446151, "grad_norm": 2.7467615604400635, "learning_rate": 4.890757875595016e-05, "loss": 5.9622, "step": 920 }, { "epoch": 0.07441190590494479, "grad_norm": 2.86570405960083, "learning_rate": 4.889420762689202e-05, "loss": 5.9718, "step": 930 }, { "epoch": 0.07521203392542807, "grad_norm": 2.544917106628418, "learning_rate": 4.8880836497833884e-05, "loss": 5.8697, "step": 940 }, { "epoch": 0.07601216194591134, "grad_norm": 2.5245840549468994, "learning_rate": 4.8867465368775746e-05, "loss": 5.9973, "step": 950 }, { "epoch": 0.07681228996639462, "grad_norm": 3.6830902099609375, "learning_rate": 4.88540942397176e-05, "loss": 5.943, "step": 960 }, { "epoch": 0.0776124179868779, "grad_norm": 2.6643354892730713, "learning_rate": 4.8840723110659465e-05, "loss": 5.8958, "step": 970 }, { "epoch": 0.07841254600736118, "grad_norm": 6.4623565673828125, "learning_rate": 4.882735198160133e-05, "loss": 6.0236, "step": 980 }, { "epoch": 0.07921267402784446, "grad_norm": 2.186974048614502, "learning_rate": 4.881398085254319e-05, "loss": 6.0481, "step": 990 }, { "epoch": 0.08001280204832774, "grad_norm": 2.4983859062194824, "learning_rate": 4.880060972348505e-05, "loss": 6.075, "step": 1000 }, { "epoch": 0.080812930068811, "grad_norm": 2.778280258178711, "learning_rate": 4.8787238594426916e-05, "loss": 6.0757, "step": 1010 }, { "epoch": 0.08161305808929428, "grad_norm": 2.706965923309326, "learning_rate": 4.877386746536878e-05, "loss": 6.1504, "step": 1020 }, { "epoch": 0.08241318610977756, "grad_norm": 3.4069600105285645, "learning_rate": 4.876049633631064e-05, "loss": 6.0889, "step": 1030 }, { "epoch": 0.08321331413026084, "grad_norm": 3.179551124572754, "learning_rate": 4.8747125207252504e-05, "loss": 6.0057, "step": 1040 }, { "epoch": 0.08401344215074412, "grad_norm": 2.924018383026123, "learning_rate": 4.873375407819437e-05, "loss": 5.8406, "step": 1050 }, { "epoch": 0.0848135701712274, "grad_norm": 3.103912115097046, "learning_rate": 4.872038294913623e-05, "loss": 6.0351, "step": 1060 }, { "epoch": 0.08561369819171068, "grad_norm": 2.8037219047546387, "learning_rate": 4.870701182007809e-05, "loss": 6.0272, "step": 1070 }, { "epoch": 0.08641382621219396, "grad_norm": 2.477062940597534, "learning_rate": 4.8693640691019955e-05, "loss": 5.9269, "step": 1080 }, { "epoch": 0.08721395423267723, "grad_norm": 2.748488187789917, "learning_rate": 4.868026956196182e-05, "loss": 5.943, "step": 1090 }, { "epoch": 0.0880140822531605, "grad_norm": 3.3991920948028564, "learning_rate": 4.866689843290368e-05, "loss": 6.1455, "step": 1100 }, { "epoch": 0.08881421027364378, "grad_norm": 3.208509683609009, "learning_rate": 4.8653527303845536e-05, "loss": 5.9746, "step": 1110 }, { "epoch": 0.08961433829412706, "grad_norm": 3.3378469944000244, "learning_rate": 4.86401561747874e-05, "loss": 5.9185, "step": 1120 }, { "epoch": 0.09041446631461034, "grad_norm": 2.269606113433838, "learning_rate": 4.862678504572926e-05, "loss": 5.9369, "step": 1130 }, { "epoch": 0.09121459433509362, "grad_norm": 2.749335765838623, "learning_rate": 4.8613413916671124e-05, "loss": 6.0648, "step": 1140 }, { "epoch": 0.0920147223555769, "grad_norm": 2.821913480758667, "learning_rate": 4.860004278761299e-05, "loss": 5.952, "step": 1150 }, { "epoch": 0.09281485037606017, "grad_norm": 2.640990734100342, "learning_rate": 4.858667165855485e-05, "loss": 6.0537, "step": 1160 }, { "epoch": 0.09361497839654345, "grad_norm": 3.570896625518799, "learning_rate": 4.857330052949671e-05, "loss": 5.7721, "step": 1170 }, { "epoch": 0.09441510641702672, "grad_norm": 3.245318651199341, "learning_rate": 4.8559929400438575e-05, "loss": 5.7305, "step": 1180 }, { "epoch": 0.09521523443751, "grad_norm": 4.075076580047607, "learning_rate": 4.854655827138044e-05, "loss": 5.974, "step": 1190 }, { "epoch": 0.09601536245799328, "grad_norm": 2.429893732070923, "learning_rate": 4.85331871423223e-05, "loss": 5.7828, "step": 1200 }, { "epoch": 0.09681549047847655, "grad_norm": 2.7077040672302246, "learning_rate": 4.851981601326416e-05, "loss": 5.9143, "step": 1210 }, { "epoch": 0.09761561849895983, "grad_norm": 2.767918586730957, "learning_rate": 4.8506444884206026e-05, "loss": 5.9449, "step": 1220 }, { "epoch": 0.09841574651944311, "grad_norm": 2.4544034004211426, "learning_rate": 4.849307375514789e-05, "loss": 6.0034, "step": 1230 }, { "epoch": 0.09921587453992639, "grad_norm": 5.215607643127441, "learning_rate": 4.847970262608975e-05, "loss": 5.867, "step": 1240 }, { "epoch": 0.10001600256040967, "grad_norm": 2.7856080532073975, "learning_rate": 4.8466331497031614e-05, "loss": 6.0213, "step": 1250 }, { "epoch": 0.10081613058089295, "grad_norm": 2.5528719425201416, "learning_rate": 4.8452960367973476e-05, "loss": 5.9634, "step": 1260 }, { "epoch": 0.10161625860137621, "grad_norm": 2.4917409420013428, "learning_rate": 4.843958923891533e-05, "loss": 5.887, "step": 1270 }, { "epoch": 0.10241638662185949, "grad_norm": 6.125699520111084, "learning_rate": 4.8426218109857195e-05, "loss": 6.1189, "step": 1280 }, { "epoch": 0.10321651464234277, "grad_norm": 2.783156156539917, "learning_rate": 4.841284698079906e-05, "loss": 5.9064, "step": 1290 }, { "epoch": 0.10401664266282605, "grad_norm": 3.611070156097412, "learning_rate": 4.839947585174092e-05, "loss": 5.9405, "step": 1300 }, { "epoch": 0.10481677068330933, "grad_norm": 4.296909809112549, "learning_rate": 4.838610472268278e-05, "loss": 5.9067, "step": 1310 }, { "epoch": 0.10561689870379261, "grad_norm": 2.4273040294647217, "learning_rate": 4.8372733593624646e-05, "loss": 5.888, "step": 1320 }, { "epoch": 0.10641702672427589, "grad_norm": 2.6499924659729004, "learning_rate": 4.835936246456651e-05, "loss": 5.9683, "step": 1330 }, { "epoch": 0.10721715474475917, "grad_norm": 3.1474297046661377, "learning_rate": 4.834599133550837e-05, "loss": 5.8946, "step": 1340 }, { "epoch": 0.10801728276524244, "grad_norm": 3.5050199031829834, "learning_rate": 4.8332620206450234e-05, "loss": 5.9179, "step": 1350 }, { "epoch": 0.10881741078572571, "grad_norm": 2.693700075149536, "learning_rate": 4.8319249077392096e-05, "loss": 5.7965, "step": 1360 }, { "epoch": 0.10961753880620899, "grad_norm": 2.8202953338623047, "learning_rate": 4.830587794833396e-05, "loss": 5.9526, "step": 1370 }, { "epoch": 0.11041766682669227, "grad_norm": 2.514862060546875, "learning_rate": 4.829250681927582e-05, "loss": 5.936, "step": 1380 }, { "epoch": 0.11121779484717555, "grad_norm": 3.18804931640625, "learning_rate": 4.8279135690217685e-05, "loss": 5.9246, "step": 1390 }, { "epoch": 0.11201792286765883, "grad_norm": 2.77697491645813, "learning_rate": 4.826576456115955e-05, "loss": 5.9576, "step": 1400 }, { "epoch": 0.1128180508881421, "grad_norm": 2.762524127960205, "learning_rate": 4.825239343210141e-05, "loss": 5.9085, "step": 1410 }, { "epoch": 0.11361817890862538, "grad_norm": 2.4407670497894287, "learning_rate": 4.8239022303043266e-05, "loss": 5.9518, "step": 1420 }, { "epoch": 0.11441830692910866, "grad_norm": 3.1036713123321533, "learning_rate": 4.822565117398513e-05, "loss": 5.8412, "step": 1430 }, { "epoch": 0.11521843494959194, "grad_norm": 3.319058418273926, "learning_rate": 4.821228004492699e-05, "loss": 5.9733, "step": 1440 }, { "epoch": 0.1160185629700752, "grad_norm": 2.13468599319458, "learning_rate": 4.8198908915868854e-05, "loss": 5.9193, "step": 1450 }, { "epoch": 0.11681869099055849, "grad_norm": 2.6057028770446777, "learning_rate": 4.8185537786810717e-05, "loss": 5.9807, "step": 1460 }, { "epoch": 0.11761881901104176, "grad_norm": 2.7509753704071045, "learning_rate": 4.817216665775258e-05, "loss": 5.9534, "step": 1470 }, { "epoch": 0.11841894703152504, "grad_norm": 2.111055850982666, "learning_rate": 4.815879552869444e-05, "loss": 5.9207, "step": 1480 }, { "epoch": 0.11921907505200832, "grad_norm": 2.5271990299224854, "learning_rate": 4.8145424399636305e-05, "loss": 5.7148, "step": 1490 }, { "epoch": 0.1200192030724916, "grad_norm": 2.814138174057007, "learning_rate": 4.813205327057817e-05, "loss": 5.9498, "step": 1500 }, { "epoch": 0.12081933109297488, "grad_norm": 3.449355363845825, "learning_rate": 4.811868214152003e-05, "loss": 5.7814, "step": 1510 }, { "epoch": 0.12161945911345816, "grad_norm": 2.813746213912964, "learning_rate": 4.810531101246189e-05, "loss": 5.9517, "step": 1520 }, { "epoch": 0.12241958713394142, "grad_norm": 2.529242753982544, "learning_rate": 4.8091939883403755e-05, "loss": 5.8227, "step": 1530 }, { "epoch": 0.1232197151544247, "grad_norm": 2.2425034046173096, "learning_rate": 4.807856875434562e-05, "loss": 6.1064, "step": 1540 }, { "epoch": 0.12401984317490798, "grad_norm": 2.7732784748077393, "learning_rate": 4.806519762528748e-05, "loss": 5.8888, "step": 1550 }, { "epoch": 0.12481997119539126, "grad_norm": 2.5558009147644043, "learning_rate": 4.8051826496229343e-05, "loss": 5.8185, "step": 1560 }, { "epoch": 0.12562009921587455, "grad_norm": 2.884411096572876, "learning_rate": 4.8038455367171206e-05, "loss": 6.0534, "step": 1570 }, { "epoch": 0.12642022723635782, "grad_norm": 2.5747668743133545, "learning_rate": 4.802508423811307e-05, "loss": 5.8186, "step": 1580 }, { "epoch": 0.12722035525684108, "grad_norm": 2.324767827987671, "learning_rate": 4.801171310905493e-05, "loss": 5.8642, "step": 1590 }, { "epoch": 0.12802048327732438, "grad_norm": 2.2255160808563232, "learning_rate": 4.7998341979996794e-05, "loss": 5.8559, "step": 1600 }, { "epoch": 0.12882061129780764, "grad_norm": 2.97525954246521, "learning_rate": 4.798497085093866e-05, "loss": 5.8744, "step": 1610 }, { "epoch": 0.12962073931829093, "grad_norm": 2.23962664604187, "learning_rate": 4.797159972188052e-05, "loss": 5.7545, "step": 1620 }, { "epoch": 0.1304208673387742, "grad_norm": 3.6182124614715576, "learning_rate": 4.795822859282238e-05, "loss": 5.8872, "step": 1630 }, { "epoch": 0.1312209953592575, "grad_norm": 4.068545341491699, "learning_rate": 4.7944857463764245e-05, "loss": 5.9008, "step": 1640 }, { "epoch": 0.13202112337974076, "grad_norm": 3.627082109451294, "learning_rate": 4.793148633470611e-05, "loss": 5.8215, "step": 1650 }, { "epoch": 0.13282125140022405, "grad_norm": 3.0080721378326416, "learning_rate": 4.791811520564797e-05, "loss": 5.9086, "step": 1660 }, { "epoch": 0.13362137942070731, "grad_norm": 2.5463860034942627, "learning_rate": 4.790474407658983e-05, "loss": 5.776, "step": 1670 }, { "epoch": 0.13442150744119058, "grad_norm": 2.212488889694214, "learning_rate": 4.7891372947531696e-05, "loss": 6.006, "step": 1680 }, { "epoch": 0.13522163546167387, "grad_norm": 4.147563934326172, "learning_rate": 4.787800181847356e-05, "loss": 5.886, "step": 1690 }, { "epoch": 0.13602176348215714, "grad_norm": 2.6021018028259277, "learning_rate": 4.786463068941542e-05, "loss": 5.9182, "step": 1700 }, { "epoch": 0.13682189150264043, "grad_norm": 2.3109893798828125, "learning_rate": 4.7851259560357284e-05, "loss": 5.8084, "step": 1710 }, { "epoch": 0.1376220195231237, "grad_norm": 2.8678529262542725, "learning_rate": 4.7837888431299147e-05, "loss": 6.0363, "step": 1720 }, { "epoch": 0.138422147543607, "grad_norm": 2.1921958923339844, "learning_rate": 4.7824517302241e-05, "loss": 5.7667, "step": 1730 }, { "epoch": 0.13922227556409025, "grad_norm": 2.6883316040039062, "learning_rate": 4.7811146173182865e-05, "loss": 5.7906, "step": 1740 }, { "epoch": 0.14002240358457352, "grad_norm": 2.4079957008361816, "learning_rate": 4.779777504412473e-05, "loss": 5.7698, "step": 1750 }, { "epoch": 0.1408225316050568, "grad_norm": 4.29390287399292, "learning_rate": 4.778440391506659e-05, "loss": 5.9639, "step": 1760 }, { "epoch": 0.14162265962554008, "grad_norm": 4.133132457733154, "learning_rate": 4.777103278600845e-05, "loss": 6.0901, "step": 1770 }, { "epoch": 0.14242278764602337, "grad_norm": 3.871561288833618, "learning_rate": 4.7757661656950316e-05, "loss": 5.7455, "step": 1780 }, { "epoch": 0.14322291566650663, "grad_norm": 4.266111850738525, "learning_rate": 4.774429052789218e-05, "loss": 5.9971, "step": 1790 }, { "epoch": 0.14402304368698993, "grad_norm": 2.9000513553619385, "learning_rate": 4.773091939883404e-05, "loss": 5.9025, "step": 1800 }, { "epoch": 0.1448231717074732, "grad_norm": 2.549964189529419, "learning_rate": 4.7717548269775904e-05, "loss": 5.768, "step": 1810 }, { "epoch": 0.14562329972795648, "grad_norm": 2.2882704734802246, "learning_rate": 4.770417714071777e-05, "loss": 6.022, "step": 1820 }, { "epoch": 0.14642342774843975, "grad_norm": 2.6501784324645996, "learning_rate": 4.769080601165963e-05, "loss": 5.8539, "step": 1830 }, { "epoch": 0.14722355576892301, "grad_norm": 2.3417108058929443, "learning_rate": 4.767743488260149e-05, "loss": 5.7734, "step": 1840 }, { "epoch": 0.1480236837894063, "grad_norm": 2.2151668071746826, "learning_rate": 4.7664063753543355e-05, "loss": 5.84, "step": 1850 }, { "epoch": 0.14882381180988957, "grad_norm": 3.114260196685791, "learning_rate": 4.765069262448522e-05, "loss": 5.9409, "step": 1860 }, { "epoch": 0.14962393983037287, "grad_norm": 2.4931910037994385, "learning_rate": 4.763732149542708e-05, "loss": 5.9396, "step": 1870 }, { "epoch": 0.15042406785085613, "grad_norm": 3.736487865447998, "learning_rate": 4.7623950366368936e-05, "loss": 5.7427, "step": 1880 }, { "epoch": 0.15122419587133942, "grad_norm": 4.730785846710205, "learning_rate": 4.76105792373108e-05, "loss": 5.9181, "step": 1890 }, { "epoch": 0.1520243238918227, "grad_norm": 2.9264132976531982, "learning_rate": 4.759720810825266e-05, "loss": 5.8967, "step": 1900 }, { "epoch": 0.15282445191230598, "grad_norm": 3.2538132667541504, "learning_rate": 4.7583836979194524e-05, "loss": 5.8459, "step": 1910 }, { "epoch": 0.15362457993278925, "grad_norm": 2.7208549976348877, "learning_rate": 4.757046585013639e-05, "loss": 5.7038, "step": 1920 }, { "epoch": 0.1544247079532725, "grad_norm": 2.7510788440704346, "learning_rate": 4.755709472107825e-05, "loss": 5.8524, "step": 1930 }, { "epoch": 0.1552248359737558, "grad_norm": 2.6565892696380615, "learning_rate": 4.754372359202011e-05, "loss": 5.6324, "step": 1940 }, { "epoch": 0.15602496399423907, "grad_norm": 2.954798936843872, "learning_rate": 4.7530352462961975e-05, "loss": 5.8388, "step": 1950 }, { "epoch": 0.15682509201472236, "grad_norm": 2.291714668273926, "learning_rate": 4.751698133390384e-05, "loss": 5.7504, "step": 1960 }, { "epoch": 0.15762522003520563, "grad_norm": 2.1387598514556885, "learning_rate": 4.75036102048457e-05, "loss": 5.7556, "step": 1970 }, { "epoch": 0.15842534805568892, "grad_norm": 2.290407180786133, "learning_rate": 4.749023907578756e-05, "loss": 5.7089, "step": 1980 }, { "epoch": 0.15922547607617218, "grad_norm": 2.852696657180786, "learning_rate": 4.7476867946729426e-05, "loss": 5.8656, "step": 1990 }, { "epoch": 0.16002560409665548, "grad_norm": 2.8190526962280273, "learning_rate": 4.746349681767129e-05, "loss": 6.0134, "step": 2000 }, { "epoch": 0.16082573211713874, "grad_norm": 2.705008029937744, "learning_rate": 4.745012568861315e-05, "loss": 5.8713, "step": 2010 }, { "epoch": 0.161625860137622, "grad_norm": 3.571394205093384, "learning_rate": 4.7436754559555014e-05, "loss": 5.8329, "step": 2020 }, { "epoch": 0.1624259881581053, "grad_norm": 2.687455177307129, "learning_rate": 4.7423383430496876e-05, "loss": 5.8355, "step": 2030 }, { "epoch": 0.16322611617858857, "grad_norm": 2.6158690452575684, "learning_rate": 4.741001230143873e-05, "loss": 5.6938, "step": 2040 }, { "epoch": 0.16402624419907186, "grad_norm": 2.9657154083251953, "learning_rate": 4.7396641172380595e-05, "loss": 5.7514, "step": 2050 }, { "epoch": 0.16482637221955512, "grad_norm": 2.310607433319092, "learning_rate": 4.738327004332246e-05, "loss": 5.7397, "step": 2060 }, { "epoch": 0.16562650024003842, "grad_norm": 2.855271339416504, "learning_rate": 4.736989891426432e-05, "loss": 5.7645, "step": 2070 }, { "epoch": 0.16642662826052168, "grad_norm": 2.778768301010132, "learning_rate": 4.735652778520618e-05, "loss": 5.9582, "step": 2080 }, { "epoch": 0.16722675628100497, "grad_norm": 3.069973945617676, "learning_rate": 4.7343156656148046e-05, "loss": 5.8205, "step": 2090 }, { "epoch": 0.16802688430148824, "grad_norm": 3.5799551010131836, "learning_rate": 4.732978552708991e-05, "loss": 5.9001, "step": 2100 }, { "epoch": 0.1688270123219715, "grad_norm": 2.556668758392334, "learning_rate": 4.731641439803177e-05, "loss": 5.7258, "step": 2110 }, { "epoch": 0.1696271403424548, "grad_norm": 2.7847707271575928, "learning_rate": 4.7303043268973634e-05, "loss": 5.9007, "step": 2120 }, { "epoch": 0.17042726836293806, "grad_norm": 4.071508407592773, "learning_rate": 4.7289672139915496e-05, "loss": 5.7035, "step": 2130 }, { "epoch": 0.17122739638342135, "grad_norm": 2.6188418865203857, "learning_rate": 4.727630101085736e-05, "loss": 5.651, "step": 2140 }, { "epoch": 0.17202752440390462, "grad_norm": 1.952249526977539, "learning_rate": 4.726292988179922e-05, "loss": 6.1107, "step": 2150 }, { "epoch": 0.1728276524243879, "grad_norm": 2.299018144607544, "learning_rate": 4.7249558752741085e-05, "loss": 5.7609, "step": 2160 }, { "epoch": 0.17362778044487118, "grad_norm": 2.5578439235687256, "learning_rate": 4.723618762368295e-05, "loss": 5.792, "step": 2170 }, { "epoch": 0.17442790846535447, "grad_norm": 3.9921529293060303, "learning_rate": 4.722281649462481e-05, "loss": 5.7233, "step": 2180 }, { "epoch": 0.17522803648583773, "grad_norm": 2.5521302223205566, "learning_rate": 4.7209445365566666e-05, "loss": 5.807, "step": 2190 }, { "epoch": 0.176028164506321, "grad_norm": 2.71401047706604, "learning_rate": 4.719607423650853e-05, "loss": 5.6689, "step": 2200 }, { "epoch": 0.1768282925268043, "grad_norm": 3.782607316970825, "learning_rate": 4.718270310745039e-05, "loss": 5.734, "step": 2210 }, { "epoch": 0.17762842054728756, "grad_norm": 2.57356333732605, "learning_rate": 4.7169331978392254e-05, "loss": 5.8101, "step": 2220 }, { "epoch": 0.17842854856777085, "grad_norm": 2.7005815505981445, "learning_rate": 4.715596084933412e-05, "loss": 6.0603, "step": 2230 }, { "epoch": 0.17922867658825412, "grad_norm": 2.081550359725952, "learning_rate": 4.714258972027598e-05, "loss": 5.7677, "step": 2240 }, { "epoch": 0.1800288046087374, "grad_norm": 3.6565728187561035, "learning_rate": 4.712921859121784e-05, "loss": 5.9672, "step": 2250 }, { "epoch": 0.18082893262922067, "grad_norm": 2.4702320098876953, "learning_rate": 4.7115847462159705e-05, "loss": 5.8397, "step": 2260 }, { "epoch": 0.18162906064970397, "grad_norm": 3.335736036300659, "learning_rate": 4.710247633310157e-05, "loss": 5.7021, "step": 2270 }, { "epoch": 0.18242918867018723, "grad_norm": 3.3939075469970703, "learning_rate": 4.708910520404343e-05, "loss": 5.8464, "step": 2280 }, { "epoch": 0.1832293166906705, "grad_norm": 2.4869279861450195, "learning_rate": 4.707573407498529e-05, "loss": 5.6904, "step": 2290 }, { "epoch": 0.1840294447111538, "grad_norm": 2.4240360260009766, "learning_rate": 4.7062362945927155e-05, "loss": 5.7227, "step": 2300 }, { "epoch": 0.18482957273163705, "grad_norm": 2.428786039352417, "learning_rate": 4.704899181686902e-05, "loss": 5.8295, "step": 2310 }, { "epoch": 0.18562970075212035, "grad_norm": 3.3214187622070312, "learning_rate": 4.703562068781088e-05, "loss": 5.8341, "step": 2320 }, { "epoch": 0.1864298287726036, "grad_norm": 3.2146456241607666, "learning_rate": 4.7022249558752744e-05, "loss": 5.7217, "step": 2330 }, { "epoch": 0.1872299567930869, "grad_norm": 4.442914009094238, "learning_rate": 4.7008878429694606e-05, "loss": 5.9003, "step": 2340 }, { "epoch": 0.18803008481357017, "grad_norm": 1.9268267154693604, "learning_rate": 4.699550730063646e-05, "loss": 5.8292, "step": 2350 }, { "epoch": 0.18883021283405343, "grad_norm": 3.130021095275879, "learning_rate": 4.6982136171578325e-05, "loss": 5.6864, "step": 2360 }, { "epoch": 0.18963034085453673, "grad_norm": 2.8835690021514893, "learning_rate": 4.696876504252019e-05, "loss": 5.829, "step": 2370 }, { "epoch": 0.19043046887502, "grad_norm": 2.4171135425567627, "learning_rate": 4.695539391346205e-05, "loss": 5.7972, "step": 2380 }, { "epoch": 0.19123059689550329, "grad_norm": 3.782817840576172, "learning_rate": 4.694202278440391e-05, "loss": 5.8497, "step": 2390 }, { "epoch": 0.19203072491598655, "grad_norm": 2.475249767303467, "learning_rate": 4.6928651655345776e-05, "loss": 5.9237, "step": 2400 }, { "epoch": 0.19283085293646984, "grad_norm": 2.5809242725372314, "learning_rate": 4.691528052628764e-05, "loss": 5.7756, "step": 2410 }, { "epoch": 0.1936309809569531, "grad_norm": 2.6922059059143066, "learning_rate": 4.69019093972295e-05, "loss": 5.9326, "step": 2420 }, { "epoch": 0.1944311089774364, "grad_norm": 2.7542431354522705, "learning_rate": 4.6888538268171364e-05, "loss": 5.6279, "step": 2430 }, { "epoch": 0.19523123699791967, "grad_norm": 2.4063303470611572, "learning_rate": 4.6875167139113226e-05, "loss": 5.91, "step": 2440 }, { "epoch": 0.19603136501840293, "grad_norm": 4.855547904968262, "learning_rate": 4.686179601005509e-05, "loss": 5.7286, "step": 2450 }, { "epoch": 0.19683149303888622, "grad_norm": 2.9875595569610596, "learning_rate": 4.684842488099695e-05, "loss": 5.8299, "step": 2460 }, { "epoch": 0.1976316210593695, "grad_norm": 4.467639923095703, "learning_rate": 4.6835053751938814e-05, "loss": 5.8469, "step": 2470 }, { "epoch": 0.19843174907985278, "grad_norm": 2.2144124507904053, "learning_rate": 4.682168262288068e-05, "loss": 5.7871, "step": 2480 }, { "epoch": 0.19923187710033605, "grad_norm": 2.4507012367248535, "learning_rate": 4.680831149382254e-05, "loss": 5.7529, "step": 2490 }, { "epoch": 0.20003200512081934, "grad_norm": 2.208648681640625, "learning_rate": 4.67949403647644e-05, "loss": 5.7265, "step": 2500 }, { "epoch": 0.2008321331413026, "grad_norm": 2.560302257537842, "learning_rate": 4.6781569235706265e-05, "loss": 5.7842, "step": 2510 }, { "epoch": 0.2016322611617859, "grad_norm": 2.354292154312134, "learning_rate": 4.676819810664813e-05, "loss": 5.8468, "step": 2520 }, { "epoch": 0.20243238918226916, "grad_norm": 2.9559860229492188, "learning_rate": 4.675482697758999e-05, "loss": 5.7003, "step": 2530 }, { "epoch": 0.20323251720275243, "grad_norm": 3.251077651977539, "learning_rate": 4.674145584853185e-05, "loss": 5.8129, "step": 2540 }, { "epoch": 0.20403264522323572, "grad_norm": 2.7863471508026123, "learning_rate": 4.6728084719473716e-05, "loss": 5.6814, "step": 2550 }, { "epoch": 0.20483277324371899, "grad_norm": 2.9006989002227783, "learning_rate": 4.671471359041558e-05, "loss": 5.8292, "step": 2560 }, { "epoch": 0.20563290126420228, "grad_norm": 2.930689573287964, "learning_rate": 4.670134246135744e-05, "loss": 5.8825, "step": 2570 }, { "epoch": 0.20643302928468554, "grad_norm": 2.3105032444000244, "learning_rate": 4.6687971332299304e-05, "loss": 5.7039, "step": 2580 }, { "epoch": 0.20723315730516884, "grad_norm": 3.1141879558563232, "learning_rate": 4.667460020324117e-05, "loss": 5.8692, "step": 2590 }, { "epoch": 0.2080332853256521, "grad_norm": 3.5017199516296387, "learning_rate": 4.666122907418303e-05, "loss": 5.7922, "step": 2600 }, { "epoch": 0.2088334133461354, "grad_norm": 2.657975912094116, "learning_rate": 4.664785794512489e-05, "loss": 5.7736, "step": 2610 }, { "epoch": 0.20963354136661866, "grad_norm": 3.246952772140503, "learning_rate": 4.6634486816066755e-05, "loss": 5.768, "step": 2620 }, { "epoch": 0.21043366938710192, "grad_norm": 6.832335948944092, "learning_rate": 4.662111568700862e-05, "loss": 5.6752, "step": 2630 }, { "epoch": 0.21123379740758522, "grad_norm": 3.2479753494262695, "learning_rate": 4.660774455795048e-05, "loss": 5.8015, "step": 2640 }, { "epoch": 0.21203392542806848, "grad_norm": 2.809082508087158, "learning_rate": 4.659437342889234e-05, "loss": 5.8663, "step": 2650 }, { "epoch": 0.21283405344855177, "grad_norm": 3.7948036193847656, "learning_rate": 4.65810022998342e-05, "loss": 5.889, "step": 2660 }, { "epoch": 0.21363418146903504, "grad_norm": 2.836090564727783, "learning_rate": 4.656763117077606e-05, "loss": 5.7516, "step": 2670 }, { "epoch": 0.21443430948951833, "grad_norm": 3.0940232276916504, "learning_rate": 4.6554260041717924e-05, "loss": 5.7033, "step": 2680 }, { "epoch": 0.2152344375100016, "grad_norm": 2.436757802963257, "learning_rate": 4.654088891265979e-05, "loss": 5.746, "step": 2690 }, { "epoch": 0.2160345655304849, "grad_norm": 2.4339609146118164, "learning_rate": 4.652751778360165e-05, "loss": 5.828, "step": 2700 }, { "epoch": 0.21683469355096816, "grad_norm": 2.379366874694824, "learning_rate": 4.651414665454351e-05, "loss": 5.719, "step": 2710 }, { "epoch": 0.21763482157145142, "grad_norm": 2.1722371578216553, "learning_rate": 4.6500775525485375e-05, "loss": 5.7875, "step": 2720 }, { "epoch": 0.2184349495919347, "grad_norm": 3.633279800415039, "learning_rate": 4.648740439642724e-05, "loss": 5.802, "step": 2730 }, { "epoch": 0.21923507761241798, "grad_norm": 2.4091219902038574, "learning_rate": 4.64740332673691e-05, "loss": 5.8197, "step": 2740 }, { "epoch": 0.22003520563290127, "grad_norm": 2.7289021015167236, "learning_rate": 4.646066213831096e-05, "loss": 5.9445, "step": 2750 }, { "epoch": 0.22083533365338454, "grad_norm": 2.376481294631958, "learning_rate": 4.6447291009252826e-05, "loss": 5.9943, "step": 2760 }, { "epoch": 0.22163546167386783, "grad_norm": 2.6542563438415527, "learning_rate": 4.643391988019469e-05, "loss": 5.6049, "step": 2770 }, { "epoch": 0.2224355896943511, "grad_norm": 2.320472240447998, "learning_rate": 4.642054875113655e-05, "loss": 5.7637, "step": 2780 }, { "epoch": 0.2232357177148344, "grad_norm": 2.8923239707946777, "learning_rate": 4.6407177622078414e-05, "loss": 5.9666, "step": 2790 }, { "epoch": 0.22403584573531765, "grad_norm": 4.277271270751953, "learning_rate": 4.6393806493020276e-05, "loss": 5.8393, "step": 2800 }, { "epoch": 0.22483597375580092, "grad_norm": 2.797428607940674, "learning_rate": 4.638043536396213e-05, "loss": 5.759, "step": 2810 }, { "epoch": 0.2256361017762842, "grad_norm": 2.1849517822265625, "learning_rate": 4.6367064234903995e-05, "loss": 5.7514, "step": 2820 }, { "epoch": 0.22643622979676747, "grad_norm": 2.8607492446899414, "learning_rate": 4.635369310584586e-05, "loss": 5.7545, "step": 2830 }, { "epoch": 0.22723635781725077, "grad_norm": 3.722041130065918, "learning_rate": 4.634032197678772e-05, "loss": 5.8011, "step": 2840 }, { "epoch": 0.22803648583773403, "grad_norm": 2.8563833236694336, "learning_rate": 4.632695084772958e-05, "loss": 5.8569, "step": 2850 }, { "epoch": 0.22883661385821732, "grad_norm": 3.5724806785583496, "learning_rate": 4.6313579718671446e-05, "loss": 5.9649, "step": 2860 }, { "epoch": 0.2296367418787006, "grad_norm": 2.380469560623169, "learning_rate": 4.630020858961331e-05, "loss": 5.7467, "step": 2870 }, { "epoch": 0.23043686989918388, "grad_norm": 3.1629838943481445, "learning_rate": 4.628683746055517e-05, "loss": 5.642, "step": 2880 }, { "epoch": 0.23123699791966715, "grad_norm": 2.1239373683929443, "learning_rate": 4.6273466331497034e-05, "loss": 5.6483, "step": 2890 }, { "epoch": 0.2320371259401504, "grad_norm": 3.049079418182373, "learning_rate": 4.6260095202438897e-05, "loss": 5.9736, "step": 2900 }, { "epoch": 0.2328372539606337, "grad_norm": 2.556830406188965, "learning_rate": 4.624672407338076e-05, "loss": 5.6037, "step": 2910 }, { "epoch": 0.23363738198111697, "grad_norm": 2.8762035369873047, "learning_rate": 4.623335294432262e-05, "loss": 5.6345, "step": 2920 }, { "epoch": 0.23443751000160026, "grad_norm": 2.11167573928833, "learning_rate": 4.6219981815264485e-05, "loss": 5.7822, "step": 2930 }, { "epoch": 0.23523763802208353, "grad_norm": 4.623869895935059, "learning_rate": 4.620661068620635e-05, "loss": 5.7063, "step": 2940 }, { "epoch": 0.23603776604256682, "grad_norm": 2.4420578479766846, "learning_rate": 4.619323955714821e-05, "loss": 5.686, "step": 2950 }, { "epoch": 0.2368378940630501, "grad_norm": 2.6543869972229004, "learning_rate": 4.617986842809007e-05, "loss": 5.7802, "step": 2960 }, { "epoch": 0.23763802208353338, "grad_norm": 2.6264312267303467, "learning_rate": 4.616649729903193e-05, "loss": 5.6667, "step": 2970 }, { "epoch": 0.23843815010401664, "grad_norm": 2.4579195976257324, "learning_rate": 4.615312616997379e-05, "loss": 5.6738, "step": 2980 }, { "epoch": 0.2392382781244999, "grad_norm": 2.299448251724243, "learning_rate": 4.6139755040915654e-05, "loss": 5.8622, "step": 2990 }, { "epoch": 0.2400384061449832, "grad_norm": 3.6527328491210938, "learning_rate": 4.612638391185752e-05, "loss": 5.6346, "step": 3000 }, { "epoch": 0.24083853416546647, "grad_norm": 2.217876434326172, "learning_rate": 4.611301278279938e-05, "loss": 5.7892, "step": 3010 }, { "epoch": 0.24163866218594976, "grad_norm": 3.500544309616089, "learning_rate": 4.609964165374124e-05, "loss": 5.8026, "step": 3020 }, { "epoch": 0.24243879020643302, "grad_norm": 3.1694483757019043, "learning_rate": 4.6086270524683105e-05, "loss": 5.827, "step": 3030 }, { "epoch": 0.24323891822691632, "grad_norm": 2.899625778198242, "learning_rate": 4.607289939562497e-05, "loss": 5.7384, "step": 3040 }, { "epoch": 0.24403904624739958, "grad_norm": 2.8286776542663574, "learning_rate": 4.605952826656683e-05, "loss": 5.7629, "step": 3050 }, { "epoch": 0.24483917426788285, "grad_norm": 2.7585489749908447, "learning_rate": 4.604615713750869e-05, "loss": 5.7462, "step": 3060 }, { "epoch": 0.24563930228836614, "grad_norm": 2.2017667293548584, "learning_rate": 4.6032786008450555e-05, "loss": 5.844, "step": 3070 }, { "epoch": 0.2464394303088494, "grad_norm": 4.679725170135498, "learning_rate": 4.601941487939242e-05, "loss": 5.7254, "step": 3080 }, { "epoch": 0.2472395583293327, "grad_norm": 2.923884868621826, "learning_rate": 4.600604375033428e-05, "loss": 5.703, "step": 3090 }, { "epoch": 0.24803968634981596, "grad_norm": 2.2205090522766113, "learning_rate": 4.5992672621276144e-05, "loss": 5.7185, "step": 3100 }, { "epoch": 0.24883981437029926, "grad_norm": 2.852313280105591, "learning_rate": 4.5979301492218006e-05, "loss": 5.5653, "step": 3110 }, { "epoch": 0.24963994239078252, "grad_norm": 2.7683911323547363, "learning_rate": 4.596593036315986e-05, "loss": 5.7262, "step": 3120 }, { "epoch": 0.2504400704112658, "grad_norm": 3.1315665245056152, "learning_rate": 4.5952559234101725e-05, "loss": 5.7524, "step": 3130 }, { "epoch": 0.2512401984317491, "grad_norm": 2.5233592987060547, "learning_rate": 4.593918810504359e-05, "loss": 5.7443, "step": 3140 }, { "epoch": 0.25204032645223234, "grad_norm": 2.3802831172943115, "learning_rate": 4.592581697598545e-05, "loss": 5.8091, "step": 3150 }, { "epoch": 0.25284045447271564, "grad_norm": 2.378218412399292, "learning_rate": 4.591244584692731e-05, "loss": 5.7741, "step": 3160 }, { "epoch": 0.25364058249319893, "grad_norm": 4.712483882904053, "learning_rate": 4.5899074717869176e-05, "loss": 5.8643, "step": 3170 }, { "epoch": 0.25444071051368217, "grad_norm": 2.798752784729004, "learning_rate": 4.588570358881104e-05, "loss": 5.7984, "step": 3180 }, { "epoch": 0.25524083853416546, "grad_norm": 2.302037477493286, "learning_rate": 4.58723324597529e-05, "loss": 5.6548, "step": 3190 }, { "epoch": 0.25604096655464875, "grad_norm": 2.8621273040771484, "learning_rate": 4.5858961330694764e-05, "loss": 5.6875, "step": 3200 }, { "epoch": 0.25684109457513205, "grad_norm": 2.9079480171203613, "learning_rate": 4.5845590201636626e-05, "loss": 5.8801, "step": 3210 }, { "epoch": 0.2576412225956153, "grad_norm": 2.9576847553253174, "learning_rate": 4.583221907257849e-05, "loss": 5.6646, "step": 3220 }, { "epoch": 0.2584413506160986, "grad_norm": 4.085951805114746, "learning_rate": 4.581884794352035e-05, "loss": 5.9078, "step": 3230 }, { "epoch": 0.25924147863658187, "grad_norm": 2.622903347015381, "learning_rate": 4.5805476814462214e-05, "loss": 5.6821, "step": 3240 }, { "epoch": 0.2600416066570651, "grad_norm": 1.794255256652832, "learning_rate": 4.579210568540408e-05, "loss": 5.751, "step": 3250 }, { "epoch": 0.2608417346775484, "grad_norm": 3.074042558670044, "learning_rate": 4.577873455634594e-05, "loss": 5.7864, "step": 3260 }, { "epoch": 0.2616418626980317, "grad_norm": 2.3138844966888428, "learning_rate": 4.57653634272878e-05, "loss": 5.693, "step": 3270 }, { "epoch": 0.262441990718515, "grad_norm": 3.8877549171447754, "learning_rate": 4.5751992298229665e-05, "loss": 5.7154, "step": 3280 }, { "epoch": 0.2632421187389982, "grad_norm": 2.9623680114746094, "learning_rate": 4.573862116917153e-05, "loss": 5.7514, "step": 3290 }, { "epoch": 0.2640422467594815, "grad_norm": 2.840122938156128, "learning_rate": 4.572525004011339e-05, "loss": 5.7397, "step": 3300 }, { "epoch": 0.2648423747799648, "grad_norm": 2.9699277877807617, "learning_rate": 4.571187891105525e-05, "loss": 5.7626, "step": 3310 }, { "epoch": 0.2656425028004481, "grad_norm": 2.6493773460388184, "learning_rate": 4.5698507781997116e-05, "loss": 5.7619, "step": 3320 }, { "epoch": 0.26644263082093134, "grad_norm": 2.283259868621826, "learning_rate": 4.568513665293898e-05, "loss": 5.8409, "step": 3330 }, { "epoch": 0.26724275884141463, "grad_norm": 1.9254164695739746, "learning_rate": 4.567176552388084e-05, "loss": 5.8218, "step": 3340 }, { "epoch": 0.2680428868618979, "grad_norm": 2.382345676422119, "learning_rate": 4.5658394394822704e-05, "loss": 5.6865, "step": 3350 }, { "epoch": 0.26884301488238116, "grad_norm": 2.6039271354675293, "learning_rate": 4.564502326576457e-05, "loss": 5.7254, "step": 3360 }, { "epoch": 0.26964314290286445, "grad_norm": 2.0948996543884277, "learning_rate": 4.563165213670643e-05, "loss": 5.7589, "step": 3370 }, { "epoch": 0.27044327092334774, "grad_norm": 2.939955711364746, "learning_rate": 4.561828100764829e-05, "loss": 5.8298, "step": 3380 }, { "epoch": 0.27124339894383104, "grad_norm": 2.748307466506958, "learning_rate": 4.5604909878590155e-05, "loss": 5.8505, "step": 3390 }, { "epoch": 0.2720435269643143, "grad_norm": 2.7122459411621094, "learning_rate": 4.559153874953202e-05, "loss": 5.9027, "step": 3400 }, { "epoch": 0.27284365498479757, "grad_norm": 3.6053593158721924, "learning_rate": 4.557816762047388e-05, "loss": 5.6746, "step": 3410 }, { "epoch": 0.27364378300528086, "grad_norm": 4.433299541473389, "learning_rate": 4.556479649141574e-05, "loss": 5.7713, "step": 3420 }, { "epoch": 0.2744439110257641, "grad_norm": 2.5253539085388184, "learning_rate": 4.55514253623576e-05, "loss": 5.8219, "step": 3430 }, { "epoch": 0.2752440390462474, "grad_norm": 4.9358062744140625, "learning_rate": 4.553805423329946e-05, "loss": 5.7971, "step": 3440 }, { "epoch": 0.2760441670667307, "grad_norm": 2.6247594356536865, "learning_rate": 4.5524683104241324e-05, "loss": 5.1528, "step": 3450 }, { "epoch": 0.276844295087214, "grad_norm": 2.8152048587799072, "learning_rate": 4.551131197518319e-05, "loss": 5.7955, "step": 3460 }, { "epoch": 0.2776444231076972, "grad_norm": 2.143275499343872, "learning_rate": 4.549794084612505e-05, "loss": 5.6875, "step": 3470 }, { "epoch": 0.2784445511281805, "grad_norm": 2.9896023273468018, "learning_rate": 4.548456971706691e-05, "loss": 5.7981, "step": 3480 }, { "epoch": 0.2792446791486638, "grad_norm": 3.5231759548187256, "learning_rate": 4.5471198588008775e-05, "loss": 5.7343, "step": 3490 }, { "epoch": 0.28004480716914704, "grad_norm": 2.391721487045288, "learning_rate": 4.545782745895064e-05, "loss": 5.6821, "step": 3500 }, { "epoch": 0.28084493518963033, "grad_norm": 2.414992332458496, "learning_rate": 4.54444563298925e-05, "loss": 5.7357, "step": 3510 }, { "epoch": 0.2816450632101136, "grad_norm": 2.7502214908599854, "learning_rate": 4.543108520083436e-05, "loss": 5.6511, "step": 3520 }, { "epoch": 0.2824451912305969, "grad_norm": 2.1601436138153076, "learning_rate": 4.5417714071776226e-05, "loss": 5.6249, "step": 3530 }, { "epoch": 0.28324531925108015, "grad_norm": 2.89013671875, "learning_rate": 4.540434294271809e-05, "loss": 5.7583, "step": 3540 }, { "epoch": 0.28404544727156344, "grad_norm": 2.4915778636932373, "learning_rate": 4.539097181365995e-05, "loss": 5.6957, "step": 3550 }, { "epoch": 0.28484557529204674, "grad_norm": 5.053386688232422, "learning_rate": 4.5377600684601814e-05, "loss": 5.632, "step": 3560 }, { "epoch": 0.28564570331253003, "grad_norm": 2.6207687854766846, "learning_rate": 4.5364229555543676e-05, "loss": 5.8514, "step": 3570 }, { "epoch": 0.28644583133301327, "grad_norm": 4.157670497894287, "learning_rate": 4.535085842648553e-05, "loss": 5.7608, "step": 3580 }, { "epoch": 0.28724595935349656, "grad_norm": 3.4464797973632812, "learning_rate": 4.5337487297427395e-05, "loss": 5.6737, "step": 3590 }, { "epoch": 0.28804608737397985, "grad_norm": 4.255002498626709, "learning_rate": 4.532411616836926e-05, "loss": 5.7977, "step": 3600 }, { "epoch": 0.2888462153944631, "grad_norm": 2.7926547527313232, "learning_rate": 4.531074503931112e-05, "loss": 5.6891, "step": 3610 }, { "epoch": 0.2896463434149464, "grad_norm": 3.150400400161743, "learning_rate": 4.529737391025298e-05, "loss": 5.7931, "step": 3620 }, { "epoch": 0.2904464714354297, "grad_norm": 2.1223199367523193, "learning_rate": 4.5284002781194846e-05, "loss": 5.8646, "step": 3630 }, { "epoch": 0.29124659945591297, "grad_norm": 3.950665235519409, "learning_rate": 4.527063165213671e-05, "loss": 5.7008, "step": 3640 }, { "epoch": 0.2920467274763962, "grad_norm": 2.995692729949951, "learning_rate": 4.525726052307857e-05, "loss": 5.688, "step": 3650 }, { "epoch": 0.2928468554968795, "grad_norm": 2.041736125946045, "learning_rate": 4.5243889394020434e-05, "loss": 5.7301, "step": 3660 }, { "epoch": 0.2936469835173628, "grad_norm": 2.541757106781006, "learning_rate": 4.5230518264962297e-05, "loss": 5.5606, "step": 3670 }, { "epoch": 0.29444711153784603, "grad_norm": 2.140761613845825, "learning_rate": 4.521714713590416e-05, "loss": 5.7671, "step": 3680 }, { "epoch": 0.2952472395583293, "grad_norm": 2.6869146823883057, "learning_rate": 4.520377600684602e-05, "loss": 5.6452, "step": 3690 }, { "epoch": 0.2960473675788126, "grad_norm": 3.072376012802124, "learning_rate": 4.5190404877787885e-05, "loss": 5.6956, "step": 3700 }, { "epoch": 0.2968474955992959, "grad_norm": 2.5933837890625, "learning_rate": 4.517703374872975e-05, "loss": 5.6212, "step": 3710 }, { "epoch": 0.29764762361977914, "grad_norm": 3.0443103313446045, "learning_rate": 4.516366261967161e-05, "loss": 5.7849, "step": 3720 }, { "epoch": 0.29844775164026244, "grad_norm": 2.673583745956421, "learning_rate": 4.515029149061347e-05, "loss": 5.6186, "step": 3730 }, { "epoch": 0.29924787966074573, "grad_norm": 2.3276283740997314, "learning_rate": 4.513692036155533e-05, "loss": 5.9188, "step": 3740 }, { "epoch": 0.300048007681229, "grad_norm": 5.504491329193115, "learning_rate": 4.512354923249719e-05, "loss": 5.5676, "step": 3750 }, { "epoch": 0.30084813570171226, "grad_norm": 2.4181482791900635, "learning_rate": 4.5110178103439054e-05, "loss": 5.6852, "step": 3760 }, { "epoch": 0.30164826372219555, "grad_norm": 2.2489006519317627, "learning_rate": 4.509680697438092e-05, "loss": 5.7003, "step": 3770 }, { "epoch": 0.30244839174267885, "grad_norm": 2.6925253868103027, "learning_rate": 4.508343584532278e-05, "loss": 5.8176, "step": 3780 }, { "epoch": 0.3032485197631621, "grad_norm": 2.904318332672119, "learning_rate": 4.507006471626464e-05, "loss": 5.6912, "step": 3790 }, { "epoch": 0.3040486477836454, "grad_norm": 3.3189070224761963, "learning_rate": 4.5056693587206505e-05, "loss": 5.8706, "step": 3800 }, { "epoch": 0.30484877580412867, "grad_norm": 2.8324170112609863, "learning_rate": 4.504332245814837e-05, "loss": 5.8795, "step": 3810 }, { "epoch": 0.30564890382461196, "grad_norm": 3.113417148590088, "learning_rate": 4.502995132909023e-05, "loss": 5.8689, "step": 3820 }, { "epoch": 0.3064490318450952, "grad_norm": 2.469269275665283, "learning_rate": 4.501658020003209e-05, "loss": 5.7934, "step": 3830 }, { "epoch": 0.3072491598655785, "grad_norm": 2.778571128845215, "learning_rate": 4.5003209070973956e-05, "loss": 5.8577, "step": 3840 }, { "epoch": 0.3080492878860618, "grad_norm": 3.4269161224365234, "learning_rate": 4.498983794191582e-05, "loss": 5.8378, "step": 3850 }, { "epoch": 0.308849415906545, "grad_norm": 3.417850971221924, "learning_rate": 4.497646681285768e-05, "loss": 5.6532, "step": 3860 }, { "epoch": 0.3096495439270283, "grad_norm": 2.389784097671509, "learning_rate": 4.4963095683799544e-05, "loss": 5.5454, "step": 3870 }, { "epoch": 0.3104496719475116, "grad_norm": 2.384453296661377, "learning_rate": 4.4949724554741406e-05, "loss": 5.8014, "step": 3880 }, { "epoch": 0.3112497999679949, "grad_norm": 1.913668155670166, "learning_rate": 4.493635342568326e-05, "loss": 5.6033, "step": 3890 }, { "epoch": 0.31204992798847814, "grad_norm": 3.4930074214935303, "learning_rate": 4.4922982296625125e-05, "loss": 5.7649, "step": 3900 }, { "epoch": 0.31285005600896143, "grad_norm": 3.517458200454712, "learning_rate": 4.490961116756699e-05, "loss": 5.5635, "step": 3910 }, { "epoch": 0.3136501840294447, "grad_norm": 2.611274480819702, "learning_rate": 4.489624003850885e-05, "loss": 5.8121, "step": 3920 }, { "epoch": 0.314450312049928, "grad_norm": 2.373997926712036, "learning_rate": 4.488286890945071e-05, "loss": 5.6002, "step": 3930 }, { "epoch": 0.31525044007041125, "grad_norm": 2.554847002029419, "learning_rate": 4.4869497780392576e-05, "loss": 5.6432, "step": 3940 }, { "epoch": 0.31605056809089455, "grad_norm": 3.3720595836639404, "learning_rate": 4.485612665133444e-05, "loss": 5.5794, "step": 3950 }, { "epoch": 0.31685069611137784, "grad_norm": 2.2308788299560547, "learning_rate": 4.48427555222763e-05, "loss": 5.794, "step": 3960 }, { "epoch": 0.3176508241318611, "grad_norm": 2.0659661293029785, "learning_rate": 4.4829384393218164e-05, "loss": 5.5383, "step": 3970 }, { "epoch": 0.31845095215234437, "grad_norm": 3.2644894123077393, "learning_rate": 4.4816013264160026e-05, "loss": 5.6979, "step": 3980 }, { "epoch": 0.31925108017282766, "grad_norm": 2.3485729694366455, "learning_rate": 4.480264213510189e-05, "loss": 5.7214, "step": 3990 }, { "epoch": 0.32005120819331095, "grad_norm": 2.7470600605010986, "learning_rate": 4.478927100604375e-05, "loss": 5.6032, "step": 4000 }, { "epoch": 0.3208513362137942, "grad_norm": 2.1622989177703857, "learning_rate": 4.4775899876985614e-05, "loss": 5.7976, "step": 4010 }, { "epoch": 0.3216514642342775, "grad_norm": 2.7463905811309814, "learning_rate": 4.476252874792748e-05, "loss": 5.7181, "step": 4020 }, { "epoch": 0.3224515922547608, "grad_norm": 3.503662109375, "learning_rate": 4.474915761886934e-05, "loss": 5.8092, "step": 4030 }, { "epoch": 0.323251720275244, "grad_norm": 2.6073853969573975, "learning_rate": 4.47357864898112e-05, "loss": 5.7876, "step": 4040 }, { "epoch": 0.3240518482957273, "grad_norm": 3.354768991470337, "learning_rate": 4.472241536075306e-05, "loss": 5.7741, "step": 4050 }, { "epoch": 0.3248519763162106, "grad_norm": 2.648145914077759, "learning_rate": 4.470904423169492e-05, "loss": 5.7522, "step": 4060 }, { "epoch": 0.3256521043366939, "grad_norm": 3.086655378341675, "learning_rate": 4.4695673102636784e-05, "loss": 5.81, "step": 4070 }, { "epoch": 0.32645223235717713, "grad_norm": 2.230905771255493, "learning_rate": 4.4682301973578647e-05, "loss": 5.8839, "step": 4080 }, { "epoch": 0.3272523603776604, "grad_norm": 2.5391674041748047, "learning_rate": 4.466893084452051e-05, "loss": 5.5535, "step": 4090 }, { "epoch": 0.3280524883981437, "grad_norm": 2.7574117183685303, "learning_rate": 4.465555971546237e-05, "loss": 5.8275, "step": 4100 }, { "epoch": 0.32885261641862695, "grad_norm": 3.1114678382873535, "learning_rate": 4.4642188586404235e-05, "loss": 5.6876, "step": 4110 }, { "epoch": 0.32965274443911025, "grad_norm": 2.404892683029175, "learning_rate": 4.46288174573461e-05, "loss": 5.6876, "step": 4120 }, { "epoch": 0.33045287245959354, "grad_norm": 2.590759754180908, "learning_rate": 4.461544632828796e-05, "loss": 5.802, "step": 4130 }, { "epoch": 0.33125300048007683, "grad_norm": 2.4358649253845215, "learning_rate": 4.460207519922982e-05, "loss": 5.632, "step": 4140 }, { "epoch": 0.33205312850056007, "grad_norm": 3.9567458629608154, "learning_rate": 4.4588704070171685e-05, "loss": 5.8761, "step": 4150 }, { "epoch": 0.33285325652104336, "grad_norm": 2.3808743953704834, "learning_rate": 4.457533294111355e-05, "loss": 5.6815, "step": 4160 }, { "epoch": 0.33365338454152665, "grad_norm": 2.6527156829833984, "learning_rate": 4.456196181205541e-05, "loss": 5.805, "step": 4170 }, { "epoch": 0.33445351256200995, "grad_norm": 2.351062536239624, "learning_rate": 4.4548590682997273e-05, "loss": 5.6681, "step": 4180 }, { "epoch": 0.3352536405824932, "grad_norm": 2.3213460445404053, "learning_rate": 4.4535219553939136e-05, "loss": 5.6363, "step": 4190 }, { "epoch": 0.3360537686029765, "grad_norm": 1.9470767974853516, "learning_rate": 4.4521848424881e-05, "loss": 5.8772, "step": 4200 }, { "epoch": 0.33685389662345977, "grad_norm": 4.303500652313232, "learning_rate": 4.450847729582286e-05, "loss": 5.6185, "step": 4210 }, { "epoch": 0.337654024643943, "grad_norm": 2.713275909423828, "learning_rate": 4.4495106166764724e-05, "loss": 5.6754, "step": 4220 }, { "epoch": 0.3384541526644263, "grad_norm": 2.34993314743042, "learning_rate": 4.448173503770659e-05, "loss": 5.7003, "step": 4230 }, { "epoch": 0.3392542806849096, "grad_norm": 2.276228666305542, "learning_rate": 4.446836390864845e-05, "loss": 5.6, "step": 4240 }, { "epoch": 0.3400544087053929, "grad_norm": 2.3635685443878174, "learning_rate": 4.445499277959031e-05, "loss": 5.7373, "step": 4250 }, { "epoch": 0.3408545367258761, "grad_norm": 3.100604772567749, "learning_rate": 4.4441621650532175e-05, "loss": 5.7354, "step": 4260 }, { "epoch": 0.3416546647463594, "grad_norm": 2.6743876934051514, "learning_rate": 4.442825052147404e-05, "loss": 5.7544, "step": 4270 }, { "epoch": 0.3424547927668427, "grad_norm": 2.5783612728118896, "learning_rate": 4.44148793924159e-05, "loss": 5.8826, "step": 4280 }, { "epoch": 0.34325492078732595, "grad_norm": 2.8976659774780273, "learning_rate": 4.440150826335776e-05, "loss": 5.5418, "step": 4290 }, { "epoch": 0.34405504880780924, "grad_norm": 2.1061089038848877, "learning_rate": 4.4388137134299626e-05, "loss": 5.6406, "step": 4300 }, { "epoch": 0.34485517682829253, "grad_norm": 2.1303789615631104, "learning_rate": 4.437476600524149e-05, "loss": 5.6491, "step": 4310 }, { "epoch": 0.3456553048487758, "grad_norm": 2.6240499019622803, "learning_rate": 4.436139487618335e-05, "loss": 5.7161, "step": 4320 }, { "epoch": 0.34645543286925906, "grad_norm": 2.325155019760132, "learning_rate": 4.4348023747125214e-05, "loss": 5.6172, "step": 4330 }, { "epoch": 0.34725556088974235, "grad_norm": 2.8844404220581055, "learning_rate": 4.4334652618067076e-05, "loss": 5.7438, "step": 4340 }, { "epoch": 0.34805568891022565, "grad_norm": 2.375324249267578, "learning_rate": 4.432128148900894e-05, "loss": 5.8335, "step": 4350 }, { "epoch": 0.34885581693070894, "grad_norm": 2.1572377681732178, "learning_rate": 4.4307910359950795e-05, "loss": 5.706, "step": 4360 }, { "epoch": 0.3496559449511922, "grad_norm": 2.5218889713287354, "learning_rate": 4.429453923089266e-05, "loss": 5.7487, "step": 4370 }, { "epoch": 0.35045607297167547, "grad_norm": 2.636223554611206, "learning_rate": 4.428116810183452e-05, "loss": 5.8327, "step": 4380 }, { "epoch": 0.35125620099215876, "grad_norm": 2.436155080795288, "learning_rate": 4.426779697277638e-05, "loss": 5.6895, "step": 4390 }, { "epoch": 0.352056329012642, "grad_norm": 3.4435484409332275, "learning_rate": 4.4254425843718246e-05, "loss": 5.6171, "step": 4400 }, { "epoch": 0.3528564570331253, "grad_norm": 2.3990628719329834, "learning_rate": 4.424105471466011e-05, "loss": 5.7574, "step": 4410 }, { "epoch": 0.3536565850536086, "grad_norm": 2.544774293899536, "learning_rate": 4.422768358560197e-05, "loss": 5.558, "step": 4420 }, { "epoch": 0.3544567130740919, "grad_norm": 2.389491081237793, "learning_rate": 4.4214312456543834e-05, "loss": 5.6628, "step": 4430 }, { "epoch": 0.3552568410945751, "grad_norm": 5.203212261199951, "learning_rate": 4.4200941327485697e-05, "loss": 5.5403, "step": 4440 }, { "epoch": 0.3560569691150584, "grad_norm": 2.0861873626708984, "learning_rate": 4.418757019842756e-05, "loss": 5.625, "step": 4450 }, { "epoch": 0.3568570971355417, "grad_norm": 2.2355470657348633, "learning_rate": 4.417419906936942e-05, "loss": 5.614, "step": 4460 }, { "epoch": 0.35765722515602494, "grad_norm": 2.2239274978637695, "learning_rate": 4.4160827940311285e-05, "loss": 5.6885, "step": 4470 }, { "epoch": 0.35845735317650823, "grad_norm": 4.571592807769775, "learning_rate": 4.414745681125315e-05, "loss": 5.8495, "step": 4480 }, { "epoch": 0.3592574811969915, "grad_norm": 2.6501150131225586, "learning_rate": 4.413408568219501e-05, "loss": 5.6158, "step": 4490 }, { "epoch": 0.3600576092174748, "grad_norm": 2.8568902015686035, "learning_rate": 4.412071455313687e-05, "loss": 5.6403, "step": 4500 }, { "epoch": 0.36085773723795805, "grad_norm": 2.4179179668426514, "learning_rate": 4.410734342407873e-05, "loss": 5.749, "step": 4510 }, { "epoch": 0.36165786525844135, "grad_norm": 2.950491189956665, "learning_rate": 4.409397229502059e-05, "loss": 5.7128, "step": 4520 }, { "epoch": 0.36245799327892464, "grad_norm": 3.731049060821533, "learning_rate": 4.4080601165962454e-05, "loss": 5.6397, "step": 4530 }, { "epoch": 0.36325812129940793, "grad_norm": 2.255730390548706, "learning_rate": 4.406723003690432e-05, "loss": 5.626, "step": 4540 }, { "epoch": 0.36405824931989117, "grad_norm": 2.623455047607422, "learning_rate": 4.405385890784618e-05, "loss": 5.6792, "step": 4550 }, { "epoch": 0.36485837734037446, "grad_norm": 2.366481065750122, "learning_rate": 4.404048777878804e-05, "loss": 5.5455, "step": 4560 }, { "epoch": 0.36565850536085776, "grad_norm": 2.56351375579834, "learning_rate": 4.4027116649729905e-05, "loss": 5.7982, "step": 4570 }, { "epoch": 0.366458633381341, "grad_norm": 2.3203811645507812, "learning_rate": 4.401374552067177e-05, "loss": 5.7969, "step": 4580 }, { "epoch": 0.3672587614018243, "grad_norm": 2.3838179111480713, "learning_rate": 4.400037439161363e-05, "loss": 5.7484, "step": 4590 }, { "epoch": 0.3680588894223076, "grad_norm": 2.0725440979003906, "learning_rate": 4.398700326255549e-05, "loss": 5.8405, "step": 4600 }, { "epoch": 0.36885901744279087, "grad_norm": 3.49495005607605, "learning_rate": 4.3973632133497356e-05, "loss": 5.7151, "step": 4610 }, { "epoch": 0.3696591454632741, "grad_norm": 2.643007755279541, "learning_rate": 4.396026100443922e-05, "loss": 5.6374, "step": 4620 }, { "epoch": 0.3704592734837574, "grad_norm": 2.282304286956787, "learning_rate": 4.394688987538108e-05, "loss": 5.589, "step": 4630 }, { "epoch": 0.3712594015042407, "grad_norm": 2.244058609008789, "learning_rate": 4.3933518746322944e-05, "loss": 5.7516, "step": 4640 }, { "epoch": 0.37205952952472393, "grad_norm": 2.44496488571167, "learning_rate": 4.3920147617264806e-05, "loss": 5.8393, "step": 4650 }, { "epoch": 0.3728596575452072, "grad_norm": 2.6613078117370605, "learning_rate": 4.390677648820667e-05, "loss": 5.6764, "step": 4660 }, { "epoch": 0.3736597855656905, "grad_norm": 3.99092173576355, "learning_rate": 4.3893405359148525e-05, "loss": 5.8658, "step": 4670 }, { "epoch": 0.3744599135861738, "grad_norm": 1.6338485479354858, "learning_rate": 4.388003423009039e-05, "loss": 5.7527, "step": 4680 }, { "epoch": 0.37526004160665705, "grad_norm": 2.3723371028900146, "learning_rate": 4.386666310103225e-05, "loss": 5.7482, "step": 4690 }, { "epoch": 0.37606016962714034, "grad_norm": 2.630424976348877, "learning_rate": 4.385329197197411e-05, "loss": 5.7539, "step": 4700 }, { "epoch": 0.37686029764762363, "grad_norm": 2.3873038291931152, "learning_rate": 4.3839920842915976e-05, "loss": 5.6729, "step": 4710 }, { "epoch": 0.37766042566810687, "grad_norm": 1.9391748905181885, "learning_rate": 4.382654971385784e-05, "loss": 5.6794, "step": 4720 }, { "epoch": 0.37846055368859016, "grad_norm": 2.103975296020508, "learning_rate": 4.38131785847997e-05, "loss": 5.5104, "step": 4730 }, { "epoch": 0.37926068170907346, "grad_norm": 3.731184959411621, "learning_rate": 4.3799807455741564e-05, "loss": 5.6699, "step": 4740 }, { "epoch": 0.38006080972955675, "grad_norm": 2.881068468093872, "learning_rate": 4.3786436326683426e-05, "loss": 5.6394, "step": 4750 }, { "epoch": 0.38086093775004, "grad_norm": 2.5963799953460693, "learning_rate": 4.377306519762529e-05, "loss": 5.784, "step": 4760 }, { "epoch": 0.3816610657705233, "grad_norm": 1.9520230293273926, "learning_rate": 4.375969406856715e-05, "loss": 5.7608, "step": 4770 }, { "epoch": 0.38246119379100657, "grad_norm": 2.386702537536621, "learning_rate": 4.374766005241483e-05, "loss": 5.5725, "step": 4780 }, { "epoch": 0.38326132181148986, "grad_norm": 2.3830511569976807, "learning_rate": 4.3734288923356694e-05, "loss": 5.5584, "step": 4790 }, { "epoch": 0.3840614498319731, "grad_norm": 2.1514739990234375, "learning_rate": 4.3720917794298556e-05, "loss": 5.6621, "step": 4800 }, { "epoch": 0.3848615778524564, "grad_norm": 2.5376317501068115, "learning_rate": 4.370754666524042e-05, "loss": 5.4138, "step": 4810 }, { "epoch": 0.3856617058729397, "grad_norm": 3.425899028778076, "learning_rate": 4.3694175536182275e-05, "loss": 5.6478, "step": 4820 }, { "epoch": 0.3864618338934229, "grad_norm": 2.7518632411956787, "learning_rate": 4.368080440712414e-05, "loss": 5.6556, "step": 4830 }, { "epoch": 0.3872619619139062, "grad_norm": 3.119227647781372, "learning_rate": 4.3667433278066e-05, "loss": 5.7925, "step": 4840 }, { "epoch": 0.3880620899343895, "grad_norm": 3.2664616107940674, "learning_rate": 4.365406214900786e-05, "loss": 5.7176, "step": 4850 }, { "epoch": 0.3888622179548728, "grad_norm": 2.5125045776367188, "learning_rate": 4.3640691019949726e-05, "loss": 5.6511, "step": 4860 }, { "epoch": 0.38966234597535604, "grad_norm": 2.992112874984741, "learning_rate": 4.362731989089159e-05, "loss": 5.6426, "step": 4870 }, { "epoch": 0.39046247399583933, "grad_norm": 4.46783971786499, "learning_rate": 4.361394876183345e-05, "loss": 5.736, "step": 4880 }, { "epoch": 0.3912626020163226, "grad_norm": 1.8372838497161865, "learning_rate": 4.3600577632775314e-05, "loss": 5.7603, "step": 4890 }, { "epoch": 0.39206273003680586, "grad_norm": 2.1635375022888184, "learning_rate": 4.3587206503717176e-05, "loss": 5.6019, "step": 4900 }, { "epoch": 0.39286285805728915, "grad_norm": 2.2425310611724854, "learning_rate": 4.357383537465904e-05, "loss": 5.6829, "step": 4910 }, { "epoch": 0.39366298607777245, "grad_norm": 2.408907413482666, "learning_rate": 4.35604642456009e-05, "loss": 5.6821, "step": 4920 }, { "epoch": 0.39446311409825574, "grad_norm": 3.012258291244507, "learning_rate": 4.3547093116542765e-05, "loss": 5.7503, "step": 4930 }, { "epoch": 0.395263242118739, "grad_norm": 3.187053680419922, "learning_rate": 4.353372198748463e-05, "loss": 5.6459, "step": 4940 }, { "epoch": 0.39606337013922227, "grad_norm": 2.7528955936431885, "learning_rate": 4.352035085842649e-05, "loss": 5.6386, "step": 4950 }, { "epoch": 0.39686349815970556, "grad_norm": 2.9744699001312256, "learning_rate": 4.350697972936835e-05, "loss": 5.5938, "step": 4960 }, { "epoch": 0.39766362618018886, "grad_norm": 2.779604196548462, "learning_rate": 4.3493608600310215e-05, "loss": 5.5459, "step": 4970 }, { "epoch": 0.3984637542006721, "grad_norm": 2.9092133045196533, "learning_rate": 4.348023747125207e-05, "loss": 5.7695, "step": 4980 }, { "epoch": 0.3992638822211554, "grad_norm": 2.800872802734375, "learning_rate": 4.3466866342193934e-05, "loss": 5.6943, "step": 4990 }, { "epoch": 0.4000640102416387, "grad_norm": 3.299595832824707, "learning_rate": 4.3453495213135797e-05, "loss": 5.4432, "step": 5000 }, { "epoch": 0.4008641382621219, "grad_norm": 2.2425456047058105, "learning_rate": 4.344012408407766e-05, "loss": 5.6688, "step": 5010 }, { "epoch": 0.4016642662826052, "grad_norm": 2.269378423690796, "learning_rate": 4.342675295501952e-05, "loss": 5.7713, "step": 5020 }, { "epoch": 0.4024643943030885, "grad_norm": 2.3903868198394775, "learning_rate": 4.3413381825961385e-05, "loss": 5.5926, "step": 5030 }, { "epoch": 0.4032645223235718, "grad_norm": 3.267918109893799, "learning_rate": 4.340001069690325e-05, "loss": 5.6806, "step": 5040 }, { "epoch": 0.40406465034405503, "grad_norm": 3.2075066566467285, "learning_rate": 4.338663956784511e-05, "loss": 5.6582, "step": 5050 }, { "epoch": 0.4048647783645383, "grad_norm": 2.5458226203918457, "learning_rate": 4.337326843878697e-05, "loss": 5.6576, "step": 5060 }, { "epoch": 0.4056649063850216, "grad_norm": 2.0331077575683594, "learning_rate": 4.3359897309728835e-05, "loss": 5.6725, "step": 5070 }, { "epoch": 0.40646503440550485, "grad_norm": 2.406907796859741, "learning_rate": 4.33465261806707e-05, "loss": 5.5168, "step": 5080 }, { "epoch": 0.40726516242598815, "grad_norm": 2.661137580871582, "learning_rate": 4.333315505161256e-05, "loss": 5.5953, "step": 5090 }, { "epoch": 0.40806529044647144, "grad_norm": 2.857725143432617, "learning_rate": 4.3319783922554423e-05, "loss": 5.6702, "step": 5100 }, { "epoch": 0.40886541846695473, "grad_norm": 2.7894747257232666, "learning_rate": 4.3306412793496286e-05, "loss": 5.6228, "step": 5110 }, { "epoch": 0.40966554648743797, "grad_norm": 2.8865861892700195, "learning_rate": 4.329304166443815e-05, "loss": 5.6859, "step": 5120 }, { "epoch": 0.41046567450792126, "grad_norm": 2.1493608951568604, "learning_rate": 4.3279670535380005e-05, "loss": 5.5516, "step": 5130 }, { "epoch": 0.41126580252840456, "grad_norm": 3.112820863723755, "learning_rate": 4.326629940632187e-05, "loss": 5.6409, "step": 5140 }, { "epoch": 0.41206593054888785, "grad_norm": 2.778876543045044, "learning_rate": 4.325292827726373e-05, "loss": 5.6948, "step": 5150 }, { "epoch": 0.4128660585693711, "grad_norm": 2.0409047603607178, "learning_rate": 4.323955714820559e-05, "loss": 5.5458, "step": 5160 }, { "epoch": 0.4136661865898544, "grad_norm": 3.1058828830718994, "learning_rate": 4.3226186019147456e-05, "loss": 5.8437, "step": 5170 }, { "epoch": 0.41446631461033767, "grad_norm": 3.306704044342041, "learning_rate": 4.321281489008932e-05, "loss": 5.691, "step": 5180 }, { "epoch": 0.4152664426308209, "grad_norm": 2.9495625495910645, "learning_rate": 4.319944376103118e-05, "loss": 5.6364, "step": 5190 }, { "epoch": 0.4160665706513042, "grad_norm": 2.1773974895477295, "learning_rate": 4.3186072631973044e-05, "loss": 5.6713, "step": 5200 }, { "epoch": 0.4168666986717875, "grad_norm": 2.0897533893585205, "learning_rate": 4.3172701502914906e-05, "loss": 5.6022, "step": 5210 }, { "epoch": 0.4176668266922708, "grad_norm": 2.2131927013397217, "learning_rate": 4.315933037385677e-05, "loss": 5.5728, "step": 5220 }, { "epoch": 0.418466954712754, "grad_norm": 2.225728750228882, "learning_rate": 4.314595924479863e-05, "loss": 5.5374, "step": 5230 }, { "epoch": 0.4192670827332373, "grad_norm": 2.219791889190674, "learning_rate": 4.3132588115740494e-05, "loss": 5.6986, "step": 5240 }, { "epoch": 0.4200672107537206, "grad_norm": 2.720323085784912, "learning_rate": 4.311921698668236e-05, "loss": 5.6046, "step": 5250 }, { "epoch": 0.42086733877420385, "grad_norm": 2.4254257678985596, "learning_rate": 4.310584585762422e-05, "loss": 5.5566, "step": 5260 }, { "epoch": 0.42166746679468714, "grad_norm": 2.2297472953796387, "learning_rate": 4.309247472856608e-05, "loss": 5.7431, "step": 5270 }, { "epoch": 0.42246759481517043, "grad_norm": 2.2767512798309326, "learning_rate": 4.3079103599507945e-05, "loss": 5.6661, "step": 5280 }, { "epoch": 0.4232677228356537, "grad_norm": 2.8959579467773438, "learning_rate": 4.30657324704498e-05, "loss": 5.6584, "step": 5290 }, { "epoch": 0.42406785085613696, "grad_norm": 2.49867844581604, "learning_rate": 4.3052361341391664e-05, "loss": 5.7564, "step": 5300 }, { "epoch": 0.42486797887662026, "grad_norm": 2.1820337772369385, "learning_rate": 4.3038990212333526e-05, "loss": 5.6288, "step": 5310 }, { "epoch": 0.42566810689710355, "grad_norm": 2.7174227237701416, "learning_rate": 4.302561908327539e-05, "loss": 5.6496, "step": 5320 }, { "epoch": 0.42646823491758684, "grad_norm": 2.7261149883270264, "learning_rate": 4.301224795421725e-05, "loss": 5.6557, "step": 5330 }, { "epoch": 0.4272683629380701, "grad_norm": 2.581760883331299, "learning_rate": 4.2998876825159114e-05, "loss": 5.604, "step": 5340 }, { "epoch": 0.42806849095855337, "grad_norm": 2.43254017829895, "learning_rate": 4.298550569610098e-05, "loss": 5.6041, "step": 5350 }, { "epoch": 0.42886861897903666, "grad_norm": 4.465782165527344, "learning_rate": 4.297213456704284e-05, "loss": 5.7158, "step": 5360 }, { "epoch": 0.4296687469995199, "grad_norm": 2.6434614658355713, "learning_rate": 4.29587634379847e-05, "loss": 5.6347, "step": 5370 }, { "epoch": 0.4304688750200032, "grad_norm": 2.344190835952759, "learning_rate": 4.2945392308926565e-05, "loss": 5.6062, "step": 5380 }, { "epoch": 0.4312690030404865, "grad_norm": 4.311372756958008, "learning_rate": 4.293202117986843e-05, "loss": 5.7356, "step": 5390 }, { "epoch": 0.4320691310609698, "grad_norm": 2.8204123973846436, "learning_rate": 4.291865005081029e-05, "loss": 5.63, "step": 5400 }, { "epoch": 0.432869259081453, "grad_norm": 3.333059072494507, "learning_rate": 4.290527892175215e-05, "loss": 5.5992, "step": 5410 }, { "epoch": 0.4336693871019363, "grad_norm": 2.0647048950195312, "learning_rate": 4.2891907792694016e-05, "loss": 5.691, "step": 5420 }, { "epoch": 0.4344695151224196, "grad_norm": 2.5100045204162598, "learning_rate": 4.287853666363588e-05, "loss": 5.615, "step": 5430 }, { "epoch": 0.43526964314290284, "grad_norm": 2.6120762825012207, "learning_rate": 4.286516553457774e-05, "loss": 5.746, "step": 5440 }, { "epoch": 0.43606977116338613, "grad_norm": 2.2886853218078613, "learning_rate": 4.2851794405519604e-05, "loss": 5.6783, "step": 5450 }, { "epoch": 0.4368698991838694, "grad_norm": 2.6724119186401367, "learning_rate": 4.283842327646147e-05, "loss": 5.6526, "step": 5460 }, { "epoch": 0.4376700272043527, "grad_norm": 2.2408151626586914, "learning_rate": 4.282505214740333e-05, "loss": 5.6314, "step": 5470 }, { "epoch": 0.43847015522483596, "grad_norm": 3.0294084548950195, "learning_rate": 4.281168101834519e-05, "loss": 5.6669, "step": 5480 }, { "epoch": 0.43927028324531925, "grad_norm": 2.1664011478424072, "learning_rate": 4.2798309889287055e-05, "loss": 5.4856, "step": 5490 }, { "epoch": 0.44007041126580254, "grad_norm": 3.4465417861938477, "learning_rate": 4.278493876022892e-05, "loss": 5.5859, "step": 5500 }, { "epoch": 0.4408705392862858, "grad_norm": 2.0116310119628906, "learning_rate": 4.277156763117078e-05, "loss": 5.5982, "step": 5510 }, { "epoch": 0.44167066730676907, "grad_norm": 2.578658103942871, "learning_rate": 4.275819650211264e-05, "loss": 5.4026, "step": 5520 }, { "epoch": 0.44247079532725236, "grad_norm": 3.1201677322387695, "learning_rate": 4.2744825373054506e-05, "loss": 5.7024, "step": 5530 }, { "epoch": 0.44327092334773566, "grad_norm": 2.2246837615966797, "learning_rate": 4.273145424399637e-05, "loss": 5.5842, "step": 5540 }, { "epoch": 0.4440710513682189, "grad_norm": 2.1593568325042725, "learning_rate": 4.271808311493823e-05, "loss": 5.5099, "step": 5550 }, { "epoch": 0.4448711793887022, "grad_norm": 3.082218885421753, "learning_rate": 4.2704711985880094e-05, "loss": 5.5539, "step": 5560 }, { "epoch": 0.4456713074091855, "grad_norm": 3.2272634506225586, "learning_rate": 4.2691340856821956e-05, "loss": 5.73, "step": 5570 }, { "epoch": 0.4464714354296688, "grad_norm": 2.301713466644287, "learning_rate": 4.267796972776382e-05, "loss": 5.5444, "step": 5580 }, { "epoch": 0.447271563450152, "grad_norm": 3.2985429763793945, "learning_rate": 4.2664598598705675e-05, "loss": 5.7499, "step": 5590 }, { "epoch": 0.4480716914706353, "grad_norm": 2.103994607925415, "learning_rate": 4.265122746964754e-05, "loss": 5.5627, "step": 5600 }, { "epoch": 0.4488718194911186, "grad_norm": 3.260099172592163, "learning_rate": 4.26378563405894e-05, "loss": 5.5692, "step": 5610 }, { "epoch": 0.44967194751160183, "grad_norm": 2.740907907485962, "learning_rate": 4.262448521153126e-05, "loss": 5.4984, "step": 5620 }, { "epoch": 0.4504720755320851, "grad_norm": 5.314218997955322, "learning_rate": 4.2611114082473126e-05, "loss": 5.5641, "step": 5630 }, { "epoch": 0.4512722035525684, "grad_norm": 3.0524938106536865, "learning_rate": 4.259774295341499e-05, "loss": 5.6375, "step": 5640 }, { "epoch": 0.4520723315730517, "grad_norm": 3.57781982421875, "learning_rate": 4.258437182435685e-05, "loss": 5.6726, "step": 5650 }, { "epoch": 0.45287245959353495, "grad_norm": 3.094510793685913, "learning_rate": 4.2571000695298714e-05, "loss": 5.7328, "step": 5660 }, { "epoch": 0.45367258761401824, "grad_norm": 2.731092929840088, "learning_rate": 4.2557629566240576e-05, "loss": 5.6667, "step": 5670 }, { "epoch": 0.45447271563450153, "grad_norm": 3.6701395511627197, "learning_rate": 4.254425843718244e-05, "loss": 5.641, "step": 5680 }, { "epoch": 0.45527284365498477, "grad_norm": 1.9017853736877441, "learning_rate": 4.25308873081243e-05, "loss": 5.6521, "step": 5690 }, { "epoch": 0.45607297167546806, "grad_norm": 3.2658119201660156, "learning_rate": 4.2517516179066165e-05, "loss": 5.6431, "step": 5700 }, { "epoch": 0.45687309969595136, "grad_norm": 2.227353572845459, "learning_rate": 4.250414505000803e-05, "loss": 5.6198, "step": 5710 }, { "epoch": 0.45767322771643465, "grad_norm": 1.7804296016693115, "learning_rate": 4.249077392094989e-05, "loss": 5.618, "step": 5720 }, { "epoch": 0.4584733557369179, "grad_norm": 2.9357879161834717, "learning_rate": 4.247740279189175e-05, "loss": 5.5222, "step": 5730 }, { "epoch": 0.4592734837574012, "grad_norm": 5.074959754943848, "learning_rate": 4.2464031662833615e-05, "loss": 5.7604, "step": 5740 }, { "epoch": 0.4600736117778845, "grad_norm": 2.4961061477661133, "learning_rate": 4.245066053377547e-05, "loss": 5.5699, "step": 5750 }, { "epoch": 0.46087373979836777, "grad_norm": 2.636403799057007, "learning_rate": 4.2437289404717334e-05, "loss": 5.745, "step": 5760 }, { "epoch": 0.461673867818851, "grad_norm": 2.4829630851745605, "learning_rate": 4.2423918275659197e-05, "loss": 5.9779, "step": 5770 }, { "epoch": 0.4624739958393343, "grad_norm": 2.389112710952759, "learning_rate": 4.241054714660106e-05, "loss": 5.696, "step": 5780 }, { "epoch": 0.4632741238598176, "grad_norm": 2.3053462505340576, "learning_rate": 4.239717601754292e-05, "loss": 5.6567, "step": 5790 }, { "epoch": 0.4640742518803008, "grad_norm": 2.9635446071624756, "learning_rate": 4.2383804888484785e-05, "loss": 5.7643, "step": 5800 }, { "epoch": 0.4648743799007841, "grad_norm": 3.3227570056915283, "learning_rate": 4.237043375942665e-05, "loss": 5.5425, "step": 5810 }, { "epoch": 0.4656745079212674, "grad_norm": 3.2959067821502686, "learning_rate": 4.235706263036851e-05, "loss": 5.5886, "step": 5820 }, { "epoch": 0.4664746359417507, "grad_norm": 2.497953176498413, "learning_rate": 4.234369150131037e-05, "loss": 5.6248, "step": 5830 }, { "epoch": 0.46727476396223394, "grad_norm": 3.5957205295562744, "learning_rate": 4.2330320372252235e-05, "loss": 5.5345, "step": 5840 }, { "epoch": 0.46807489198271723, "grad_norm": 2.9113316535949707, "learning_rate": 4.23169492431941e-05, "loss": 5.7358, "step": 5850 }, { "epoch": 0.4688750200032005, "grad_norm": 3.8617255687713623, "learning_rate": 4.230357811413596e-05, "loss": 5.7451, "step": 5860 }, { "epoch": 0.46967514802368376, "grad_norm": 2.5546538829803467, "learning_rate": 4.2290206985077824e-05, "loss": 5.5874, "step": 5870 }, { "epoch": 0.47047527604416706, "grad_norm": 3.7215869426727295, "learning_rate": 4.2276835856019686e-05, "loss": 5.5462, "step": 5880 }, { "epoch": 0.47127540406465035, "grad_norm": 3.3122622966766357, "learning_rate": 4.226346472696155e-05, "loss": 5.7368, "step": 5890 }, { "epoch": 0.47207553208513364, "grad_norm": 2.3962459564208984, "learning_rate": 4.2250093597903405e-05, "loss": 5.7328, "step": 5900 }, { "epoch": 0.4728756601056169, "grad_norm": 2.497668504714966, "learning_rate": 4.223672246884527e-05, "loss": 5.7063, "step": 5910 }, { "epoch": 0.4736757881261002, "grad_norm": 2.301725387573242, "learning_rate": 4.222335133978713e-05, "loss": 5.6029, "step": 5920 }, { "epoch": 0.47447591614658347, "grad_norm": 3.840155839920044, "learning_rate": 4.220998021072899e-05, "loss": 5.825, "step": 5930 }, { "epoch": 0.47527604416706676, "grad_norm": 3.1776278018951416, "learning_rate": 4.2196609081670856e-05, "loss": 5.6421, "step": 5940 }, { "epoch": 0.47607617218755, "grad_norm": 2.1823127269744873, "learning_rate": 4.218323795261272e-05, "loss": 5.7154, "step": 5950 }, { "epoch": 0.4768763002080333, "grad_norm": 2.944390058517456, "learning_rate": 4.216986682355458e-05, "loss": 5.5429, "step": 5960 }, { "epoch": 0.4776764282285166, "grad_norm": 2.035430431365967, "learning_rate": 4.2156495694496444e-05, "loss": 5.8187, "step": 5970 }, { "epoch": 0.4784765562489998, "grad_norm": 3.167098045349121, "learning_rate": 4.2143124565438306e-05, "loss": 5.5891, "step": 5980 }, { "epoch": 0.4792766842694831, "grad_norm": 1.9377233982086182, "learning_rate": 4.212975343638017e-05, "loss": 5.7428, "step": 5990 }, { "epoch": 0.4800768122899664, "grad_norm": 2.759096622467041, "learning_rate": 4.211638230732203e-05, "loss": 5.5572, "step": 6000 }, { "epoch": 0.4808769403104497, "grad_norm": 2.074033498764038, "learning_rate": 4.2103011178263894e-05, "loss": 5.517, "step": 6010 }, { "epoch": 0.48167706833093293, "grad_norm": 2.2866854667663574, "learning_rate": 4.208964004920576e-05, "loss": 5.6539, "step": 6020 }, { "epoch": 0.4824771963514162, "grad_norm": 1.9909095764160156, "learning_rate": 4.207626892014762e-05, "loss": 5.5532, "step": 6030 }, { "epoch": 0.4832773243718995, "grad_norm": 3.245906114578247, "learning_rate": 4.206289779108948e-05, "loss": 5.6797, "step": 6040 }, { "epoch": 0.48407745239238276, "grad_norm": 2.013009786605835, "learning_rate": 4.2049526662031345e-05, "loss": 5.6378, "step": 6050 }, { "epoch": 0.48487758041286605, "grad_norm": 2.5478925704956055, "learning_rate": 4.20361555329732e-05, "loss": 5.555, "step": 6060 }, { "epoch": 0.48567770843334934, "grad_norm": 3.079225778579712, "learning_rate": 4.2022784403915064e-05, "loss": 5.7618, "step": 6070 }, { "epoch": 0.48647783645383263, "grad_norm": 2.2639927864074707, "learning_rate": 4.2009413274856926e-05, "loss": 5.8063, "step": 6080 }, { "epoch": 0.48727796447431587, "grad_norm": 4.630524158477783, "learning_rate": 4.199604214579879e-05, "loss": 5.6403, "step": 6090 }, { "epoch": 0.48807809249479917, "grad_norm": 3.11018967628479, "learning_rate": 4.198267101674065e-05, "loss": 5.7517, "step": 6100 }, { "epoch": 0.48887822051528246, "grad_norm": 8.462982177734375, "learning_rate": 4.1969299887682515e-05, "loss": 5.7311, "step": 6110 }, { "epoch": 0.4896783485357657, "grad_norm": 2.418065071105957, "learning_rate": 4.195592875862438e-05, "loss": 5.6239, "step": 6120 }, { "epoch": 0.490478476556249, "grad_norm": 2.5452466011047363, "learning_rate": 4.194255762956624e-05, "loss": 5.7417, "step": 6130 }, { "epoch": 0.4912786045767323, "grad_norm": 2.986041307449341, "learning_rate": 4.19291865005081e-05, "loss": 5.663, "step": 6140 }, { "epoch": 0.4920787325972156, "grad_norm": 2.7642807960510254, "learning_rate": 4.1915815371449965e-05, "loss": 5.5379, "step": 6150 }, { "epoch": 0.4928788606176988, "grad_norm": 4.326907157897949, "learning_rate": 4.190244424239183e-05, "loss": 5.8058, "step": 6160 }, { "epoch": 0.4936789886381821, "grad_norm": 1.9514706134796143, "learning_rate": 4.188907311333369e-05, "loss": 5.7004, "step": 6170 }, { "epoch": 0.4944791166586654, "grad_norm": 2.5721428394317627, "learning_rate": 4.187570198427555e-05, "loss": 5.6959, "step": 6180 }, { "epoch": 0.4952792446791487, "grad_norm": 2.6619083881378174, "learning_rate": 4.1862330855217416e-05, "loss": 5.7196, "step": 6190 }, { "epoch": 0.4960793726996319, "grad_norm": 2.322341203689575, "learning_rate": 4.184895972615928e-05, "loss": 5.5998, "step": 6200 }, { "epoch": 0.4968795007201152, "grad_norm": 2.280777931213379, "learning_rate": 4.183558859710114e-05, "loss": 5.5171, "step": 6210 }, { "epoch": 0.4976796287405985, "grad_norm": 1.9774320125579834, "learning_rate": 4.1822217468043004e-05, "loss": 5.6368, "step": 6220 }, { "epoch": 0.49847975676108175, "grad_norm": 2.199708938598633, "learning_rate": 4.180884633898487e-05, "loss": 5.4638, "step": 6230 }, { "epoch": 0.49927988478156504, "grad_norm": 2.0054879188537598, "learning_rate": 4.179547520992673e-05, "loss": 5.4624, "step": 6240 }, { "epoch": 0.5000800128020483, "grad_norm": 2.0623903274536133, "learning_rate": 4.178210408086859e-05, "loss": 5.6554, "step": 6250 }, { "epoch": 0.5008801408225316, "grad_norm": 2.5907487869262695, "learning_rate": 4.1768732951810455e-05, "loss": 5.4989, "step": 6260 }, { "epoch": 0.5016802688430149, "grad_norm": 2.181987762451172, "learning_rate": 4.175536182275232e-05, "loss": 5.624, "step": 6270 }, { "epoch": 0.5024803968634982, "grad_norm": 2.9678001403808594, "learning_rate": 4.174199069369418e-05, "loss": 5.6545, "step": 6280 }, { "epoch": 0.5032805248839815, "grad_norm": 5.213638782501221, "learning_rate": 4.172861956463604e-05, "loss": 5.7048, "step": 6290 }, { "epoch": 0.5040806529044647, "grad_norm": 2.465900182723999, "learning_rate": 4.1715248435577906e-05, "loss": 5.646, "step": 6300 }, { "epoch": 0.504880780924948, "grad_norm": 2.94570255279541, "learning_rate": 4.170187730651977e-05, "loss": 5.6274, "step": 6310 }, { "epoch": 0.5056809089454313, "grad_norm": 3.5255651473999023, "learning_rate": 4.168850617746163e-05, "loss": 5.5336, "step": 6320 }, { "epoch": 0.5064810369659145, "grad_norm": 2.3499608039855957, "learning_rate": 4.1675135048403494e-05, "loss": 5.7768, "step": 6330 }, { "epoch": 0.5072811649863979, "grad_norm": 2.0476951599121094, "learning_rate": 4.1661763919345356e-05, "loss": 5.5927, "step": 6340 }, { "epoch": 0.5080812930068811, "grad_norm": 2.4708118438720703, "learning_rate": 4.164839279028722e-05, "loss": 5.6458, "step": 6350 }, { "epoch": 0.5088814210273643, "grad_norm": 2.465075731277466, "learning_rate": 4.163502166122908e-05, "loss": 5.5744, "step": 6360 }, { "epoch": 0.5096815490478477, "grad_norm": 2.9378490447998047, "learning_rate": 4.162165053217094e-05, "loss": 5.6963, "step": 6370 }, { "epoch": 0.5104816770683309, "grad_norm": 2.201359987258911, "learning_rate": 4.16082794031128e-05, "loss": 5.613, "step": 6380 }, { "epoch": 0.5112818050888142, "grad_norm": 1.8427401781082153, "learning_rate": 4.159490827405466e-05, "loss": 5.5494, "step": 6390 }, { "epoch": 0.5120819331092975, "grad_norm": 1.9969813823699951, "learning_rate": 4.1581537144996526e-05, "loss": 5.5783, "step": 6400 }, { "epoch": 0.5128820611297807, "grad_norm": 2.9670321941375732, "learning_rate": 4.156816601593839e-05, "loss": 5.7176, "step": 6410 }, { "epoch": 0.5136821891502641, "grad_norm": 2.76875901222229, "learning_rate": 4.155479488688025e-05, "loss": 5.5584, "step": 6420 }, { "epoch": 0.5144823171707473, "grad_norm": 3.2874600887298584, "learning_rate": 4.1541423757822114e-05, "loss": 5.8726, "step": 6430 }, { "epoch": 0.5152824451912306, "grad_norm": 2.4672482013702393, "learning_rate": 4.1528052628763977e-05, "loss": 5.764, "step": 6440 }, { "epoch": 0.5160825732117139, "grad_norm": 3.5424506664276123, "learning_rate": 4.151468149970584e-05, "loss": 5.6612, "step": 6450 }, { "epoch": 0.5168827012321972, "grad_norm": 2.7947871685028076, "learning_rate": 4.15013103706477e-05, "loss": 5.668, "step": 6460 }, { "epoch": 0.5176828292526804, "grad_norm": 2.624370574951172, "learning_rate": 4.1487939241589565e-05, "loss": 5.577, "step": 6470 }, { "epoch": 0.5184829572731637, "grad_norm": 2.276289701461792, "learning_rate": 4.147456811253143e-05, "loss": 5.7592, "step": 6480 }, { "epoch": 0.519283085293647, "grad_norm": 2.751945972442627, "learning_rate": 4.146119698347329e-05, "loss": 5.6251, "step": 6490 }, { "epoch": 0.5200832133141302, "grad_norm": 2.1990444660186768, "learning_rate": 4.144782585441515e-05, "loss": 5.5141, "step": 6500 }, { "epoch": 0.5208833413346136, "grad_norm": 2.732024908065796, "learning_rate": 4.1434454725357015e-05, "loss": 5.5938, "step": 6510 }, { "epoch": 0.5216834693550968, "grad_norm": 2.6876533031463623, "learning_rate": 4.142108359629887e-05, "loss": 5.7126, "step": 6520 }, { "epoch": 0.5224835973755801, "grad_norm": 2.660323143005371, "learning_rate": 4.1407712467240734e-05, "loss": 5.6261, "step": 6530 }, { "epoch": 0.5232837253960634, "grad_norm": 2.567084550857544, "learning_rate": 4.13943413381826e-05, "loss": 5.5248, "step": 6540 }, { "epoch": 0.5240838534165466, "grad_norm": 4.317018032073975, "learning_rate": 4.138097020912446e-05, "loss": 5.4444, "step": 6550 }, { "epoch": 0.52488398143703, "grad_norm": 2.0361647605895996, "learning_rate": 4.136759908006632e-05, "loss": 5.7532, "step": 6560 }, { "epoch": 0.5256841094575132, "grad_norm": 2.0946271419525146, "learning_rate": 4.1354227951008185e-05, "loss": 5.6343, "step": 6570 }, { "epoch": 0.5264842374779964, "grad_norm": 3.3724842071533203, "learning_rate": 4.134085682195005e-05, "loss": 5.6455, "step": 6580 }, { "epoch": 0.5272843654984798, "grad_norm": 4.078947067260742, "learning_rate": 4.132748569289191e-05, "loss": 5.6681, "step": 6590 }, { "epoch": 0.528084493518963, "grad_norm": 4.288105010986328, "learning_rate": 4.131411456383377e-05, "loss": 5.7152, "step": 6600 }, { "epoch": 0.5288846215394463, "grad_norm": 2.5208754539489746, "learning_rate": 4.1300743434775635e-05, "loss": 5.5715, "step": 6610 }, { "epoch": 0.5296847495599296, "grad_norm": 2.6902217864990234, "learning_rate": 4.12873723057175e-05, "loss": 5.4997, "step": 6620 }, { "epoch": 0.5304848775804129, "grad_norm": 2.4580068588256836, "learning_rate": 4.127400117665936e-05, "loss": 5.7656, "step": 6630 }, { "epoch": 0.5312850056008962, "grad_norm": 2.5117955207824707, "learning_rate": 4.1260630047601224e-05, "loss": 5.6373, "step": 6640 }, { "epoch": 0.5320851336213794, "grad_norm": 2.660921096801758, "learning_rate": 4.1247258918543086e-05, "loss": 5.6829, "step": 6650 }, { "epoch": 0.5328852616418627, "grad_norm": 2.4601287841796875, "learning_rate": 4.123388778948495e-05, "loss": 5.7702, "step": 6660 }, { "epoch": 0.533685389662346, "grad_norm": 2.9025120735168457, "learning_rate": 4.122051666042681e-05, "loss": 5.6374, "step": 6670 }, { "epoch": 0.5344855176828293, "grad_norm": 2.8221569061279297, "learning_rate": 4.120714553136867e-05, "loss": 5.5568, "step": 6680 }, { "epoch": 0.5352856457033125, "grad_norm": 2.3035178184509277, "learning_rate": 4.119377440231053e-05, "loss": 5.5845, "step": 6690 }, { "epoch": 0.5360857737237958, "grad_norm": 2.0955657958984375, "learning_rate": 4.118040327325239e-05, "loss": 5.687, "step": 6700 }, { "epoch": 0.5368859017442791, "grad_norm": 2.530156135559082, "learning_rate": 4.1167032144194256e-05, "loss": 5.5772, "step": 6710 }, { "epoch": 0.5376860297647623, "grad_norm": 2.2060387134552, "learning_rate": 4.115366101513612e-05, "loss": 5.5964, "step": 6720 }, { "epoch": 0.5384861577852457, "grad_norm": 2.720702886581421, "learning_rate": 4.114028988607798e-05, "loss": 5.5432, "step": 6730 }, { "epoch": 0.5392862858057289, "grad_norm": 2.2585232257843018, "learning_rate": 4.1126918757019844e-05, "loss": 5.77, "step": 6740 }, { "epoch": 0.5400864138262121, "grad_norm": 2.052316904067993, "learning_rate": 4.1113547627961706e-05, "loss": 5.5679, "step": 6750 }, { "epoch": 0.5408865418466955, "grad_norm": 2.772500991821289, "learning_rate": 4.110017649890357e-05, "loss": 5.5608, "step": 6760 }, { "epoch": 0.5416866698671787, "grad_norm": 2.158129930496216, "learning_rate": 4.108680536984543e-05, "loss": 5.6612, "step": 6770 }, { "epoch": 0.5424867978876621, "grad_norm": 2.874685287475586, "learning_rate": 4.1073434240787294e-05, "loss": 5.5999, "step": 6780 }, { "epoch": 0.5432869259081453, "grad_norm": 2.2797632217407227, "learning_rate": 4.106006311172916e-05, "loss": 5.7243, "step": 6790 }, { "epoch": 0.5440870539286286, "grad_norm": 2.998309850692749, "learning_rate": 4.1048029095576836e-05, "loss": 5.5031, "step": 6800 }, { "epoch": 0.5448871819491119, "grad_norm": 2.8155364990234375, "learning_rate": 4.10346579665187e-05, "loss": 5.7631, "step": 6810 }, { "epoch": 0.5456873099695951, "grad_norm": 2.327279806137085, "learning_rate": 4.102128683746056e-05, "loss": 5.6293, "step": 6820 }, { "epoch": 0.5464874379900784, "grad_norm": 3.3200621604919434, "learning_rate": 4.100791570840242e-05, "loss": 5.717, "step": 6830 }, { "epoch": 0.5472875660105617, "grad_norm": 2.521144390106201, "learning_rate": 4.099454457934428e-05, "loss": 5.5705, "step": 6840 }, { "epoch": 0.548087694031045, "grad_norm": 2.7198219299316406, "learning_rate": 4.098117345028614e-05, "loss": 5.5931, "step": 6850 }, { "epoch": 0.5488878220515282, "grad_norm": 2.701251268386841, "learning_rate": 4.0967802321228006e-05, "loss": 5.4706, "step": 6860 }, { "epoch": 0.5496879500720115, "grad_norm": 2.2789149284362793, "learning_rate": 4.095443119216987e-05, "loss": 5.5883, "step": 6870 }, { "epoch": 0.5504880780924948, "grad_norm": 2.8821568489074707, "learning_rate": 4.094106006311173e-05, "loss": 5.7525, "step": 6880 }, { "epoch": 0.5512882061129781, "grad_norm": 2.3450064659118652, "learning_rate": 4.0927688934053594e-05, "loss": 5.5166, "step": 6890 }, { "epoch": 0.5520883341334614, "grad_norm": 2.639960527420044, "learning_rate": 4.0914317804995456e-05, "loss": 5.7001, "step": 6900 }, { "epoch": 0.5528884621539446, "grad_norm": 2.6743710041046143, "learning_rate": 4.090094667593732e-05, "loss": 5.7049, "step": 6910 }, { "epoch": 0.553688590174428, "grad_norm": 2.7540199756622314, "learning_rate": 4.088757554687918e-05, "loss": 5.5705, "step": 6920 }, { "epoch": 0.5544887181949112, "grad_norm": 3.2703442573547363, "learning_rate": 4.0874204417821044e-05, "loss": 5.5585, "step": 6930 }, { "epoch": 0.5552888462153944, "grad_norm": 3.684135913848877, "learning_rate": 4.086083328876291e-05, "loss": 5.6561, "step": 6940 }, { "epoch": 0.5560889742358778, "grad_norm": 2.918989896774292, "learning_rate": 4.084746215970477e-05, "loss": 5.5171, "step": 6950 }, { "epoch": 0.556889102256361, "grad_norm": 2.5902323722839355, "learning_rate": 4.083409103064663e-05, "loss": 5.6703, "step": 6960 }, { "epoch": 0.5576892302768442, "grad_norm": 2.23820161819458, "learning_rate": 4.0820719901588495e-05, "loss": 5.7048, "step": 6970 }, { "epoch": 0.5584893582973276, "grad_norm": 2.4339401721954346, "learning_rate": 4.080734877253036e-05, "loss": 5.4264, "step": 6980 }, { "epoch": 0.5592894863178108, "grad_norm": 3.3097031116485596, "learning_rate": 4.0793977643472214e-05, "loss": 5.5931, "step": 6990 }, { "epoch": 0.5600896143382941, "grad_norm": 2.6903202533721924, "learning_rate": 4.0780606514414077e-05, "loss": 5.5349, "step": 7000 }, { "epoch": 0.5600896143382941, "eval_loss": 5.870830535888672, "eval_runtime": 13.3044, "eval_samples_per_second": 3.007, "eval_steps_per_second": 0.376, "step": 7000 } ], "logging_steps": 10, "max_steps": 37494, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 7000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }