diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,26333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 7000, + "global_step": 37494, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_loss": 9.306169509887695, + "eval_runtime": 10.9126, + "eval_samples_per_second": 3.665, + "eval_steps_per_second": 0.458, + "step": 0 + }, + { + "epoch": 0.0008001280204832773, + "grad_norm": 8.51533031463623, + "learning_rate": 3.5000000000000004e-06, + "loss": 8.786, + "step": 10 + }, + { + "epoch": 0.0016002560409665546, + "grad_norm": 10.90935230255127, + "learning_rate": 8.500000000000002e-06, + "loss": 8.3433, + "step": 20 + }, + { + "epoch": 0.002400384061449832, + "grad_norm": 7.269016265869141, + "learning_rate": 1.3500000000000001e-05, + "loss": 7.549, + "step": 30 + }, + { + "epoch": 0.003200512081933109, + "grad_norm": 8.790578842163086, + "learning_rate": 1.85e-05, + "loss": 7.2574, + "step": 40 + }, + { + "epoch": 0.004000640102416387, + "grad_norm": 6.52068567276001, + "learning_rate": 2.35e-05, + "loss": 7.0024, + "step": 50 + }, + { + "epoch": 0.004800768122899664, + "grad_norm": 6.902959823608398, + "learning_rate": 2.8499999999999998e-05, + "loss": 6.9074, + "step": 60 + }, + { + "epoch": 0.005600896143382941, + "grad_norm": 5.350945949554443, + "learning_rate": 3.35e-05, + "loss": 6.8765, + "step": 70 + }, + { + "epoch": 0.006401024163866218, + "grad_norm": 5.928489685058594, + "learning_rate": 3.85e-05, + "loss": 6.5663, + "step": 80 + }, + { + "epoch": 0.007201152184349496, + "grad_norm": 9.222543716430664, + "learning_rate": 4.35e-05, + "loss": 6.6131, + "step": 90 + }, + { + "epoch": 0.008001280204832774, + "grad_norm": 6.57027006149292, + "learning_rate": 4.85e-05, + "loss": 6.5829, + "step": 100 + }, + { + "epoch": 0.00880140822531605, + "grad_norm": 5.280848503112793, + "learning_rate": 4.999064020965931e-05, + "loss": 6.5996, + "step": 110 + }, + { + "epoch": 0.009601536245799328, + "grad_norm": 5.950971603393555, + "learning_rate": 4.997726908060117e-05, + "loss": 6.6075, + "step": 120 + }, + { + "epoch": 0.010401664266282605, + "grad_norm": 4.300549507141113, + "learning_rate": 4.996389795154303e-05, + "loss": 6.5074, + "step": 130 + }, + { + "epoch": 0.011201792286765882, + "grad_norm": 4.824333190917969, + "learning_rate": 4.9950526822484896e-05, + "loss": 6.6072, + "step": 140 + }, + { + "epoch": 0.01200192030724916, + "grad_norm": 5.4324116706848145, + "learning_rate": 4.993715569342676e-05, + "loss": 6.6183, + "step": 150 + }, + { + "epoch": 0.012802048327732437, + "grad_norm": 4.087579250335693, + "learning_rate": 4.992378456436862e-05, + "loss": 6.4806, + "step": 160 + }, + { + "epoch": 0.013602176348215714, + "grad_norm": 7.260207653045654, + "learning_rate": 4.9910413435310484e-05, + "loss": 6.3709, + "step": 170 + }, + { + "epoch": 0.014402304368698993, + "grad_norm": 4.145061016082764, + "learning_rate": 4.9897042306252346e-05, + "loss": 6.2951, + "step": 180 + }, + { + "epoch": 0.01520243238918227, + "grad_norm": 3.2026450634002686, + "learning_rate": 4.98836711771942e-05, + "loss": 6.3255, + "step": 190 + }, + { + "epoch": 0.016002560409665547, + "grad_norm": 3.443145751953125, + "learning_rate": 4.9870300048136065e-05, + "loss": 6.4894, + "step": 200 + }, + { + "epoch": 0.016802688430148822, + "grad_norm": 5.324231147766113, + "learning_rate": 4.985692891907793e-05, + "loss": 6.4312, + "step": 210 + }, + { + "epoch": 0.0176028164506321, + "grad_norm": 3.2833452224731445, + "learning_rate": 4.984355779001979e-05, + "loss": 6.513, + "step": 220 + }, + { + "epoch": 0.018402944471115377, + "grad_norm": 3.8984358310699463, + "learning_rate": 4.983018666096165e-05, + "loss": 6.1683, + "step": 230 + }, + { + "epoch": 0.019203072491598656, + "grad_norm": 4.183676719665527, + "learning_rate": 4.9816815531903516e-05, + "loss": 6.329, + "step": 240 + }, + { + "epoch": 0.020003200512081935, + "grad_norm": 3.136693239212036, + "learning_rate": 4.980344440284538e-05, + "loss": 6.466, + "step": 250 + }, + { + "epoch": 0.02080332853256521, + "grad_norm": 4.185967445373535, + "learning_rate": 4.979007327378724e-05, + "loss": 6.4613, + "step": 260 + }, + { + "epoch": 0.02160345655304849, + "grad_norm": 3.105653762817383, + "learning_rate": 4.9776702144729104e-05, + "loss": 6.3596, + "step": 270 + }, + { + "epoch": 0.022403584573531764, + "grad_norm": 3.927561044692993, + "learning_rate": 4.9763331015670967e-05, + "loss": 6.2604, + "step": 280 + }, + { + "epoch": 0.023203712594015043, + "grad_norm": 3.513439178466797, + "learning_rate": 4.974995988661283e-05, + "loss": 6.2747, + "step": 290 + }, + { + "epoch": 0.02400384061449832, + "grad_norm": 3.07377290725708, + "learning_rate": 4.973658875755469e-05, + "loss": 6.202, + "step": 300 + }, + { + "epoch": 0.024803968634981598, + "grad_norm": 3.045619249343872, + "learning_rate": 4.9723217628496555e-05, + "loss": 6.1022, + "step": 310 + }, + { + "epoch": 0.025604096655464873, + "grad_norm": 3.330648183822632, + "learning_rate": 4.970984649943842e-05, + "loss": 6.1544, + "step": 320 + }, + { + "epoch": 0.026404224675948152, + "grad_norm": 3.0299668312072754, + "learning_rate": 4.969647537038028e-05, + "loss": 6.3119, + "step": 330 + }, + { + "epoch": 0.027204352696431428, + "grad_norm": 3.687938928604126, + "learning_rate": 4.9683104241322136e-05, + "loss": 6.333, + "step": 340 + }, + { + "epoch": 0.028004480716914706, + "grad_norm": 4.0919413566589355, + "learning_rate": 4.9669733112264e-05, + "loss": 6.1711, + "step": 350 + }, + { + "epoch": 0.028804608737397985, + "grad_norm": 3.1327242851257324, + "learning_rate": 4.965636198320586e-05, + "loss": 6.3365, + "step": 360 + }, + { + "epoch": 0.02960473675788126, + "grad_norm": 4.531859874725342, + "learning_rate": 4.9642990854147724e-05, + "loss": 6.2121, + "step": 370 + }, + { + "epoch": 0.03040486477836454, + "grad_norm": 2.522672414779663, + "learning_rate": 4.962961972508959e-05, + "loss": 6.2388, + "step": 380 + }, + { + "epoch": 0.031204992798847815, + "grad_norm": 5.62153959274292, + "learning_rate": 4.961624859603145e-05, + "loss": 6.168, + "step": 390 + }, + { + "epoch": 0.032005120819331094, + "grad_norm": 3.522804021835327, + "learning_rate": 4.960287746697331e-05, + "loss": 6.1207, + "step": 400 + }, + { + "epoch": 0.03280524883981437, + "grad_norm": 7.260324478149414, + "learning_rate": 4.9589506337915175e-05, + "loss": 6.31, + "step": 410 + }, + { + "epoch": 0.033605376860297645, + "grad_norm": 4.309441566467285, + "learning_rate": 4.957613520885704e-05, + "loss": 6.1107, + "step": 420 + }, + { + "epoch": 0.034405504880780924, + "grad_norm": 3.2409913539886475, + "learning_rate": 4.95627640797989e-05, + "loss": 6.2082, + "step": 430 + }, + { + "epoch": 0.0352056329012642, + "grad_norm": 3.9414610862731934, + "learning_rate": 4.954939295074076e-05, + "loss": 6.2102, + "step": 440 + }, + { + "epoch": 0.03600576092174748, + "grad_norm": 2.441235303878784, + "learning_rate": 4.9536021821682626e-05, + "loss": 6.1023, + "step": 450 + }, + { + "epoch": 0.036805888942230754, + "grad_norm": 2.997591972351074, + "learning_rate": 4.952265069262449e-05, + "loss": 6.1147, + "step": 460 + }, + { + "epoch": 0.03760601696271403, + "grad_norm": 3.950436592102051, + "learning_rate": 4.950927956356635e-05, + "loss": 6.0725, + "step": 470 + }, + { + "epoch": 0.03840614498319731, + "grad_norm": 3.4340896606445312, + "learning_rate": 4.9495908434508214e-05, + "loss": 6.1336, + "step": 480 + }, + { + "epoch": 0.03920627300368059, + "grad_norm": 3.28839373588562, + "learning_rate": 4.948253730545007e-05, + "loss": 6.1709, + "step": 490 + }, + { + "epoch": 0.04000640102416387, + "grad_norm": 2.976365566253662, + "learning_rate": 4.946916617639193e-05, + "loss": 6.2074, + "step": 500 + }, + { + "epoch": 0.04080652904464714, + "grad_norm": 4.156027793884277, + "learning_rate": 4.9455795047333795e-05, + "loss": 6.1694, + "step": 510 + }, + { + "epoch": 0.04160665706513042, + "grad_norm": 3.4855797290802, + "learning_rate": 4.944242391827566e-05, + "loss": 6.1218, + "step": 520 + }, + { + "epoch": 0.0424067850856137, + "grad_norm": 4.489185333251953, + "learning_rate": 4.942905278921752e-05, + "loss": 6.1507, + "step": 530 + }, + { + "epoch": 0.04320691310609698, + "grad_norm": 3.2751166820526123, + "learning_rate": 4.941568166015938e-05, + "loss": 6.1055, + "step": 540 + }, + { + "epoch": 0.04400704112658025, + "grad_norm": 2.4234585762023926, + "learning_rate": 4.9402310531101246e-05, + "loss": 6.1755, + "step": 550 + }, + { + "epoch": 0.04480716914706353, + "grad_norm": 3.4436991214752197, + "learning_rate": 4.938893940204311e-05, + "loss": 6.1882, + "step": 560 + }, + { + "epoch": 0.04560729716754681, + "grad_norm": 3.3731908798217773, + "learning_rate": 4.937556827298497e-05, + "loss": 6.0648, + "step": 570 + }, + { + "epoch": 0.04640742518803009, + "grad_norm": 3.8733670711517334, + "learning_rate": 4.9362197143926834e-05, + "loss": 6.0621, + "step": 580 + }, + { + "epoch": 0.04720755320851336, + "grad_norm": 4.126636505126953, + "learning_rate": 4.9348826014868696e-05, + "loss": 6.122, + "step": 590 + }, + { + "epoch": 0.04800768122899664, + "grad_norm": 3.8605775833129883, + "learning_rate": 4.933545488581056e-05, + "loss": 5.9788, + "step": 600 + }, + { + "epoch": 0.048807809249479917, + "grad_norm": 2.9509966373443604, + "learning_rate": 4.932208375675242e-05, + "loss": 6.2045, + "step": 610 + }, + { + "epoch": 0.049607937269963195, + "grad_norm": 4.4266510009765625, + "learning_rate": 4.9308712627694285e-05, + "loss": 5.9981, + "step": 620 + }, + { + "epoch": 0.050408065290446474, + "grad_norm": 2.79042649269104, + "learning_rate": 4.929534149863615e-05, + "loss": 6.1882, + "step": 630 + }, + { + "epoch": 0.051208193310929746, + "grad_norm": 2.8986568450927734, + "learning_rate": 4.928197036957801e-05, + "loss": 6.1739, + "step": 640 + }, + { + "epoch": 0.052008321331413025, + "grad_norm": 4.294217586517334, + "learning_rate": 4.926859924051987e-05, + "loss": 6.0566, + "step": 650 + }, + { + "epoch": 0.052808449351896304, + "grad_norm": 8.848836898803711, + "learning_rate": 4.9255228111461735e-05, + "loss": 6.2994, + "step": 660 + }, + { + "epoch": 0.05360857737237958, + "grad_norm": 3.2204337120056152, + "learning_rate": 4.92418569824036e-05, + "loss": 6.0573, + "step": 670 + }, + { + "epoch": 0.054408705392862855, + "grad_norm": 4.775251865386963, + "learning_rate": 4.922848585334546e-05, + "loss": 5.9764, + "step": 680 + }, + { + "epoch": 0.055208833413346134, + "grad_norm": 3.5426905155181885, + "learning_rate": 4.921511472428732e-05, + "loss": 6.0402, + "step": 690 + }, + { + "epoch": 0.05600896143382941, + "grad_norm": 10.72481632232666, + "learning_rate": 4.9201743595229186e-05, + "loss": 6.0024, + "step": 700 + }, + { + "epoch": 0.05680908945431269, + "grad_norm": 2.441681385040283, + "learning_rate": 4.918837246617105e-05, + "loss": 6.1122, + "step": 710 + }, + { + "epoch": 0.05760921747479597, + "grad_norm": 3.375319480895996, + "learning_rate": 4.917500133711291e-05, + "loss": 6.058, + "step": 720 + }, + { + "epoch": 0.05840934549527924, + "grad_norm": 2.821507453918457, + "learning_rate": 4.9161630208054774e-05, + "loss": 6.0586, + "step": 730 + }, + { + "epoch": 0.05920947351576252, + "grad_norm": 2.8658957481384277, + "learning_rate": 4.914825907899664e-05, + "loss": 6.0115, + "step": 740 + }, + { + "epoch": 0.0600096015362458, + "grad_norm": 2.239774227142334, + "learning_rate": 4.91348879499385e-05, + "loss": 6.0669, + "step": 750 + }, + { + "epoch": 0.06080972955672908, + "grad_norm": 3.5249900817871094, + "learning_rate": 4.912151682088036e-05, + "loss": 6.1013, + "step": 760 + }, + { + "epoch": 0.06160985757721235, + "grad_norm": 2.790356159210205, + "learning_rate": 4.9108145691822225e-05, + "loss": 6.0099, + "step": 770 + }, + { + "epoch": 0.06240998559769563, + "grad_norm": 3.0729963779449463, + "learning_rate": 4.909477456276409e-05, + "loss": 6.1376, + "step": 780 + }, + { + "epoch": 0.06321011361817891, + "grad_norm": 2.9490275382995605, + "learning_rate": 4.908140343370595e-05, + "loss": 6.1457, + "step": 790 + }, + { + "epoch": 0.06401024163866219, + "grad_norm": 2.7475438117980957, + "learning_rate": 4.9068032304647806e-05, + "loss": 6.0041, + "step": 800 + }, + { + "epoch": 0.06481036965914547, + "grad_norm": 2.755703926086426, + "learning_rate": 4.905466117558967e-05, + "loss": 6.0242, + "step": 810 + }, + { + "epoch": 0.06561049767962875, + "grad_norm": 2.724515676498413, + "learning_rate": 4.904129004653153e-05, + "loss": 6.1827, + "step": 820 + }, + { + "epoch": 0.06641062570011202, + "grad_norm": 4.498260974884033, + "learning_rate": 4.9027918917473394e-05, + "loss": 6.0892, + "step": 830 + }, + { + "epoch": 0.06721075372059529, + "grad_norm": 2.4399070739746094, + "learning_rate": 4.901454778841526e-05, + "loss": 6.0197, + "step": 840 + }, + { + "epoch": 0.06801088174107857, + "grad_norm": 2.7584304809570312, + "learning_rate": 4.900117665935712e-05, + "loss": 5.9056, + "step": 850 + }, + { + "epoch": 0.06881100976156185, + "grad_norm": 2.8177144527435303, + "learning_rate": 4.898780553029898e-05, + "loss": 6.1484, + "step": 860 + }, + { + "epoch": 0.06961113778204513, + "grad_norm": 4.181133270263672, + "learning_rate": 4.8974434401240845e-05, + "loss": 5.9376, + "step": 870 + }, + { + "epoch": 0.0704112658025284, + "grad_norm": 3.677849769592285, + "learning_rate": 4.896106327218271e-05, + "loss": 6.0403, + "step": 880 + }, + { + "epoch": 0.07121139382301168, + "grad_norm": 3.1553192138671875, + "learning_rate": 4.894769214312457e-05, + "loss": 6.0488, + "step": 890 + }, + { + "epoch": 0.07201152184349496, + "grad_norm": 3.2580947875976562, + "learning_rate": 4.893432101406643e-05, + "loss": 6.1002, + "step": 900 + }, + { + "epoch": 0.07281164986397824, + "grad_norm": 6.328150749206543, + "learning_rate": 4.8920949885008296e-05, + "loss": 6.0225, + "step": 910 + }, + { + "epoch": 0.07361177788446151, + "grad_norm": 2.7467615604400635, + "learning_rate": 4.890757875595016e-05, + "loss": 5.9622, + "step": 920 + }, + { + "epoch": 0.07441190590494479, + "grad_norm": 2.86570405960083, + "learning_rate": 4.889420762689202e-05, + "loss": 5.9718, + "step": 930 + }, + { + "epoch": 0.07521203392542807, + "grad_norm": 2.544917106628418, + "learning_rate": 4.8880836497833884e-05, + "loss": 5.8697, + "step": 940 + }, + { + "epoch": 0.07601216194591134, + "grad_norm": 2.5245840549468994, + "learning_rate": 4.8867465368775746e-05, + "loss": 5.9973, + "step": 950 + }, + { + "epoch": 0.07681228996639462, + "grad_norm": 3.6830902099609375, + "learning_rate": 4.88540942397176e-05, + "loss": 5.943, + "step": 960 + }, + { + "epoch": 0.0776124179868779, + "grad_norm": 2.6643354892730713, + "learning_rate": 4.8840723110659465e-05, + "loss": 5.8958, + "step": 970 + }, + { + "epoch": 0.07841254600736118, + "grad_norm": 6.4623565673828125, + "learning_rate": 4.882735198160133e-05, + "loss": 6.0236, + "step": 980 + }, + { + "epoch": 0.07921267402784446, + "grad_norm": 2.186974048614502, + "learning_rate": 4.881398085254319e-05, + "loss": 6.0481, + "step": 990 + }, + { + "epoch": 0.08001280204832774, + "grad_norm": 2.4983859062194824, + "learning_rate": 4.880060972348505e-05, + "loss": 6.075, + "step": 1000 + }, + { + "epoch": 0.080812930068811, + "grad_norm": 2.778280258178711, + "learning_rate": 4.8787238594426916e-05, + "loss": 6.0757, + "step": 1010 + }, + { + "epoch": 0.08161305808929428, + "grad_norm": 2.706965923309326, + "learning_rate": 4.877386746536878e-05, + "loss": 6.1504, + "step": 1020 + }, + { + "epoch": 0.08241318610977756, + "grad_norm": 3.4069600105285645, + "learning_rate": 4.876049633631064e-05, + "loss": 6.0889, + "step": 1030 + }, + { + "epoch": 0.08321331413026084, + "grad_norm": 3.179551124572754, + "learning_rate": 4.8747125207252504e-05, + "loss": 6.0057, + "step": 1040 + }, + { + "epoch": 0.08401344215074412, + "grad_norm": 2.924018383026123, + "learning_rate": 4.873375407819437e-05, + "loss": 5.8406, + "step": 1050 + }, + { + "epoch": 0.0848135701712274, + "grad_norm": 3.103912115097046, + "learning_rate": 4.872038294913623e-05, + "loss": 6.0351, + "step": 1060 + }, + { + "epoch": 0.08561369819171068, + "grad_norm": 2.8037219047546387, + "learning_rate": 4.870701182007809e-05, + "loss": 6.0272, + "step": 1070 + }, + { + "epoch": 0.08641382621219396, + "grad_norm": 2.477062940597534, + "learning_rate": 4.8693640691019955e-05, + "loss": 5.9269, + "step": 1080 + }, + { + "epoch": 0.08721395423267723, + "grad_norm": 2.748488187789917, + "learning_rate": 4.868026956196182e-05, + "loss": 5.943, + "step": 1090 + }, + { + "epoch": 0.0880140822531605, + "grad_norm": 3.3991920948028564, + "learning_rate": 4.866689843290368e-05, + "loss": 6.1455, + "step": 1100 + }, + { + "epoch": 0.08881421027364378, + "grad_norm": 3.208509683609009, + "learning_rate": 4.8653527303845536e-05, + "loss": 5.9746, + "step": 1110 + }, + { + "epoch": 0.08961433829412706, + "grad_norm": 3.3378469944000244, + "learning_rate": 4.86401561747874e-05, + "loss": 5.9185, + "step": 1120 + }, + { + "epoch": 0.09041446631461034, + "grad_norm": 2.269606113433838, + "learning_rate": 4.862678504572926e-05, + "loss": 5.9369, + "step": 1130 + }, + { + "epoch": 0.09121459433509362, + "grad_norm": 2.749335765838623, + "learning_rate": 4.8613413916671124e-05, + "loss": 6.0648, + "step": 1140 + }, + { + "epoch": 0.0920147223555769, + "grad_norm": 2.821913480758667, + "learning_rate": 4.860004278761299e-05, + "loss": 5.952, + "step": 1150 + }, + { + "epoch": 0.09281485037606017, + "grad_norm": 2.640990734100342, + "learning_rate": 4.858667165855485e-05, + "loss": 6.0537, + "step": 1160 + }, + { + "epoch": 0.09361497839654345, + "grad_norm": 3.570896625518799, + "learning_rate": 4.857330052949671e-05, + "loss": 5.7721, + "step": 1170 + }, + { + "epoch": 0.09441510641702672, + "grad_norm": 3.245318651199341, + "learning_rate": 4.8559929400438575e-05, + "loss": 5.7305, + "step": 1180 + }, + { + "epoch": 0.09521523443751, + "grad_norm": 4.075076580047607, + "learning_rate": 4.854655827138044e-05, + "loss": 5.974, + "step": 1190 + }, + { + "epoch": 0.09601536245799328, + "grad_norm": 2.429893732070923, + "learning_rate": 4.85331871423223e-05, + "loss": 5.7828, + "step": 1200 + }, + { + "epoch": 0.09681549047847655, + "grad_norm": 2.7077040672302246, + "learning_rate": 4.851981601326416e-05, + "loss": 5.9143, + "step": 1210 + }, + { + "epoch": 0.09761561849895983, + "grad_norm": 2.767918586730957, + "learning_rate": 4.8506444884206026e-05, + "loss": 5.9449, + "step": 1220 + }, + { + "epoch": 0.09841574651944311, + "grad_norm": 2.4544034004211426, + "learning_rate": 4.849307375514789e-05, + "loss": 6.0034, + "step": 1230 + }, + { + "epoch": 0.09921587453992639, + "grad_norm": 5.215607643127441, + "learning_rate": 4.847970262608975e-05, + "loss": 5.867, + "step": 1240 + }, + { + "epoch": 0.10001600256040967, + "grad_norm": 2.7856080532073975, + "learning_rate": 4.8466331497031614e-05, + "loss": 6.0213, + "step": 1250 + }, + { + "epoch": 0.10081613058089295, + "grad_norm": 2.5528719425201416, + "learning_rate": 4.8452960367973476e-05, + "loss": 5.9634, + "step": 1260 + }, + { + "epoch": 0.10161625860137621, + "grad_norm": 2.4917409420013428, + "learning_rate": 4.843958923891533e-05, + "loss": 5.887, + "step": 1270 + }, + { + "epoch": 0.10241638662185949, + "grad_norm": 6.125699520111084, + "learning_rate": 4.8426218109857195e-05, + "loss": 6.1189, + "step": 1280 + }, + { + "epoch": 0.10321651464234277, + "grad_norm": 2.783156156539917, + "learning_rate": 4.841284698079906e-05, + "loss": 5.9064, + "step": 1290 + }, + { + "epoch": 0.10401664266282605, + "grad_norm": 3.611070156097412, + "learning_rate": 4.839947585174092e-05, + "loss": 5.9405, + "step": 1300 + }, + { + "epoch": 0.10481677068330933, + "grad_norm": 4.296909809112549, + "learning_rate": 4.838610472268278e-05, + "loss": 5.9067, + "step": 1310 + }, + { + "epoch": 0.10561689870379261, + "grad_norm": 2.4273040294647217, + "learning_rate": 4.8372733593624646e-05, + "loss": 5.888, + "step": 1320 + }, + { + "epoch": 0.10641702672427589, + "grad_norm": 2.6499924659729004, + "learning_rate": 4.835936246456651e-05, + "loss": 5.9683, + "step": 1330 + }, + { + "epoch": 0.10721715474475917, + "grad_norm": 3.1474297046661377, + "learning_rate": 4.834599133550837e-05, + "loss": 5.8946, + "step": 1340 + }, + { + "epoch": 0.10801728276524244, + "grad_norm": 3.5050199031829834, + "learning_rate": 4.8332620206450234e-05, + "loss": 5.9179, + "step": 1350 + }, + { + "epoch": 0.10881741078572571, + "grad_norm": 2.693700075149536, + "learning_rate": 4.8319249077392096e-05, + "loss": 5.7965, + "step": 1360 + }, + { + "epoch": 0.10961753880620899, + "grad_norm": 2.8202953338623047, + "learning_rate": 4.830587794833396e-05, + "loss": 5.9526, + "step": 1370 + }, + { + "epoch": 0.11041766682669227, + "grad_norm": 2.514862060546875, + "learning_rate": 4.829250681927582e-05, + "loss": 5.936, + "step": 1380 + }, + { + "epoch": 0.11121779484717555, + "grad_norm": 3.18804931640625, + "learning_rate": 4.8279135690217685e-05, + "loss": 5.9246, + "step": 1390 + }, + { + "epoch": 0.11201792286765883, + "grad_norm": 2.77697491645813, + "learning_rate": 4.826576456115955e-05, + "loss": 5.9576, + "step": 1400 + }, + { + "epoch": 0.1128180508881421, + "grad_norm": 2.762524127960205, + "learning_rate": 4.825239343210141e-05, + "loss": 5.9085, + "step": 1410 + }, + { + "epoch": 0.11361817890862538, + "grad_norm": 2.4407670497894287, + "learning_rate": 4.8239022303043266e-05, + "loss": 5.9518, + "step": 1420 + }, + { + "epoch": 0.11441830692910866, + "grad_norm": 3.1036713123321533, + "learning_rate": 4.822565117398513e-05, + "loss": 5.8412, + "step": 1430 + }, + { + "epoch": 0.11521843494959194, + "grad_norm": 3.319058418273926, + "learning_rate": 4.821228004492699e-05, + "loss": 5.9733, + "step": 1440 + }, + { + "epoch": 0.1160185629700752, + "grad_norm": 2.13468599319458, + "learning_rate": 4.8198908915868854e-05, + "loss": 5.9193, + "step": 1450 + }, + { + "epoch": 0.11681869099055849, + "grad_norm": 2.6057028770446777, + "learning_rate": 4.8185537786810717e-05, + "loss": 5.9807, + "step": 1460 + }, + { + "epoch": 0.11761881901104176, + "grad_norm": 2.7509753704071045, + "learning_rate": 4.817216665775258e-05, + "loss": 5.9534, + "step": 1470 + }, + { + "epoch": 0.11841894703152504, + "grad_norm": 2.111055850982666, + "learning_rate": 4.815879552869444e-05, + "loss": 5.9207, + "step": 1480 + }, + { + "epoch": 0.11921907505200832, + "grad_norm": 2.5271990299224854, + "learning_rate": 4.8145424399636305e-05, + "loss": 5.7148, + "step": 1490 + }, + { + "epoch": 0.1200192030724916, + "grad_norm": 2.814138174057007, + "learning_rate": 4.813205327057817e-05, + "loss": 5.9498, + "step": 1500 + }, + { + "epoch": 0.12081933109297488, + "grad_norm": 3.449355363845825, + "learning_rate": 4.811868214152003e-05, + "loss": 5.7814, + "step": 1510 + }, + { + "epoch": 0.12161945911345816, + "grad_norm": 2.813746213912964, + "learning_rate": 4.810531101246189e-05, + "loss": 5.9517, + "step": 1520 + }, + { + "epoch": 0.12241958713394142, + "grad_norm": 2.529242753982544, + "learning_rate": 4.8091939883403755e-05, + "loss": 5.8227, + "step": 1530 + }, + { + "epoch": 0.1232197151544247, + "grad_norm": 2.2425034046173096, + "learning_rate": 4.807856875434562e-05, + "loss": 6.1064, + "step": 1540 + }, + { + "epoch": 0.12401984317490798, + "grad_norm": 2.7732784748077393, + "learning_rate": 4.806519762528748e-05, + "loss": 5.8888, + "step": 1550 + }, + { + "epoch": 0.12481997119539126, + "grad_norm": 2.5558009147644043, + "learning_rate": 4.8051826496229343e-05, + "loss": 5.8185, + "step": 1560 + }, + { + "epoch": 0.12562009921587455, + "grad_norm": 2.884411096572876, + "learning_rate": 4.8038455367171206e-05, + "loss": 6.0534, + "step": 1570 + }, + { + "epoch": 0.12642022723635782, + "grad_norm": 2.5747668743133545, + "learning_rate": 4.802508423811307e-05, + "loss": 5.8186, + "step": 1580 + }, + { + "epoch": 0.12722035525684108, + "grad_norm": 2.324767827987671, + "learning_rate": 4.801171310905493e-05, + "loss": 5.8642, + "step": 1590 + }, + { + "epoch": 0.12802048327732438, + "grad_norm": 2.2255160808563232, + "learning_rate": 4.7998341979996794e-05, + "loss": 5.8559, + "step": 1600 + }, + { + "epoch": 0.12882061129780764, + "grad_norm": 2.97525954246521, + "learning_rate": 4.798497085093866e-05, + "loss": 5.8744, + "step": 1610 + }, + { + "epoch": 0.12962073931829093, + "grad_norm": 2.23962664604187, + "learning_rate": 4.797159972188052e-05, + "loss": 5.7545, + "step": 1620 + }, + { + "epoch": 0.1304208673387742, + "grad_norm": 3.6182124614715576, + "learning_rate": 4.795822859282238e-05, + "loss": 5.8872, + "step": 1630 + }, + { + "epoch": 0.1312209953592575, + "grad_norm": 4.068545341491699, + "learning_rate": 4.7944857463764245e-05, + "loss": 5.9008, + "step": 1640 + }, + { + "epoch": 0.13202112337974076, + "grad_norm": 3.627082109451294, + "learning_rate": 4.793148633470611e-05, + "loss": 5.8215, + "step": 1650 + }, + { + "epoch": 0.13282125140022405, + "grad_norm": 3.0080721378326416, + "learning_rate": 4.791811520564797e-05, + "loss": 5.9086, + "step": 1660 + }, + { + "epoch": 0.13362137942070731, + "grad_norm": 2.5463860034942627, + "learning_rate": 4.790474407658983e-05, + "loss": 5.776, + "step": 1670 + }, + { + "epoch": 0.13442150744119058, + "grad_norm": 2.212488889694214, + "learning_rate": 4.7891372947531696e-05, + "loss": 6.006, + "step": 1680 + }, + { + "epoch": 0.13522163546167387, + "grad_norm": 4.147563934326172, + "learning_rate": 4.787800181847356e-05, + "loss": 5.886, + "step": 1690 + }, + { + "epoch": 0.13602176348215714, + "grad_norm": 2.6021018028259277, + "learning_rate": 4.786463068941542e-05, + "loss": 5.9182, + "step": 1700 + }, + { + "epoch": 0.13682189150264043, + "grad_norm": 2.3109893798828125, + "learning_rate": 4.7851259560357284e-05, + "loss": 5.8084, + "step": 1710 + }, + { + "epoch": 0.1376220195231237, + "grad_norm": 2.8678529262542725, + "learning_rate": 4.7837888431299147e-05, + "loss": 6.0363, + "step": 1720 + }, + { + "epoch": 0.138422147543607, + "grad_norm": 2.1921958923339844, + "learning_rate": 4.7824517302241e-05, + "loss": 5.7667, + "step": 1730 + }, + { + "epoch": 0.13922227556409025, + "grad_norm": 2.6883316040039062, + "learning_rate": 4.7811146173182865e-05, + "loss": 5.7906, + "step": 1740 + }, + { + "epoch": 0.14002240358457352, + "grad_norm": 2.4079957008361816, + "learning_rate": 4.779777504412473e-05, + "loss": 5.7698, + "step": 1750 + }, + { + "epoch": 0.1408225316050568, + "grad_norm": 4.29390287399292, + "learning_rate": 4.778440391506659e-05, + "loss": 5.9639, + "step": 1760 + }, + { + "epoch": 0.14162265962554008, + "grad_norm": 4.133132457733154, + "learning_rate": 4.777103278600845e-05, + "loss": 6.0901, + "step": 1770 + }, + { + "epoch": 0.14242278764602337, + "grad_norm": 3.871561288833618, + "learning_rate": 4.7757661656950316e-05, + "loss": 5.7455, + "step": 1780 + }, + { + "epoch": 0.14322291566650663, + "grad_norm": 4.266111850738525, + "learning_rate": 4.774429052789218e-05, + "loss": 5.9971, + "step": 1790 + }, + { + "epoch": 0.14402304368698993, + "grad_norm": 2.9000513553619385, + "learning_rate": 4.773091939883404e-05, + "loss": 5.9025, + "step": 1800 + }, + { + "epoch": 0.1448231717074732, + "grad_norm": 2.549964189529419, + "learning_rate": 4.7717548269775904e-05, + "loss": 5.768, + "step": 1810 + }, + { + "epoch": 0.14562329972795648, + "grad_norm": 2.2882704734802246, + "learning_rate": 4.770417714071777e-05, + "loss": 6.022, + "step": 1820 + }, + { + "epoch": 0.14642342774843975, + "grad_norm": 2.6501784324645996, + "learning_rate": 4.769080601165963e-05, + "loss": 5.8539, + "step": 1830 + }, + { + "epoch": 0.14722355576892301, + "grad_norm": 2.3417108058929443, + "learning_rate": 4.767743488260149e-05, + "loss": 5.7734, + "step": 1840 + }, + { + "epoch": 0.1480236837894063, + "grad_norm": 2.2151668071746826, + "learning_rate": 4.7664063753543355e-05, + "loss": 5.84, + "step": 1850 + }, + { + "epoch": 0.14882381180988957, + "grad_norm": 3.114260196685791, + "learning_rate": 4.765069262448522e-05, + "loss": 5.9409, + "step": 1860 + }, + { + "epoch": 0.14962393983037287, + "grad_norm": 2.4931910037994385, + "learning_rate": 4.763732149542708e-05, + "loss": 5.9396, + "step": 1870 + }, + { + "epoch": 0.15042406785085613, + "grad_norm": 3.736487865447998, + "learning_rate": 4.7623950366368936e-05, + "loss": 5.7427, + "step": 1880 + }, + { + "epoch": 0.15122419587133942, + "grad_norm": 4.730785846710205, + "learning_rate": 4.76105792373108e-05, + "loss": 5.9181, + "step": 1890 + }, + { + "epoch": 0.1520243238918227, + "grad_norm": 2.9264132976531982, + "learning_rate": 4.759720810825266e-05, + "loss": 5.8967, + "step": 1900 + }, + { + "epoch": 0.15282445191230598, + "grad_norm": 3.2538132667541504, + "learning_rate": 4.7583836979194524e-05, + "loss": 5.8459, + "step": 1910 + }, + { + "epoch": 0.15362457993278925, + "grad_norm": 2.7208549976348877, + "learning_rate": 4.757046585013639e-05, + "loss": 5.7038, + "step": 1920 + }, + { + "epoch": 0.1544247079532725, + "grad_norm": 2.7510788440704346, + "learning_rate": 4.755709472107825e-05, + "loss": 5.8524, + "step": 1930 + }, + { + "epoch": 0.1552248359737558, + "grad_norm": 2.6565892696380615, + "learning_rate": 4.754372359202011e-05, + "loss": 5.6324, + "step": 1940 + }, + { + "epoch": 0.15602496399423907, + "grad_norm": 2.954798936843872, + "learning_rate": 4.7530352462961975e-05, + "loss": 5.8388, + "step": 1950 + }, + { + "epoch": 0.15682509201472236, + "grad_norm": 2.291714668273926, + "learning_rate": 4.751698133390384e-05, + "loss": 5.7504, + "step": 1960 + }, + { + "epoch": 0.15762522003520563, + "grad_norm": 2.1387598514556885, + "learning_rate": 4.75036102048457e-05, + "loss": 5.7556, + "step": 1970 + }, + { + "epoch": 0.15842534805568892, + "grad_norm": 2.290407180786133, + "learning_rate": 4.749023907578756e-05, + "loss": 5.7089, + "step": 1980 + }, + { + "epoch": 0.15922547607617218, + "grad_norm": 2.852696657180786, + "learning_rate": 4.7476867946729426e-05, + "loss": 5.8656, + "step": 1990 + }, + { + "epoch": 0.16002560409665548, + "grad_norm": 2.8190526962280273, + "learning_rate": 4.746349681767129e-05, + "loss": 6.0134, + "step": 2000 + }, + { + "epoch": 0.16082573211713874, + "grad_norm": 2.705008029937744, + "learning_rate": 4.745012568861315e-05, + "loss": 5.8713, + "step": 2010 + }, + { + "epoch": 0.161625860137622, + "grad_norm": 3.571394205093384, + "learning_rate": 4.7436754559555014e-05, + "loss": 5.8329, + "step": 2020 + }, + { + "epoch": 0.1624259881581053, + "grad_norm": 2.687455177307129, + "learning_rate": 4.7423383430496876e-05, + "loss": 5.8355, + "step": 2030 + }, + { + "epoch": 0.16322611617858857, + "grad_norm": 2.6158690452575684, + "learning_rate": 4.741001230143873e-05, + "loss": 5.6938, + "step": 2040 + }, + { + "epoch": 0.16402624419907186, + "grad_norm": 2.9657154083251953, + "learning_rate": 4.7396641172380595e-05, + "loss": 5.7514, + "step": 2050 + }, + { + "epoch": 0.16482637221955512, + "grad_norm": 2.310607433319092, + "learning_rate": 4.738327004332246e-05, + "loss": 5.7397, + "step": 2060 + }, + { + "epoch": 0.16562650024003842, + "grad_norm": 2.855271339416504, + "learning_rate": 4.736989891426432e-05, + "loss": 5.7645, + "step": 2070 + }, + { + "epoch": 0.16642662826052168, + "grad_norm": 2.778768301010132, + "learning_rate": 4.735652778520618e-05, + "loss": 5.9582, + "step": 2080 + }, + { + "epoch": 0.16722675628100497, + "grad_norm": 3.069973945617676, + "learning_rate": 4.7343156656148046e-05, + "loss": 5.8205, + "step": 2090 + }, + { + "epoch": 0.16802688430148824, + "grad_norm": 3.5799551010131836, + "learning_rate": 4.732978552708991e-05, + "loss": 5.9001, + "step": 2100 + }, + { + "epoch": 0.1688270123219715, + "grad_norm": 2.556668758392334, + "learning_rate": 4.731641439803177e-05, + "loss": 5.7258, + "step": 2110 + }, + { + "epoch": 0.1696271403424548, + "grad_norm": 2.7847707271575928, + "learning_rate": 4.7303043268973634e-05, + "loss": 5.9007, + "step": 2120 + }, + { + "epoch": 0.17042726836293806, + "grad_norm": 4.071508407592773, + "learning_rate": 4.7289672139915496e-05, + "loss": 5.7035, + "step": 2130 + }, + { + "epoch": 0.17122739638342135, + "grad_norm": 2.6188418865203857, + "learning_rate": 4.727630101085736e-05, + "loss": 5.651, + "step": 2140 + }, + { + "epoch": 0.17202752440390462, + "grad_norm": 1.952249526977539, + "learning_rate": 4.726292988179922e-05, + "loss": 6.1107, + "step": 2150 + }, + { + "epoch": 0.1728276524243879, + "grad_norm": 2.299018144607544, + "learning_rate": 4.7249558752741085e-05, + "loss": 5.7609, + "step": 2160 + }, + { + "epoch": 0.17362778044487118, + "grad_norm": 2.5578439235687256, + "learning_rate": 4.723618762368295e-05, + "loss": 5.792, + "step": 2170 + }, + { + "epoch": 0.17442790846535447, + "grad_norm": 3.9921529293060303, + "learning_rate": 4.722281649462481e-05, + "loss": 5.7233, + "step": 2180 + }, + { + "epoch": 0.17522803648583773, + "grad_norm": 2.5521302223205566, + "learning_rate": 4.7209445365566666e-05, + "loss": 5.807, + "step": 2190 + }, + { + "epoch": 0.176028164506321, + "grad_norm": 2.71401047706604, + "learning_rate": 4.719607423650853e-05, + "loss": 5.6689, + "step": 2200 + }, + { + "epoch": 0.1768282925268043, + "grad_norm": 3.782607316970825, + "learning_rate": 4.718270310745039e-05, + "loss": 5.734, + "step": 2210 + }, + { + "epoch": 0.17762842054728756, + "grad_norm": 2.57356333732605, + "learning_rate": 4.7169331978392254e-05, + "loss": 5.8101, + "step": 2220 + }, + { + "epoch": 0.17842854856777085, + "grad_norm": 2.7005815505981445, + "learning_rate": 4.715596084933412e-05, + "loss": 6.0603, + "step": 2230 + }, + { + "epoch": 0.17922867658825412, + "grad_norm": 2.081550359725952, + "learning_rate": 4.714258972027598e-05, + "loss": 5.7677, + "step": 2240 + }, + { + "epoch": 0.1800288046087374, + "grad_norm": 3.6565728187561035, + "learning_rate": 4.712921859121784e-05, + "loss": 5.9672, + "step": 2250 + }, + { + "epoch": 0.18082893262922067, + "grad_norm": 2.4702320098876953, + "learning_rate": 4.7115847462159705e-05, + "loss": 5.8397, + "step": 2260 + }, + { + "epoch": 0.18162906064970397, + "grad_norm": 3.335736036300659, + "learning_rate": 4.710247633310157e-05, + "loss": 5.7021, + "step": 2270 + }, + { + "epoch": 0.18242918867018723, + "grad_norm": 3.3939075469970703, + "learning_rate": 4.708910520404343e-05, + "loss": 5.8464, + "step": 2280 + }, + { + "epoch": 0.1832293166906705, + "grad_norm": 2.4869279861450195, + "learning_rate": 4.707573407498529e-05, + "loss": 5.6904, + "step": 2290 + }, + { + "epoch": 0.1840294447111538, + "grad_norm": 2.4240360260009766, + "learning_rate": 4.7062362945927155e-05, + "loss": 5.7227, + "step": 2300 + }, + { + "epoch": 0.18482957273163705, + "grad_norm": 2.428786039352417, + "learning_rate": 4.704899181686902e-05, + "loss": 5.8295, + "step": 2310 + }, + { + "epoch": 0.18562970075212035, + "grad_norm": 3.3214187622070312, + "learning_rate": 4.703562068781088e-05, + "loss": 5.8341, + "step": 2320 + }, + { + "epoch": 0.1864298287726036, + "grad_norm": 3.2146456241607666, + "learning_rate": 4.7022249558752744e-05, + "loss": 5.7217, + "step": 2330 + }, + { + "epoch": 0.1872299567930869, + "grad_norm": 4.442914009094238, + "learning_rate": 4.7008878429694606e-05, + "loss": 5.9003, + "step": 2340 + }, + { + "epoch": 0.18803008481357017, + "grad_norm": 1.9268267154693604, + "learning_rate": 4.699550730063646e-05, + "loss": 5.8292, + "step": 2350 + }, + { + "epoch": 0.18883021283405343, + "grad_norm": 3.130021095275879, + "learning_rate": 4.6982136171578325e-05, + "loss": 5.6864, + "step": 2360 + }, + { + "epoch": 0.18963034085453673, + "grad_norm": 2.8835690021514893, + "learning_rate": 4.696876504252019e-05, + "loss": 5.829, + "step": 2370 + }, + { + "epoch": 0.19043046887502, + "grad_norm": 2.4171135425567627, + "learning_rate": 4.695539391346205e-05, + "loss": 5.7972, + "step": 2380 + }, + { + "epoch": 0.19123059689550329, + "grad_norm": 3.782817840576172, + "learning_rate": 4.694202278440391e-05, + "loss": 5.8497, + "step": 2390 + }, + { + "epoch": 0.19203072491598655, + "grad_norm": 2.475249767303467, + "learning_rate": 4.6928651655345776e-05, + "loss": 5.9237, + "step": 2400 + }, + { + "epoch": 0.19283085293646984, + "grad_norm": 2.5809242725372314, + "learning_rate": 4.691528052628764e-05, + "loss": 5.7756, + "step": 2410 + }, + { + "epoch": 0.1936309809569531, + "grad_norm": 2.6922059059143066, + "learning_rate": 4.69019093972295e-05, + "loss": 5.9326, + "step": 2420 + }, + { + "epoch": 0.1944311089774364, + "grad_norm": 2.7542431354522705, + "learning_rate": 4.6888538268171364e-05, + "loss": 5.6279, + "step": 2430 + }, + { + "epoch": 0.19523123699791967, + "grad_norm": 2.4063303470611572, + "learning_rate": 4.6875167139113226e-05, + "loss": 5.91, + "step": 2440 + }, + { + "epoch": 0.19603136501840293, + "grad_norm": 4.855547904968262, + "learning_rate": 4.686179601005509e-05, + "loss": 5.7286, + "step": 2450 + }, + { + "epoch": 0.19683149303888622, + "grad_norm": 2.9875595569610596, + "learning_rate": 4.684842488099695e-05, + "loss": 5.8299, + "step": 2460 + }, + { + "epoch": 0.1976316210593695, + "grad_norm": 4.467639923095703, + "learning_rate": 4.6835053751938814e-05, + "loss": 5.8469, + "step": 2470 + }, + { + "epoch": 0.19843174907985278, + "grad_norm": 2.2144124507904053, + "learning_rate": 4.682168262288068e-05, + "loss": 5.7871, + "step": 2480 + }, + { + "epoch": 0.19923187710033605, + "grad_norm": 2.4507012367248535, + "learning_rate": 4.680831149382254e-05, + "loss": 5.7529, + "step": 2490 + }, + { + "epoch": 0.20003200512081934, + "grad_norm": 2.208648681640625, + "learning_rate": 4.67949403647644e-05, + "loss": 5.7265, + "step": 2500 + }, + { + "epoch": 0.2008321331413026, + "grad_norm": 2.560302257537842, + "learning_rate": 4.6781569235706265e-05, + "loss": 5.7842, + "step": 2510 + }, + { + "epoch": 0.2016322611617859, + "grad_norm": 2.354292154312134, + "learning_rate": 4.676819810664813e-05, + "loss": 5.8468, + "step": 2520 + }, + { + "epoch": 0.20243238918226916, + "grad_norm": 2.9559860229492188, + "learning_rate": 4.675482697758999e-05, + "loss": 5.7003, + "step": 2530 + }, + { + "epoch": 0.20323251720275243, + "grad_norm": 3.251077651977539, + "learning_rate": 4.674145584853185e-05, + "loss": 5.8129, + "step": 2540 + }, + { + "epoch": 0.20403264522323572, + "grad_norm": 2.7863471508026123, + "learning_rate": 4.6728084719473716e-05, + "loss": 5.6814, + "step": 2550 + }, + { + "epoch": 0.20483277324371899, + "grad_norm": 2.9006989002227783, + "learning_rate": 4.671471359041558e-05, + "loss": 5.8292, + "step": 2560 + }, + { + "epoch": 0.20563290126420228, + "grad_norm": 2.930689573287964, + "learning_rate": 4.670134246135744e-05, + "loss": 5.8825, + "step": 2570 + }, + { + "epoch": 0.20643302928468554, + "grad_norm": 2.3105032444000244, + "learning_rate": 4.6687971332299304e-05, + "loss": 5.7039, + "step": 2580 + }, + { + "epoch": 0.20723315730516884, + "grad_norm": 3.1141879558563232, + "learning_rate": 4.667460020324117e-05, + "loss": 5.8692, + "step": 2590 + }, + { + "epoch": 0.2080332853256521, + "grad_norm": 3.5017199516296387, + "learning_rate": 4.666122907418303e-05, + "loss": 5.7922, + "step": 2600 + }, + { + "epoch": 0.2088334133461354, + "grad_norm": 2.657975912094116, + "learning_rate": 4.664785794512489e-05, + "loss": 5.7736, + "step": 2610 + }, + { + "epoch": 0.20963354136661866, + "grad_norm": 3.246952772140503, + "learning_rate": 4.6634486816066755e-05, + "loss": 5.768, + "step": 2620 + }, + { + "epoch": 0.21043366938710192, + "grad_norm": 6.832335948944092, + "learning_rate": 4.662111568700862e-05, + "loss": 5.6752, + "step": 2630 + }, + { + "epoch": 0.21123379740758522, + "grad_norm": 3.2479753494262695, + "learning_rate": 4.660774455795048e-05, + "loss": 5.8015, + "step": 2640 + }, + { + "epoch": 0.21203392542806848, + "grad_norm": 2.809082508087158, + "learning_rate": 4.659437342889234e-05, + "loss": 5.8663, + "step": 2650 + }, + { + "epoch": 0.21283405344855177, + "grad_norm": 3.7948036193847656, + "learning_rate": 4.65810022998342e-05, + "loss": 5.889, + "step": 2660 + }, + { + "epoch": 0.21363418146903504, + "grad_norm": 2.836090564727783, + "learning_rate": 4.656763117077606e-05, + "loss": 5.7516, + "step": 2670 + }, + { + "epoch": 0.21443430948951833, + "grad_norm": 3.0940232276916504, + "learning_rate": 4.6554260041717924e-05, + "loss": 5.7033, + "step": 2680 + }, + { + "epoch": 0.2152344375100016, + "grad_norm": 2.436757802963257, + "learning_rate": 4.654088891265979e-05, + "loss": 5.746, + "step": 2690 + }, + { + "epoch": 0.2160345655304849, + "grad_norm": 2.4339609146118164, + "learning_rate": 4.652751778360165e-05, + "loss": 5.828, + "step": 2700 + }, + { + "epoch": 0.21683469355096816, + "grad_norm": 2.379366874694824, + "learning_rate": 4.651414665454351e-05, + "loss": 5.719, + "step": 2710 + }, + { + "epoch": 0.21763482157145142, + "grad_norm": 2.1722371578216553, + "learning_rate": 4.6500775525485375e-05, + "loss": 5.7875, + "step": 2720 + }, + { + "epoch": 0.2184349495919347, + "grad_norm": 3.633279800415039, + "learning_rate": 4.648740439642724e-05, + "loss": 5.802, + "step": 2730 + }, + { + "epoch": 0.21923507761241798, + "grad_norm": 2.4091219902038574, + "learning_rate": 4.64740332673691e-05, + "loss": 5.8197, + "step": 2740 + }, + { + "epoch": 0.22003520563290127, + "grad_norm": 2.7289021015167236, + "learning_rate": 4.646066213831096e-05, + "loss": 5.9445, + "step": 2750 + }, + { + "epoch": 0.22083533365338454, + "grad_norm": 2.376481294631958, + "learning_rate": 4.6447291009252826e-05, + "loss": 5.9943, + "step": 2760 + }, + { + "epoch": 0.22163546167386783, + "grad_norm": 2.6542563438415527, + "learning_rate": 4.643391988019469e-05, + "loss": 5.6049, + "step": 2770 + }, + { + "epoch": 0.2224355896943511, + "grad_norm": 2.320472240447998, + "learning_rate": 4.642054875113655e-05, + "loss": 5.7637, + "step": 2780 + }, + { + "epoch": 0.2232357177148344, + "grad_norm": 2.8923239707946777, + "learning_rate": 4.6407177622078414e-05, + "loss": 5.9666, + "step": 2790 + }, + { + "epoch": 0.22403584573531765, + "grad_norm": 4.277271270751953, + "learning_rate": 4.6393806493020276e-05, + "loss": 5.8393, + "step": 2800 + }, + { + "epoch": 0.22483597375580092, + "grad_norm": 2.797428607940674, + "learning_rate": 4.638043536396213e-05, + "loss": 5.759, + "step": 2810 + }, + { + "epoch": 0.2256361017762842, + "grad_norm": 2.1849517822265625, + "learning_rate": 4.6367064234903995e-05, + "loss": 5.7514, + "step": 2820 + }, + { + "epoch": 0.22643622979676747, + "grad_norm": 2.8607492446899414, + "learning_rate": 4.635369310584586e-05, + "loss": 5.7545, + "step": 2830 + }, + { + "epoch": 0.22723635781725077, + "grad_norm": 3.722041130065918, + "learning_rate": 4.634032197678772e-05, + "loss": 5.8011, + "step": 2840 + }, + { + "epoch": 0.22803648583773403, + "grad_norm": 2.8563833236694336, + "learning_rate": 4.632695084772958e-05, + "loss": 5.8569, + "step": 2850 + }, + { + "epoch": 0.22883661385821732, + "grad_norm": 3.5724806785583496, + "learning_rate": 4.6313579718671446e-05, + "loss": 5.9649, + "step": 2860 + }, + { + "epoch": 0.2296367418787006, + "grad_norm": 2.380469560623169, + "learning_rate": 4.630020858961331e-05, + "loss": 5.7467, + "step": 2870 + }, + { + "epoch": 0.23043686989918388, + "grad_norm": 3.1629838943481445, + "learning_rate": 4.628683746055517e-05, + "loss": 5.642, + "step": 2880 + }, + { + "epoch": 0.23123699791966715, + "grad_norm": 2.1239373683929443, + "learning_rate": 4.6273466331497034e-05, + "loss": 5.6483, + "step": 2890 + }, + { + "epoch": 0.2320371259401504, + "grad_norm": 3.049079418182373, + "learning_rate": 4.6260095202438897e-05, + "loss": 5.9736, + "step": 2900 + }, + { + "epoch": 0.2328372539606337, + "grad_norm": 2.556830406188965, + "learning_rate": 4.624672407338076e-05, + "loss": 5.6037, + "step": 2910 + }, + { + "epoch": 0.23363738198111697, + "grad_norm": 2.8762035369873047, + "learning_rate": 4.623335294432262e-05, + "loss": 5.6345, + "step": 2920 + }, + { + "epoch": 0.23443751000160026, + "grad_norm": 2.11167573928833, + "learning_rate": 4.6219981815264485e-05, + "loss": 5.7822, + "step": 2930 + }, + { + "epoch": 0.23523763802208353, + "grad_norm": 4.623869895935059, + "learning_rate": 4.620661068620635e-05, + "loss": 5.7063, + "step": 2940 + }, + { + "epoch": 0.23603776604256682, + "grad_norm": 2.4420578479766846, + "learning_rate": 4.619323955714821e-05, + "loss": 5.686, + "step": 2950 + }, + { + "epoch": 0.2368378940630501, + "grad_norm": 2.6543869972229004, + "learning_rate": 4.617986842809007e-05, + "loss": 5.7802, + "step": 2960 + }, + { + "epoch": 0.23763802208353338, + "grad_norm": 2.6264312267303467, + "learning_rate": 4.616649729903193e-05, + "loss": 5.6667, + "step": 2970 + }, + { + "epoch": 0.23843815010401664, + "grad_norm": 2.4579195976257324, + "learning_rate": 4.615312616997379e-05, + "loss": 5.6738, + "step": 2980 + }, + { + "epoch": 0.2392382781244999, + "grad_norm": 2.299448251724243, + "learning_rate": 4.6139755040915654e-05, + "loss": 5.8622, + "step": 2990 + }, + { + "epoch": 0.2400384061449832, + "grad_norm": 3.6527328491210938, + "learning_rate": 4.612638391185752e-05, + "loss": 5.6346, + "step": 3000 + }, + { + "epoch": 0.24083853416546647, + "grad_norm": 2.217876434326172, + "learning_rate": 4.611301278279938e-05, + "loss": 5.7892, + "step": 3010 + }, + { + "epoch": 0.24163866218594976, + "grad_norm": 3.500544309616089, + "learning_rate": 4.609964165374124e-05, + "loss": 5.8026, + "step": 3020 + }, + { + "epoch": 0.24243879020643302, + "grad_norm": 3.1694483757019043, + "learning_rate": 4.6086270524683105e-05, + "loss": 5.827, + "step": 3030 + }, + { + "epoch": 0.24323891822691632, + "grad_norm": 2.899625778198242, + "learning_rate": 4.607289939562497e-05, + "loss": 5.7384, + "step": 3040 + }, + { + "epoch": 0.24403904624739958, + "grad_norm": 2.8286776542663574, + "learning_rate": 4.605952826656683e-05, + "loss": 5.7629, + "step": 3050 + }, + { + "epoch": 0.24483917426788285, + "grad_norm": 2.7585489749908447, + "learning_rate": 4.604615713750869e-05, + "loss": 5.7462, + "step": 3060 + }, + { + "epoch": 0.24563930228836614, + "grad_norm": 2.2017667293548584, + "learning_rate": 4.6032786008450555e-05, + "loss": 5.844, + "step": 3070 + }, + { + "epoch": 0.2464394303088494, + "grad_norm": 4.679725170135498, + "learning_rate": 4.601941487939242e-05, + "loss": 5.7254, + "step": 3080 + }, + { + "epoch": 0.2472395583293327, + "grad_norm": 2.923884868621826, + "learning_rate": 4.600604375033428e-05, + "loss": 5.703, + "step": 3090 + }, + { + "epoch": 0.24803968634981596, + "grad_norm": 2.2205090522766113, + "learning_rate": 4.5992672621276144e-05, + "loss": 5.7185, + "step": 3100 + }, + { + "epoch": 0.24883981437029926, + "grad_norm": 2.852313280105591, + "learning_rate": 4.5979301492218006e-05, + "loss": 5.5653, + "step": 3110 + }, + { + "epoch": 0.24963994239078252, + "grad_norm": 2.7683911323547363, + "learning_rate": 4.596593036315986e-05, + "loss": 5.7262, + "step": 3120 + }, + { + "epoch": 0.2504400704112658, + "grad_norm": 3.1315665245056152, + "learning_rate": 4.5952559234101725e-05, + "loss": 5.7524, + "step": 3130 + }, + { + "epoch": 0.2512401984317491, + "grad_norm": 2.5233592987060547, + "learning_rate": 4.593918810504359e-05, + "loss": 5.7443, + "step": 3140 + }, + { + "epoch": 0.25204032645223234, + "grad_norm": 2.3802831172943115, + "learning_rate": 4.592581697598545e-05, + "loss": 5.8091, + "step": 3150 + }, + { + "epoch": 0.25284045447271564, + "grad_norm": 2.378218412399292, + "learning_rate": 4.591244584692731e-05, + "loss": 5.7741, + "step": 3160 + }, + { + "epoch": 0.25364058249319893, + "grad_norm": 4.712483882904053, + "learning_rate": 4.5899074717869176e-05, + "loss": 5.8643, + "step": 3170 + }, + { + "epoch": 0.25444071051368217, + "grad_norm": 2.798752784729004, + "learning_rate": 4.588570358881104e-05, + "loss": 5.7984, + "step": 3180 + }, + { + "epoch": 0.25524083853416546, + "grad_norm": 2.302037477493286, + "learning_rate": 4.58723324597529e-05, + "loss": 5.6548, + "step": 3190 + }, + { + "epoch": 0.25604096655464875, + "grad_norm": 2.8621273040771484, + "learning_rate": 4.5858961330694764e-05, + "loss": 5.6875, + "step": 3200 + }, + { + "epoch": 0.25684109457513205, + "grad_norm": 2.9079480171203613, + "learning_rate": 4.5845590201636626e-05, + "loss": 5.8801, + "step": 3210 + }, + { + "epoch": 0.2576412225956153, + "grad_norm": 2.9576847553253174, + "learning_rate": 4.583221907257849e-05, + "loss": 5.6646, + "step": 3220 + }, + { + "epoch": 0.2584413506160986, + "grad_norm": 4.085951805114746, + "learning_rate": 4.581884794352035e-05, + "loss": 5.9078, + "step": 3230 + }, + { + "epoch": 0.25924147863658187, + "grad_norm": 2.622903347015381, + "learning_rate": 4.5805476814462214e-05, + "loss": 5.6821, + "step": 3240 + }, + { + "epoch": 0.2600416066570651, + "grad_norm": 1.794255256652832, + "learning_rate": 4.579210568540408e-05, + "loss": 5.751, + "step": 3250 + }, + { + "epoch": 0.2608417346775484, + "grad_norm": 3.074042558670044, + "learning_rate": 4.577873455634594e-05, + "loss": 5.7864, + "step": 3260 + }, + { + "epoch": 0.2616418626980317, + "grad_norm": 2.3138844966888428, + "learning_rate": 4.57653634272878e-05, + "loss": 5.693, + "step": 3270 + }, + { + "epoch": 0.262441990718515, + "grad_norm": 3.8877549171447754, + "learning_rate": 4.5751992298229665e-05, + "loss": 5.7154, + "step": 3280 + }, + { + "epoch": 0.2632421187389982, + "grad_norm": 2.9623680114746094, + "learning_rate": 4.573862116917153e-05, + "loss": 5.7514, + "step": 3290 + }, + { + "epoch": 0.2640422467594815, + "grad_norm": 2.840122938156128, + "learning_rate": 4.572525004011339e-05, + "loss": 5.7397, + "step": 3300 + }, + { + "epoch": 0.2648423747799648, + "grad_norm": 2.9699277877807617, + "learning_rate": 4.571187891105525e-05, + "loss": 5.7626, + "step": 3310 + }, + { + "epoch": 0.2656425028004481, + "grad_norm": 2.6493773460388184, + "learning_rate": 4.5698507781997116e-05, + "loss": 5.7619, + "step": 3320 + }, + { + "epoch": 0.26644263082093134, + "grad_norm": 2.283259868621826, + "learning_rate": 4.568513665293898e-05, + "loss": 5.8409, + "step": 3330 + }, + { + "epoch": 0.26724275884141463, + "grad_norm": 1.9254164695739746, + "learning_rate": 4.567176552388084e-05, + "loss": 5.8218, + "step": 3340 + }, + { + "epoch": 0.2680428868618979, + "grad_norm": 2.382345676422119, + "learning_rate": 4.5658394394822704e-05, + "loss": 5.6865, + "step": 3350 + }, + { + "epoch": 0.26884301488238116, + "grad_norm": 2.6039271354675293, + "learning_rate": 4.564502326576457e-05, + "loss": 5.7254, + "step": 3360 + }, + { + "epoch": 0.26964314290286445, + "grad_norm": 2.0948996543884277, + "learning_rate": 4.563165213670643e-05, + "loss": 5.7589, + "step": 3370 + }, + { + "epoch": 0.27044327092334774, + "grad_norm": 2.939955711364746, + "learning_rate": 4.561828100764829e-05, + "loss": 5.8298, + "step": 3380 + }, + { + "epoch": 0.27124339894383104, + "grad_norm": 2.748307466506958, + "learning_rate": 4.5604909878590155e-05, + "loss": 5.8505, + "step": 3390 + }, + { + "epoch": 0.2720435269643143, + "grad_norm": 2.7122459411621094, + "learning_rate": 4.559153874953202e-05, + "loss": 5.9027, + "step": 3400 + }, + { + "epoch": 0.27284365498479757, + "grad_norm": 3.6053593158721924, + "learning_rate": 4.557816762047388e-05, + "loss": 5.6746, + "step": 3410 + }, + { + "epoch": 0.27364378300528086, + "grad_norm": 4.433299541473389, + "learning_rate": 4.556479649141574e-05, + "loss": 5.7713, + "step": 3420 + }, + { + "epoch": 0.2744439110257641, + "grad_norm": 2.5253539085388184, + "learning_rate": 4.55514253623576e-05, + "loss": 5.8219, + "step": 3430 + }, + { + "epoch": 0.2752440390462474, + "grad_norm": 4.9358062744140625, + "learning_rate": 4.553805423329946e-05, + "loss": 5.7971, + "step": 3440 + }, + { + "epoch": 0.2760441670667307, + "grad_norm": 2.6247594356536865, + "learning_rate": 4.5524683104241324e-05, + "loss": 5.1528, + "step": 3450 + }, + { + "epoch": 0.276844295087214, + "grad_norm": 2.8152048587799072, + "learning_rate": 4.551131197518319e-05, + "loss": 5.7955, + "step": 3460 + }, + { + "epoch": 0.2776444231076972, + "grad_norm": 2.143275499343872, + "learning_rate": 4.549794084612505e-05, + "loss": 5.6875, + "step": 3470 + }, + { + "epoch": 0.2784445511281805, + "grad_norm": 2.9896023273468018, + "learning_rate": 4.548456971706691e-05, + "loss": 5.7981, + "step": 3480 + }, + { + "epoch": 0.2792446791486638, + "grad_norm": 3.5231759548187256, + "learning_rate": 4.5471198588008775e-05, + "loss": 5.7343, + "step": 3490 + }, + { + "epoch": 0.28004480716914704, + "grad_norm": 2.391721487045288, + "learning_rate": 4.545782745895064e-05, + "loss": 5.6821, + "step": 3500 + }, + { + "epoch": 0.28084493518963033, + "grad_norm": 2.414992332458496, + "learning_rate": 4.54444563298925e-05, + "loss": 5.7357, + "step": 3510 + }, + { + "epoch": 0.2816450632101136, + "grad_norm": 2.7502214908599854, + "learning_rate": 4.543108520083436e-05, + "loss": 5.6511, + "step": 3520 + }, + { + "epoch": 0.2824451912305969, + "grad_norm": 2.1601436138153076, + "learning_rate": 4.5417714071776226e-05, + "loss": 5.6249, + "step": 3530 + }, + { + "epoch": 0.28324531925108015, + "grad_norm": 2.89013671875, + "learning_rate": 4.540434294271809e-05, + "loss": 5.7583, + "step": 3540 + }, + { + "epoch": 0.28404544727156344, + "grad_norm": 2.4915778636932373, + "learning_rate": 4.539097181365995e-05, + "loss": 5.6957, + "step": 3550 + }, + { + "epoch": 0.28484557529204674, + "grad_norm": 5.053386688232422, + "learning_rate": 4.5377600684601814e-05, + "loss": 5.632, + "step": 3560 + }, + { + "epoch": 0.28564570331253003, + "grad_norm": 2.6207687854766846, + "learning_rate": 4.5364229555543676e-05, + "loss": 5.8514, + "step": 3570 + }, + { + "epoch": 0.28644583133301327, + "grad_norm": 4.157670497894287, + "learning_rate": 4.535085842648553e-05, + "loss": 5.7608, + "step": 3580 + }, + { + "epoch": 0.28724595935349656, + "grad_norm": 3.4464797973632812, + "learning_rate": 4.5337487297427395e-05, + "loss": 5.6737, + "step": 3590 + }, + { + "epoch": 0.28804608737397985, + "grad_norm": 4.255002498626709, + "learning_rate": 4.532411616836926e-05, + "loss": 5.7977, + "step": 3600 + }, + { + "epoch": 0.2888462153944631, + "grad_norm": 2.7926547527313232, + "learning_rate": 4.531074503931112e-05, + "loss": 5.6891, + "step": 3610 + }, + { + "epoch": 0.2896463434149464, + "grad_norm": 3.150400400161743, + "learning_rate": 4.529737391025298e-05, + "loss": 5.7931, + "step": 3620 + }, + { + "epoch": 0.2904464714354297, + "grad_norm": 2.1223199367523193, + "learning_rate": 4.5284002781194846e-05, + "loss": 5.8646, + "step": 3630 + }, + { + "epoch": 0.29124659945591297, + "grad_norm": 3.950665235519409, + "learning_rate": 4.527063165213671e-05, + "loss": 5.7008, + "step": 3640 + }, + { + "epoch": 0.2920467274763962, + "grad_norm": 2.995692729949951, + "learning_rate": 4.525726052307857e-05, + "loss": 5.688, + "step": 3650 + }, + { + "epoch": 0.2928468554968795, + "grad_norm": 2.041736125946045, + "learning_rate": 4.5243889394020434e-05, + "loss": 5.7301, + "step": 3660 + }, + { + "epoch": 0.2936469835173628, + "grad_norm": 2.541757106781006, + "learning_rate": 4.5230518264962297e-05, + "loss": 5.5606, + "step": 3670 + }, + { + "epoch": 0.29444711153784603, + "grad_norm": 2.140761613845825, + "learning_rate": 4.521714713590416e-05, + "loss": 5.7671, + "step": 3680 + }, + { + "epoch": 0.2952472395583293, + "grad_norm": 2.6869146823883057, + "learning_rate": 4.520377600684602e-05, + "loss": 5.6452, + "step": 3690 + }, + { + "epoch": 0.2960473675788126, + "grad_norm": 3.072376012802124, + "learning_rate": 4.5190404877787885e-05, + "loss": 5.6956, + "step": 3700 + }, + { + "epoch": 0.2968474955992959, + "grad_norm": 2.5933837890625, + "learning_rate": 4.517703374872975e-05, + "loss": 5.6212, + "step": 3710 + }, + { + "epoch": 0.29764762361977914, + "grad_norm": 3.0443103313446045, + "learning_rate": 4.516366261967161e-05, + "loss": 5.7849, + "step": 3720 + }, + { + "epoch": 0.29844775164026244, + "grad_norm": 2.673583745956421, + "learning_rate": 4.515029149061347e-05, + "loss": 5.6186, + "step": 3730 + }, + { + "epoch": 0.29924787966074573, + "grad_norm": 2.3276283740997314, + "learning_rate": 4.513692036155533e-05, + "loss": 5.9188, + "step": 3740 + }, + { + "epoch": 0.300048007681229, + "grad_norm": 5.504491329193115, + "learning_rate": 4.512354923249719e-05, + "loss": 5.5676, + "step": 3750 + }, + { + "epoch": 0.30084813570171226, + "grad_norm": 2.4181482791900635, + "learning_rate": 4.5110178103439054e-05, + "loss": 5.6852, + "step": 3760 + }, + { + "epoch": 0.30164826372219555, + "grad_norm": 2.2489006519317627, + "learning_rate": 4.509680697438092e-05, + "loss": 5.7003, + "step": 3770 + }, + { + "epoch": 0.30244839174267885, + "grad_norm": 2.6925253868103027, + "learning_rate": 4.508343584532278e-05, + "loss": 5.8176, + "step": 3780 + }, + { + "epoch": 0.3032485197631621, + "grad_norm": 2.904318332672119, + "learning_rate": 4.507006471626464e-05, + "loss": 5.6912, + "step": 3790 + }, + { + "epoch": 0.3040486477836454, + "grad_norm": 3.3189070224761963, + "learning_rate": 4.5056693587206505e-05, + "loss": 5.8706, + "step": 3800 + }, + { + "epoch": 0.30484877580412867, + "grad_norm": 2.8324170112609863, + "learning_rate": 4.504332245814837e-05, + "loss": 5.8795, + "step": 3810 + }, + { + "epoch": 0.30564890382461196, + "grad_norm": 3.113417148590088, + "learning_rate": 4.502995132909023e-05, + "loss": 5.8689, + "step": 3820 + }, + { + "epoch": 0.3064490318450952, + "grad_norm": 2.469269275665283, + "learning_rate": 4.501658020003209e-05, + "loss": 5.7934, + "step": 3830 + }, + { + "epoch": 0.3072491598655785, + "grad_norm": 2.778571128845215, + "learning_rate": 4.5003209070973956e-05, + "loss": 5.8577, + "step": 3840 + }, + { + "epoch": 0.3080492878860618, + "grad_norm": 3.4269161224365234, + "learning_rate": 4.498983794191582e-05, + "loss": 5.8378, + "step": 3850 + }, + { + "epoch": 0.308849415906545, + "grad_norm": 3.417850971221924, + "learning_rate": 4.497646681285768e-05, + "loss": 5.6532, + "step": 3860 + }, + { + "epoch": 0.3096495439270283, + "grad_norm": 2.389784097671509, + "learning_rate": 4.4963095683799544e-05, + "loss": 5.5454, + "step": 3870 + }, + { + "epoch": 0.3104496719475116, + "grad_norm": 2.384453296661377, + "learning_rate": 4.4949724554741406e-05, + "loss": 5.8014, + "step": 3880 + }, + { + "epoch": 0.3112497999679949, + "grad_norm": 1.913668155670166, + "learning_rate": 4.493635342568326e-05, + "loss": 5.6033, + "step": 3890 + }, + { + "epoch": 0.31204992798847814, + "grad_norm": 3.4930074214935303, + "learning_rate": 4.4922982296625125e-05, + "loss": 5.7649, + "step": 3900 + }, + { + "epoch": 0.31285005600896143, + "grad_norm": 3.517458200454712, + "learning_rate": 4.490961116756699e-05, + "loss": 5.5635, + "step": 3910 + }, + { + "epoch": 0.3136501840294447, + "grad_norm": 2.611274480819702, + "learning_rate": 4.489624003850885e-05, + "loss": 5.8121, + "step": 3920 + }, + { + "epoch": 0.314450312049928, + "grad_norm": 2.373997926712036, + "learning_rate": 4.488286890945071e-05, + "loss": 5.6002, + "step": 3930 + }, + { + "epoch": 0.31525044007041125, + "grad_norm": 2.554847002029419, + "learning_rate": 4.4869497780392576e-05, + "loss": 5.6432, + "step": 3940 + }, + { + "epoch": 0.31605056809089455, + "grad_norm": 3.3720595836639404, + "learning_rate": 4.485612665133444e-05, + "loss": 5.5794, + "step": 3950 + }, + { + "epoch": 0.31685069611137784, + "grad_norm": 2.2308788299560547, + "learning_rate": 4.48427555222763e-05, + "loss": 5.794, + "step": 3960 + }, + { + "epoch": 0.3176508241318611, + "grad_norm": 2.0659661293029785, + "learning_rate": 4.4829384393218164e-05, + "loss": 5.5383, + "step": 3970 + }, + { + "epoch": 0.31845095215234437, + "grad_norm": 3.2644894123077393, + "learning_rate": 4.4816013264160026e-05, + "loss": 5.6979, + "step": 3980 + }, + { + "epoch": 0.31925108017282766, + "grad_norm": 2.3485729694366455, + "learning_rate": 4.480264213510189e-05, + "loss": 5.7214, + "step": 3990 + }, + { + "epoch": 0.32005120819331095, + "grad_norm": 2.7470600605010986, + "learning_rate": 4.478927100604375e-05, + "loss": 5.6032, + "step": 4000 + }, + { + "epoch": 0.3208513362137942, + "grad_norm": 2.1622989177703857, + "learning_rate": 4.4775899876985614e-05, + "loss": 5.7976, + "step": 4010 + }, + { + "epoch": 0.3216514642342775, + "grad_norm": 2.7463905811309814, + "learning_rate": 4.476252874792748e-05, + "loss": 5.7181, + "step": 4020 + }, + { + "epoch": 0.3224515922547608, + "grad_norm": 3.503662109375, + "learning_rate": 4.474915761886934e-05, + "loss": 5.8092, + "step": 4030 + }, + { + "epoch": 0.323251720275244, + "grad_norm": 2.6073853969573975, + "learning_rate": 4.47357864898112e-05, + "loss": 5.7876, + "step": 4040 + }, + { + "epoch": 0.3240518482957273, + "grad_norm": 3.354768991470337, + "learning_rate": 4.472241536075306e-05, + "loss": 5.7741, + "step": 4050 + }, + { + "epoch": 0.3248519763162106, + "grad_norm": 2.648145914077759, + "learning_rate": 4.470904423169492e-05, + "loss": 5.7522, + "step": 4060 + }, + { + "epoch": 0.3256521043366939, + "grad_norm": 3.086655378341675, + "learning_rate": 4.4695673102636784e-05, + "loss": 5.81, + "step": 4070 + }, + { + "epoch": 0.32645223235717713, + "grad_norm": 2.230905771255493, + "learning_rate": 4.4682301973578647e-05, + "loss": 5.8839, + "step": 4080 + }, + { + "epoch": 0.3272523603776604, + "grad_norm": 2.5391674041748047, + "learning_rate": 4.466893084452051e-05, + "loss": 5.5535, + "step": 4090 + }, + { + "epoch": 0.3280524883981437, + "grad_norm": 2.7574117183685303, + "learning_rate": 4.465555971546237e-05, + "loss": 5.8275, + "step": 4100 + }, + { + "epoch": 0.32885261641862695, + "grad_norm": 3.1114678382873535, + "learning_rate": 4.4642188586404235e-05, + "loss": 5.6876, + "step": 4110 + }, + { + "epoch": 0.32965274443911025, + "grad_norm": 2.404892683029175, + "learning_rate": 4.46288174573461e-05, + "loss": 5.6876, + "step": 4120 + }, + { + "epoch": 0.33045287245959354, + "grad_norm": 2.590759754180908, + "learning_rate": 4.461544632828796e-05, + "loss": 5.802, + "step": 4130 + }, + { + "epoch": 0.33125300048007683, + "grad_norm": 2.4358649253845215, + "learning_rate": 4.460207519922982e-05, + "loss": 5.632, + "step": 4140 + }, + { + "epoch": 0.33205312850056007, + "grad_norm": 3.9567458629608154, + "learning_rate": 4.4588704070171685e-05, + "loss": 5.8761, + "step": 4150 + }, + { + "epoch": 0.33285325652104336, + "grad_norm": 2.3808743953704834, + "learning_rate": 4.457533294111355e-05, + "loss": 5.6815, + "step": 4160 + }, + { + "epoch": 0.33365338454152665, + "grad_norm": 2.6527156829833984, + "learning_rate": 4.456196181205541e-05, + "loss": 5.805, + "step": 4170 + }, + { + "epoch": 0.33445351256200995, + "grad_norm": 2.351062536239624, + "learning_rate": 4.4548590682997273e-05, + "loss": 5.6681, + "step": 4180 + }, + { + "epoch": 0.3352536405824932, + "grad_norm": 2.3213460445404053, + "learning_rate": 4.4535219553939136e-05, + "loss": 5.6363, + "step": 4190 + }, + { + "epoch": 0.3360537686029765, + "grad_norm": 1.9470767974853516, + "learning_rate": 4.4521848424881e-05, + "loss": 5.8772, + "step": 4200 + }, + { + "epoch": 0.33685389662345977, + "grad_norm": 4.303500652313232, + "learning_rate": 4.450847729582286e-05, + "loss": 5.6185, + "step": 4210 + }, + { + "epoch": 0.337654024643943, + "grad_norm": 2.713275909423828, + "learning_rate": 4.4495106166764724e-05, + "loss": 5.6754, + "step": 4220 + }, + { + "epoch": 0.3384541526644263, + "grad_norm": 2.34993314743042, + "learning_rate": 4.448173503770659e-05, + "loss": 5.7003, + "step": 4230 + }, + { + "epoch": 0.3392542806849096, + "grad_norm": 2.276228666305542, + "learning_rate": 4.446836390864845e-05, + "loss": 5.6, + "step": 4240 + }, + { + "epoch": 0.3400544087053929, + "grad_norm": 2.3635685443878174, + "learning_rate": 4.445499277959031e-05, + "loss": 5.7373, + "step": 4250 + }, + { + "epoch": 0.3408545367258761, + "grad_norm": 3.100604772567749, + "learning_rate": 4.4441621650532175e-05, + "loss": 5.7354, + "step": 4260 + }, + { + "epoch": 0.3416546647463594, + "grad_norm": 2.6743876934051514, + "learning_rate": 4.442825052147404e-05, + "loss": 5.7544, + "step": 4270 + }, + { + "epoch": 0.3424547927668427, + "grad_norm": 2.5783612728118896, + "learning_rate": 4.44148793924159e-05, + "loss": 5.8826, + "step": 4280 + }, + { + "epoch": 0.34325492078732595, + "grad_norm": 2.8976659774780273, + "learning_rate": 4.440150826335776e-05, + "loss": 5.5418, + "step": 4290 + }, + { + "epoch": 0.34405504880780924, + "grad_norm": 2.1061089038848877, + "learning_rate": 4.4388137134299626e-05, + "loss": 5.6406, + "step": 4300 + }, + { + "epoch": 0.34485517682829253, + "grad_norm": 2.1303789615631104, + "learning_rate": 4.437476600524149e-05, + "loss": 5.6491, + "step": 4310 + }, + { + "epoch": 0.3456553048487758, + "grad_norm": 2.6240499019622803, + "learning_rate": 4.436139487618335e-05, + "loss": 5.7161, + "step": 4320 + }, + { + "epoch": 0.34645543286925906, + "grad_norm": 2.325155019760132, + "learning_rate": 4.4348023747125214e-05, + "loss": 5.6172, + "step": 4330 + }, + { + "epoch": 0.34725556088974235, + "grad_norm": 2.8844404220581055, + "learning_rate": 4.4334652618067076e-05, + "loss": 5.7438, + "step": 4340 + }, + { + "epoch": 0.34805568891022565, + "grad_norm": 2.375324249267578, + "learning_rate": 4.432128148900894e-05, + "loss": 5.8335, + "step": 4350 + }, + { + "epoch": 0.34885581693070894, + "grad_norm": 2.1572377681732178, + "learning_rate": 4.4307910359950795e-05, + "loss": 5.706, + "step": 4360 + }, + { + "epoch": 0.3496559449511922, + "grad_norm": 2.5218889713287354, + "learning_rate": 4.429453923089266e-05, + "loss": 5.7487, + "step": 4370 + }, + { + "epoch": 0.35045607297167547, + "grad_norm": 2.636223554611206, + "learning_rate": 4.428116810183452e-05, + "loss": 5.8327, + "step": 4380 + }, + { + "epoch": 0.35125620099215876, + "grad_norm": 2.436155080795288, + "learning_rate": 4.426779697277638e-05, + "loss": 5.6895, + "step": 4390 + }, + { + "epoch": 0.352056329012642, + "grad_norm": 3.4435484409332275, + "learning_rate": 4.4254425843718246e-05, + "loss": 5.6171, + "step": 4400 + }, + { + "epoch": 0.3528564570331253, + "grad_norm": 2.3990628719329834, + "learning_rate": 4.424105471466011e-05, + "loss": 5.7574, + "step": 4410 + }, + { + "epoch": 0.3536565850536086, + "grad_norm": 2.544774293899536, + "learning_rate": 4.422768358560197e-05, + "loss": 5.558, + "step": 4420 + }, + { + "epoch": 0.3544567130740919, + "grad_norm": 2.389491081237793, + "learning_rate": 4.4214312456543834e-05, + "loss": 5.6628, + "step": 4430 + }, + { + "epoch": 0.3552568410945751, + "grad_norm": 5.203212261199951, + "learning_rate": 4.4200941327485697e-05, + "loss": 5.5403, + "step": 4440 + }, + { + "epoch": 0.3560569691150584, + "grad_norm": 2.0861873626708984, + "learning_rate": 4.418757019842756e-05, + "loss": 5.625, + "step": 4450 + }, + { + "epoch": 0.3568570971355417, + "grad_norm": 2.2355470657348633, + "learning_rate": 4.417419906936942e-05, + "loss": 5.614, + "step": 4460 + }, + { + "epoch": 0.35765722515602494, + "grad_norm": 2.2239274978637695, + "learning_rate": 4.4160827940311285e-05, + "loss": 5.6885, + "step": 4470 + }, + { + "epoch": 0.35845735317650823, + "grad_norm": 4.571592807769775, + "learning_rate": 4.414745681125315e-05, + "loss": 5.8495, + "step": 4480 + }, + { + "epoch": 0.3592574811969915, + "grad_norm": 2.6501150131225586, + "learning_rate": 4.413408568219501e-05, + "loss": 5.6158, + "step": 4490 + }, + { + "epoch": 0.3600576092174748, + "grad_norm": 2.8568902015686035, + "learning_rate": 4.412071455313687e-05, + "loss": 5.6403, + "step": 4500 + }, + { + "epoch": 0.36085773723795805, + "grad_norm": 2.4179179668426514, + "learning_rate": 4.410734342407873e-05, + "loss": 5.749, + "step": 4510 + }, + { + "epoch": 0.36165786525844135, + "grad_norm": 2.950491189956665, + "learning_rate": 4.409397229502059e-05, + "loss": 5.7128, + "step": 4520 + }, + { + "epoch": 0.36245799327892464, + "grad_norm": 3.731049060821533, + "learning_rate": 4.4080601165962454e-05, + "loss": 5.6397, + "step": 4530 + }, + { + "epoch": 0.36325812129940793, + "grad_norm": 2.255730390548706, + "learning_rate": 4.406723003690432e-05, + "loss": 5.626, + "step": 4540 + }, + { + "epoch": 0.36405824931989117, + "grad_norm": 2.623455047607422, + "learning_rate": 4.405385890784618e-05, + "loss": 5.6792, + "step": 4550 + }, + { + "epoch": 0.36485837734037446, + "grad_norm": 2.366481065750122, + "learning_rate": 4.404048777878804e-05, + "loss": 5.5455, + "step": 4560 + }, + { + "epoch": 0.36565850536085776, + "grad_norm": 2.56351375579834, + "learning_rate": 4.4027116649729905e-05, + "loss": 5.7982, + "step": 4570 + }, + { + "epoch": 0.366458633381341, + "grad_norm": 2.3203811645507812, + "learning_rate": 4.401374552067177e-05, + "loss": 5.7969, + "step": 4580 + }, + { + "epoch": 0.3672587614018243, + "grad_norm": 2.3838179111480713, + "learning_rate": 4.400037439161363e-05, + "loss": 5.7484, + "step": 4590 + }, + { + "epoch": 0.3680588894223076, + "grad_norm": 2.0725440979003906, + "learning_rate": 4.398700326255549e-05, + "loss": 5.8405, + "step": 4600 + }, + { + "epoch": 0.36885901744279087, + "grad_norm": 3.49495005607605, + "learning_rate": 4.3973632133497356e-05, + "loss": 5.7151, + "step": 4610 + }, + { + "epoch": 0.3696591454632741, + "grad_norm": 2.643007755279541, + "learning_rate": 4.396026100443922e-05, + "loss": 5.6374, + "step": 4620 + }, + { + "epoch": 0.3704592734837574, + "grad_norm": 2.282304286956787, + "learning_rate": 4.394688987538108e-05, + "loss": 5.589, + "step": 4630 + }, + { + "epoch": 0.3712594015042407, + "grad_norm": 2.244058609008789, + "learning_rate": 4.3933518746322944e-05, + "loss": 5.7516, + "step": 4640 + }, + { + "epoch": 0.37205952952472393, + "grad_norm": 2.44496488571167, + "learning_rate": 4.3920147617264806e-05, + "loss": 5.8393, + "step": 4650 + }, + { + "epoch": 0.3728596575452072, + "grad_norm": 2.6613078117370605, + "learning_rate": 4.390677648820667e-05, + "loss": 5.6764, + "step": 4660 + }, + { + "epoch": 0.3736597855656905, + "grad_norm": 3.99092173576355, + "learning_rate": 4.3893405359148525e-05, + "loss": 5.8658, + "step": 4670 + }, + { + "epoch": 0.3744599135861738, + "grad_norm": 1.6338485479354858, + "learning_rate": 4.388003423009039e-05, + "loss": 5.7527, + "step": 4680 + }, + { + "epoch": 0.37526004160665705, + "grad_norm": 2.3723371028900146, + "learning_rate": 4.386666310103225e-05, + "loss": 5.7482, + "step": 4690 + }, + { + "epoch": 0.37606016962714034, + "grad_norm": 2.630424976348877, + "learning_rate": 4.385329197197411e-05, + "loss": 5.7539, + "step": 4700 + }, + { + "epoch": 0.37686029764762363, + "grad_norm": 2.3873038291931152, + "learning_rate": 4.3839920842915976e-05, + "loss": 5.6729, + "step": 4710 + }, + { + "epoch": 0.37766042566810687, + "grad_norm": 1.9391748905181885, + "learning_rate": 4.382654971385784e-05, + "loss": 5.6794, + "step": 4720 + }, + { + "epoch": 0.37846055368859016, + "grad_norm": 2.103975296020508, + "learning_rate": 4.38131785847997e-05, + "loss": 5.5104, + "step": 4730 + }, + { + "epoch": 0.37926068170907346, + "grad_norm": 3.731184959411621, + "learning_rate": 4.3799807455741564e-05, + "loss": 5.6699, + "step": 4740 + }, + { + "epoch": 0.38006080972955675, + "grad_norm": 2.881068468093872, + "learning_rate": 4.3786436326683426e-05, + "loss": 5.6394, + "step": 4750 + }, + { + "epoch": 0.38086093775004, + "grad_norm": 2.5963799953460693, + "learning_rate": 4.377306519762529e-05, + "loss": 5.784, + "step": 4760 + }, + { + "epoch": 0.3816610657705233, + "grad_norm": 1.9520230293273926, + "learning_rate": 4.375969406856715e-05, + "loss": 5.7608, + "step": 4770 + }, + { + "epoch": 0.38246119379100657, + "grad_norm": 2.386702537536621, + "learning_rate": 4.374766005241483e-05, + "loss": 5.5725, + "step": 4780 + }, + { + "epoch": 0.38326132181148986, + "grad_norm": 2.3830511569976807, + "learning_rate": 4.3734288923356694e-05, + "loss": 5.5584, + "step": 4790 + }, + { + "epoch": 0.3840614498319731, + "grad_norm": 2.1514739990234375, + "learning_rate": 4.3720917794298556e-05, + "loss": 5.6621, + "step": 4800 + }, + { + "epoch": 0.3848615778524564, + "grad_norm": 2.5376317501068115, + "learning_rate": 4.370754666524042e-05, + "loss": 5.4138, + "step": 4810 + }, + { + "epoch": 0.3856617058729397, + "grad_norm": 3.425899028778076, + "learning_rate": 4.3694175536182275e-05, + "loss": 5.6478, + "step": 4820 + }, + { + "epoch": 0.3864618338934229, + "grad_norm": 2.7518632411956787, + "learning_rate": 4.368080440712414e-05, + "loss": 5.6556, + "step": 4830 + }, + { + "epoch": 0.3872619619139062, + "grad_norm": 3.119227647781372, + "learning_rate": 4.3667433278066e-05, + "loss": 5.7925, + "step": 4840 + }, + { + "epoch": 0.3880620899343895, + "grad_norm": 3.2664616107940674, + "learning_rate": 4.365406214900786e-05, + "loss": 5.7176, + "step": 4850 + }, + { + "epoch": 0.3888622179548728, + "grad_norm": 2.5125045776367188, + "learning_rate": 4.3640691019949726e-05, + "loss": 5.6511, + "step": 4860 + }, + { + "epoch": 0.38966234597535604, + "grad_norm": 2.992112874984741, + "learning_rate": 4.362731989089159e-05, + "loss": 5.6426, + "step": 4870 + }, + { + "epoch": 0.39046247399583933, + "grad_norm": 4.46783971786499, + "learning_rate": 4.361394876183345e-05, + "loss": 5.736, + "step": 4880 + }, + { + "epoch": 0.3912626020163226, + "grad_norm": 1.8372838497161865, + "learning_rate": 4.3600577632775314e-05, + "loss": 5.7603, + "step": 4890 + }, + { + "epoch": 0.39206273003680586, + "grad_norm": 2.1635375022888184, + "learning_rate": 4.3587206503717176e-05, + "loss": 5.6019, + "step": 4900 + }, + { + "epoch": 0.39286285805728915, + "grad_norm": 2.2425310611724854, + "learning_rate": 4.357383537465904e-05, + "loss": 5.6829, + "step": 4910 + }, + { + "epoch": 0.39366298607777245, + "grad_norm": 2.408907413482666, + "learning_rate": 4.35604642456009e-05, + "loss": 5.6821, + "step": 4920 + }, + { + "epoch": 0.39446311409825574, + "grad_norm": 3.012258291244507, + "learning_rate": 4.3547093116542765e-05, + "loss": 5.7503, + "step": 4930 + }, + { + "epoch": 0.395263242118739, + "grad_norm": 3.187053680419922, + "learning_rate": 4.353372198748463e-05, + "loss": 5.6459, + "step": 4940 + }, + { + "epoch": 0.39606337013922227, + "grad_norm": 2.7528955936431885, + "learning_rate": 4.352035085842649e-05, + "loss": 5.6386, + "step": 4950 + }, + { + "epoch": 0.39686349815970556, + "grad_norm": 2.9744699001312256, + "learning_rate": 4.350697972936835e-05, + "loss": 5.5938, + "step": 4960 + }, + { + "epoch": 0.39766362618018886, + "grad_norm": 2.779604196548462, + "learning_rate": 4.3493608600310215e-05, + "loss": 5.5459, + "step": 4970 + }, + { + "epoch": 0.3984637542006721, + "grad_norm": 2.9092133045196533, + "learning_rate": 4.348023747125207e-05, + "loss": 5.7695, + "step": 4980 + }, + { + "epoch": 0.3992638822211554, + "grad_norm": 2.800872802734375, + "learning_rate": 4.3466866342193934e-05, + "loss": 5.6943, + "step": 4990 + }, + { + "epoch": 0.4000640102416387, + "grad_norm": 3.299595832824707, + "learning_rate": 4.3453495213135797e-05, + "loss": 5.4432, + "step": 5000 + }, + { + "epoch": 0.4008641382621219, + "grad_norm": 2.2425456047058105, + "learning_rate": 4.344012408407766e-05, + "loss": 5.6688, + "step": 5010 + }, + { + "epoch": 0.4016642662826052, + "grad_norm": 2.269378423690796, + "learning_rate": 4.342675295501952e-05, + "loss": 5.7713, + "step": 5020 + }, + { + "epoch": 0.4024643943030885, + "grad_norm": 2.3903868198394775, + "learning_rate": 4.3413381825961385e-05, + "loss": 5.5926, + "step": 5030 + }, + { + "epoch": 0.4032645223235718, + "grad_norm": 3.267918109893799, + "learning_rate": 4.340001069690325e-05, + "loss": 5.6806, + "step": 5040 + }, + { + "epoch": 0.40406465034405503, + "grad_norm": 3.2075066566467285, + "learning_rate": 4.338663956784511e-05, + "loss": 5.6582, + "step": 5050 + }, + { + "epoch": 0.4048647783645383, + "grad_norm": 2.5458226203918457, + "learning_rate": 4.337326843878697e-05, + "loss": 5.6576, + "step": 5060 + }, + { + "epoch": 0.4056649063850216, + "grad_norm": 2.0331077575683594, + "learning_rate": 4.3359897309728835e-05, + "loss": 5.6725, + "step": 5070 + }, + { + "epoch": 0.40646503440550485, + "grad_norm": 2.406907796859741, + "learning_rate": 4.33465261806707e-05, + "loss": 5.5168, + "step": 5080 + }, + { + "epoch": 0.40726516242598815, + "grad_norm": 2.661137580871582, + "learning_rate": 4.333315505161256e-05, + "loss": 5.5953, + "step": 5090 + }, + { + "epoch": 0.40806529044647144, + "grad_norm": 2.857725143432617, + "learning_rate": 4.3319783922554423e-05, + "loss": 5.6702, + "step": 5100 + }, + { + "epoch": 0.40886541846695473, + "grad_norm": 2.7894747257232666, + "learning_rate": 4.3306412793496286e-05, + "loss": 5.6228, + "step": 5110 + }, + { + "epoch": 0.40966554648743797, + "grad_norm": 2.8865861892700195, + "learning_rate": 4.329304166443815e-05, + "loss": 5.6859, + "step": 5120 + }, + { + "epoch": 0.41046567450792126, + "grad_norm": 2.1493608951568604, + "learning_rate": 4.3279670535380005e-05, + "loss": 5.5516, + "step": 5130 + }, + { + "epoch": 0.41126580252840456, + "grad_norm": 3.112820863723755, + "learning_rate": 4.326629940632187e-05, + "loss": 5.6409, + "step": 5140 + }, + { + "epoch": 0.41206593054888785, + "grad_norm": 2.778876543045044, + "learning_rate": 4.325292827726373e-05, + "loss": 5.6948, + "step": 5150 + }, + { + "epoch": 0.4128660585693711, + "grad_norm": 2.0409047603607178, + "learning_rate": 4.323955714820559e-05, + "loss": 5.5458, + "step": 5160 + }, + { + "epoch": 0.4136661865898544, + "grad_norm": 3.1058828830718994, + "learning_rate": 4.3226186019147456e-05, + "loss": 5.8437, + "step": 5170 + }, + { + "epoch": 0.41446631461033767, + "grad_norm": 3.306704044342041, + "learning_rate": 4.321281489008932e-05, + "loss": 5.691, + "step": 5180 + }, + { + "epoch": 0.4152664426308209, + "grad_norm": 2.9495625495910645, + "learning_rate": 4.319944376103118e-05, + "loss": 5.6364, + "step": 5190 + }, + { + "epoch": 0.4160665706513042, + "grad_norm": 2.1773974895477295, + "learning_rate": 4.3186072631973044e-05, + "loss": 5.6713, + "step": 5200 + }, + { + "epoch": 0.4168666986717875, + "grad_norm": 2.0897533893585205, + "learning_rate": 4.3172701502914906e-05, + "loss": 5.6022, + "step": 5210 + }, + { + "epoch": 0.4176668266922708, + "grad_norm": 2.2131927013397217, + "learning_rate": 4.315933037385677e-05, + "loss": 5.5728, + "step": 5220 + }, + { + "epoch": 0.418466954712754, + "grad_norm": 2.225728750228882, + "learning_rate": 4.314595924479863e-05, + "loss": 5.5374, + "step": 5230 + }, + { + "epoch": 0.4192670827332373, + "grad_norm": 2.219791889190674, + "learning_rate": 4.3132588115740494e-05, + "loss": 5.6986, + "step": 5240 + }, + { + "epoch": 0.4200672107537206, + "grad_norm": 2.720323085784912, + "learning_rate": 4.311921698668236e-05, + "loss": 5.6046, + "step": 5250 + }, + { + "epoch": 0.42086733877420385, + "grad_norm": 2.4254257678985596, + "learning_rate": 4.310584585762422e-05, + "loss": 5.5566, + "step": 5260 + }, + { + "epoch": 0.42166746679468714, + "grad_norm": 2.2297472953796387, + "learning_rate": 4.309247472856608e-05, + "loss": 5.7431, + "step": 5270 + }, + { + "epoch": 0.42246759481517043, + "grad_norm": 2.2767512798309326, + "learning_rate": 4.3079103599507945e-05, + "loss": 5.6661, + "step": 5280 + }, + { + "epoch": 0.4232677228356537, + "grad_norm": 2.8959579467773438, + "learning_rate": 4.30657324704498e-05, + "loss": 5.6584, + "step": 5290 + }, + { + "epoch": 0.42406785085613696, + "grad_norm": 2.49867844581604, + "learning_rate": 4.3052361341391664e-05, + "loss": 5.7564, + "step": 5300 + }, + { + "epoch": 0.42486797887662026, + "grad_norm": 2.1820337772369385, + "learning_rate": 4.3038990212333526e-05, + "loss": 5.6288, + "step": 5310 + }, + { + "epoch": 0.42566810689710355, + "grad_norm": 2.7174227237701416, + "learning_rate": 4.302561908327539e-05, + "loss": 5.6496, + "step": 5320 + }, + { + "epoch": 0.42646823491758684, + "grad_norm": 2.7261149883270264, + "learning_rate": 4.301224795421725e-05, + "loss": 5.6557, + "step": 5330 + }, + { + "epoch": 0.4272683629380701, + "grad_norm": 2.581760883331299, + "learning_rate": 4.2998876825159114e-05, + "loss": 5.604, + "step": 5340 + }, + { + "epoch": 0.42806849095855337, + "grad_norm": 2.43254017829895, + "learning_rate": 4.298550569610098e-05, + "loss": 5.6041, + "step": 5350 + }, + { + "epoch": 0.42886861897903666, + "grad_norm": 4.465782165527344, + "learning_rate": 4.297213456704284e-05, + "loss": 5.7158, + "step": 5360 + }, + { + "epoch": 0.4296687469995199, + "grad_norm": 2.6434614658355713, + "learning_rate": 4.29587634379847e-05, + "loss": 5.6347, + "step": 5370 + }, + { + "epoch": 0.4304688750200032, + "grad_norm": 2.344190835952759, + "learning_rate": 4.2945392308926565e-05, + "loss": 5.6062, + "step": 5380 + }, + { + "epoch": 0.4312690030404865, + "grad_norm": 4.311372756958008, + "learning_rate": 4.293202117986843e-05, + "loss": 5.7356, + "step": 5390 + }, + { + "epoch": 0.4320691310609698, + "grad_norm": 2.8204123973846436, + "learning_rate": 4.291865005081029e-05, + "loss": 5.63, + "step": 5400 + }, + { + "epoch": 0.432869259081453, + "grad_norm": 3.333059072494507, + "learning_rate": 4.290527892175215e-05, + "loss": 5.5992, + "step": 5410 + }, + { + "epoch": 0.4336693871019363, + "grad_norm": 2.0647048950195312, + "learning_rate": 4.2891907792694016e-05, + "loss": 5.691, + "step": 5420 + }, + { + "epoch": 0.4344695151224196, + "grad_norm": 2.5100045204162598, + "learning_rate": 4.287853666363588e-05, + "loss": 5.615, + "step": 5430 + }, + { + "epoch": 0.43526964314290284, + "grad_norm": 2.6120762825012207, + "learning_rate": 4.286516553457774e-05, + "loss": 5.746, + "step": 5440 + }, + { + "epoch": 0.43606977116338613, + "grad_norm": 2.2886853218078613, + "learning_rate": 4.2851794405519604e-05, + "loss": 5.6783, + "step": 5450 + }, + { + "epoch": 0.4368698991838694, + "grad_norm": 2.6724119186401367, + "learning_rate": 4.283842327646147e-05, + "loss": 5.6526, + "step": 5460 + }, + { + "epoch": 0.4376700272043527, + "grad_norm": 2.2408151626586914, + "learning_rate": 4.282505214740333e-05, + "loss": 5.6314, + "step": 5470 + }, + { + "epoch": 0.43847015522483596, + "grad_norm": 3.0294084548950195, + "learning_rate": 4.281168101834519e-05, + "loss": 5.6669, + "step": 5480 + }, + { + "epoch": 0.43927028324531925, + "grad_norm": 2.1664011478424072, + "learning_rate": 4.2798309889287055e-05, + "loss": 5.4856, + "step": 5490 + }, + { + "epoch": 0.44007041126580254, + "grad_norm": 3.4465417861938477, + "learning_rate": 4.278493876022892e-05, + "loss": 5.5859, + "step": 5500 + }, + { + "epoch": 0.4408705392862858, + "grad_norm": 2.0116310119628906, + "learning_rate": 4.277156763117078e-05, + "loss": 5.5982, + "step": 5510 + }, + { + "epoch": 0.44167066730676907, + "grad_norm": 2.578658103942871, + "learning_rate": 4.275819650211264e-05, + "loss": 5.4026, + "step": 5520 + }, + { + "epoch": 0.44247079532725236, + "grad_norm": 3.1201677322387695, + "learning_rate": 4.2744825373054506e-05, + "loss": 5.7024, + "step": 5530 + }, + { + "epoch": 0.44327092334773566, + "grad_norm": 2.2246837615966797, + "learning_rate": 4.273145424399637e-05, + "loss": 5.5842, + "step": 5540 + }, + { + "epoch": 0.4440710513682189, + "grad_norm": 2.1593568325042725, + "learning_rate": 4.271808311493823e-05, + "loss": 5.5099, + "step": 5550 + }, + { + "epoch": 0.4448711793887022, + "grad_norm": 3.082218885421753, + "learning_rate": 4.2704711985880094e-05, + "loss": 5.5539, + "step": 5560 + }, + { + "epoch": 0.4456713074091855, + "grad_norm": 3.2272634506225586, + "learning_rate": 4.2691340856821956e-05, + "loss": 5.73, + "step": 5570 + }, + { + "epoch": 0.4464714354296688, + "grad_norm": 2.301713466644287, + "learning_rate": 4.267796972776382e-05, + "loss": 5.5444, + "step": 5580 + }, + { + "epoch": 0.447271563450152, + "grad_norm": 3.2985429763793945, + "learning_rate": 4.2664598598705675e-05, + "loss": 5.7499, + "step": 5590 + }, + { + "epoch": 0.4480716914706353, + "grad_norm": 2.103994607925415, + "learning_rate": 4.265122746964754e-05, + "loss": 5.5627, + "step": 5600 + }, + { + "epoch": 0.4488718194911186, + "grad_norm": 3.260099172592163, + "learning_rate": 4.26378563405894e-05, + "loss": 5.5692, + "step": 5610 + }, + { + "epoch": 0.44967194751160183, + "grad_norm": 2.740907907485962, + "learning_rate": 4.262448521153126e-05, + "loss": 5.4984, + "step": 5620 + }, + { + "epoch": 0.4504720755320851, + "grad_norm": 5.314218997955322, + "learning_rate": 4.2611114082473126e-05, + "loss": 5.5641, + "step": 5630 + }, + { + "epoch": 0.4512722035525684, + "grad_norm": 3.0524938106536865, + "learning_rate": 4.259774295341499e-05, + "loss": 5.6375, + "step": 5640 + }, + { + "epoch": 0.4520723315730517, + "grad_norm": 3.57781982421875, + "learning_rate": 4.258437182435685e-05, + "loss": 5.6726, + "step": 5650 + }, + { + "epoch": 0.45287245959353495, + "grad_norm": 3.094510793685913, + "learning_rate": 4.2571000695298714e-05, + "loss": 5.7328, + "step": 5660 + }, + { + "epoch": 0.45367258761401824, + "grad_norm": 2.731092929840088, + "learning_rate": 4.2557629566240576e-05, + "loss": 5.6667, + "step": 5670 + }, + { + "epoch": 0.45447271563450153, + "grad_norm": 3.6701395511627197, + "learning_rate": 4.254425843718244e-05, + "loss": 5.641, + "step": 5680 + }, + { + "epoch": 0.45527284365498477, + "grad_norm": 1.9017853736877441, + "learning_rate": 4.25308873081243e-05, + "loss": 5.6521, + "step": 5690 + }, + { + "epoch": 0.45607297167546806, + "grad_norm": 3.2658119201660156, + "learning_rate": 4.2517516179066165e-05, + "loss": 5.6431, + "step": 5700 + }, + { + "epoch": 0.45687309969595136, + "grad_norm": 2.227353572845459, + "learning_rate": 4.250414505000803e-05, + "loss": 5.6198, + "step": 5710 + }, + { + "epoch": 0.45767322771643465, + "grad_norm": 1.7804296016693115, + "learning_rate": 4.249077392094989e-05, + "loss": 5.618, + "step": 5720 + }, + { + "epoch": 0.4584733557369179, + "grad_norm": 2.9357879161834717, + "learning_rate": 4.247740279189175e-05, + "loss": 5.5222, + "step": 5730 + }, + { + "epoch": 0.4592734837574012, + "grad_norm": 5.074959754943848, + "learning_rate": 4.2464031662833615e-05, + "loss": 5.7604, + "step": 5740 + }, + { + "epoch": 0.4600736117778845, + "grad_norm": 2.4961061477661133, + "learning_rate": 4.245066053377547e-05, + "loss": 5.5699, + "step": 5750 + }, + { + "epoch": 0.46087373979836777, + "grad_norm": 2.636403799057007, + "learning_rate": 4.2437289404717334e-05, + "loss": 5.745, + "step": 5760 + }, + { + "epoch": 0.461673867818851, + "grad_norm": 2.4829630851745605, + "learning_rate": 4.2423918275659197e-05, + "loss": 5.9779, + "step": 5770 + }, + { + "epoch": 0.4624739958393343, + "grad_norm": 2.389112710952759, + "learning_rate": 4.241054714660106e-05, + "loss": 5.696, + "step": 5780 + }, + { + "epoch": 0.4632741238598176, + "grad_norm": 2.3053462505340576, + "learning_rate": 4.239717601754292e-05, + "loss": 5.6567, + "step": 5790 + }, + { + "epoch": 0.4640742518803008, + "grad_norm": 2.9635446071624756, + "learning_rate": 4.2383804888484785e-05, + "loss": 5.7643, + "step": 5800 + }, + { + "epoch": 0.4648743799007841, + "grad_norm": 3.3227570056915283, + "learning_rate": 4.237043375942665e-05, + "loss": 5.5425, + "step": 5810 + }, + { + "epoch": 0.4656745079212674, + "grad_norm": 3.2959067821502686, + "learning_rate": 4.235706263036851e-05, + "loss": 5.5886, + "step": 5820 + }, + { + "epoch": 0.4664746359417507, + "grad_norm": 2.497953176498413, + "learning_rate": 4.234369150131037e-05, + "loss": 5.6248, + "step": 5830 + }, + { + "epoch": 0.46727476396223394, + "grad_norm": 3.5957205295562744, + "learning_rate": 4.2330320372252235e-05, + "loss": 5.5345, + "step": 5840 + }, + { + "epoch": 0.46807489198271723, + "grad_norm": 2.9113316535949707, + "learning_rate": 4.23169492431941e-05, + "loss": 5.7358, + "step": 5850 + }, + { + "epoch": 0.4688750200032005, + "grad_norm": 3.8617255687713623, + "learning_rate": 4.230357811413596e-05, + "loss": 5.7451, + "step": 5860 + }, + { + "epoch": 0.46967514802368376, + "grad_norm": 2.5546538829803467, + "learning_rate": 4.2290206985077824e-05, + "loss": 5.5874, + "step": 5870 + }, + { + "epoch": 0.47047527604416706, + "grad_norm": 3.7215869426727295, + "learning_rate": 4.2276835856019686e-05, + "loss": 5.5462, + "step": 5880 + }, + { + "epoch": 0.47127540406465035, + "grad_norm": 3.3122622966766357, + "learning_rate": 4.226346472696155e-05, + "loss": 5.7368, + "step": 5890 + }, + { + "epoch": 0.47207553208513364, + "grad_norm": 2.3962459564208984, + "learning_rate": 4.2250093597903405e-05, + "loss": 5.7328, + "step": 5900 + }, + { + "epoch": 0.4728756601056169, + "grad_norm": 2.497668504714966, + "learning_rate": 4.223672246884527e-05, + "loss": 5.7063, + "step": 5910 + }, + { + "epoch": 0.4736757881261002, + "grad_norm": 2.301725387573242, + "learning_rate": 4.222335133978713e-05, + "loss": 5.6029, + "step": 5920 + }, + { + "epoch": 0.47447591614658347, + "grad_norm": 3.840155839920044, + "learning_rate": 4.220998021072899e-05, + "loss": 5.825, + "step": 5930 + }, + { + "epoch": 0.47527604416706676, + "grad_norm": 3.1776278018951416, + "learning_rate": 4.2196609081670856e-05, + "loss": 5.6421, + "step": 5940 + }, + { + "epoch": 0.47607617218755, + "grad_norm": 2.1823127269744873, + "learning_rate": 4.218323795261272e-05, + "loss": 5.7154, + "step": 5950 + }, + { + "epoch": 0.4768763002080333, + "grad_norm": 2.944390058517456, + "learning_rate": 4.216986682355458e-05, + "loss": 5.5429, + "step": 5960 + }, + { + "epoch": 0.4776764282285166, + "grad_norm": 2.035430431365967, + "learning_rate": 4.2156495694496444e-05, + "loss": 5.8187, + "step": 5970 + }, + { + "epoch": 0.4784765562489998, + "grad_norm": 3.167098045349121, + "learning_rate": 4.2143124565438306e-05, + "loss": 5.5891, + "step": 5980 + }, + { + "epoch": 0.4792766842694831, + "grad_norm": 1.9377233982086182, + "learning_rate": 4.212975343638017e-05, + "loss": 5.7428, + "step": 5990 + }, + { + "epoch": 0.4800768122899664, + "grad_norm": 2.759096622467041, + "learning_rate": 4.211638230732203e-05, + "loss": 5.5572, + "step": 6000 + }, + { + "epoch": 0.4808769403104497, + "grad_norm": 2.074033498764038, + "learning_rate": 4.2103011178263894e-05, + "loss": 5.517, + "step": 6010 + }, + { + "epoch": 0.48167706833093293, + "grad_norm": 2.2866854667663574, + "learning_rate": 4.208964004920576e-05, + "loss": 5.6539, + "step": 6020 + }, + { + "epoch": 0.4824771963514162, + "grad_norm": 1.9909095764160156, + "learning_rate": 4.207626892014762e-05, + "loss": 5.5532, + "step": 6030 + }, + { + "epoch": 0.4832773243718995, + "grad_norm": 3.245906114578247, + "learning_rate": 4.206289779108948e-05, + "loss": 5.6797, + "step": 6040 + }, + { + "epoch": 0.48407745239238276, + "grad_norm": 2.013009786605835, + "learning_rate": 4.2049526662031345e-05, + "loss": 5.6378, + "step": 6050 + }, + { + "epoch": 0.48487758041286605, + "grad_norm": 2.5478925704956055, + "learning_rate": 4.20361555329732e-05, + "loss": 5.555, + "step": 6060 + }, + { + "epoch": 0.48567770843334934, + "grad_norm": 3.079225778579712, + "learning_rate": 4.2022784403915064e-05, + "loss": 5.7618, + "step": 6070 + }, + { + "epoch": 0.48647783645383263, + "grad_norm": 2.2639927864074707, + "learning_rate": 4.2009413274856926e-05, + "loss": 5.8063, + "step": 6080 + }, + { + "epoch": 0.48727796447431587, + "grad_norm": 4.630524158477783, + "learning_rate": 4.199604214579879e-05, + "loss": 5.6403, + "step": 6090 + }, + { + "epoch": 0.48807809249479917, + "grad_norm": 3.11018967628479, + "learning_rate": 4.198267101674065e-05, + "loss": 5.7517, + "step": 6100 + }, + { + "epoch": 0.48887822051528246, + "grad_norm": 8.462982177734375, + "learning_rate": 4.1969299887682515e-05, + "loss": 5.7311, + "step": 6110 + }, + { + "epoch": 0.4896783485357657, + "grad_norm": 2.418065071105957, + "learning_rate": 4.195592875862438e-05, + "loss": 5.6239, + "step": 6120 + }, + { + "epoch": 0.490478476556249, + "grad_norm": 2.5452466011047363, + "learning_rate": 4.194255762956624e-05, + "loss": 5.7417, + "step": 6130 + }, + { + "epoch": 0.4912786045767323, + "grad_norm": 2.986041307449341, + "learning_rate": 4.19291865005081e-05, + "loss": 5.663, + "step": 6140 + }, + { + "epoch": 0.4920787325972156, + "grad_norm": 2.7642807960510254, + "learning_rate": 4.1915815371449965e-05, + "loss": 5.5379, + "step": 6150 + }, + { + "epoch": 0.4928788606176988, + "grad_norm": 4.326907157897949, + "learning_rate": 4.190244424239183e-05, + "loss": 5.8058, + "step": 6160 + }, + { + "epoch": 0.4936789886381821, + "grad_norm": 1.9514706134796143, + "learning_rate": 4.188907311333369e-05, + "loss": 5.7004, + "step": 6170 + }, + { + "epoch": 0.4944791166586654, + "grad_norm": 2.5721428394317627, + "learning_rate": 4.187570198427555e-05, + "loss": 5.6959, + "step": 6180 + }, + { + "epoch": 0.4952792446791487, + "grad_norm": 2.6619083881378174, + "learning_rate": 4.1862330855217416e-05, + "loss": 5.7196, + "step": 6190 + }, + { + "epoch": 0.4960793726996319, + "grad_norm": 2.322341203689575, + "learning_rate": 4.184895972615928e-05, + "loss": 5.5998, + "step": 6200 + }, + { + "epoch": 0.4968795007201152, + "grad_norm": 2.280777931213379, + "learning_rate": 4.183558859710114e-05, + "loss": 5.5171, + "step": 6210 + }, + { + "epoch": 0.4976796287405985, + "grad_norm": 1.9774320125579834, + "learning_rate": 4.1822217468043004e-05, + "loss": 5.6368, + "step": 6220 + }, + { + "epoch": 0.49847975676108175, + "grad_norm": 2.199708938598633, + "learning_rate": 4.180884633898487e-05, + "loss": 5.4638, + "step": 6230 + }, + { + "epoch": 0.49927988478156504, + "grad_norm": 2.0054879188537598, + "learning_rate": 4.179547520992673e-05, + "loss": 5.4624, + "step": 6240 + }, + { + "epoch": 0.5000800128020483, + "grad_norm": 2.0623903274536133, + "learning_rate": 4.178210408086859e-05, + "loss": 5.6554, + "step": 6250 + }, + { + "epoch": 0.5008801408225316, + "grad_norm": 2.5907487869262695, + "learning_rate": 4.1768732951810455e-05, + "loss": 5.4989, + "step": 6260 + }, + { + "epoch": 0.5016802688430149, + "grad_norm": 2.181987762451172, + "learning_rate": 4.175536182275232e-05, + "loss": 5.624, + "step": 6270 + }, + { + "epoch": 0.5024803968634982, + "grad_norm": 2.9678001403808594, + "learning_rate": 4.174199069369418e-05, + "loss": 5.6545, + "step": 6280 + }, + { + "epoch": 0.5032805248839815, + "grad_norm": 5.213638782501221, + "learning_rate": 4.172861956463604e-05, + "loss": 5.7048, + "step": 6290 + }, + { + "epoch": 0.5040806529044647, + "grad_norm": 2.465900182723999, + "learning_rate": 4.1715248435577906e-05, + "loss": 5.646, + "step": 6300 + }, + { + "epoch": 0.504880780924948, + "grad_norm": 2.94570255279541, + "learning_rate": 4.170187730651977e-05, + "loss": 5.6274, + "step": 6310 + }, + { + "epoch": 0.5056809089454313, + "grad_norm": 3.5255651473999023, + "learning_rate": 4.168850617746163e-05, + "loss": 5.5336, + "step": 6320 + }, + { + "epoch": 0.5064810369659145, + "grad_norm": 2.3499608039855957, + "learning_rate": 4.1675135048403494e-05, + "loss": 5.7768, + "step": 6330 + }, + { + "epoch": 0.5072811649863979, + "grad_norm": 2.0476951599121094, + "learning_rate": 4.1661763919345356e-05, + "loss": 5.5927, + "step": 6340 + }, + { + "epoch": 0.5080812930068811, + "grad_norm": 2.4708118438720703, + "learning_rate": 4.164839279028722e-05, + "loss": 5.6458, + "step": 6350 + }, + { + "epoch": 0.5088814210273643, + "grad_norm": 2.465075731277466, + "learning_rate": 4.163502166122908e-05, + "loss": 5.5744, + "step": 6360 + }, + { + "epoch": 0.5096815490478477, + "grad_norm": 2.9378490447998047, + "learning_rate": 4.162165053217094e-05, + "loss": 5.6963, + "step": 6370 + }, + { + "epoch": 0.5104816770683309, + "grad_norm": 2.201359987258911, + "learning_rate": 4.16082794031128e-05, + "loss": 5.613, + "step": 6380 + }, + { + "epoch": 0.5112818050888142, + "grad_norm": 1.8427401781082153, + "learning_rate": 4.159490827405466e-05, + "loss": 5.5494, + "step": 6390 + }, + { + "epoch": 0.5120819331092975, + "grad_norm": 1.9969813823699951, + "learning_rate": 4.1581537144996526e-05, + "loss": 5.5783, + "step": 6400 + }, + { + "epoch": 0.5128820611297807, + "grad_norm": 2.9670321941375732, + "learning_rate": 4.156816601593839e-05, + "loss": 5.7176, + "step": 6410 + }, + { + "epoch": 0.5136821891502641, + "grad_norm": 2.76875901222229, + "learning_rate": 4.155479488688025e-05, + "loss": 5.5584, + "step": 6420 + }, + { + "epoch": 0.5144823171707473, + "grad_norm": 3.2874600887298584, + "learning_rate": 4.1541423757822114e-05, + "loss": 5.8726, + "step": 6430 + }, + { + "epoch": 0.5152824451912306, + "grad_norm": 2.4672482013702393, + "learning_rate": 4.1528052628763977e-05, + "loss": 5.764, + "step": 6440 + }, + { + "epoch": 0.5160825732117139, + "grad_norm": 3.5424506664276123, + "learning_rate": 4.151468149970584e-05, + "loss": 5.6612, + "step": 6450 + }, + { + "epoch": 0.5168827012321972, + "grad_norm": 2.7947871685028076, + "learning_rate": 4.15013103706477e-05, + "loss": 5.668, + "step": 6460 + }, + { + "epoch": 0.5176828292526804, + "grad_norm": 2.624370574951172, + "learning_rate": 4.1487939241589565e-05, + "loss": 5.577, + "step": 6470 + }, + { + "epoch": 0.5184829572731637, + "grad_norm": 2.276289701461792, + "learning_rate": 4.147456811253143e-05, + "loss": 5.7592, + "step": 6480 + }, + { + "epoch": 0.519283085293647, + "grad_norm": 2.751945972442627, + "learning_rate": 4.146119698347329e-05, + "loss": 5.6251, + "step": 6490 + }, + { + "epoch": 0.5200832133141302, + "grad_norm": 2.1990444660186768, + "learning_rate": 4.144782585441515e-05, + "loss": 5.5141, + "step": 6500 + }, + { + "epoch": 0.5208833413346136, + "grad_norm": 2.732024908065796, + "learning_rate": 4.1434454725357015e-05, + "loss": 5.5938, + "step": 6510 + }, + { + "epoch": 0.5216834693550968, + "grad_norm": 2.6876533031463623, + "learning_rate": 4.142108359629887e-05, + "loss": 5.7126, + "step": 6520 + }, + { + "epoch": 0.5224835973755801, + "grad_norm": 2.660323143005371, + "learning_rate": 4.1407712467240734e-05, + "loss": 5.6261, + "step": 6530 + }, + { + "epoch": 0.5232837253960634, + "grad_norm": 2.567084550857544, + "learning_rate": 4.13943413381826e-05, + "loss": 5.5248, + "step": 6540 + }, + { + "epoch": 0.5240838534165466, + "grad_norm": 4.317018032073975, + "learning_rate": 4.138097020912446e-05, + "loss": 5.4444, + "step": 6550 + }, + { + "epoch": 0.52488398143703, + "grad_norm": 2.0361647605895996, + "learning_rate": 4.136759908006632e-05, + "loss": 5.7532, + "step": 6560 + }, + { + "epoch": 0.5256841094575132, + "grad_norm": 2.0946271419525146, + "learning_rate": 4.1354227951008185e-05, + "loss": 5.6343, + "step": 6570 + }, + { + "epoch": 0.5264842374779964, + "grad_norm": 3.3724842071533203, + "learning_rate": 4.134085682195005e-05, + "loss": 5.6455, + "step": 6580 + }, + { + "epoch": 0.5272843654984798, + "grad_norm": 4.078947067260742, + "learning_rate": 4.132748569289191e-05, + "loss": 5.6681, + "step": 6590 + }, + { + "epoch": 0.528084493518963, + "grad_norm": 4.288105010986328, + "learning_rate": 4.131411456383377e-05, + "loss": 5.7152, + "step": 6600 + }, + { + "epoch": 0.5288846215394463, + "grad_norm": 2.5208754539489746, + "learning_rate": 4.1300743434775635e-05, + "loss": 5.5715, + "step": 6610 + }, + { + "epoch": 0.5296847495599296, + "grad_norm": 2.6902217864990234, + "learning_rate": 4.12873723057175e-05, + "loss": 5.4997, + "step": 6620 + }, + { + "epoch": 0.5304848775804129, + "grad_norm": 2.4580068588256836, + "learning_rate": 4.127400117665936e-05, + "loss": 5.7656, + "step": 6630 + }, + { + "epoch": 0.5312850056008962, + "grad_norm": 2.5117955207824707, + "learning_rate": 4.1260630047601224e-05, + "loss": 5.6373, + "step": 6640 + }, + { + "epoch": 0.5320851336213794, + "grad_norm": 2.660921096801758, + "learning_rate": 4.1247258918543086e-05, + "loss": 5.6829, + "step": 6650 + }, + { + "epoch": 0.5328852616418627, + "grad_norm": 2.4601287841796875, + "learning_rate": 4.123388778948495e-05, + "loss": 5.7702, + "step": 6660 + }, + { + "epoch": 0.533685389662346, + "grad_norm": 2.9025120735168457, + "learning_rate": 4.122051666042681e-05, + "loss": 5.6374, + "step": 6670 + }, + { + "epoch": 0.5344855176828293, + "grad_norm": 2.8221569061279297, + "learning_rate": 4.120714553136867e-05, + "loss": 5.5568, + "step": 6680 + }, + { + "epoch": 0.5352856457033125, + "grad_norm": 2.3035178184509277, + "learning_rate": 4.119377440231053e-05, + "loss": 5.5845, + "step": 6690 + }, + { + "epoch": 0.5360857737237958, + "grad_norm": 2.0955657958984375, + "learning_rate": 4.118040327325239e-05, + "loss": 5.687, + "step": 6700 + }, + { + "epoch": 0.5368859017442791, + "grad_norm": 2.530156135559082, + "learning_rate": 4.1167032144194256e-05, + "loss": 5.5772, + "step": 6710 + }, + { + "epoch": 0.5376860297647623, + "grad_norm": 2.2060387134552, + "learning_rate": 4.115366101513612e-05, + "loss": 5.5964, + "step": 6720 + }, + { + "epoch": 0.5384861577852457, + "grad_norm": 2.720702886581421, + "learning_rate": 4.114028988607798e-05, + "loss": 5.5432, + "step": 6730 + }, + { + "epoch": 0.5392862858057289, + "grad_norm": 2.2585232257843018, + "learning_rate": 4.1126918757019844e-05, + "loss": 5.77, + "step": 6740 + }, + { + "epoch": 0.5400864138262121, + "grad_norm": 2.052316904067993, + "learning_rate": 4.1113547627961706e-05, + "loss": 5.5679, + "step": 6750 + }, + { + "epoch": 0.5408865418466955, + "grad_norm": 2.772500991821289, + "learning_rate": 4.110017649890357e-05, + "loss": 5.5608, + "step": 6760 + }, + { + "epoch": 0.5416866698671787, + "grad_norm": 2.158129930496216, + "learning_rate": 4.108680536984543e-05, + "loss": 5.6612, + "step": 6770 + }, + { + "epoch": 0.5424867978876621, + "grad_norm": 2.874685287475586, + "learning_rate": 4.1073434240787294e-05, + "loss": 5.5999, + "step": 6780 + }, + { + "epoch": 0.5432869259081453, + "grad_norm": 2.2797632217407227, + "learning_rate": 4.106006311172916e-05, + "loss": 5.7243, + "step": 6790 + }, + { + "epoch": 0.5440870539286286, + "grad_norm": 2.998309850692749, + "learning_rate": 4.1048029095576836e-05, + "loss": 5.5031, + "step": 6800 + }, + { + "epoch": 0.5448871819491119, + "grad_norm": 2.8155364990234375, + "learning_rate": 4.10346579665187e-05, + "loss": 5.7631, + "step": 6810 + }, + { + "epoch": 0.5456873099695951, + "grad_norm": 2.327279806137085, + "learning_rate": 4.102128683746056e-05, + "loss": 5.6293, + "step": 6820 + }, + { + "epoch": 0.5464874379900784, + "grad_norm": 3.3200621604919434, + "learning_rate": 4.100791570840242e-05, + "loss": 5.717, + "step": 6830 + }, + { + "epoch": 0.5472875660105617, + "grad_norm": 2.521144390106201, + "learning_rate": 4.099454457934428e-05, + "loss": 5.5705, + "step": 6840 + }, + { + "epoch": 0.548087694031045, + "grad_norm": 2.7198219299316406, + "learning_rate": 4.098117345028614e-05, + "loss": 5.5931, + "step": 6850 + }, + { + "epoch": 0.5488878220515282, + "grad_norm": 2.701251268386841, + "learning_rate": 4.0967802321228006e-05, + "loss": 5.4706, + "step": 6860 + }, + { + "epoch": 0.5496879500720115, + "grad_norm": 2.2789149284362793, + "learning_rate": 4.095443119216987e-05, + "loss": 5.5883, + "step": 6870 + }, + { + "epoch": 0.5504880780924948, + "grad_norm": 2.8821568489074707, + "learning_rate": 4.094106006311173e-05, + "loss": 5.7525, + "step": 6880 + }, + { + "epoch": 0.5512882061129781, + "grad_norm": 2.3450064659118652, + "learning_rate": 4.0927688934053594e-05, + "loss": 5.5166, + "step": 6890 + }, + { + "epoch": 0.5520883341334614, + "grad_norm": 2.639960527420044, + "learning_rate": 4.0914317804995456e-05, + "loss": 5.7001, + "step": 6900 + }, + { + "epoch": 0.5528884621539446, + "grad_norm": 2.6743710041046143, + "learning_rate": 4.090094667593732e-05, + "loss": 5.7049, + "step": 6910 + }, + { + "epoch": 0.553688590174428, + "grad_norm": 2.7540199756622314, + "learning_rate": 4.088757554687918e-05, + "loss": 5.5705, + "step": 6920 + }, + { + "epoch": 0.5544887181949112, + "grad_norm": 3.2703442573547363, + "learning_rate": 4.0874204417821044e-05, + "loss": 5.5585, + "step": 6930 + }, + { + "epoch": 0.5552888462153944, + "grad_norm": 3.684135913848877, + "learning_rate": 4.086083328876291e-05, + "loss": 5.6561, + "step": 6940 + }, + { + "epoch": 0.5560889742358778, + "grad_norm": 2.918989896774292, + "learning_rate": 4.084746215970477e-05, + "loss": 5.5171, + "step": 6950 + }, + { + "epoch": 0.556889102256361, + "grad_norm": 2.5902323722839355, + "learning_rate": 4.083409103064663e-05, + "loss": 5.6703, + "step": 6960 + }, + { + "epoch": 0.5576892302768442, + "grad_norm": 2.23820161819458, + "learning_rate": 4.0820719901588495e-05, + "loss": 5.7048, + "step": 6970 + }, + { + "epoch": 0.5584893582973276, + "grad_norm": 2.4339401721954346, + "learning_rate": 4.080734877253036e-05, + "loss": 5.4264, + "step": 6980 + }, + { + "epoch": 0.5592894863178108, + "grad_norm": 3.3097031116485596, + "learning_rate": 4.0793977643472214e-05, + "loss": 5.5931, + "step": 6990 + }, + { + "epoch": 0.5600896143382941, + "grad_norm": 2.6903202533721924, + "learning_rate": 4.0780606514414077e-05, + "loss": 5.5349, + "step": 7000 + }, + { + "epoch": 0.5600896143382941, + "eval_loss": 5.870830535888672, + "eval_runtime": 13.3044, + "eval_samples_per_second": 3.007, + "eval_steps_per_second": 0.376, + "step": 7000 + }, + { + "epoch": 0.5608897423587774, + "grad_norm": 2.144684314727783, + "learning_rate": 4.076723538535594e-05, + "loss": 5.6295, + "step": 7010 + }, + { + "epoch": 0.5616898703792607, + "grad_norm": 3.227046489715576, + "learning_rate": 4.07538642562978e-05, + "loss": 5.5506, + "step": 7020 + }, + { + "epoch": 0.562489998399744, + "grad_norm": 2.7323713302612305, + "learning_rate": 4.0740493127239665e-05, + "loss": 5.5441, + "step": 7030 + }, + { + "epoch": 0.5632901264202272, + "grad_norm": 2.3682384490966797, + "learning_rate": 4.072712199818153e-05, + "loss": 5.6632, + "step": 7040 + }, + { + "epoch": 0.5640902544407105, + "grad_norm": 3.006518602371216, + "learning_rate": 4.071375086912339e-05, + "loss": 5.5702, + "step": 7050 + }, + { + "epoch": 0.5648903824611938, + "grad_norm": 2.554481029510498, + "learning_rate": 4.070037974006525e-05, + "loss": 5.4405, + "step": 7060 + }, + { + "epoch": 0.5656905104816771, + "grad_norm": 2.2349042892456055, + "learning_rate": 4.0687008611007115e-05, + "loss": 5.5774, + "step": 7070 + }, + { + "epoch": 0.5664906385021603, + "grad_norm": 2.24906325340271, + "learning_rate": 4.067363748194898e-05, + "loss": 5.6362, + "step": 7080 + }, + { + "epoch": 0.5672907665226437, + "grad_norm": 2.2345407009124756, + "learning_rate": 4.066026635289084e-05, + "loss": 5.642, + "step": 7090 + }, + { + "epoch": 0.5680908945431269, + "grad_norm": 3.2273216247558594, + "learning_rate": 4.0646895223832703e-05, + "loss": 5.5204, + "step": 7100 + }, + { + "epoch": 0.5688910225636101, + "grad_norm": 2.689624071121216, + "learning_rate": 4.0633524094774566e-05, + "loss": 5.5565, + "step": 7110 + }, + { + "epoch": 0.5696911505840935, + "grad_norm": 3.4473490715026855, + "learning_rate": 4.062015296571643e-05, + "loss": 5.4041, + "step": 7120 + }, + { + "epoch": 0.5704912786045767, + "grad_norm": 2.528700590133667, + "learning_rate": 4.060678183665829e-05, + "loss": 5.4294, + "step": 7130 + }, + { + "epoch": 0.5712914066250601, + "grad_norm": 2.6679399013519287, + "learning_rate": 4.059341070760015e-05, + "loss": 5.6018, + "step": 7140 + }, + { + "epoch": 0.5720915346455433, + "grad_norm": 2.0572123527526855, + "learning_rate": 4.058003957854201e-05, + "loss": 5.6527, + "step": 7150 + }, + { + "epoch": 0.5728916626660265, + "grad_norm": 2.446279287338257, + "learning_rate": 4.056666844948387e-05, + "loss": 5.5862, + "step": 7160 + }, + { + "epoch": 0.5736917906865099, + "grad_norm": 2.067232131958008, + "learning_rate": 4.0553297320425735e-05, + "loss": 5.5159, + "step": 7170 + }, + { + "epoch": 0.5744919187069931, + "grad_norm": 2.225755214691162, + "learning_rate": 4.05399261913676e-05, + "loss": 5.6483, + "step": 7180 + }, + { + "epoch": 0.5752920467274764, + "grad_norm": 2.3613367080688477, + "learning_rate": 4.052655506230946e-05, + "loss": 5.6226, + "step": 7190 + }, + { + "epoch": 0.5760921747479597, + "grad_norm": 2.4239625930786133, + "learning_rate": 4.0513183933251324e-05, + "loss": 5.6164, + "step": 7200 + }, + { + "epoch": 0.5768923027684429, + "grad_norm": 3.5525450706481934, + "learning_rate": 4.0499812804193186e-05, + "loss": 5.4503, + "step": 7210 + }, + { + "epoch": 0.5776924307889262, + "grad_norm": 2.664311170578003, + "learning_rate": 4.048644167513505e-05, + "loss": 5.5188, + "step": 7220 + }, + { + "epoch": 0.5784925588094095, + "grad_norm": 2.4020540714263916, + "learning_rate": 4.047307054607691e-05, + "loss": 5.5481, + "step": 7230 + }, + { + "epoch": 0.5792926868298928, + "grad_norm": 2.256044626235962, + "learning_rate": 4.0459699417018774e-05, + "loss": 5.6097, + "step": 7240 + }, + { + "epoch": 0.5800928148503761, + "grad_norm": 2.1168150901794434, + "learning_rate": 4.044632828796064e-05, + "loss": 5.5249, + "step": 7250 + }, + { + "epoch": 0.5808929428708594, + "grad_norm": 2.329375743865967, + "learning_rate": 4.04329571589025e-05, + "loss": 5.504, + "step": 7260 + }, + { + "epoch": 0.5816930708913426, + "grad_norm": 2.1734092235565186, + "learning_rate": 4.041958602984436e-05, + "loss": 5.5017, + "step": 7270 + }, + { + "epoch": 0.5824931989118259, + "grad_norm": 3.232649564743042, + "learning_rate": 4.0406214900786225e-05, + "loss": 5.6462, + "step": 7280 + }, + { + "epoch": 0.5832933269323092, + "grad_norm": 3.140702724456787, + "learning_rate": 4.039284377172809e-05, + "loss": 5.4393, + "step": 7290 + }, + { + "epoch": 0.5840934549527924, + "grad_norm": 2.284515619277954, + "learning_rate": 4.0379472642669944e-05, + "loss": 5.4891, + "step": 7300 + }, + { + "epoch": 0.5848935829732758, + "grad_norm": 4.518533706665039, + "learning_rate": 4.0366101513611806e-05, + "loss": 5.7371, + "step": 7310 + }, + { + "epoch": 0.585693710993759, + "grad_norm": 2.2323620319366455, + "learning_rate": 4.035273038455367e-05, + "loss": 5.6324, + "step": 7320 + }, + { + "epoch": 0.5864938390142422, + "grad_norm": 3.123394012451172, + "learning_rate": 4.033935925549553e-05, + "loss": 5.6266, + "step": 7330 + }, + { + "epoch": 0.5872939670347256, + "grad_norm": 2.577545642852783, + "learning_rate": 4.0325988126437394e-05, + "loss": 5.6541, + "step": 7340 + }, + { + "epoch": 0.5880940950552088, + "grad_norm": 2.8590281009674072, + "learning_rate": 4.031261699737926e-05, + "loss": 5.6927, + "step": 7350 + }, + { + "epoch": 0.5888942230756921, + "grad_norm": 3.0693793296813965, + "learning_rate": 4.029924586832112e-05, + "loss": 5.5101, + "step": 7360 + }, + { + "epoch": 0.5896943510961754, + "grad_norm": 2.5813119411468506, + "learning_rate": 4.028587473926298e-05, + "loss": 5.625, + "step": 7370 + }, + { + "epoch": 0.5904944791166586, + "grad_norm": 2.7804691791534424, + "learning_rate": 4.0272503610204845e-05, + "loss": 5.6264, + "step": 7380 + }, + { + "epoch": 0.591294607137142, + "grad_norm": 2.4291296005249023, + "learning_rate": 4.025913248114671e-05, + "loss": 5.5024, + "step": 7390 + }, + { + "epoch": 0.5920947351576252, + "grad_norm": 2.6989386081695557, + "learning_rate": 4.024576135208857e-05, + "loss": 5.4484, + "step": 7400 + }, + { + "epoch": 0.5928948631781085, + "grad_norm": 2.42767596244812, + "learning_rate": 4.023239022303043e-05, + "loss": 5.5537, + "step": 7410 + }, + { + "epoch": 0.5936949911985918, + "grad_norm": 2.492577075958252, + "learning_rate": 4.0219019093972296e-05, + "loss": 5.616, + "step": 7420 + }, + { + "epoch": 0.594495119219075, + "grad_norm": 2.4696478843688965, + "learning_rate": 4.020564796491416e-05, + "loss": 5.62, + "step": 7430 + }, + { + "epoch": 0.5952952472395583, + "grad_norm": 3.2339985370635986, + "learning_rate": 4.019227683585602e-05, + "loss": 5.485, + "step": 7440 + }, + { + "epoch": 0.5960953752600416, + "grad_norm": 3.9647512435913086, + "learning_rate": 4.0178905706797884e-05, + "loss": 5.5868, + "step": 7450 + }, + { + "epoch": 0.5968955032805249, + "grad_norm": 2.36417293548584, + "learning_rate": 4.016553457773975e-05, + "loss": 5.5179, + "step": 7460 + }, + { + "epoch": 0.5976956313010081, + "grad_norm": 2.1484084129333496, + "learning_rate": 4.015216344868161e-05, + "loss": 5.6915, + "step": 7470 + }, + { + "epoch": 0.5984957593214915, + "grad_norm": 2.5233757495880127, + "learning_rate": 4.013879231962347e-05, + "loss": 5.4879, + "step": 7480 + }, + { + "epoch": 0.5992958873419747, + "grad_norm": 3.3730146884918213, + "learning_rate": 4.0125421190565335e-05, + "loss": 5.6531, + "step": 7490 + }, + { + "epoch": 0.600096015362458, + "grad_norm": 3.0788846015930176, + "learning_rate": 4.01120500615072e-05, + "loss": 5.6058, + "step": 7500 + }, + { + "epoch": 0.6008961433829413, + "grad_norm": 2.93515944480896, + "learning_rate": 4.009867893244906e-05, + "loss": 5.4777, + "step": 7510 + }, + { + "epoch": 0.6016962714034245, + "grad_norm": 2.6020236015319824, + "learning_rate": 4.008530780339092e-05, + "loss": 5.6444, + "step": 7520 + }, + { + "epoch": 0.6024963994239079, + "grad_norm": 2.4522392749786377, + "learning_rate": 4.0071936674332786e-05, + "loss": 5.6157, + "step": 7530 + }, + { + "epoch": 0.6032965274443911, + "grad_norm": 3.1317343711853027, + "learning_rate": 4.005856554527465e-05, + "loss": 5.5527, + "step": 7540 + }, + { + "epoch": 0.6040966554648743, + "grad_norm": 2.485154390335083, + "learning_rate": 4.004519441621651e-05, + "loss": 5.6467, + "step": 7550 + }, + { + "epoch": 0.6048967834853577, + "grad_norm": 2.2032833099365234, + "learning_rate": 4.0031823287158374e-05, + "loss": 5.4957, + "step": 7560 + }, + { + "epoch": 0.6056969115058409, + "grad_norm": 3.1787898540496826, + "learning_rate": 4.0018452158100236e-05, + "loss": 5.6204, + "step": 7570 + }, + { + "epoch": 0.6064970395263242, + "grad_norm": 2.9925789833068848, + "learning_rate": 4.00050810290421e-05, + "loss": 5.6732, + "step": 7580 + }, + { + "epoch": 0.6072971675468075, + "grad_norm": 2.7631521224975586, + "learning_rate": 3.999170989998396e-05, + "loss": 5.6743, + "step": 7590 + }, + { + "epoch": 0.6080972955672908, + "grad_norm": 2.808265447616577, + "learning_rate": 3.997833877092582e-05, + "loss": 5.5951, + "step": 7600 + }, + { + "epoch": 0.608897423587774, + "grad_norm": 3.6244983673095703, + "learning_rate": 3.996496764186768e-05, + "loss": 5.5216, + "step": 7610 + }, + { + "epoch": 0.6096975516082573, + "grad_norm": 2.4245145320892334, + "learning_rate": 3.995159651280954e-05, + "loss": 5.5844, + "step": 7620 + }, + { + "epoch": 0.6104976796287406, + "grad_norm": 2.2855565547943115, + "learning_rate": 3.9938225383751406e-05, + "loss": 5.5674, + "step": 7630 + }, + { + "epoch": 0.6112978076492239, + "grad_norm": 2.2801260948181152, + "learning_rate": 3.992485425469327e-05, + "loss": 5.4406, + "step": 7640 + }, + { + "epoch": 0.6120979356697072, + "grad_norm": 2.0117592811584473, + "learning_rate": 3.991148312563513e-05, + "loss": 5.5463, + "step": 7650 + }, + { + "epoch": 0.6128980636901904, + "grad_norm": 3.110349655151367, + "learning_rate": 3.9898111996576994e-05, + "loss": 5.6124, + "step": 7660 + }, + { + "epoch": 0.6136981917106737, + "grad_norm": 2.9789066314697266, + "learning_rate": 3.9884740867518856e-05, + "loss": 5.789, + "step": 7670 + }, + { + "epoch": 0.614498319731157, + "grad_norm": 2.641871213912964, + "learning_rate": 3.987136973846072e-05, + "loss": 5.4838, + "step": 7680 + }, + { + "epoch": 0.6152984477516402, + "grad_norm": 3.82928466796875, + "learning_rate": 3.985799860940258e-05, + "loss": 5.7108, + "step": 7690 + }, + { + "epoch": 0.6160985757721236, + "grad_norm": 3.2533349990844727, + "learning_rate": 3.9844627480344444e-05, + "loss": 5.4167, + "step": 7700 + }, + { + "epoch": 0.6168987037926068, + "grad_norm": 2.4259872436523438, + "learning_rate": 3.983125635128631e-05, + "loss": 5.5539, + "step": 7710 + }, + { + "epoch": 0.61769883181309, + "grad_norm": 3.5356359481811523, + "learning_rate": 3.981788522222817e-05, + "loss": 5.4643, + "step": 7720 + }, + { + "epoch": 0.6184989598335734, + "grad_norm": 2.5774996280670166, + "learning_rate": 3.980451409317003e-05, + "loss": 5.5389, + "step": 7730 + }, + { + "epoch": 0.6192990878540566, + "grad_norm": 2.3197529315948486, + "learning_rate": 3.9791142964111895e-05, + "loss": 5.5724, + "step": 7740 + }, + { + "epoch": 0.62009921587454, + "grad_norm": 2.2660646438598633, + "learning_rate": 3.977777183505376e-05, + "loss": 5.5675, + "step": 7750 + }, + { + "epoch": 0.6208993438950232, + "grad_norm": 2.7596511840820312, + "learning_rate": 3.9764400705995614e-05, + "loss": 5.6168, + "step": 7760 + }, + { + "epoch": 0.6216994719155065, + "grad_norm": 2.4579806327819824, + "learning_rate": 3.9751029576937477e-05, + "loss": 5.4243, + "step": 7770 + }, + { + "epoch": 0.6224995999359898, + "grad_norm": 2.7039647102355957, + "learning_rate": 3.973765844787934e-05, + "loss": 5.633, + "step": 7780 + }, + { + "epoch": 0.623299727956473, + "grad_norm": 2.274777412414551, + "learning_rate": 3.97242873188212e-05, + "loss": 5.5945, + "step": 7790 + }, + { + "epoch": 0.6240998559769563, + "grad_norm": 2.4263217449188232, + "learning_rate": 3.9710916189763065e-05, + "loss": 5.6763, + "step": 7800 + }, + { + "epoch": 0.6248999839974396, + "grad_norm": 3.420625686645508, + "learning_rate": 3.969754506070493e-05, + "loss": 5.4884, + "step": 7810 + }, + { + "epoch": 0.6257001120179229, + "grad_norm": 2.1576149463653564, + "learning_rate": 3.968417393164679e-05, + "loss": 5.6325, + "step": 7820 + }, + { + "epoch": 0.6265002400384061, + "grad_norm": 2.4189348220825195, + "learning_rate": 3.967080280258865e-05, + "loss": 5.5113, + "step": 7830 + }, + { + "epoch": 0.6273003680588894, + "grad_norm": 2.533433675765991, + "learning_rate": 3.9657431673530515e-05, + "loss": 5.3743, + "step": 7840 + }, + { + "epoch": 0.6281004960793727, + "grad_norm": 2.2747883796691895, + "learning_rate": 3.964406054447238e-05, + "loss": 5.4912, + "step": 7850 + }, + { + "epoch": 0.628900624099856, + "grad_norm": 2.546261787414551, + "learning_rate": 3.963068941541424e-05, + "loss": 5.6571, + "step": 7860 + }, + { + "epoch": 0.6297007521203393, + "grad_norm": 2.5970914363861084, + "learning_rate": 3.9617318286356103e-05, + "loss": 5.6732, + "step": 7870 + }, + { + "epoch": 0.6305008801408225, + "grad_norm": 2.956646680831909, + "learning_rate": 3.9603947157297966e-05, + "loss": 5.4769, + "step": 7880 + }, + { + "epoch": 0.6313010081613059, + "grad_norm": 2.9553463459014893, + "learning_rate": 3.959057602823983e-05, + "loss": 5.4675, + "step": 7890 + }, + { + "epoch": 0.6321011361817891, + "grad_norm": 2.6471643447875977, + "learning_rate": 3.957720489918169e-05, + "loss": 5.4538, + "step": 7900 + }, + { + "epoch": 0.6329012642022723, + "grad_norm": 2.847944736480713, + "learning_rate": 3.956383377012355e-05, + "loss": 5.384, + "step": 7910 + }, + { + "epoch": 0.6337013922227557, + "grad_norm": 3.6218080520629883, + "learning_rate": 3.955046264106541e-05, + "loss": 5.56, + "step": 7920 + }, + { + "epoch": 0.6345015202432389, + "grad_norm": 2.396426200866699, + "learning_rate": 3.953709151200727e-05, + "loss": 5.6353, + "step": 7930 + }, + { + "epoch": 0.6353016482637222, + "grad_norm": 2.4465904235839844, + "learning_rate": 3.9523720382949135e-05, + "loss": 5.6698, + "step": 7940 + }, + { + "epoch": 0.6361017762842055, + "grad_norm": 2.6707208156585693, + "learning_rate": 3.9510349253891e-05, + "loss": 5.4316, + "step": 7950 + }, + { + "epoch": 0.6369019043046887, + "grad_norm": 2.982117176055908, + "learning_rate": 3.949697812483286e-05, + "loss": 5.6359, + "step": 7960 + }, + { + "epoch": 0.637702032325172, + "grad_norm": 2.6343331336975098, + "learning_rate": 3.9483606995774724e-05, + "loss": 5.6188, + "step": 7970 + }, + { + "epoch": 0.6385021603456553, + "grad_norm": 2.290728807449341, + "learning_rate": 3.9470235866716586e-05, + "loss": 5.5824, + "step": 7980 + }, + { + "epoch": 0.6393022883661386, + "grad_norm": 2.3056259155273438, + "learning_rate": 3.945686473765845e-05, + "loss": 5.5314, + "step": 7990 + }, + { + "epoch": 0.6401024163866219, + "grad_norm": 2.301790714263916, + "learning_rate": 3.944349360860031e-05, + "loss": 5.497, + "step": 8000 + }, + { + "epoch": 0.6409025444071051, + "grad_norm": 2.2784414291381836, + "learning_rate": 3.9430122479542174e-05, + "loss": 5.6482, + "step": 8010 + }, + { + "epoch": 0.6417026724275884, + "grad_norm": 2.3686752319335938, + "learning_rate": 3.941675135048404e-05, + "loss": 5.449, + "step": 8020 + }, + { + "epoch": 0.6425028004480717, + "grad_norm": 3.0353329181671143, + "learning_rate": 3.94033802214259e-05, + "loss": 5.4544, + "step": 8030 + }, + { + "epoch": 0.643302928468555, + "grad_norm": 3.035477876663208, + "learning_rate": 3.939000909236776e-05, + "loss": 5.4641, + "step": 8040 + }, + { + "epoch": 0.6441030564890382, + "grad_norm": 2.6078028678894043, + "learning_rate": 3.9376637963309625e-05, + "loss": 5.6181, + "step": 8050 + }, + { + "epoch": 0.6449031845095216, + "grad_norm": 2.7835607528686523, + "learning_rate": 3.936326683425149e-05, + "loss": 5.459, + "step": 8060 + }, + { + "epoch": 0.6457033125300048, + "grad_norm": 2.465331792831421, + "learning_rate": 3.9349895705193344e-05, + "loss": 5.5365, + "step": 8070 + }, + { + "epoch": 0.646503440550488, + "grad_norm": 2.0666961669921875, + "learning_rate": 3.9336524576135206e-05, + "loss": 5.5158, + "step": 8080 + }, + { + "epoch": 0.6473035685709714, + "grad_norm": 2.2512967586517334, + "learning_rate": 3.932315344707707e-05, + "loss": 5.4235, + "step": 8090 + }, + { + "epoch": 0.6481036965914546, + "grad_norm": 2.081125497817993, + "learning_rate": 3.930978231801893e-05, + "loss": 5.4172, + "step": 8100 + }, + { + "epoch": 0.648903824611938, + "grad_norm": 2.0393776893615723, + "learning_rate": 3.9296411188960794e-05, + "loss": 5.5454, + "step": 8110 + }, + { + "epoch": 0.6497039526324212, + "grad_norm": 2.671065092086792, + "learning_rate": 3.928304005990266e-05, + "loss": 5.4562, + "step": 8120 + }, + { + "epoch": 0.6505040806529044, + "grad_norm": 2.3266165256500244, + "learning_rate": 3.926966893084452e-05, + "loss": 5.5839, + "step": 8130 + }, + { + "epoch": 0.6513042086733878, + "grad_norm": 2.400386333465576, + "learning_rate": 3.925629780178638e-05, + "loss": 5.7815, + "step": 8140 + }, + { + "epoch": 0.652104336693871, + "grad_norm": 2.3798139095306396, + "learning_rate": 3.9242926672728245e-05, + "loss": 5.5736, + "step": 8150 + }, + { + "epoch": 0.6529044647143543, + "grad_norm": 2.4090096950531006, + "learning_rate": 3.922955554367011e-05, + "loss": 5.4634, + "step": 8160 + }, + { + "epoch": 0.6537045927348376, + "grad_norm": 3.5072951316833496, + "learning_rate": 3.921618441461197e-05, + "loss": 5.5608, + "step": 8170 + }, + { + "epoch": 0.6545047207553208, + "grad_norm": 2.364222526550293, + "learning_rate": 3.920281328555383e-05, + "loss": 5.7275, + "step": 8180 + }, + { + "epoch": 0.6553048487758041, + "grad_norm": 4.594448566436768, + "learning_rate": 3.9189442156495696e-05, + "loss": 5.7235, + "step": 8190 + }, + { + "epoch": 0.6561049767962874, + "grad_norm": 3.863098621368408, + "learning_rate": 3.917607102743756e-05, + "loss": 5.5359, + "step": 8200 + }, + { + "epoch": 0.6569051048167707, + "grad_norm": 3.201704978942871, + "learning_rate": 3.916269989837942e-05, + "loss": 5.645, + "step": 8210 + }, + { + "epoch": 0.6577052328372539, + "grad_norm": 2.697448492050171, + "learning_rate": 3.9149328769321284e-05, + "loss": 5.523, + "step": 8220 + }, + { + "epoch": 0.6585053608577373, + "grad_norm": 2.4561972618103027, + "learning_rate": 3.913595764026315e-05, + "loss": 5.734, + "step": 8230 + }, + { + "epoch": 0.6593054888782205, + "grad_norm": 4.527692794799805, + "learning_rate": 3.912258651120501e-05, + "loss": 5.4594, + "step": 8240 + }, + { + "epoch": 0.6601056168987038, + "grad_norm": 2.8713691234588623, + "learning_rate": 3.910921538214687e-05, + "loss": 5.7247, + "step": 8250 + }, + { + "epoch": 0.6609057449191871, + "grad_norm": 2.167921304702759, + "learning_rate": 3.9095844253088735e-05, + "loss": 5.6405, + "step": 8260 + }, + { + "epoch": 0.6617058729396703, + "grad_norm": 2.8967878818511963, + "learning_rate": 3.90824731240306e-05, + "loss": 5.4989, + "step": 8270 + }, + { + "epoch": 0.6625060009601537, + "grad_norm": 2.002103090286255, + "learning_rate": 3.906910199497246e-05, + "loss": 5.4434, + "step": 8280 + }, + { + "epoch": 0.6633061289806369, + "grad_norm": 2.187889575958252, + "learning_rate": 3.905573086591432e-05, + "loss": 5.4078, + "step": 8290 + }, + { + "epoch": 0.6641062570011201, + "grad_norm": 2.4078755378723145, + "learning_rate": 3.9042359736856186e-05, + "loss": 5.5381, + "step": 8300 + }, + { + "epoch": 0.6649063850216035, + "grad_norm": 3.071484327316284, + "learning_rate": 3.902898860779805e-05, + "loss": 5.4298, + "step": 8310 + }, + { + "epoch": 0.6657065130420867, + "grad_norm": 3.8413217067718506, + "learning_rate": 3.901561747873991e-05, + "loss": 5.4844, + "step": 8320 + }, + { + "epoch": 0.66650664106257, + "grad_norm": 3.0394554138183594, + "learning_rate": 3.9002246349681774e-05, + "loss": 5.5524, + "step": 8330 + }, + { + "epoch": 0.6673067690830533, + "grad_norm": 2.635354518890381, + "learning_rate": 3.8988875220623636e-05, + "loss": 5.5727, + "step": 8340 + }, + { + "epoch": 0.6681068971035365, + "grad_norm": 2.2557764053344727, + "learning_rate": 3.89755040915655e-05, + "loss": 5.3455, + "step": 8350 + }, + { + "epoch": 0.6689070251240199, + "grad_norm": 2.837040662765503, + "learning_rate": 3.896213296250736e-05, + "loss": 5.3729, + "step": 8360 + }, + { + "epoch": 0.6697071531445031, + "grad_norm": 6.783266067504883, + "learning_rate": 3.8948761833449224e-05, + "loss": 5.4372, + "step": 8370 + }, + { + "epoch": 0.6705072811649864, + "grad_norm": 2.20611310005188, + "learning_rate": 3.893539070439108e-05, + "loss": 5.4983, + "step": 8380 + }, + { + "epoch": 0.6713074091854697, + "grad_norm": 2.378692626953125, + "learning_rate": 3.892201957533294e-05, + "loss": 5.6309, + "step": 8390 + }, + { + "epoch": 0.672107537205953, + "grad_norm": 2.7219278812408447, + "learning_rate": 3.8908648446274806e-05, + "loss": 5.67, + "step": 8400 + }, + { + "epoch": 0.6729076652264362, + "grad_norm": 2.7383148670196533, + "learning_rate": 3.889527731721667e-05, + "loss": 5.5648, + "step": 8410 + }, + { + "epoch": 0.6737077932469195, + "grad_norm": 1.882124423980713, + "learning_rate": 3.888190618815853e-05, + "loss": 5.5879, + "step": 8420 + }, + { + "epoch": 0.6745079212674028, + "grad_norm": 2.5975465774536133, + "learning_rate": 3.8868535059100394e-05, + "loss": 5.5644, + "step": 8430 + }, + { + "epoch": 0.675308049287886, + "grad_norm": 3.4361534118652344, + "learning_rate": 3.8855163930042256e-05, + "loss": 5.6302, + "step": 8440 + }, + { + "epoch": 0.6761081773083694, + "grad_norm": 2.241267442703247, + "learning_rate": 3.884179280098412e-05, + "loss": 5.5003, + "step": 8450 + }, + { + "epoch": 0.6769083053288526, + "grad_norm": 1.9234975576400757, + "learning_rate": 3.882842167192598e-05, + "loss": 5.4739, + "step": 8460 + }, + { + "epoch": 0.677708433349336, + "grad_norm": 2.05928897857666, + "learning_rate": 3.8815050542867845e-05, + "loss": 5.5566, + "step": 8470 + }, + { + "epoch": 0.6785085613698192, + "grad_norm": 2.5602006912231445, + "learning_rate": 3.880167941380971e-05, + "loss": 5.6363, + "step": 8480 + }, + { + "epoch": 0.6793086893903024, + "grad_norm": 2.36325740814209, + "learning_rate": 3.878830828475157e-05, + "loss": 5.4635, + "step": 8490 + }, + { + "epoch": 0.6801088174107858, + "grad_norm": 3.087769031524658, + "learning_rate": 3.877493715569343e-05, + "loss": 5.5537, + "step": 8500 + }, + { + "epoch": 0.680908945431269, + "grad_norm": 2.759660482406616, + "learning_rate": 3.8761566026635295e-05, + "loss": 5.5427, + "step": 8510 + }, + { + "epoch": 0.6817090734517522, + "grad_norm": 2.7726991176605225, + "learning_rate": 3.874819489757716e-05, + "loss": 5.4868, + "step": 8520 + }, + { + "epoch": 0.6825092014722356, + "grad_norm": 3.408202648162842, + "learning_rate": 3.8734823768519014e-05, + "loss": 5.5416, + "step": 8530 + }, + { + "epoch": 0.6833093294927188, + "grad_norm": 3.801959753036499, + "learning_rate": 3.8721452639460877e-05, + "loss": 5.5577, + "step": 8540 + }, + { + "epoch": 0.6841094575132021, + "grad_norm": 2.7447824478149414, + "learning_rate": 3.870808151040274e-05, + "loss": 5.5837, + "step": 8550 + }, + { + "epoch": 0.6849095855336854, + "grad_norm": 3.7551326751708984, + "learning_rate": 3.86947103813446e-05, + "loss": 5.4772, + "step": 8560 + }, + { + "epoch": 0.6857097135541687, + "grad_norm": 2.036146640777588, + "learning_rate": 3.8681339252286465e-05, + "loss": 5.659, + "step": 8570 + }, + { + "epoch": 0.6865098415746519, + "grad_norm": 2.392986536026001, + "learning_rate": 3.866796812322833e-05, + "loss": 5.3913, + "step": 8580 + }, + { + "epoch": 0.6873099695951352, + "grad_norm": 2.7194063663482666, + "learning_rate": 3.865459699417019e-05, + "loss": 5.418, + "step": 8590 + }, + { + "epoch": 0.6881100976156185, + "grad_norm": 2.2499608993530273, + "learning_rate": 3.864122586511205e-05, + "loss": 5.4924, + "step": 8600 + }, + { + "epoch": 0.6889102256361018, + "grad_norm": 3.661318302154541, + "learning_rate": 3.8627854736053915e-05, + "loss": 5.5578, + "step": 8610 + }, + { + "epoch": 0.6897103536565851, + "grad_norm": 3.076019048690796, + "learning_rate": 3.861448360699578e-05, + "loss": 5.6017, + "step": 8620 + }, + { + "epoch": 0.6905104816770683, + "grad_norm": 2.133923053741455, + "learning_rate": 3.860111247793764e-05, + "loss": 5.5295, + "step": 8630 + }, + { + "epoch": 0.6913106096975516, + "grad_norm": 3.3584773540496826, + "learning_rate": 3.8587741348879503e-05, + "loss": 5.4534, + "step": 8640 + }, + { + "epoch": 0.6921107377180349, + "grad_norm": 2.499058723449707, + "learning_rate": 3.8574370219821366e-05, + "loss": 5.3402, + "step": 8650 + }, + { + "epoch": 0.6929108657385181, + "grad_norm": 2.5099146366119385, + "learning_rate": 3.856099909076323e-05, + "loss": 5.3765, + "step": 8660 + }, + { + "epoch": 0.6937109937590015, + "grad_norm": 2.9601941108703613, + "learning_rate": 3.854762796170509e-05, + "loss": 5.5139, + "step": 8670 + }, + { + "epoch": 0.6945111217794847, + "grad_norm": 3.2487246990203857, + "learning_rate": 3.8534256832646954e-05, + "loss": 5.5665, + "step": 8680 + }, + { + "epoch": 0.695311249799968, + "grad_norm": 2.8433704376220703, + "learning_rate": 3.852088570358881e-05, + "loss": 5.4445, + "step": 8690 + }, + { + "epoch": 0.6961113778204513, + "grad_norm": 2.204953670501709, + "learning_rate": 3.850751457453067e-05, + "loss": 5.5415, + "step": 8700 + }, + { + "epoch": 0.6969115058409345, + "grad_norm": 2.7477571964263916, + "learning_rate": 3.8494143445472536e-05, + "loss": 5.5603, + "step": 8710 + }, + { + "epoch": 0.6977116338614179, + "grad_norm": 3.2059755325317383, + "learning_rate": 3.84807723164144e-05, + "loss": 5.5524, + "step": 8720 + }, + { + "epoch": 0.6985117618819011, + "grad_norm": 3.2654213905334473, + "learning_rate": 3.846740118735626e-05, + "loss": 5.5482, + "step": 8730 + }, + { + "epoch": 0.6993118899023844, + "grad_norm": 2.3536834716796875, + "learning_rate": 3.8454030058298124e-05, + "loss": 5.6251, + "step": 8740 + }, + { + "epoch": 0.7001120179228677, + "grad_norm": 3.132542371749878, + "learning_rate": 3.8440658929239986e-05, + "loss": 5.762, + "step": 8750 + }, + { + "epoch": 0.7009121459433509, + "grad_norm": 2.3961470127105713, + "learning_rate": 3.842728780018185e-05, + "loss": 5.4919, + "step": 8760 + }, + { + "epoch": 0.7017122739638342, + "grad_norm": 1.9365229606628418, + "learning_rate": 3.841391667112371e-05, + "loss": 5.4369, + "step": 8770 + }, + { + "epoch": 0.7025124019843175, + "grad_norm": 2.227877140045166, + "learning_rate": 3.8400545542065574e-05, + "loss": 5.4361, + "step": 8780 + }, + { + "epoch": 0.7033125300048008, + "grad_norm": 2.521822452545166, + "learning_rate": 3.838717441300744e-05, + "loss": 5.6763, + "step": 8790 + }, + { + "epoch": 0.704112658025284, + "grad_norm": 2.4155185222625732, + "learning_rate": 3.83738032839493e-05, + "loss": 5.7041, + "step": 8800 + }, + { + "epoch": 0.7049127860457673, + "grad_norm": 1.9704358577728271, + "learning_rate": 3.836043215489116e-05, + "loss": 5.5136, + "step": 8810 + }, + { + "epoch": 0.7057129140662506, + "grad_norm": 3.447098731994629, + "learning_rate": 3.8347061025833025e-05, + "loss": 5.5963, + "step": 8820 + }, + { + "epoch": 0.7065130420867338, + "grad_norm": 2.0857930183410645, + "learning_rate": 3.833368989677489e-05, + "loss": 5.5328, + "step": 8830 + }, + { + "epoch": 0.7073131701072172, + "grad_norm": 5.354836940765381, + "learning_rate": 3.8320318767716744e-05, + "loss": 5.561, + "step": 8840 + }, + { + "epoch": 0.7081132981277004, + "grad_norm": 2.1317214965820312, + "learning_rate": 3.8306947638658606e-05, + "loss": 5.7044, + "step": 8850 + }, + { + "epoch": 0.7089134261481838, + "grad_norm": 2.163472890853882, + "learning_rate": 3.829357650960047e-05, + "loss": 5.4564, + "step": 8860 + }, + { + "epoch": 0.709713554168667, + "grad_norm": 2.155075788497925, + "learning_rate": 3.828020538054233e-05, + "loss": 5.5767, + "step": 8870 + }, + { + "epoch": 0.7105136821891502, + "grad_norm": 2.225407361984253, + "learning_rate": 3.8266834251484194e-05, + "loss": 5.574, + "step": 8880 + }, + { + "epoch": 0.7113138102096336, + "grad_norm": 2.737126350402832, + "learning_rate": 3.825346312242606e-05, + "loss": 5.5425, + "step": 8890 + }, + { + "epoch": 0.7121139382301168, + "grad_norm": 3.4771502017974854, + "learning_rate": 3.824009199336792e-05, + "loss": 5.6085, + "step": 8900 + }, + { + "epoch": 0.7129140662506, + "grad_norm": 3.2826528549194336, + "learning_rate": 3.822672086430978e-05, + "loss": 5.5632, + "step": 8910 + }, + { + "epoch": 0.7137141942710834, + "grad_norm": 2.4936113357543945, + "learning_rate": 3.8213349735251645e-05, + "loss": 5.4818, + "step": 8920 + }, + { + "epoch": 0.7145143222915666, + "grad_norm": 3.6719648838043213, + "learning_rate": 3.819997860619351e-05, + "loss": 5.5637, + "step": 8930 + }, + { + "epoch": 0.7153144503120499, + "grad_norm": 2.7252962589263916, + "learning_rate": 3.818660747713537e-05, + "loss": 5.5623, + "step": 8940 + }, + { + "epoch": 0.7161145783325332, + "grad_norm": 3.8873820304870605, + "learning_rate": 3.817323634807723e-05, + "loss": 5.5009, + "step": 8950 + }, + { + "epoch": 0.7169147063530165, + "grad_norm": 2.6248092651367188, + "learning_rate": 3.8159865219019096e-05, + "loss": 5.6683, + "step": 8960 + }, + { + "epoch": 0.7177148343734998, + "grad_norm": 2.1327767372131348, + "learning_rate": 3.814649408996096e-05, + "loss": 5.373, + "step": 8970 + }, + { + "epoch": 0.718514962393983, + "grad_norm": 3.1641392707824707, + "learning_rate": 3.813312296090282e-05, + "loss": 5.6192, + "step": 8980 + }, + { + "epoch": 0.7193150904144663, + "grad_norm": 2.533423662185669, + "learning_rate": 3.811975183184468e-05, + "loss": 5.4736, + "step": 8990 + }, + { + "epoch": 0.7201152184349496, + "grad_norm": 2.892228841781616, + "learning_rate": 3.810638070278654e-05, + "loss": 5.437, + "step": 9000 + }, + { + "epoch": 0.7209153464554329, + "grad_norm": 2.295328140258789, + "learning_rate": 3.80930095737284e-05, + "loss": 5.4327, + "step": 9010 + }, + { + "epoch": 0.7217154744759161, + "grad_norm": 2.4300477504730225, + "learning_rate": 3.8079638444670265e-05, + "loss": 5.6341, + "step": 9020 + }, + { + "epoch": 0.7225156024963995, + "grad_norm": 4.092593669891357, + "learning_rate": 3.806626731561213e-05, + "loss": 5.5062, + "step": 9030 + }, + { + "epoch": 0.7233157305168827, + "grad_norm": 2.7330925464630127, + "learning_rate": 3.805289618655399e-05, + "loss": 5.4915, + "step": 9040 + }, + { + "epoch": 0.7241158585373659, + "grad_norm": 2.0372865200042725, + "learning_rate": 3.8039525057495853e-05, + "loss": 5.5056, + "step": 9050 + }, + { + "epoch": 0.7249159865578493, + "grad_norm": 2.5585618019104004, + "learning_rate": 3.8026153928437716e-05, + "loss": 5.4614, + "step": 9060 + }, + { + "epoch": 0.7257161145783325, + "grad_norm": 2.653251886367798, + "learning_rate": 3.801278279937958e-05, + "loss": 5.4437, + "step": 9070 + }, + { + "epoch": 0.7265162425988159, + "grad_norm": 2.7902703285217285, + "learning_rate": 3.799941167032144e-05, + "loss": 5.4927, + "step": 9080 + }, + { + "epoch": 0.7273163706192991, + "grad_norm": 3.366363525390625, + "learning_rate": 3.7986040541263304e-05, + "loss": 5.382, + "step": 9090 + }, + { + "epoch": 0.7281164986397823, + "grad_norm": 2.065732479095459, + "learning_rate": 3.797266941220517e-05, + "loss": 5.5663, + "step": 9100 + }, + { + "epoch": 0.7289166266602657, + "grad_norm": 3.823241710662842, + "learning_rate": 3.795929828314703e-05, + "loss": 5.4697, + "step": 9110 + }, + { + "epoch": 0.7297167546807489, + "grad_norm": 2.3972017765045166, + "learning_rate": 3.794592715408889e-05, + "loss": 5.5508, + "step": 9120 + }, + { + "epoch": 0.7305168827012322, + "grad_norm": 2.4955368041992188, + "learning_rate": 3.7932556025030755e-05, + "loss": 5.5437, + "step": 9130 + }, + { + "epoch": 0.7313170107217155, + "grad_norm": 5.454606533050537, + "learning_rate": 3.791918489597262e-05, + "loss": 5.4974, + "step": 9140 + }, + { + "epoch": 0.7321171387421987, + "grad_norm": 2.6541287899017334, + "learning_rate": 3.790581376691448e-05, + "loss": 5.5327, + "step": 9150 + }, + { + "epoch": 0.732917266762682, + "grad_norm": 2.974902391433716, + "learning_rate": 3.789244263785634e-05, + "loss": 5.5352, + "step": 9160 + }, + { + "epoch": 0.7337173947831653, + "grad_norm": 7.2000274658203125, + "learning_rate": 3.7879071508798206e-05, + "loss": 5.5946, + "step": 9170 + }, + { + "epoch": 0.7345175228036486, + "grad_norm": 2.418121576309204, + "learning_rate": 3.786570037974007e-05, + "loss": 5.4985, + "step": 9180 + }, + { + "epoch": 0.7353176508241318, + "grad_norm": 2.3174428939819336, + "learning_rate": 3.785232925068193e-05, + "loss": 5.6393, + "step": 9190 + }, + { + "epoch": 0.7361177788446152, + "grad_norm": 2.172489643096924, + "learning_rate": 3.7838958121623794e-05, + "loss": 5.6173, + "step": 9200 + }, + { + "epoch": 0.7369179068650984, + "grad_norm": 3.9107019901275635, + "learning_rate": 3.7825586992565656e-05, + "loss": 5.4436, + "step": 9210 + }, + { + "epoch": 0.7377180348855817, + "grad_norm": 2.3483355045318604, + "learning_rate": 3.781221586350752e-05, + "loss": 5.4981, + "step": 9220 + }, + { + "epoch": 0.738518162906065, + "grad_norm": 3.839348077774048, + "learning_rate": 3.779884473444938e-05, + "loss": 5.5541, + "step": 9230 + }, + { + "epoch": 0.7393182909265482, + "grad_norm": 1.686996579170227, + "learning_rate": 3.7785473605391245e-05, + "loss": 5.6328, + "step": 9240 + }, + { + "epoch": 0.7401184189470316, + "grad_norm": 2.7277584075927734, + "learning_rate": 3.777210247633311e-05, + "loss": 5.5787, + "step": 9250 + }, + { + "epoch": 0.7409185469675148, + "grad_norm": 2.60896635055542, + "learning_rate": 3.775873134727497e-05, + "loss": 5.5082, + "step": 9260 + }, + { + "epoch": 0.741718674987998, + "grad_norm": 2.957674264907837, + "learning_rate": 3.774669733112264e-05, + "loss": 5.516, + "step": 9270 + }, + { + "epoch": 0.7425188030084814, + "grad_norm": 2.223433017730713, + "learning_rate": 3.7733326202064505e-05, + "loss": 5.502, + "step": 9280 + }, + { + "epoch": 0.7433189310289646, + "grad_norm": 2.6075685024261475, + "learning_rate": 3.771995507300637e-05, + "loss": 5.5067, + "step": 9290 + }, + { + "epoch": 0.7441190590494479, + "grad_norm": 2.6572721004486084, + "learning_rate": 3.7706583943948224e-05, + "loss": 5.6304, + "step": 9300 + }, + { + "epoch": 0.7449191870699312, + "grad_norm": 2.0563318729400635, + "learning_rate": 3.7693212814890086e-05, + "loss": 5.4974, + "step": 9310 + }, + { + "epoch": 0.7457193150904144, + "grad_norm": 2.032820463180542, + "learning_rate": 3.767984168583195e-05, + "loss": 5.6016, + "step": 9320 + }, + { + "epoch": 0.7465194431108978, + "grad_norm": 5.646316051483154, + "learning_rate": 3.766647055677381e-05, + "loss": 5.6661, + "step": 9330 + }, + { + "epoch": 0.747319571131381, + "grad_norm": 2.5043859481811523, + "learning_rate": 3.7653099427715674e-05, + "loss": 5.6445, + "step": 9340 + }, + { + "epoch": 0.7481196991518643, + "grad_norm": 2.817434787750244, + "learning_rate": 3.763972829865754e-05, + "loss": 5.3901, + "step": 9350 + }, + { + "epoch": 0.7489198271723476, + "grad_norm": 2.4041759967803955, + "learning_rate": 3.76263571695994e-05, + "loss": 5.7132, + "step": 9360 + }, + { + "epoch": 0.7497199551928309, + "grad_norm": 1.8806638717651367, + "learning_rate": 3.761298604054126e-05, + "loss": 5.5203, + "step": 9370 + }, + { + "epoch": 0.7505200832133141, + "grad_norm": 2.088700532913208, + "learning_rate": 3.7599614911483125e-05, + "loss": 5.4414, + "step": 9380 + }, + { + "epoch": 0.7513202112337974, + "grad_norm": 2.519188165664673, + "learning_rate": 3.758624378242499e-05, + "loss": 5.4094, + "step": 9390 + }, + { + "epoch": 0.7521203392542807, + "grad_norm": 4.597784042358398, + "learning_rate": 3.757287265336685e-05, + "loss": 5.6246, + "step": 9400 + }, + { + "epoch": 0.7529204672747639, + "grad_norm": 2.0422868728637695, + "learning_rate": 3.755950152430871e-05, + "loss": 5.3393, + "step": 9410 + }, + { + "epoch": 0.7537205952952473, + "grad_norm": 3.0451338291168213, + "learning_rate": 3.7546130395250576e-05, + "loss": 5.618, + "step": 9420 + }, + { + "epoch": 0.7545207233157305, + "grad_norm": 2.3379099369049072, + "learning_rate": 3.753275926619244e-05, + "loss": 5.4859, + "step": 9430 + }, + { + "epoch": 0.7553208513362137, + "grad_norm": 2.6721060276031494, + "learning_rate": 3.75193881371343e-05, + "loss": 5.5349, + "step": 9440 + }, + { + "epoch": 0.7561209793566971, + "grad_norm": 2.495716094970703, + "learning_rate": 3.7506017008076164e-05, + "loss": 5.626, + "step": 9450 + }, + { + "epoch": 0.7569211073771803, + "grad_norm": 2.9002442359924316, + "learning_rate": 3.749264587901803e-05, + "loss": 5.5438, + "step": 9460 + }, + { + "epoch": 0.7577212353976637, + "grad_norm": 2.3616931438446045, + "learning_rate": 3.747927474995989e-05, + "loss": 5.6381, + "step": 9470 + }, + { + "epoch": 0.7585213634181469, + "grad_norm": 2.389329433441162, + "learning_rate": 3.746590362090175e-05, + "loss": 5.4326, + "step": 9480 + }, + { + "epoch": 0.7593214914386301, + "grad_norm": 2.1870810985565186, + "learning_rate": 3.7452532491843615e-05, + "loss": 5.5129, + "step": 9490 + }, + { + "epoch": 0.7601216194591135, + "grad_norm": 2.2454891204833984, + "learning_rate": 3.743916136278548e-05, + "loss": 5.3963, + "step": 9500 + }, + { + "epoch": 0.7609217474795967, + "grad_norm": 2.5803539752960205, + "learning_rate": 3.742579023372734e-05, + "loss": 5.5237, + "step": 9510 + }, + { + "epoch": 0.76172187550008, + "grad_norm": 2.5508155822753906, + "learning_rate": 3.74124191046692e-05, + "loss": 5.4525, + "step": 9520 + }, + { + "epoch": 0.7625220035205633, + "grad_norm": 3.693437337875366, + "learning_rate": 3.7399047975611065e-05, + "loss": 5.5101, + "step": 9530 + }, + { + "epoch": 0.7633221315410466, + "grad_norm": 2.4398484230041504, + "learning_rate": 3.738567684655293e-05, + "loss": 5.5372, + "step": 9540 + }, + { + "epoch": 0.7641222595615298, + "grad_norm": 2.226680278778076, + "learning_rate": 3.737230571749479e-05, + "loss": 5.3711, + "step": 9550 + }, + { + "epoch": 0.7649223875820131, + "grad_norm": 2.182704210281372, + "learning_rate": 3.7358934588436654e-05, + "loss": 5.4957, + "step": 9560 + }, + { + "epoch": 0.7657225156024964, + "grad_norm": 3.145799398422241, + "learning_rate": 3.7345563459378516e-05, + "loss": 5.5411, + "step": 9570 + }, + { + "epoch": 0.7665226436229797, + "grad_norm": 2.656719923019409, + "learning_rate": 3.733219233032038e-05, + "loss": 5.4737, + "step": 9580 + }, + { + "epoch": 0.767322771643463, + "grad_norm": 2.2230639457702637, + "learning_rate": 3.731882120126224e-05, + "loss": 5.5192, + "step": 9590 + }, + { + "epoch": 0.7681228996639462, + "grad_norm": 4.286400318145752, + "learning_rate": 3.7305450072204104e-05, + "loss": 5.6413, + "step": 9600 + }, + { + "epoch": 0.7689230276844295, + "grad_norm": 2.3106577396392822, + "learning_rate": 3.729207894314596e-05, + "loss": 5.5998, + "step": 9610 + }, + { + "epoch": 0.7697231557049128, + "grad_norm": 2.7155752182006836, + "learning_rate": 3.727870781408782e-05, + "loss": 5.4494, + "step": 9620 + }, + { + "epoch": 0.770523283725396, + "grad_norm": 2.082399368286133, + "learning_rate": 3.7265336685029686e-05, + "loss": 5.4897, + "step": 9630 + }, + { + "epoch": 0.7713234117458794, + "grad_norm": 2.0752410888671875, + "learning_rate": 3.725196555597155e-05, + "loss": 5.537, + "step": 9640 + }, + { + "epoch": 0.7721235397663626, + "grad_norm": 2.258284091949463, + "learning_rate": 3.723859442691341e-05, + "loss": 5.6481, + "step": 9650 + }, + { + "epoch": 0.7729236677868458, + "grad_norm": 2.8548264503479004, + "learning_rate": 3.7225223297855274e-05, + "loss": 5.5508, + "step": 9660 + }, + { + "epoch": 0.7737237958073292, + "grad_norm": 3.375497579574585, + "learning_rate": 3.7211852168797136e-05, + "loss": 5.3847, + "step": 9670 + }, + { + "epoch": 0.7745239238278124, + "grad_norm": 2.6680548191070557, + "learning_rate": 3.7198481039739e-05, + "loss": 5.3742, + "step": 9680 + }, + { + "epoch": 0.7753240518482958, + "grad_norm": 2.2915420532226562, + "learning_rate": 3.718510991068086e-05, + "loss": 5.5593, + "step": 9690 + }, + { + "epoch": 0.776124179868779, + "grad_norm": 3.224327325820923, + "learning_rate": 3.7171738781622724e-05, + "loss": 5.5711, + "step": 9700 + }, + { + "epoch": 0.7769243078892623, + "grad_norm": 3.025899887084961, + "learning_rate": 3.715836765256459e-05, + "loss": 5.3164, + "step": 9710 + }, + { + "epoch": 0.7777244359097456, + "grad_norm": 1.9424941539764404, + "learning_rate": 3.714499652350645e-05, + "loss": 5.4804, + "step": 9720 + }, + { + "epoch": 0.7785245639302288, + "grad_norm": 2.863312005996704, + "learning_rate": 3.713162539444831e-05, + "loss": 5.3353, + "step": 9730 + }, + { + "epoch": 0.7793246919507121, + "grad_norm": 2.0607283115386963, + "learning_rate": 3.7118254265390175e-05, + "loss": 5.5311, + "step": 9740 + }, + { + "epoch": 0.7801248199711954, + "grad_norm": 2.225666046142578, + "learning_rate": 3.710488313633204e-05, + "loss": 5.5315, + "step": 9750 + }, + { + "epoch": 0.7809249479916787, + "grad_norm": 2.1531851291656494, + "learning_rate": 3.70915120072739e-05, + "loss": 5.5311, + "step": 9760 + }, + { + "epoch": 0.7817250760121619, + "grad_norm": 2.6129846572875977, + "learning_rate": 3.7078140878215756e-05, + "loss": 5.5927, + "step": 9770 + }, + { + "epoch": 0.7825252040326452, + "grad_norm": 3.1822173595428467, + "learning_rate": 3.706476974915762e-05, + "loss": 5.5403, + "step": 9780 + }, + { + "epoch": 0.7833253320531285, + "grad_norm": 5.453544616699219, + "learning_rate": 3.705139862009948e-05, + "loss": 5.4393, + "step": 9790 + }, + { + "epoch": 0.7841254600736117, + "grad_norm": 2.573024272918701, + "learning_rate": 3.7038027491041345e-05, + "loss": 5.5677, + "step": 9800 + }, + { + "epoch": 0.7849255880940951, + "grad_norm": 2.283381700515747, + "learning_rate": 3.702465636198321e-05, + "loss": 5.3814, + "step": 9810 + }, + { + "epoch": 0.7857257161145783, + "grad_norm": 3.119277238845825, + "learning_rate": 3.701128523292507e-05, + "loss": 5.5022, + "step": 9820 + }, + { + "epoch": 0.7865258441350617, + "grad_norm": 5.085709571838379, + "learning_rate": 3.699791410386693e-05, + "loss": 5.5322, + "step": 9830 + }, + { + "epoch": 0.7873259721555449, + "grad_norm": 2.4339115619659424, + "learning_rate": 3.6984542974808795e-05, + "loss": 5.5885, + "step": 9840 + }, + { + "epoch": 0.7881261001760281, + "grad_norm": 2.2715206146240234, + "learning_rate": 3.697117184575066e-05, + "loss": 5.4657, + "step": 9850 + }, + { + "epoch": 0.7889262281965115, + "grad_norm": 2.1434290409088135, + "learning_rate": 3.695780071669252e-05, + "loss": 5.5571, + "step": 9860 + }, + { + "epoch": 0.7897263562169947, + "grad_norm": 2.235814094543457, + "learning_rate": 3.694442958763438e-05, + "loss": 5.5054, + "step": 9870 + }, + { + "epoch": 0.790526484237478, + "grad_norm": 4.322607517242432, + "learning_rate": 3.6931058458576246e-05, + "loss": 5.3727, + "step": 9880 + }, + { + "epoch": 0.7913266122579613, + "grad_norm": 2.0876612663269043, + "learning_rate": 3.691768732951811e-05, + "loss": 5.5682, + "step": 9890 + }, + { + "epoch": 0.7921267402784445, + "grad_norm": 1.9573509693145752, + "learning_rate": 3.690431620045997e-05, + "loss": 5.4981, + "step": 9900 + }, + { + "epoch": 0.7929268682989278, + "grad_norm": 2.527776002883911, + "learning_rate": 3.6890945071401834e-05, + "loss": 5.3799, + "step": 9910 + }, + { + "epoch": 0.7937269963194111, + "grad_norm": 3.043266773223877, + "learning_rate": 3.687757394234369e-05, + "loss": 5.5366, + "step": 9920 + }, + { + "epoch": 0.7945271243398944, + "grad_norm": 2.502704381942749, + "learning_rate": 3.686420281328555e-05, + "loss": 5.576, + "step": 9930 + }, + { + "epoch": 0.7953272523603777, + "grad_norm": 2.863032817840576, + "learning_rate": 3.6850831684227415e-05, + "loss": 5.4838, + "step": 9940 + }, + { + "epoch": 0.796127380380861, + "grad_norm": 2.4610373973846436, + "learning_rate": 3.683746055516928e-05, + "loss": 5.6119, + "step": 9950 + }, + { + "epoch": 0.7969275084013442, + "grad_norm": 2.193134069442749, + "learning_rate": 3.682408942611114e-05, + "loss": 5.3948, + "step": 9960 + }, + { + "epoch": 0.7977276364218275, + "grad_norm": 3.6384451389312744, + "learning_rate": 3.6810718297053003e-05, + "loss": 5.5381, + "step": 9970 + }, + { + "epoch": 0.7985277644423108, + "grad_norm": 2.5201289653778076, + "learning_rate": 3.6797347167994866e-05, + "loss": 5.4386, + "step": 9980 + }, + { + "epoch": 0.799327892462794, + "grad_norm": 2.3459038734436035, + "learning_rate": 3.678397603893673e-05, + "loss": 5.8173, + "step": 9990 + }, + { + "epoch": 0.8001280204832774, + "grad_norm": 2.575666904449463, + "learning_rate": 3.677060490987859e-05, + "loss": 5.4436, + "step": 10000 + }, + { + "epoch": 0.8009281485037606, + "grad_norm": 4.0012712478637695, + "learning_rate": 3.6757233780820454e-05, + "loss": 5.5222, + "step": 10010 + }, + { + "epoch": 0.8017282765242438, + "grad_norm": 2.3244402408599854, + "learning_rate": 3.674386265176232e-05, + "loss": 5.398, + "step": 10020 + }, + { + "epoch": 0.8025284045447272, + "grad_norm": 2.2298974990844727, + "learning_rate": 3.673049152270418e-05, + "loss": 5.4749, + "step": 10030 + }, + { + "epoch": 0.8033285325652104, + "grad_norm": 3.589245080947876, + "learning_rate": 3.671712039364604e-05, + "loss": 5.5091, + "step": 10040 + }, + { + "epoch": 0.8041286605856938, + "grad_norm": 2.2426655292510986, + "learning_rate": 3.6703749264587905e-05, + "loss": 5.5136, + "step": 10050 + }, + { + "epoch": 0.804928788606177, + "grad_norm": 2.5258290767669678, + "learning_rate": 3.669037813552977e-05, + "loss": 5.522, + "step": 10060 + }, + { + "epoch": 0.8057289166266602, + "grad_norm": 3.040107250213623, + "learning_rate": 3.667700700647163e-05, + "loss": 5.5748, + "step": 10070 + }, + { + "epoch": 0.8065290446471436, + "grad_norm": 2.561196804046631, + "learning_rate": 3.6663635877413486e-05, + "loss": 5.5973, + "step": 10080 + }, + { + "epoch": 0.8073291726676268, + "grad_norm": 2.4179880619049072, + "learning_rate": 3.665026474835535e-05, + "loss": 5.5915, + "step": 10090 + }, + { + "epoch": 0.8081293006881101, + "grad_norm": 2.393134593963623, + "learning_rate": 3.663689361929721e-05, + "loss": 5.4809, + "step": 10100 + }, + { + "epoch": 0.8089294287085934, + "grad_norm": 3.107543468475342, + "learning_rate": 3.6623522490239074e-05, + "loss": 5.6127, + "step": 10110 + }, + { + "epoch": 0.8097295567290766, + "grad_norm": 2.8467986583709717, + "learning_rate": 3.661015136118094e-05, + "loss": 5.5274, + "step": 10120 + }, + { + "epoch": 0.8105296847495599, + "grad_norm": 2.49955153465271, + "learning_rate": 3.65967802321228e-05, + "loss": 5.4469, + "step": 10130 + }, + { + "epoch": 0.8113298127700432, + "grad_norm": 2.817401885986328, + "learning_rate": 3.658340910306466e-05, + "loss": 5.5901, + "step": 10140 + }, + { + "epoch": 0.8121299407905265, + "grad_norm": 2.284855842590332, + "learning_rate": 3.6570037974006525e-05, + "loss": 5.588, + "step": 10150 + }, + { + "epoch": 0.8129300688110097, + "grad_norm": 3.13712739944458, + "learning_rate": 3.655666684494839e-05, + "loss": 5.5035, + "step": 10160 + }, + { + "epoch": 0.8137301968314931, + "grad_norm": 2.7964253425598145, + "learning_rate": 3.654329571589025e-05, + "loss": 5.4622, + "step": 10170 + }, + { + "epoch": 0.8145303248519763, + "grad_norm": 3.7489845752716064, + "learning_rate": 3.652992458683211e-05, + "loss": 5.6106, + "step": 10180 + }, + { + "epoch": 0.8153304528724596, + "grad_norm": 2.0697953701019287, + "learning_rate": 3.6516553457773976e-05, + "loss": 5.4128, + "step": 10190 + }, + { + "epoch": 0.8161305808929429, + "grad_norm": 2.495635986328125, + "learning_rate": 3.650318232871584e-05, + "loss": 5.3183, + "step": 10200 + }, + { + "epoch": 0.8169307089134261, + "grad_norm": 1.9717586040496826, + "learning_rate": 3.64898111996577e-05, + "loss": 5.4251, + "step": 10210 + }, + { + "epoch": 0.8177308369339095, + "grad_norm": 2.591371774673462, + "learning_rate": 3.6476440070599564e-05, + "loss": 5.3903, + "step": 10220 + }, + { + "epoch": 0.8185309649543927, + "grad_norm": 2.9142751693725586, + "learning_rate": 3.646306894154142e-05, + "loss": 5.4119, + "step": 10230 + }, + { + "epoch": 0.8193310929748759, + "grad_norm": 2.1791203022003174, + "learning_rate": 3.644969781248328e-05, + "loss": 5.5931, + "step": 10240 + }, + { + "epoch": 0.8201312209953593, + "grad_norm": 2.787339925765991, + "learning_rate": 3.6436326683425145e-05, + "loss": 5.5301, + "step": 10250 + }, + { + "epoch": 0.8209313490158425, + "grad_norm": 2.722717523574829, + "learning_rate": 3.642295555436701e-05, + "loss": 5.5608, + "step": 10260 + }, + { + "epoch": 0.8217314770363258, + "grad_norm": 2.937549114227295, + "learning_rate": 3.640958442530887e-05, + "loss": 5.5967, + "step": 10270 + }, + { + "epoch": 0.8225316050568091, + "grad_norm": 3.0384104251861572, + "learning_rate": 3.639621329625073e-05, + "loss": 5.5901, + "step": 10280 + }, + { + "epoch": 0.8233317330772923, + "grad_norm": 2.6817758083343506, + "learning_rate": 3.6382842167192596e-05, + "loss": 5.4188, + "step": 10290 + }, + { + "epoch": 0.8241318610977757, + "grad_norm": 2.6184494495391846, + "learning_rate": 3.636947103813446e-05, + "loss": 5.5194, + "step": 10300 + }, + { + "epoch": 0.8249319891182589, + "grad_norm": 2.613208293914795, + "learning_rate": 3.635609990907632e-05, + "loss": 5.4968, + "step": 10310 + }, + { + "epoch": 0.8257321171387422, + "grad_norm": 6.223053932189941, + "learning_rate": 3.6342728780018184e-05, + "loss": 5.3478, + "step": 10320 + }, + { + "epoch": 0.8265322451592255, + "grad_norm": 3.294417381286621, + "learning_rate": 3.632935765096005e-05, + "loss": 5.5736, + "step": 10330 + }, + { + "epoch": 0.8273323731797088, + "grad_norm": 2.3347206115722656, + "learning_rate": 3.631598652190191e-05, + "loss": 5.6787, + "step": 10340 + }, + { + "epoch": 0.828132501200192, + "grad_norm": 3.219491958618164, + "learning_rate": 3.630261539284377e-05, + "loss": 5.5125, + "step": 10350 + }, + { + "epoch": 0.8289326292206753, + "grad_norm": 2.5759575366973877, + "learning_rate": 3.6289244263785635e-05, + "loss": 5.4405, + "step": 10360 + }, + { + "epoch": 0.8297327572411586, + "grad_norm": 2.4145963191986084, + "learning_rate": 3.62758731347275e-05, + "loss": 5.479, + "step": 10370 + }, + { + "epoch": 0.8305328852616418, + "grad_norm": 2.7548952102661133, + "learning_rate": 3.626250200566936e-05, + "loss": 5.5466, + "step": 10380 + }, + { + "epoch": 0.8313330132821252, + "grad_norm": 1.9488781690597534, + "learning_rate": 3.624913087661122e-05, + "loss": 5.5063, + "step": 10390 + }, + { + "epoch": 0.8321331413026084, + "grad_norm": 2.648233652114868, + "learning_rate": 3.6235759747553086e-05, + "loss": 5.4158, + "step": 10400 + }, + { + "epoch": 0.8329332693230916, + "grad_norm": 2.8808720111846924, + "learning_rate": 3.622238861849495e-05, + "loss": 5.5431, + "step": 10410 + }, + { + "epoch": 0.833733397343575, + "grad_norm": 3.4570131301879883, + "learning_rate": 3.620901748943681e-05, + "loss": 5.4842, + "step": 10420 + }, + { + "epoch": 0.8345335253640582, + "grad_norm": 4.246754169464111, + "learning_rate": 3.6195646360378674e-05, + "loss": 5.5809, + "step": 10430 + }, + { + "epoch": 0.8353336533845416, + "grad_norm": 1.8645952939987183, + "learning_rate": 3.6182275231320536e-05, + "loss": 5.4272, + "step": 10440 + }, + { + "epoch": 0.8361337814050248, + "grad_norm": 3.3832550048828125, + "learning_rate": 3.61689041022624e-05, + "loss": 5.4291, + "step": 10450 + }, + { + "epoch": 0.836933909425508, + "grad_norm": 2.1454830169677734, + "learning_rate": 3.615553297320426e-05, + "loss": 5.4457, + "step": 10460 + }, + { + "epoch": 0.8377340374459914, + "grad_norm": 2.9275059700012207, + "learning_rate": 3.6142161844146124e-05, + "loss": 5.3577, + "step": 10470 + }, + { + "epoch": 0.8385341654664746, + "grad_norm": 2.9177403450012207, + "learning_rate": 3.612879071508799e-05, + "loss": 5.5011, + "step": 10480 + }, + { + "epoch": 0.8393342934869579, + "grad_norm": 2.9115045070648193, + "learning_rate": 3.611541958602985e-05, + "loss": 5.4961, + "step": 10490 + }, + { + "epoch": 0.8401344215074412, + "grad_norm": 3.270296335220337, + "learning_rate": 3.610204845697171e-05, + "loss": 5.4651, + "step": 10500 + }, + { + "epoch": 0.8409345495279245, + "grad_norm": 2.2930686473846436, + "learning_rate": 3.6088677327913575e-05, + "loss": 5.4363, + "step": 10510 + }, + { + "epoch": 0.8417346775484077, + "grad_norm": 3.168717622756958, + "learning_rate": 3.607530619885544e-05, + "loss": 5.361, + "step": 10520 + }, + { + "epoch": 0.842534805568891, + "grad_norm": 2.009021759033203, + "learning_rate": 3.60619350697973e-05, + "loss": 5.4435, + "step": 10530 + }, + { + "epoch": 0.8433349335893743, + "grad_norm": 3.454181432723999, + "learning_rate": 3.6048563940739156e-05, + "loss": 5.4134, + "step": 10540 + }, + { + "epoch": 0.8441350616098576, + "grad_norm": 2.8601911067962646, + "learning_rate": 3.603519281168102e-05, + "loss": 5.3224, + "step": 10550 + }, + { + "epoch": 0.8449351896303409, + "grad_norm": 2.612689733505249, + "learning_rate": 3.602182168262288e-05, + "loss": 5.3947, + "step": 10560 + }, + { + "epoch": 0.8457353176508241, + "grad_norm": 2.813868284225464, + "learning_rate": 3.6008450553564745e-05, + "loss": 5.4598, + "step": 10570 + }, + { + "epoch": 0.8465354456713075, + "grad_norm": 2.226395606994629, + "learning_rate": 3.599507942450661e-05, + "loss": 5.4401, + "step": 10580 + }, + { + "epoch": 0.8473355736917907, + "grad_norm": 3.4722280502319336, + "learning_rate": 3.598170829544847e-05, + "loss": 5.4831, + "step": 10590 + }, + { + "epoch": 0.8481357017122739, + "grad_norm": 3.270322799682617, + "learning_rate": 3.596833716639033e-05, + "loss": 5.6256, + "step": 10600 + }, + { + "epoch": 0.8489358297327573, + "grad_norm": 1.9735034704208374, + "learning_rate": 3.5954966037332195e-05, + "loss": 5.491, + "step": 10610 + }, + { + "epoch": 0.8497359577532405, + "grad_norm": 2.9609665870666504, + "learning_rate": 3.594159490827406e-05, + "loss": 5.5421, + "step": 10620 + }, + { + "epoch": 0.8505360857737237, + "grad_norm": 3.1109185218811035, + "learning_rate": 3.592822377921592e-05, + "loss": 5.5718, + "step": 10630 + }, + { + "epoch": 0.8513362137942071, + "grad_norm": 2.68784761428833, + "learning_rate": 3.5914852650157783e-05, + "loss": 5.4769, + "step": 10640 + }, + { + "epoch": 0.8521363418146903, + "grad_norm": 2.2947535514831543, + "learning_rate": 3.5901481521099646e-05, + "loss": 5.4901, + "step": 10650 + }, + { + "epoch": 0.8529364698351737, + "grad_norm": 1.894142746925354, + "learning_rate": 3.588811039204151e-05, + "loss": 5.5021, + "step": 10660 + }, + { + "epoch": 0.8537365978556569, + "grad_norm": 2.800260543823242, + "learning_rate": 3.587473926298337e-05, + "loss": 5.6767, + "step": 10670 + }, + { + "epoch": 0.8545367258761402, + "grad_norm": 3.055172920227051, + "learning_rate": 3.5861368133925234e-05, + "loss": 5.5765, + "step": 10680 + }, + { + "epoch": 0.8553368538966235, + "grad_norm": 2.3778443336486816, + "learning_rate": 3.58479970048671e-05, + "loss": 5.5377, + "step": 10690 + }, + { + "epoch": 0.8561369819171067, + "grad_norm": 4.772058486938477, + "learning_rate": 3.583462587580895e-05, + "loss": 5.432, + "step": 10700 + }, + { + "epoch": 0.85693710993759, + "grad_norm": 1.9563825130462646, + "learning_rate": 3.5821254746750815e-05, + "loss": 5.4832, + "step": 10710 + }, + { + "epoch": 0.8577372379580733, + "grad_norm": 2.149519205093384, + "learning_rate": 3.580788361769268e-05, + "loss": 5.491, + "step": 10720 + }, + { + "epoch": 0.8585373659785566, + "grad_norm": 3.5061347484588623, + "learning_rate": 3.579451248863454e-05, + "loss": 5.5747, + "step": 10730 + }, + { + "epoch": 0.8593374939990398, + "grad_norm": 2.74947452545166, + "learning_rate": 3.5781141359576404e-05, + "loss": 5.3591, + "step": 10740 + }, + { + "epoch": 0.8601376220195232, + "grad_norm": 2.818753719329834, + "learning_rate": 3.5767770230518266e-05, + "loss": 5.4722, + "step": 10750 + }, + { + "epoch": 0.8609377500400064, + "grad_norm": 2.7501718997955322, + "learning_rate": 3.575439910146013e-05, + "loss": 5.4531, + "step": 10760 + }, + { + "epoch": 0.8617378780604896, + "grad_norm": 2.314549207687378, + "learning_rate": 3.574102797240199e-05, + "loss": 5.5488, + "step": 10770 + }, + { + "epoch": 0.862538006080973, + "grad_norm": 2.583895683288574, + "learning_rate": 3.5727656843343854e-05, + "loss": 5.5101, + "step": 10780 + }, + { + "epoch": 0.8633381341014562, + "grad_norm": 2.778087854385376, + "learning_rate": 3.571428571428572e-05, + "loss": 5.421, + "step": 10790 + }, + { + "epoch": 0.8641382621219396, + "grad_norm": 3.679514169692993, + "learning_rate": 3.570091458522758e-05, + "loss": 5.5277, + "step": 10800 + }, + { + "epoch": 0.8649383901424228, + "grad_norm": 3.3869597911834717, + "learning_rate": 3.568754345616944e-05, + "loss": 5.5185, + "step": 10810 + }, + { + "epoch": 0.865738518162906, + "grad_norm": 3.1094346046447754, + "learning_rate": 3.5674172327111305e-05, + "loss": 5.396, + "step": 10820 + }, + { + "epoch": 0.8665386461833894, + "grad_norm": 2.3561792373657227, + "learning_rate": 3.566080119805317e-05, + "loss": 5.5995, + "step": 10830 + }, + { + "epoch": 0.8673387742038726, + "grad_norm": 2.7533133029937744, + "learning_rate": 3.564743006899503e-05, + "loss": 5.4848, + "step": 10840 + }, + { + "epoch": 0.8681389022243559, + "grad_norm": 2.923741579055786, + "learning_rate": 3.5634058939936886e-05, + "loss": 5.5549, + "step": 10850 + }, + { + "epoch": 0.8689390302448392, + "grad_norm": 2.002704381942749, + "learning_rate": 3.562068781087875e-05, + "loss": 5.4354, + "step": 10860 + }, + { + "epoch": 0.8697391582653224, + "grad_norm": 2.277064085006714, + "learning_rate": 3.560731668182061e-05, + "loss": 5.4404, + "step": 10870 + }, + { + "epoch": 0.8705392862858057, + "grad_norm": 2.23490047454834, + "learning_rate": 3.5593945552762474e-05, + "loss": 5.7253, + "step": 10880 + }, + { + "epoch": 0.871339414306289, + "grad_norm": 2.42874813079834, + "learning_rate": 3.558057442370434e-05, + "loss": 5.4351, + "step": 10890 + }, + { + "epoch": 0.8721395423267723, + "grad_norm": 2.097278118133545, + "learning_rate": 3.55672032946462e-05, + "loss": 5.4772, + "step": 10900 + }, + { + "epoch": 0.8729396703472556, + "grad_norm": 2.045832395553589, + "learning_rate": 3.555383216558806e-05, + "loss": 5.4132, + "step": 10910 + }, + { + "epoch": 0.8737397983677389, + "grad_norm": 2.695033550262451, + "learning_rate": 3.5540461036529925e-05, + "loss": 5.3975, + "step": 10920 + }, + { + "epoch": 0.8745399263882221, + "grad_norm": 2.62748384475708, + "learning_rate": 3.552708990747179e-05, + "loss": 5.5843, + "step": 10930 + }, + { + "epoch": 0.8753400544087054, + "grad_norm": 2.6703569889068604, + "learning_rate": 3.551371877841365e-05, + "loss": 5.548, + "step": 10940 + }, + { + "epoch": 0.8761401824291887, + "grad_norm": 2.7184908390045166, + "learning_rate": 3.550034764935551e-05, + "loss": 5.4833, + "step": 10950 + }, + { + "epoch": 0.8769403104496719, + "grad_norm": 2.6194417476654053, + "learning_rate": 3.5486976520297376e-05, + "loss": 5.3647, + "step": 10960 + }, + { + "epoch": 0.8777404384701553, + "grad_norm": 2.5021440982818604, + "learning_rate": 3.547360539123924e-05, + "loss": 5.4775, + "step": 10970 + }, + { + "epoch": 0.8785405664906385, + "grad_norm": 3.3758370876312256, + "learning_rate": 3.54602342621811e-05, + "loss": 5.4144, + "step": 10980 + }, + { + "epoch": 0.8793406945111217, + "grad_norm": 2.7361087799072266, + "learning_rate": 3.5446863133122964e-05, + "loss": 5.3614, + "step": 10990 + }, + { + "epoch": 0.8801408225316051, + "grad_norm": 3.831631660461426, + "learning_rate": 3.543349200406482e-05, + "loss": 5.4672, + "step": 11000 + }, + { + "epoch": 0.8809409505520883, + "grad_norm": 2.9705264568328857, + "learning_rate": 3.542012087500668e-05, + "loss": 5.5334, + "step": 11010 + }, + { + "epoch": 0.8817410785725716, + "grad_norm": 3.578693389892578, + "learning_rate": 3.5406749745948545e-05, + "loss": 5.4943, + "step": 11020 + }, + { + "epoch": 0.8825412065930549, + "grad_norm": 2.0674843788146973, + "learning_rate": 3.539337861689041e-05, + "loss": 5.4054, + "step": 11030 + }, + { + "epoch": 0.8833413346135381, + "grad_norm": 2.1904194355010986, + "learning_rate": 3.538000748783227e-05, + "loss": 5.37, + "step": 11040 + }, + { + "epoch": 0.8841414626340215, + "grad_norm": 3.7718141078948975, + "learning_rate": 3.536663635877413e-05, + "loss": 5.6004, + "step": 11050 + }, + { + "epoch": 0.8849415906545047, + "grad_norm": 2.7325282096862793, + "learning_rate": 3.5353265229715996e-05, + "loss": 5.4552, + "step": 11060 + }, + { + "epoch": 0.885741718674988, + "grad_norm": 3.3750839233398438, + "learning_rate": 3.533989410065786e-05, + "loss": 5.5041, + "step": 11070 + }, + { + "epoch": 0.8865418466954713, + "grad_norm": 2.5617001056671143, + "learning_rate": 3.532652297159972e-05, + "loss": 5.4912, + "step": 11080 + }, + { + "epoch": 0.8873419747159546, + "grad_norm": 1.9870737791061401, + "learning_rate": 3.5313151842541584e-05, + "loss": 5.4576, + "step": 11090 + }, + { + "epoch": 0.8881421027364378, + "grad_norm": 2.458249568939209, + "learning_rate": 3.529978071348345e-05, + "loss": 5.7306, + "step": 11100 + }, + { + "epoch": 0.8889422307569211, + "grad_norm": 3.1406562328338623, + "learning_rate": 3.528640958442531e-05, + "loss": 5.5833, + "step": 11110 + }, + { + "epoch": 0.8897423587774044, + "grad_norm": 2.4337878227233887, + "learning_rate": 3.527303845536717e-05, + "loss": 5.4938, + "step": 11120 + }, + { + "epoch": 0.8905424867978876, + "grad_norm": 2.925147294998169, + "learning_rate": 3.5259667326309035e-05, + "loss": 5.5591, + "step": 11130 + }, + { + "epoch": 0.891342614818371, + "grad_norm": 2.5177969932556152, + "learning_rate": 3.52462961972509e-05, + "loss": 5.5199, + "step": 11140 + }, + { + "epoch": 0.8921427428388542, + "grad_norm": 2.3133068084716797, + "learning_rate": 3.523292506819276e-05, + "loss": 5.3506, + "step": 11150 + }, + { + "epoch": 0.8929428708593375, + "grad_norm": 2.1670310497283936, + "learning_rate": 3.521955393913462e-05, + "loss": 5.3459, + "step": 11160 + }, + { + "epoch": 0.8937429988798208, + "grad_norm": 2.875126838684082, + "learning_rate": 3.5206182810076486e-05, + "loss": 5.3948, + "step": 11170 + }, + { + "epoch": 0.894543126900304, + "grad_norm": 2.3784403800964355, + "learning_rate": 3.519281168101835e-05, + "loss": 5.431, + "step": 11180 + }, + { + "epoch": 0.8953432549207874, + "grad_norm": 2.400426149368286, + "learning_rate": 3.517944055196021e-05, + "loss": 5.4228, + "step": 11190 + }, + { + "epoch": 0.8961433829412706, + "grad_norm": 2.2166919708251953, + "learning_rate": 3.5166069422902074e-05, + "loss": 5.6408, + "step": 11200 + }, + { + "epoch": 0.8969435109617538, + "grad_norm": 1.7938240766525269, + "learning_rate": 3.5152698293843936e-05, + "loss": 5.3972, + "step": 11210 + }, + { + "epoch": 0.8977436389822372, + "grad_norm": 2.4942996501922607, + "learning_rate": 3.51393271647858e-05, + "loss": 5.5523, + "step": 11220 + }, + { + "epoch": 0.8985437670027204, + "grad_norm": 2.706131935119629, + "learning_rate": 3.512595603572766e-05, + "loss": 5.6029, + "step": 11230 + }, + { + "epoch": 0.8993438950232037, + "grad_norm": 3.6749794483184814, + "learning_rate": 3.5112584906669524e-05, + "loss": 5.5903, + "step": 11240 + }, + { + "epoch": 0.900144023043687, + "grad_norm": 2.8764829635620117, + "learning_rate": 3.509921377761139e-05, + "loss": 5.392, + "step": 11250 + }, + { + "epoch": 0.9009441510641703, + "grad_norm": 1.9971251487731934, + "learning_rate": 3.508584264855325e-05, + "loss": 5.5115, + "step": 11260 + }, + { + "epoch": 0.9017442790846536, + "grad_norm": 1.9127808809280396, + "learning_rate": 3.507247151949511e-05, + "loss": 5.6273, + "step": 11270 + }, + { + "epoch": 0.9025444071051368, + "grad_norm": 2.679152727127075, + "learning_rate": 3.5059100390436975e-05, + "loss": 5.5216, + "step": 11280 + }, + { + "epoch": 0.9033445351256201, + "grad_norm": 3.1412837505340576, + "learning_rate": 3.504572926137884e-05, + "loss": 5.5665, + "step": 11290 + }, + { + "epoch": 0.9041446631461034, + "grad_norm": 3.2604153156280518, + "learning_rate": 3.50323581323207e-05, + "loss": 5.6283, + "step": 11300 + }, + { + "epoch": 0.9049447911665867, + "grad_norm": 2.2050578594207764, + "learning_rate": 3.5018987003262557e-05, + "loss": 5.429, + "step": 11310 + }, + { + "epoch": 0.9057449191870699, + "grad_norm": 3.6569366455078125, + "learning_rate": 3.500561587420442e-05, + "loss": 5.5833, + "step": 11320 + }, + { + "epoch": 0.9065450472075532, + "grad_norm": 2.38771653175354, + "learning_rate": 3.499224474514628e-05, + "loss": 5.4127, + "step": 11330 + }, + { + "epoch": 0.9073451752280365, + "grad_norm": 2.1471800804138184, + "learning_rate": 3.4978873616088145e-05, + "loss": 5.4064, + "step": 11340 + }, + { + "epoch": 0.9081453032485197, + "grad_norm": 2.340174674987793, + "learning_rate": 3.496550248703001e-05, + "loss": 5.5581, + "step": 11350 + }, + { + "epoch": 0.9089454312690031, + "grad_norm": 2.771235466003418, + "learning_rate": 3.495213135797187e-05, + "loss": 5.4221, + "step": 11360 + }, + { + "epoch": 0.9097455592894863, + "grad_norm": 2.7797491550445557, + "learning_rate": 3.493876022891373e-05, + "loss": 5.5604, + "step": 11370 + }, + { + "epoch": 0.9105456873099695, + "grad_norm": 2.0206966400146484, + "learning_rate": 3.4925389099855595e-05, + "loss": 5.3382, + "step": 11380 + }, + { + "epoch": 0.9113458153304529, + "grad_norm": 3.5101125240325928, + "learning_rate": 3.491201797079746e-05, + "loss": 5.5358, + "step": 11390 + }, + { + "epoch": 0.9121459433509361, + "grad_norm": 2.3375003337860107, + "learning_rate": 3.489864684173932e-05, + "loss": 5.5492, + "step": 11400 + }, + { + "epoch": 0.9129460713714195, + "grad_norm": 2.4977264404296875, + "learning_rate": 3.4885275712681183e-05, + "loss": 5.507, + "step": 11410 + }, + { + "epoch": 0.9137461993919027, + "grad_norm": 2.0408174991607666, + "learning_rate": 3.4871904583623046e-05, + "loss": 5.5587, + "step": 11420 + }, + { + "epoch": 0.914546327412386, + "grad_norm": 2.525320053100586, + "learning_rate": 3.485853345456491e-05, + "loss": 5.5013, + "step": 11430 + }, + { + "epoch": 0.9153464554328693, + "grad_norm": 2.946377992630005, + "learning_rate": 3.484516232550677e-05, + "loss": 5.5959, + "step": 11440 + }, + { + "epoch": 0.9161465834533525, + "grad_norm": 2.138331174850464, + "learning_rate": 3.4831791196448634e-05, + "loss": 5.4817, + "step": 11450 + }, + { + "epoch": 0.9169467114738358, + "grad_norm": 1.7159631252288818, + "learning_rate": 3.48184200673905e-05, + "loss": 5.5036, + "step": 11460 + }, + { + "epoch": 0.9177468394943191, + "grad_norm": 2.5576088428497314, + "learning_rate": 3.480504893833235e-05, + "loss": 5.4721, + "step": 11470 + }, + { + "epoch": 0.9185469675148024, + "grad_norm": 2.057349443435669, + "learning_rate": 3.4791677809274215e-05, + "loss": 5.5468, + "step": 11480 + }, + { + "epoch": 0.9193470955352856, + "grad_norm": 2.4942944049835205, + "learning_rate": 3.477830668021608e-05, + "loss": 5.5999, + "step": 11490 + }, + { + "epoch": 0.920147223555769, + "grad_norm": 3.3070192337036133, + "learning_rate": 3.476493555115794e-05, + "loss": 5.5418, + "step": 11500 + }, + { + "epoch": 0.9209473515762522, + "grad_norm": 2.2323672771453857, + "learning_rate": 3.4751564422099804e-05, + "loss": 5.398, + "step": 11510 + }, + { + "epoch": 0.9217474795967355, + "grad_norm": 1.9982457160949707, + "learning_rate": 3.4738193293041666e-05, + "loss": 5.4668, + "step": 11520 + }, + { + "epoch": 0.9225476076172188, + "grad_norm": 3.4668660163879395, + "learning_rate": 3.472482216398353e-05, + "loss": 5.5433, + "step": 11530 + }, + { + "epoch": 0.923347735637702, + "grad_norm": 2.7247307300567627, + "learning_rate": 3.471145103492539e-05, + "loss": 5.4156, + "step": 11540 + }, + { + "epoch": 0.9241478636581854, + "grad_norm": 2.42948317527771, + "learning_rate": 3.4698079905867254e-05, + "loss": 5.4336, + "step": 11550 + }, + { + "epoch": 0.9249479916786686, + "grad_norm": 4.134993076324463, + "learning_rate": 3.468470877680912e-05, + "loss": 5.3362, + "step": 11560 + }, + { + "epoch": 0.9257481196991518, + "grad_norm": 2.0852134227752686, + "learning_rate": 3.467133764775098e-05, + "loss": 5.4117, + "step": 11570 + }, + { + "epoch": 0.9265482477196352, + "grad_norm": 2.224235773086548, + "learning_rate": 3.465796651869284e-05, + "loss": 5.4132, + "step": 11580 + }, + { + "epoch": 0.9273483757401184, + "grad_norm": 2.0093464851379395, + "learning_rate": 3.4644595389634705e-05, + "loss": 5.3876, + "step": 11590 + }, + { + "epoch": 0.9281485037606017, + "grad_norm": 1.9892866611480713, + "learning_rate": 3.463122426057657e-05, + "loss": 5.4069, + "step": 11600 + }, + { + "epoch": 0.928948631781085, + "grad_norm": 3.9974398612976074, + "learning_rate": 3.461785313151843e-05, + "loss": 5.4892, + "step": 11610 + }, + { + "epoch": 0.9297487598015682, + "grad_norm": 1.9878896474838257, + "learning_rate": 3.4604482002460286e-05, + "loss": 5.5017, + "step": 11620 + }, + { + "epoch": 0.9305488878220515, + "grad_norm": 3.1477320194244385, + "learning_rate": 3.459111087340215e-05, + "loss": 5.3199, + "step": 11630 + }, + { + "epoch": 0.9313490158425348, + "grad_norm": 2.434946298599243, + "learning_rate": 3.457773974434401e-05, + "loss": 5.4885, + "step": 11640 + }, + { + "epoch": 0.9321491438630181, + "grad_norm": 3.2463152408599854, + "learning_rate": 3.4564368615285874e-05, + "loss": 5.5232, + "step": 11650 + }, + { + "epoch": 0.9329492718835014, + "grad_norm": 3.733612537384033, + "learning_rate": 3.455099748622774e-05, + "loss": 5.4918, + "step": 11660 + }, + { + "epoch": 0.9337493999039846, + "grad_norm": 3.3726518154144287, + "learning_rate": 3.45376263571696e-05, + "loss": 5.3887, + "step": 11670 + }, + { + "epoch": 0.9345495279244679, + "grad_norm": 2.527639627456665, + "learning_rate": 3.452425522811146e-05, + "loss": 5.4, + "step": 11680 + }, + { + "epoch": 0.9353496559449512, + "grad_norm": 3.3945000171661377, + "learning_rate": 3.4510884099053325e-05, + "loss": 5.4835, + "step": 11690 + }, + { + "epoch": 0.9361497839654345, + "grad_norm": 2.492178201675415, + "learning_rate": 3.449751296999519e-05, + "loss": 5.5472, + "step": 11700 + }, + { + "epoch": 0.9369499119859177, + "grad_norm": 2.2719671726226807, + "learning_rate": 3.448414184093705e-05, + "loss": 5.3069, + "step": 11710 + }, + { + "epoch": 0.937750040006401, + "grad_norm": 4.121431350708008, + "learning_rate": 3.447077071187891e-05, + "loss": 5.3377, + "step": 11720 + }, + { + "epoch": 0.9385501680268843, + "grad_norm": 2.2480831146240234, + "learning_rate": 3.4457399582820776e-05, + "loss": 5.3888, + "step": 11730 + }, + { + "epoch": 0.9393502960473675, + "grad_norm": 3.118621349334717, + "learning_rate": 3.444402845376264e-05, + "loss": 5.3225, + "step": 11740 + }, + { + "epoch": 0.9401504240678509, + "grad_norm": 2.513777494430542, + "learning_rate": 3.44306573247045e-05, + "loss": 5.4971, + "step": 11750 + }, + { + "epoch": 0.9409505520883341, + "grad_norm": 2.491767406463623, + "learning_rate": 3.4417286195646364e-05, + "loss": 5.5061, + "step": 11760 + }, + { + "epoch": 0.9417506801088175, + "grad_norm": 2.8964290618896484, + "learning_rate": 3.440391506658823e-05, + "loss": 5.3395, + "step": 11770 + }, + { + "epoch": 0.9425508081293007, + "grad_norm": 2.1613073348999023, + "learning_rate": 3.439054393753008e-05, + "loss": 5.512, + "step": 11780 + }, + { + "epoch": 0.9433509361497839, + "grad_norm": 3.5444371700286865, + "learning_rate": 3.4377172808471945e-05, + "loss": 5.4804, + "step": 11790 + }, + { + "epoch": 0.9441510641702673, + "grad_norm": 3.0833287239074707, + "learning_rate": 3.436380167941381e-05, + "loss": 5.5711, + "step": 11800 + }, + { + "epoch": 0.9449511921907505, + "grad_norm": 2.2267260551452637, + "learning_rate": 3.435043055035567e-05, + "loss": 5.3964, + "step": 11810 + }, + { + "epoch": 0.9457513202112338, + "grad_norm": 3.114546537399292, + "learning_rate": 3.4337059421297533e-05, + "loss": 5.4296, + "step": 11820 + }, + { + "epoch": 0.9465514482317171, + "grad_norm": 3.316612958908081, + "learning_rate": 3.4323688292239396e-05, + "loss": 5.451, + "step": 11830 + }, + { + "epoch": 0.9473515762522003, + "grad_norm": 2.97145414352417, + "learning_rate": 3.431031716318126e-05, + "loss": 5.6184, + "step": 11840 + }, + { + "epoch": 0.9481517042726836, + "grad_norm": 2.2837045192718506, + "learning_rate": 3.429694603412312e-05, + "loss": 5.3398, + "step": 11850 + }, + { + "epoch": 0.9489518322931669, + "grad_norm": 2.2095916271209717, + "learning_rate": 3.4283574905064984e-05, + "loss": 5.3933, + "step": 11860 + }, + { + "epoch": 0.9497519603136502, + "grad_norm": 1.9592795372009277, + "learning_rate": 3.427020377600685e-05, + "loss": 5.4423, + "step": 11870 + }, + { + "epoch": 0.9505520883341335, + "grad_norm": 2.9245188236236572, + "learning_rate": 3.425683264694871e-05, + "loss": 5.4927, + "step": 11880 + }, + { + "epoch": 0.9513522163546168, + "grad_norm": 2.5000531673431396, + "learning_rate": 3.424346151789057e-05, + "loss": 5.3523, + "step": 11890 + }, + { + "epoch": 0.9521523443751, + "grad_norm": 2.4692375659942627, + "learning_rate": 3.4230090388832435e-05, + "loss": 5.5949, + "step": 11900 + }, + { + "epoch": 0.9529524723955833, + "grad_norm": 2.387812852859497, + "learning_rate": 3.42167192597743e-05, + "loss": 5.4971, + "step": 11910 + }, + { + "epoch": 0.9537526004160666, + "grad_norm": 2.938291072845459, + "learning_rate": 3.420334813071616e-05, + "loss": 5.3849, + "step": 11920 + }, + { + "epoch": 0.9545527284365498, + "grad_norm": 2.608431339263916, + "learning_rate": 3.4189977001658016e-05, + "loss": 5.3414, + "step": 11930 + }, + { + "epoch": 0.9553528564570332, + "grad_norm": 2.695615530014038, + "learning_rate": 3.417660587259988e-05, + "loss": 5.2343, + "step": 11940 + }, + { + "epoch": 0.9561529844775164, + "grad_norm": 3.0142087936401367, + "learning_rate": 3.416323474354174e-05, + "loss": 5.3293, + "step": 11950 + }, + { + "epoch": 0.9569531124979996, + "grad_norm": 2.5953242778778076, + "learning_rate": 3.4149863614483604e-05, + "loss": 5.459, + "step": 11960 + }, + { + "epoch": 0.957753240518483, + "grad_norm": 2.2795822620391846, + "learning_rate": 3.413649248542547e-05, + "loss": 5.5305, + "step": 11970 + }, + { + "epoch": 0.9585533685389662, + "grad_norm": 2.5979270935058594, + "learning_rate": 3.412312135636733e-05, + "loss": 5.4866, + "step": 11980 + }, + { + "epoch": 0.9593534965594495, + "grad_norm": 2.66823673248291, + "learning_rate": 3.410975022730919e-05, + "loss": 5.5734, + "step": 11990 + }, + { + "epoch": 0.9601536245799328, + "grad_norm": 2.3899004459381104, + "learning_rate": 3.4096379098251055e-05, + "loss": 5.5367, + "step": 12000 + }, + { + "epoch": 0.960953752600416, + "grad_norm": 2.233553171157837, + "learning_rate": 3.408300796919292e-05, + "loss": 5.3773, + "step": 12010 + }, + { + "epoch": 0.9617538806208994, + "grad_norm": 2.2967305183410645, + "learning_rate": 3.406963684013478e-05, + "loss": 5.4409, + "step": 12020 + }, + { + "epoch": 0.9625540086413826, + "grad_norm": 2.4291601181030273, + "learning_rate": 3.405626571107664e-05, + "loss": 5.4198, + "step": 12030 + }, + { + "epoch": 0.9633541366618659, + "grad_norm": 2.6325435638427734, + "learning_rate": 3.4042894582018506e-05, + "loss": 5.6044, + "step": 12040 + }, + { + "epoch": 0.9641542646823492, + "grad_norm": 2.4688518047332764, + "learning_rate": 3.402952345296037e-05, + "loss": 5.3633, + "step": 12050 + }, + { + "epoch": 0.9649543927028325, + "grad_norm": 2.3974521160125732, + "learning_rate": 3.401615232390223e-05, + "loss": 5.3022, + "step": 12060 + }, + { + "epoch": 0.9657545207233157, + "grad_norm": 2.146742105484009, + "learning_rate": 3.4002781194844094e-05, + "loss": 5.2753, + "step": 12070 + }, + { + "epoch": 0.966554648743799, + "grad_norm": 2.1239147186279297, + "learning_rate": 3.3989410065785957e-05, + "loss": 5.466, + "step": 12080 + }, + { + "epoch": 0.9673547767642823, + "grad_norm": 2.939096450805664, + "learning_rate": 3.397603893672782e-05, + "loss": 5.5288, + "step": 12090 + }, + { + "epoch": 0.9681549047847655, + "grad_norm": 2.6875243186950684, + "learning_rate": 3.396266780766968e-05, + "loss": 5.4279, + "step": 12100 + }, + { + "epoch": 0.9689550328052489, + "grad_norm": 3.1991941928863525, + "learning_rate": 3.3949296678611545e-05, + "loss": 5.5397, + "step": 12110 + }, + { + "epoch": 0.9697551608257321, + "grad_norm": 2.4558470249176025, + "learning_rate": 3.393592554955341e-05, + "loss": 5.3246, + "step": 12120 + }, + { + "epoch": 0.9705552888462154, + "grad_norm": 2.2693309783935547, + "learning_rate": 3.392255442049527e-05, + "loss": 5.5941, + "step": 12130 + }, + { + "epoch": 0.9713554168666987, + "grad_norm": 2.8864657878875732, + "learning_rate": 3.390918329143713e-05, + "loss": 5.4632, + "step": 12140 + }, + { + "epoch": 0.9721555448871819, + "grad_norm": 2.3996002674102783, + "learning_rate": 3.3895812162378995e-05, + "loss": 5.4724, + "step": 12150 + }, + { + "epoch": 0.9729556729076653, + "grad_norm": 1.979028582572937, + "learning_rate": 3.388244103332086e-05, + "loss": 5.4229, + "step": 12160 + }, + { + "epoch": 0.9737558009281485, + "grad_norm": 2.0203795433044434, + "learning_rate": 3.386906990426272e-05, + "loss": 5.5592, + "step": 12170 + }, + { + "epoch": 0.9745559289486317, + "grad_norm": 2.0890145301818848, + "learning_rate": 3.3855698775204583e-05, + "loss": 5.4313, + "step": 12180 + }, + { + "epoch": 0.9753560569691151, + "grad_norm": 2.4817287921905518, + "learning_rate": 3.3842327646146446e-05, + "loss": 5.5, + "step": 12190 + }, + { + "epoch": 0.9761561849895983, + "grad_norm": 2.2497968673706055, + "learning_rate": 3.382895651708831e-05, + "loss": 5.3126, + "step": 12200 + }, + { + "epoch": 0.9769563130100816, + "grad_norm": 3.2818548679351807, + "learning_rate": 3.381558538803017e-05, + "loss": 5.3421, + "step": 12210 + }, + { + "epoch": 0.9777564410305649, + "grad_norm": 7.580129623413086, + "learning_rate": 3.3802214258972034e-05, + "loss": 5.6585, + "step": 12220 + }, + { + "epoch": 0.9785565690510482, + "grad_norm": 3.0450634956359863, + "learning_rate": 3.37888431299139e-05, + "loss": 5.4403, + "step": 12230 + }, + { + "epoch": 0.9793566970715314, + "grad_norm": 2.5230050086975098, + "learning_rate": 3.377547200085575e-05, + "loss": 5.5331, + "step": 12240 + }, + { + "epoch": 0.9801568250920147, + "grad_norm": 3.398266315460205, + "learning_rate": 3.3762100871797616e-05, + "loss": 5.3996, + "step": 12250 + }, + { + "epoch": 0.980956953112498, + "grad_norm": 2.2126028537750244, + "learning_rate": 3.374872974273948e-05, + "loss": 5.4175, + "step": 12260 + }, + { + "epoch": 0.9817570811329813, + "grad_norm": 3.0015792846679688, + "learning_rate": 3.373535861368134e-05, + "loss": 5.3961, + "step": 12270 + }, + { + "epoch": 0.9825572091534646, + "grad_norm": 2.5461559295654297, + "learning_rate": 3.3721987484623204e-05, + "loss": 5.6026, + "step": 12280 + }, + { + "epoch": 0.9833573371739478, + "grad_norm": 2.498425245285034, + "learning_rate": 3.3708616355565066e-05, + "loss": 5.3524, + "step": 12290 + }, + { + "epoch": 0.9841574651944311, + "grad_norm": 2.9614803791046143, + "learning_rate": 3.369524522650693e-05, + "loss": 5.5101, + "step": 12300 + }, + { + "epoch": 0.9849575932149144, + "grad_norm": 2.7508606910705566, + "learning_rate": 3.368187409744879e-05, + "loss": 5.3776, + "step": 12310 + }, + { + "epoch": 0.9857577212353976, + "grad_norm": 2.0286755561828613, + "learning_rate": 3.3668502968390654e-05, + "loss": 5.4913, + "step": 12320 + }, + { + "epoch": 0.986557849255881, + "grad_norm": 3.728842258453369, + "learning_rate": 3.365513183933252e-05, + "loss": 5.4477, + "step": 12330 + }, + { + "epoch": 0.9873579772763642, + "grad_norm": 3.3132193088531494, + "learning_rate": 3.364176071027438e-05, + "loss": 5.2361, + "step": 12340 + }, + { + "epoch": 0.9881581052968474, + "grad_norm": 2.515298843383789, + "learning_rate": 3.362838958121624e-05, + "loss": 5.4632, + "step": 12350 + }, + { + "epoch": 0.9889582333173308, + "grad_norm": 2.0937442779541016, + "learning_rate": 3.3615018452158105e-05, + "loss": 5.5075, + "step": 12360 + }, + { + "epoch": 0.989758361337814, + "grad_norm": 3.3019323348999023, + "learning_rate": 3.360164732309997e-05, + "loss": 5.4566, + "step": 12370 + }, + { + "epoch": 0.9905584893582974, + "grad_norm": 3.502408266067505, + "learning_rate": 3.358827619404183e-05, + "loss": 5.464, + "step": 12380 + }, + { + "epoch": 0.9913586173787806, + "grad_norm": 2.3667659759521484, + "learning_rate": 3.357490506498369e-05, + "loss": 5.5423, + "step": 12390 + }, + { + "epoch": 0.9921587453992639, + "grad_norm": 2.15498423576355, + "learning_rate": 3.356153393592555e-05, + "loss": 5.4031, + "step": 12400 + }, + { + "epoch": 0.9929588734197472, + "grad_norm": 2.733090877532959, + "learning_rate": 3.354816280686741e-05, + "loss": 5.4771, + "step": 12410 + }, + { + "epoch": 0.9937590014402304, + "grad_norm": 2.595238208770752, + "learning_rate": 3.3534791677809274e-05, + "loss": 5.4538, + "step": 12420 + }, + { + "epoch": 0.9945591294607137, + "grad_norm": 2.3755598068237305, + "learning_rate": 3.352142054875114e-05, + "loss": 5.432, + "step": 12430 + }, + { + "epoch": 0.995359257481197, + "grad_norm": 2.2179529666900635, + "learning_rate": 3.3508049419693e-05, + "loss": 5.4359, + "step": 12440 + }, + { + "epoch": 0.9961593855016803, + "grad_norm": 2.264469623565674, + "learning_rate": 3.349467829063486e-05, + "loss": 5.4514, + "step": 12450 + }, + { + "epoch": 0.9969595135221635, + "grad_norm": 2.9361791610717773, + "learning_rate": 3.3481307161576725e-05, + "loss": 5.4411, + "step": 12460 + }, + { + "epoch": 0.9977596415426468, + "grad_norm": 2.6548573970794678, + "learning_rate": 3.346793603251859e-05, + "loss": 5.4368, + "step": 12470 + }, + { + "epoch": 0.9985597695631301, + "grad_norm": 3.5749149322509766, + "learning_rate": 3.345456490346045e-05, + "loss": 5.6314, + "step": 12480 + }, + { + "epoch": 0.9993598975836134, + "grad_norm": 2.848527193069458, + "learning_rate": 3.344119377440231e-05, + "loss": 5.3849, + "step": 12490 + }, + { + "epoch": 1.0001600256040966, + "grad_norm": 2.036498546600342, + "learning_rate": 3.3427822645344176e-05, + "loss": 5.5973, + "step": 12500 + }, + { + "epoch": 1.00096015362458, + "grad_norm": 3.499455451965332, + "learning_rate": 3.341445151628604e-05, + "loss": 5.1882, + "step": 12510 + }, + { + "epoch": 1.0017602816450633, + "grad_norm": 2.4391655921936035, + "learning_rate": 3.34010803872279e-05, + "loss": 5.0281, + "step": 12520 + }, + { + "epoch": 1.0025604096655465, + "grad_norm": 2.522850513458252, + "learning_rate": 3.3387709258169764e-05, + "loss": 5.1038, + "step": 12530 + }, + { + "epoch": 1.0033605376860297, + "grad_norm": 2.631127119064331, + "learning_rate": 3.337433812911163e-05, + "loss": 4.9671, + "step": 12540 + }, + { + "epoch": 1.004160665706513, + "grad_norm": 2.9861068725585938, + "learning_rate": 3.336096700005348e-05, + "loss": 5.2225, + "step": 12550 + }, + { + "epoch": 1.0049607937269964, + "grad_norm": 2.59002423286438, + "learning_rate": 3.3347595870995345e-05, + "loss": 5.142, + "step": 12560 + }, + { + "epoch": 1.0057609217474797, + "grad_norm": 2.830385208129883, + "learning_rate": 3.333422474193721e-05, + "loss": 5.0919, + "step": 12570 + }, + { + "epoch": 1.006561049767963, + "grad_norm": 2.6355655193328857, + "learning_rate": 3.332085361287907e-05, + "loss": 5.0604, + "step": 12580 + }, + { + "epoch": 1.0073611777884461, + "grad_norm": 2.8990426063537598, + "learning_rate": 3.3307482483820933e-05, + "loss": 5.0488, + "step": 12590 + }, + { + "epoch": 1.0081613058089294, + "grad_norm": 2.657283067703247, + "learning_rate": 3.3294111354762796e-05, + "loss": 5.157, + "step": 12600 + }, + { + "epoch": 1.0089614338294126, + "grad_norm": 3.652735710144043, + "learning_rate": 3.328074022570466e-05, + "loss": 5.1629, + "step": 12610 + }, + { + "epoch": 1.009761561849896, + "grad_norm": 2.9064295291900635, + "learning_rate": 3.326736909664652e-05, + "loss": 5.1757, + "step": 12620 + }, + { + "epoch": 1.0105616898703793, + "grad_norm": 3.015488386154175, + "learning_rate": 3.3253997967588384e-05, + "loss": 5.2311, + "step": 12630 + }, + { + "epoch": 1.0113618178908625, + "grad_norm": 9.49726390838623, + "learning_rate": 3.324062683853025e-05, + "loss": 5.1402, + "step": 12640 + }, + { + "epoch": 1.0121619459113458, + "grad_norm": 6.71565055847168, + "learning_rate": 3.322725570947211e-05, + "loss": 4.7297, + "step": 12650 + }, + { + "epoch": 1.012962073931829, + "grad_norm": 4.39326286315918, + "learning_rate": 3.321388458041397e-05, + "loss": 5.1663, + "step": 12660 + }, + { + "epoch": 1.0137622019523125, + "grad_norm": 2.8973264694213867, + "learning_rate": 3.3200513451355835e-05, + "loss": 5.0674, + "step": 12670 + }, + { + "epoch": 1.0145623299727957, + "grad_norm": 3.1058743000030518, + "learning_rate": 3.31871423222977e-05, + "loss": 4.9689, + "step": 12680 + }, + { + "epoch": 1.015362457993279, + "grad_norm": 2.688951253890991, + "learning_rate": 3.317377119323956e-05, + "loss": 5.0916, + "step": 12690 + }, + { + "epoch": 1.0161625860137622, + "grad_norm": 2.9495773315429688, + "learning_rate": 3.3160400064181416e-05, + "loss": 5.0939, + "step": 12700 + }, + { + "epoch": 1.0169627140342454, + "grad_norm": 2.5915777683258057, + "learning_rate": 3.314702893512328e-05, + "loss": 5.134, + "step": 12710 + }, + { + "epoch": 1.0177628420547287, + "grad_norm": 2.703012228012085, + "learning_rate": 3.313365780606514e-05, + "loss": 5.1285, + "step": 12720 + }, + { + "epoch": 1.0185629700752121, + "grad_norm": 3.0492970943450928, + "learning_rate": 3.3120286677007004e-05, + "loss": 5.1477, + "step": 12730 + }, + { + "epoch": 1.0193630980956954, + "grad_norm": 2.756546974182129, + "learning_rate": 3.310691554794887e-05, + "loss": 5.0668, + "step": 12740 + }, + { + "epoch": 1.0201632261161786, + "grad_norm": 4.764959335327148, + "learning_rate": 3.309354441889073e-05, + "loss": 5.1243, + "step": 12750 + }, + { + "epoch": 1.0209633541366618, + "grad_norm": 5.539842128753662, + "learning_rate": 3.308017328983259e-05, + "loss": 5.0519, + "step": 12760 + }, + { + "epoch": 1.021763482157145, + "grad_norm": 3.8945937156677246, + "learning_rate": 3.3066802160774455e-05, + "loss": 5.1758, + "step": 12770 + }, + { + "epoch": 1.0225636101776283, + "grad_norm": 2.5580265522003174, + "learning_rate": 3.305343103171632e-05, + "loss": 5.0893, + "step": 12780 + }, + { + "epoch": 1.0233637381981118, + "grad_norm": 2.8203110694885254, + "learning_rate": 3.304005990265818e-05, + "loss": 5.2472, + "step": 12790 + }, + { + "epoch": 1.024163866218595, + "grad_norm": 3.5090975761413574, + "learning_rate": 3.302668877360004e-05, + "loss": 5.0594, + "step": 12800 + }, + { + "epoch": 1.0249639942390782, + "grad_norm": 2.915062189102173, + "learning_rate": 3.3013317644541906e-05, + "loss": 5.0673, + "step": 12810 + }, + { + "epoch": 1.0257641222595615, + "grad_norm": 2.648737668991089, + "learning_rate": 3.299994651548377e-05, + "loss": 4.8937, + "step": 12820 + }, + { + "epoch": 1.0265642502800447, + "grad_norm": 3.2576730251312256, + "learning_rate": 3.298657538642563e-05, + "loss": 5.1564, + "step": 12830 + }, + { + "epoch": 1.0273643783005282, + "grad_norm": 5.624968528747559, + "learning_rate": 3.2973204257367494e-05, + "loss": 5.3011, + "step": 12840 + }, + { + "epoch": 1.0281645063210114, + "grad_norm": 2.492978811264038, + "learning_rate": 3.2959833128309357e-05, + "loss": 5.0935, + "step": 12850 + }, + { + "epoch": 1.0289646343414947, + "grad_norm": 2.4655046463012695, + "learning_rate": 3.294646199925121e-05, + "loss": 5.1768, + "step": 12860 + }, + { + "epoch": 1.029764762361978, + "grad_norm": 3.4421567916870117, + "learning_rate": 3.2933090870193075e-05, + "loss": 5.0756, + "step": 12870 + }, + { + "epoch": 1.0305648903824611, + "grad_norm": 2.6774377822875977, + "learning_rate": 3.291971974113494e-05, + "loss": 5.036, + "step": 12880 + }, + { + "epoch": 1.0313650184029444, + "grad_norm": 2.665099859237671, + "learning_rate": 3.29063486120768e-05, + "loss": 5.1284, + "step": 12890 + }, + { + "epoch": 1.0321651464234278, + "grad_norm": 3.7092061042785645, + "learning_rate": 3.289297748301866e-05, + "loss": 4.8892, + "step": 12900 + }, + { + "epoch": 1.032965274443911, + "grad_norm": 2.875427484512329, + "learning_rate": 3.2879606353960526e-05, + "loss": 4.928, + "step": 12910 + }, + { + "epoch": 1.0337654024643943, + "grad_norm": 2.409395694732666, + "learning_rate": 3.286623522490239e-05, + "loss": 5.0545, + "step": 12920 + }, + { + "epoch": 1.0345655304848775, + "grad_norm": 3.936565637588501, + "learning_rate": 3.285286409584425e-05, + "loss": 5.0556, + "step": 12930 + }, + { + "epoch": 1.0353656585053608, + "grad_norm": 3.52986216545105, + "learning_rate": 3.2839492966786114e-05, + "loss": 5.0738, + "step": 12940 + }, + { + "epoch": 1.0361657865258442, + "grad_norm": 3.0732507705688477, + "learning_rate": 3.282612183772798e-05, + "loss": 5.0852, + "step": 12950 + }, + { + "epoch": 1.0369659145463275, + "grad_norm": 2.800020217895508, + "learning_rate": 3.281275070866984e-05, + "loss": 4.9983, + "step": 12960 + }, + { + "epoch": 1.0377660425668107, + "grad_norm": 2.682191848754883, + "learning_rate": 3.27993795796117e-05, + "loss": 4.8372, + "step": 12970 + }, + { + "epoch": 1.038566170587294, + "grad_norm": 5.331565856933594, + "learning_rate": 3.2786008450553565e-05, + "loss": 5.1444, + "step": 12980 + }, + { + "epoch": 1.0393662986077772, + "grad_norm": 3.530069589614868, + "learning_rate": 3.277263732149543e-05, + "loss": 5.1467, + "step": 12990 + }, + { + "epoch": 1.0401664266282604, + "grad_norm": 2.296837568283081, + "learning_rate": 3.275926619243729e-05, + "loss": 5.0782, + "step": 13000 + }, + { + "epoch": 1.0409665546487439, + "grad_norm": 4.3493146896362305, + "learning_rate": 3.274589506337915e-05, + "loss": 5.0574, + "step": 13010 + }, + { + "epoch": 1.0417666826692271, + "grad_norm": 3.2167856693267822, + "learning_rate": 3.2732523934321016e-05, + "loss": 5.1219, + "step": 13020 + }, + { + "epoch": 1.0425668106897104, + "grad_norm": 3.200861692428589, + "learning_rate": 3.271915280526288e-05, + "loss": 5.0674, + "step": 13030 + }, + { + "epoch": 1.0433669387101936, + "grad_norm": 2.286841869354248, + "learning_rate": 3.270578167620474e-05, + "loss": 5.0125, + "step": 13040 + }, + { + "epoch": 1.0441670667306768, + "grad_norm": 3.6788413524627686, + "learning_rate": 3.2692410547146604e-05, + "loss": 5.2975, + "step": 13050 + }, + { + "epoch": 1.0449671947511603, + "grad_norm": 2.77284574508667, + "learning_rate": 3.2679039418088466e-05, + "loss": 5.0099, + "step": 13060 + }, + { + "epoch": 1.0457673227716435, + "grad_norm": 4.33493185043335, + "learning_rate": 3.266566828903033e-05, + "loss": 5.0362, + "step": 13070 + }, + { + "epoch": 1.0465674507921268, + "grad_norm": 3.2839553356170654, + "learning_rate": 3.265229715997219e-05, + "loss": 4.9569, + "step": 13080 + }, + { + "epoch": 1.04736757881261, + "grad_norm": 2.9086809158325195, + "learning_rate": 3.2638926030914054e-05, + "loss": 5.0341, + "step": 13090 + }, + { + "epoch": 1.0481677068330932, + "grad_norm": 2.565225124359131, + "learning_rate": 3.262555490185592e-05, + "loss": 5.0601, + "step": 13100 + }, + { + "epoch": 1.0489678348535765, + "grad_norm": 2.8457388877868652, + "learning_rate": 3.261218377279778e-05, + "loss": 4.9952, + "step": 13110 + }, + { + "epoch": 1.04976796287406, + "grad_norm": 2.5370593070983887, + "learning_rate": 3.259881264373964e-05, + "loss": 5.1425, + "step": 13120 + }, + { + "epoch": 1.0505680908945432, + "grad_norm": 2.504817008972168, + "learning_rate": 3.2585441514681505e-05, + "loss": 4.9605, + "step": 13130 + }, + { + "epoch": 1.0513682189150264, + "grad_norm": 2.9582226276397705, + "learning_rate": 3.257207038562337e-05, + "loss": 5.1436, + "step": 13140 + }, + { + "epoch": 1.0521683469355096, + "grad_norm": 3.7598915100097656, + "learning_rate": 3.255869925656523e-05, + "loss": 5.0743, + "step": 13150 + }, + { + "epoch": 1.0529684749559929, + "grad_norm": 3.2642862796783447, + "learning_rate": 3.254532812750709e-05, + "loss": 5.139, + "step": 13160 + }, + { + "epoch": 1.0537686029764763, + "grad_norm": 3.4917502403259277, + "learning_rate": 3.253195699844895e-05, + "loss": 5.0566, + "step": 13170 + }, + { + "epoch": 1.0545687309969596, + "grad_norm": 2.9878995418548584, + "learning_rate": 3.251858586939081e-05, + "loss": 5.2385, + "step": 13180 + }, + { + "epoch": 1.0553688590174428, + "grad_norm": 2.9996213912963867, + "learning_rate": 3.2505214740332674e-05, + "loss": 5.1138, + "step": 13190 + }, + { + "epoch": 1.056168987037926, + "grad_norm": 5.470676422119141, + "learning_rate": 3.249184361127454e-05, + "loss": 5.0921, + "step": 13200 + }, + { + "epoch": 1.0569691150584093, + "grad_norm": 2.9724602699279785, + "learning_rate": 3.24784724822164e-05, + "loss": 4.9315, + "step": 13210 + }, + { + "epoch": 1.0577692430788925, + "grad_norm": 3.191342353820801, + "learning_rate": 3.246510135315826e-05, + "loss": 5.1095, + "step": 13220 + }, + { + "epoch": 1.058569371099376, + "grad_norm": 4.010619163513184, + "learning_rate": 3.2451730224100125e-05, + "loss": 5.1697, + "step": 13230 + }, + { + "epoch": 1.0593694991198592, + "grad_norm": 2.828768253326416, + "learning_rate": 3.243835909504199e-05, + "loss": 5.1114, + "step": 13240 + }, + { + "epoch": 1.0601696271403425, + "grad_norm": 4.081239223480225, + "learning_rate": 3.242498796598385e-05, + "loss": 5.0629, + "step": 13250 + }, + { + "epoch": 1.0609697551608257, + "grad_norm": 3.347407817840576, + "learning_rate": 3.241161683692571e-05, + "loss": 5.0355, + "step": 13260 + }, + { + "epoch": 1.061769883181309, + "grad_norm": 2.902289390563965, + "learning_rate": 3.2398245707867576e-05, + "loss": 5.1561, + "step": 13270 + }, + { + "epoch": 1.0625700112017924, + "grad_norm": 15.202888488769531, + "learning_rate": 3.238487457880944e-05, + "loss": 5.2149, + "step": 13280 + }, + { + "epoch": 1.0633701392222756, + "grad_norm": 3.353285551071167, + "learning_rate": 3.23715034497513e-05, + "loss": 4.8566, + "step": 13290 + }, + { + "epoch": 1.0641702672427589, + "grad_norm": 4.258049011230469, + "learning_rate": 3.2358132320693164e-05, + "loss": 5.0358, + "step": 13300 + }, + { + "epoch": 1.064970395263242, + "grad_norm": 2.727367639541626, + "learning_rate": 3.234476119163503e-05, + "loss": 4.9733, + "step": 13310 + }, + { + "epoch": 1.0657705232837253, + "grad_norm": 4.626856803894043, + "learning_rate": 3.233139006257688e-05, + "loss": 5.162, + "step": 13320 + }, + { + "epoch": 1.0665706513042086, + "grad_norm": 3.074949264526367, + "learning_rate": 3.2318018933518745e-05, + "loss": 5.1322, + "step": 13330 + }, + { + "epoch": 1.067370779324692, + "grad_norm": 4.150319576263428, + "learning_rate": 3.230464780446061e-05, + "loss": 5.0567, + "step": 13340 + }, + { + "epoch": 1.0681709073451753, + "grad_norm": 5.132182598114014, + "learning_rate": 3.229127667540247e-05, + "loss": 5.1743, + "step": 13350 + }, + { + "epoch": 1.0689710353656585, + "grad_norm": 4.4582839012146, + "learning_rate": 3.2277905546344333e-05, + "loss": 5.2236, + "step": 13360 + }, + { + "epoch": 1.0697711633861418, + "grad_norm": 2.9640562534332275, + "learning_rate": 3.2264534417286196e-05, + "loss": 5.0974, + "step": 13370 + }, + { + "epoch": 1.070571291406625, + "grad_norm": 2.8978335857391357, + "learning_rate": 3.225116328822806e-05, + "loss": 5.1591, + "step": 13380 + }, + { + "epoch": 1.0713714194271082, + "grad_norm": 2.773488759994507, + "learning_rate": 3.223779215916992e-05, + "loss": 5.0814, + "step": 13390 + }, + { + "epoch": 1.0721715474475917, + "grad_norm": 2.719374656677246, + "learning_rate": 3.2224421030111784e-05, + "loss": 5.0352, + "step": 13400 + }, + { + "epoch": 1.072971675468075, + "grad_norm": 2.918991804122925, + "learning_rate": 3.221104990105365e-05, + "loss": 5.0955, + "step": 13410 + }, + { + "epoch": 1.0737718034885582, + "grad_norm": 3.3438122272491455, + "learning_rate": 3.219767877199551e-05, + "loss": 5.0205, + "step": 13420 + }, + { + "epoch": 1.0745719315090414, + "grad_norm": 2.915687322616577, + "learning_rate": 3.218430764293737e-05, + "loss": 5.0708, + "step": 13430 + }, + { + "epoch": 1.0753720595295246, + "grad_norm": 2.3897652626037598, + "learning_rate": 3.2170936513879235e-05, + "loss": 5.0898, + "step": 13440 + }, + { + "epoch": 1.076172187550008, + "grad_norm": 2.5261075496673584, + "learning_rate": 3.21575653848211e-05, + "loss": 5.0002, + "step": 13450 + }, + { + "epoch": 1.0769723155704913, + "grad_norm": 4.839473247528076, + "learning_rate": 3.214419425576296e-05, + "loss": 5.1853, + "step": 13460 + }, + { + "epoch": 1.0777724435909746, + "grad_norm": 2.396831512451172, + "learning_rate": 3.213082312670482e-05, + "loss": 5.0397, + "step": 13470 + }, + { + "epoch": 1.0785725716114578, + "grad_norm": 4.165911674499512, + "learning_rate": 3.211745199764668e-05, + "loss": 5.2065, + "step": 13480 + }, + { + "epoch": 1.079372699631941, + "grad_norm": 2.74873423576355, + "learning_rate": 3.210408086858854e-05, + "loss": 5.2217, + "step": 13490 + }, + { + "epoch": 1.0801728276524245, + "grad_norm": 3.480703353881836, + "learning_rate": 3.2090709739530404e-05, + "loss": 4.9929, + "step": 13500 + }, + { + "epoch": 1.0809729556729077, + "grad_norm": 3.747199773788452, + "learning_rate": 3.207733861047227e-05, + "loss": 5.1235, + "step": 13510 + }, + { + "epoch": 1.081773083693391, + "grad_norm": 3.634990692138672, + "learning_rate": 3.206396748141413e-05, + "loss": 4.9466, + "step": 13520 + }, + { + "epoch": 1.0825732117138742, + "grad_norm": 3.6419565677642822, + "learning_rate": 3.205059635235599e-05, + "loss": 5.1791, + "step": 13530 + }, + { + "epoch": 1.0833733397343575, + "grad_norm": 3.413770914077759, + "learning_rate": 3.2037225223297855e-05, + "loss": 5.1777, + "step": 13540 + }, + { + "epoch": 1.0841734677548407, + "grad_norm": 5.771011829376221, + "learning_rate": 3.202385409423972e-05, + "loss": 5.0543, + "step": 13550 + }, + { + "epoch": 1.0849735957753242, + "grad_norm": 2.9491965770721436, + "learning_rate": 3.201048296518158e-05, + "loss": 4.9719, + "step": 13560 + }, + { + "epoch": 1.0857737237958074, + "grad_norm": 3.3095767498016357, + "learning_rate": 3.199711183612344e-05, + "loss": 5.2155, + "step": 13570 + }, + { + "epoch": 1.0865738518162906, + "grad_norm": 4.941197395324707, + "learning_rate": 3.1983740707065306e-05, + "loss": 5.073, + "step": 13580 + }, + { + "epoch": 1.0873739798367739, + "grad_norm": 2.3605270385742188, + "learning_rate": 3.197036957800717e-05, + "loss": 5.1746, + "step": 13590 + }, + { + "epoch": 1.088174107857257, + "grad_norm": 2.9810526371002197, + "learning_rate": 3.195699844894903e-05, + "loss": 5.157, + "step": 13600 + }, + { + "epoch": 1.0889742358777403, + "grad_norm": 2.767223358154297, + "learning_rate": 3.1943627319890894e-05, + "loss": 5.0831, + "step": 13610 + }, + { + "epoch": 1.0897743638982238, + "grad_norm": 6.959831714630127, + "learning_rate": 3.193025619083276e-05, + "loss": 4.883, + "step": 13620 + }, + { + "epoch": 1.090574491918707, + "grad_norm": 6.120983123779297, + "learning_rate": 3.191688506177461e-05, + "loss": 5.0368, + "step": 13630 + }, + { + "epoch": 1.0913746199391903, + "grad_norm": 2.680748462677002, + "learning_rate": 3.1903513932716475e-05, + "loss": 5.1996, + "step": 13640 + }, + { + "epoch": 1.0921747479596735, + "grad_norm": 4.287043571472168, + "learning_rate": 3.189014280365834e-05, + "loss": 4.9824, + "step": 13650 + }, + { + "epoch": 1.0929748759801567, + "grad_norm": 2.647005319595337, + "learning_rate": 3.18767716746002e-05, + "loss": 4.9845, + "step": 13660 + }, + { + "epoch": 1.0937750040006402, + "grad_norm": 2.9568288326263428, + "learning_rate": 3.186340054554206e-05, + "loss": 5.0804, + "step": 13670 + }, + { + "epoch": 1.0945751320211234, + "grad_norm": 4.118317127227783, + "learning_rate": 3.1850029416483926e-05, + "loss": 5.0375, + "step": 13680 + }, + { + "epoch": 1.0953752600416067, + "grad_norm": 3.7457168102264404, + "learning_rate": 3.183665828742579e-05, + "loss": 5.0193, + "step": 13690 + }, + { + "epoch": 1.09617538806209, + "grad_norm": 2.829274892807007, + "learning_rate": 3.182328715836765e-05, + "loss": 5.1896, + "step": 13700 + }, + { + "epoch": 1.0969755160825732, + "grad_norm": 3.568166971206665, + "learning_rate": 3.1809916029309514e-05, + "loss": 5.0527, + "step": 13710 + }, + { + "epoch": 1.0977756441030564, + "grad_norm": 2.8555142879486084, + "learning_rate": 3.179654490025138e-05, + "loss": 5.0873, + "step": 13720 + }, + { + "epoch": 1.0985757721235399, + "grad_norm": 2.9258460998535156, + "learning_rate": 3.178317377119324e-05, + "loss": 4.9293, + "step": 13730 + }, + { + "epoch": 1.099375900144023, + "grad_norm": 3.3614535331726074, + "learning_rate": 3.17698026421351e-05, + "loss": 4.992, + "step": 13740 + }, + { + "epoch": 1.1001760281645063, + "grad_norm": 3.859238624572754, + "learning_rate": 3.1756431513076965e-05, + "loss": 4.9695, + "step": 13750 + }, + { + "epoch": 1.1009761561849896, + "grad_norm": 2.9869918823242188, + "learning_rate": 3.174306038401883e-05, + "loss": 5.0833, + "step": 13760 + }, + { + "epoch": 1.1017762842054728, + "grad_norm": 2.874736785888672, + "learning_rate": 3.172968925496069e-05, + "loss": 5.0329, + "step": 13770 + }, + { + "epoch": 1.102576412225956, + "grad_norm": 3.2926857471466064, + "learning_rate": 3.171631812590255e-05, + "loss": 5.0431, + "step": 13780 + }, + { + "epoch": 1.1033765402464395, + "grad_norm": 3.0349912643432617, + "learning_rate": 3.1702946996844416e-05, + "loss": 5.0485, + "step": 13790 + }, + { + "epoch": 1.1041766682669227, + "grad_norm": 3.0139970779418945, + "learning_rate": 3.168957586778628e-05, + "loss": 5.0519, + "step": 13800 + }, + { + "epoch": 1.104976796287406, + "grad_norm": 3.5662894248962402, + "learning_rate": 3.167620473872814e-05, + "loss": 5.2053, + "step": 13810 + }, + { + "epoch": 1.1057769243078892, + "grad_norm": 3.348515033721924, + "learning_rate": 3.1662833609670004e-05, + "loss": 5.0588, + "step": 13820 + }, + { + "epoch": 1.1065770523283724, + "grad_norm": 2.439892292022705, + "learning_rate": 3.1649462480611866e-05, + "loss": 5.0894, + "step": 13830 + }, + { + "epoch": 1.107377180348856, + "grad_norm": 3.85776948928833, + "learning_rate": 3.163609135155373e-05, + "loss": 5.0345, + "step": 13840 + }, + { + "epoch": 1.1081773083693391, + "grad_norm": 2.6576287746429443, + "learning_rate": 3.162272022249559e-05, + "loss": 5.0607, + "step": 13850 + }, + { + "epoch": 1.1089774363898224, + "grad_norm": 2.6049861907958984, + "learning_rate": 3.1609349093437454e-05, + "loss": 5.0033, + "step": 13860 + }, + { + "epoch": 1.1097775644103056, + "grad_norm": 2.5496983528137207, + "learning_rate": 3.159597796437932e-05, + "loss": 5.2102, + "step": 13870 + }, + { + "epoch": 1.1105776924307889, + "grad_norm": 4.300173282623291, + "learning_rate": 3.158260683532118e-05, + "loss": 5.1137, + "step": 13880 + }, + { + "epoch": 1.1113778204512723, + "grad_norm": 2.4413559436798096, + "learning_rate": 3.156923570626304e-05, + "loss": 4.9222, + "step": 13890 + }, + { + "epoch": 1.1121779484717556, + "grad_norm": 2.4938573837280273, + "learning_rate": 3.1555864577204905e-05, + "loss": 5.1414, + "step": 13900 + }, + { + "epoch": 1.1129780764922388, + "grad_norm": 3.333294153213501, + "learning_rate": 3.154249344814677e-05, + "loss": 5.1243, + "step": 13910 + }, + { + "epoch": 1.113778204512722, + "grad_norm": 3.8718490600585938, + "learning_rate": 3.152912231908863e-05, + "loss": 5.2178, + "step": 13920 + }, + { + "epoch": 1.1145783325332053, + "grad_norm": 4.667349338531494, + "learning_rate": 3.151575119003049e-05, + "loss": 5.185, + "step": 13930 + }, + { + "epoch": 1.1153784605536885, + "grad_norm": 3.7269580364227295, + "learning_rate": 3.150238006097235e-05, + "loss": 4.9231, + "step": 13940 + }, + { + "epoch": 1.116178588574172, + "grad_norm": 3.8037633895874023, + "learning_rate": 3.148900893191421e-05, + "loss": 5.0166, + "step": 13950 + }, + { + "epoch": 1.1169787165946552, + "grad_norm": 3.2636613845825195, + "learning_rate": 3.1475637802856075e-05, + "loss": 5.0339, + "step": 13960 + }, + { + "epoch": 1.1177788446151384, + "grad_norm": 4.069303035736084, + "learning_rate": 3.146226667379794e-05, + "loss": 5.1558, + "step": 13970 + }, + { + "epoch": 1.1185789726356217, + "grad_norm": 3.160214424133301, + "learning_rate": 3.14488955447398e-05, + "loss": 5.0048, + "step": 13980 + }, + { + "epoch": 1.119379100656105, + "grad_norm": 2.7678611278533936, + "learning_rate": 3.143552441568166e-05, + "loss": 5.0992, + "step": 13990 + }, + { + "epoch": 1.1201792286765881, + "grad_norm": 3.162316083908081, + "learning_rate": 3.1422153286623525e-05, + "loss": 5.0398, + "step": 14000 + }, + { + "epoch": 1.1201792286765881, + "eval_loss": 5.684463977813721, + "eval_runtime": 11.9219, + "eval_samples_per_second": 3.355, + "eval_steps_per_second": 0.419, + "step": 14000 + }, + { + "epoch": 1.1209793566970716, + "grad_norm": 3.6854958534240723, + "learning_rate": 3.140878215756539e-05, + "loss": 5.0345, + "step": 14010 + }, + { + "epoch": 1.1217794847175548, + "grad_norm": 2.577242851257324, + "learning_rate": 3.139541102850725e-05, + "loss": 4.9944, + "step": 14020 + }, + { + "epoch": 1.122579612738038, + "grad_norm": 2.181784152984619, + "learning_rate": 3.138203989944911e-05, + "loss": 5.0055, + "step": 14030 + }, + { + "epoch": 1.1233797407585213, + "grad_norm": 4.235867500305176, + "learning_rate": 3.1368668770390976e-05, + "loss": 5.2758, + "step": 14040 + }, + { + "epoch": 1.1241798687790046, + "grad_norm": 2.5550196170806885, + "learning_rate": 3.135529764133284e-05, + "loss": 5.0281, + "step": 14050 + }, + { + "epoch": 1.124979996799488, + "grad_norm": 3.513957977294922, + "learning_rate": 3.13419265122747e-05, + "loss": 5.0031, + "step": 14060 + }, + { + "epoch": 1.1257801248199713, + "grad_norm": 2.731046676635742, + "learning_rate": 3.1328555383216564e-05, + "loss": 5.061, + "step": 14070 + }, + { + "epoch": 1.1265802528404545, + "grad_norm": 2.664210557937622, + "learning_rate": 3.131518425415843e-05, + "loss": 5.0907, + "step": 14080 + }, + { + "epoch": 1.1273803808609377, + "grad_norm": 3.6753573417663574, + "learning_rate": 3.130181312510029e-05, + "loss": 4.9645, + "step": 14090 + }, + { + "epoch": 1.128180508881421, + "grad_norm": 4.166867733001709, + "learning_rate": 3.1288441996042145e-05, + "loss": 5.1689, + "step": 14100 + }, + { + "epoch": 1.1289806369019044, + "grad_norm": 3.4742753505706787, + "learning_rate": 3.127507086698401e-05, + "loss": 5.0116, + "step": 14110 + }, + { + "epoch": 1.1297807649223877, + "grad_norm": 2.653500556945801, + "learning_rate": 3.126169973792587e-05, + "loss": 5.0203, + "step": 14120 + }, + { + "epoch": 1.130580892942871, + "grad_norm": 4.123785495758057, + "learning_rate": 3.1248328608867733e-05, + "loss": 4.8747, + "step": 14130 + }, + { + "epoch": 1.1313810209633541, + "grad_norm": 2.67256498336792, + "learning_rate": 3.1234957479809596e-05, + "loss": 5.0739, + "step": 14140 + }, + { + "epoch": 1.1321811489838374, + "grad_norm": 3.181354284286499, + "learning_rate": 3.122158635075146e-05, + "loss": 5.0416, + "step": 14150 + }, + { + "epoch": 1.1329812770043206, + "grad_norm": 3.475081443786621, + "learning_rate": 3.120821522169332e-05, + "loss": 5.126, + "step": 14160 + }, + { + "epoch": 1.1337814050248038, + "grad_norm": 3.9857749938964844, + "learning_rate": 3.1194844092635184e-05, + "loss": 5.088, + "step": 14170 + }, + { + "epoch": 1.1345815330452873, + "grad_norm": 2.8515474796295166, + "learning_rate": 3.118147296357705e-05, + "loss": 5.0137, + "step": 14180 + }, + { + "epoch": 1.1353816610657705, + "grad_norm": 3.1561362743377686, + "learning_rate": 3.116810183451891e-05, + "loss": 5.1099, + "step": 14190 + }, + { + "epoch": 1.1361817890862538, + "grad_norm": 3.4241456985473633, + "learning_rate": 3.115473070546077e-05, + "loss": 5.2076, + "step": 14200 + }, + { + "epoch": 1.136981917106737, + "grad_norm": 5.489968776702881, + "learning_rate": 3.1141359576402635e-05, + "loss": 5.0493, + "step": 14210 + }, + { + "epoch": 1.1377820451272203, + "grad_norm": 4.867628574371338, + "learning_rate": 3.11279884473445e-05, + "loss": 5.0169, + "step": 14220 + }, + { + "epoch": 1.1385821731477037, + "grad_norm": 4.413990497589111, + "learning_rate": 3.111461731828636e-05, + "loss": 5.227, + "step": 14230 + }, + { + "epoch": 1.139382301168187, + "grad_norm": 2.8556582927703857, + "learning_rate": 3.110124618922822e-05, + "loss": 5.0318, + "step": 14240 + }, + { + "epoch": 1.1401824291886702, + "grad_norm": 3.325246572494507, + "learning_rate": 3.108787506017008e-05, + "loss": 5.1337, + "step": 14250 + }, + { + "epoch": 1.1409825572091534, + "grad_norm": 2.323495864868164, + "learning_rate": 3.107450393111194e-05, + "loss": 5.05, + "step": 14260 + }, + { + "epoch": 1.1417826852296367, + "grad_norm": 3.933109998703003, + "learning_rate": 3.1061132802053804e-05, + "loss": 5.0799, + "step": 14270 + }, + { + "epoch": 1.1425828132501201, + "grad_norm": 4.54990816116333, + "learning_rate": 3.104776167299567e-05, + "loss": 5.0432, + "step": 14280 + }, + { + "epoch": 1.1433829412706034, + "grad_norm": 3.6891863346099854, + "learning_rate": 3.103439054393753e-05, + "loss": 5.281, + "step": 14290 + }, + { + "epoch": 1.1441830692910866, + "grad_norm": 3.3995590209960938, + "learning_rate": 3.102101941487939e-05, + "loss": 5.0141, + "step": 14300 + }, + { + "epoch": 1.1449831973115698, + "grad_norm": 3.7183680534362793, + "learning_rate": 3.1007648285821255e-05, + "loss": 5.1297, + "step": 14310 + }, + { + "epoch": 1.145783325332053, + "grad_norm": 4.312760353088379, + "learning_rate": 3.099427715676312e-05, + "loss": 5.1593, + "step": 14320 + }, + { + "epoch": 1.1465834533525365, + "grad_norm": 3.545175552368164, + "learning_rate": 3.098090602770498e-05, + "loss": 5.0797, + "step": 14330 + }, + { + "epoch": 1.1473835813730198, + "grad_norm": 2.8631224632263184, + "learning_rate": 3.096753489864684e-05, + "loss": 4.8917, + "step": 14340 + }, + { + "epoch": 1.148183709393503, + "grad_norm": 4.92164945602417, + "learning_rate": 3.0954163769588706e-05, + "loss": 5.1036, + "step": 14350 + }, + { + "epoch": 1.1489838374139862, + "grad_norm": 3.5669424533843994, + "learning_rate": 3.094079264053057e-05, + "loss": 5.0415, + "step": 14360 + }, + { + "epoch": 1.1497839654344695, + "grad_norm": 2.990910291671753, + "learning_rate": 3.092742151147243e-05, + "loss": 5.031, + "step": 14370 + }, + { + "epoch": 1.1505840934549527, + "grad_norm": 3.9870245456695557, + "learning_rate": 3.0914050382414294e-05, + "loss": 5.1785, + "step": 14380 + }, + { + "epoch": 1.151384221475436, + "grad_norm": 2.9064061641693115, + "learning_rate": 3.090067925335616e-05, + "loss": 4.8864, + "step": 14390 + }, + { + "epoch": 1.1521843494959194, + "grad_norm": 2.7341842651367188, + "learning_rate": 3.088730812429801e-05, + "loss": 5.0189, + "step": 14400 + }, + { + "epoch": 1.1529844775164027, + "grad_norm": 3.3455512523651123, + "learning_rate": 3.0873936995239875e-05, + "loss": 4.998, + "step": 14410 + }, + { + "epoch": 1.1537846055368859, + "grad_norm": 3.155400276184082, + "learning_rate": 3.086056586618174e-05, + "loss": 4.9555, + "step": 14420 + }, + { + "epoch": 1.1545847335573691, + "grad_norm": 3.9464547634124756, + "learning_rate": 3.08471947371236e-05, + "loss": 4.9177, + "step": 14430 + }, + { + "epoch": 1.1553848615778524, + "grad_norm": 3.159940242767334, + "learning_rate": 3.083382360806546e-05, + "loss": 5.113, + "step": 14440 + }, + { + "epoch": 1.1561849895983358, + "grad_norm": 5.134779930114746, + "learning_rate": 3.0820452479007326e-05, + "loss": 5.054, + "step": 14450 + }, + { + "epoch": 1.156985117618819, + "grad_norm": 3.6196463108062744, + "learning_rate": 3.080708134994919e-05, + "loss": 5.0461, + "step": 14460 + }, + { + "epoch": 1.1577852456393023, + "grad_norm": 3.5549261569976807, + "learning_rate": 3.079371022089105e-05, + "loss": 5.0545, + "step": 14470 + }, + { + "epoch": 1.1585853736597855, + "grad_norm": 2.656663656234741, + "learning_rate": 3.0780339091832914e-05, + "loss": 5.0538, + "step": 14480 + }, + { + "epoch": 1.1593855016802688, + "grad_norm": 3.927999973297119, + "learning_rate": 3.076696796277478e-05, + "loss": 4.95, + "step": 14490 + }, + { + "epoch": 1.1601856297007522, + "grad_norm": 3.3732807636260986, + "learning_rate": 3.075359683371664e-05, + "loss": 5.1581, + "step": 14500 + }, + { + "epoch": 1.1609857577212355, + "grad_norm": 3.2441952228546143, + "learning_rate": 3.07402257046585e-05, + "loss": 4.9357, + "step": 14510 + }, + { + "epoch": 1.1617858857417187, + "grad_norm": 3.2283682823181152, + "learning_rate": 3.0726854575600365e-05, + "loss": 4.9143, + "step": 14520 + }, + { + "epoch": 1.162586013762202, + "grad_norm": 4.4284234046936035, + "learning_rate": 3.071348344654223e-05, + "loss": 5.0004, + "step": 14530 + }, + { + "epoch": 1.1633861417826852, + "grad_norm": 2.5761866569519043, + "learning_rate": 3.070011231748409e-05, + "loss": 5.08, + "step": 14540 + }, + { + "epoch": 1.1641862698031684, + "grad_norm": 3.1167516708374023, + "learning_rate": 3.068674118842595e-05, + "loss": 5.0459, + "step": 14550 + }, + { + "epoch": 1.1649863978236519, + "grad_norm": 4.732173442840576, + "learning_rate": 3.067337005936781e-05, + "loss": 5.092, + "step": 14560 + }, + { + "epoch": 1.1657865258441351, + "grad_norm": 3.620969772338867, + "learning_rate": 3.065999893030967e-05, + "loss": 5.0032, + "step": 14570 + }, + { + "epoch": 1.1665866538646184, + "grad_norm": 4.062996864318848, + "learning_rate": 3.0646627801251534e-05, + "loss": 5.1082, + "step": 14580 + }, + { + "epoch": 1.1673867818851016, + "grad_norm": 4.451529502868652, + "learning_rate": 3.06332566721934e-05, + "loss": 4.7973, + "step": 14590 + }, + { + "epoch": 1.1681869099055848, + "grad_norm": 2.944892406463623, + "learning_rate": 3.061988554313526e-05, + "loss": 5.1265, + "step": 14600 + }, + { + "epoch": 1.168987037926068, + "grad_norm": 2.3681206703186035, + "learning_rate": 3.060651441407712e-05, + "loss": 5.036, + "step": 14610 + }, + { + "epoch": 1.1697871659465515, + "grad_norm": 3.83963680267334, + "learning_rate": 3.0593143285018985e-05, + "loss": 5.0027, + "step": 14620 + }, + { + "epoch": 1.1705872939670348, + "grad_norm": 2.4702982902526855, + "learning_rate": 3.057977215596085e-05, + "loss": 5.1936, + "step": 14630 + }, + { + "epoch": 1.171387421987518, + "grad_norm": 3.8022842407226562, + "learning_rate": 3.056640102690271e-05, + "loss": 4.9908, + "step": 14640 + }, + { + "epoch": 1.1721875500080012, + "grad_norm": 7.476479530334473, + "learning_rate": 3.055302989784457e-05, + "loss": 4.9345, + "step": 14650 + }, + { + "epoch": 1.1729876780284845, + "grad_norm": 4.326262474060059, + "learning_rate": 3.0539658768786436e-05, + "loss": 5.0183, + "step": 14660 + }, + { + "epoch": 1.173787806048968, + "grad_norm": 3.567706346511841, + "learning_rate": 3.05262876397283e-05, + "loss": 4.9711, + "step": 14670 + }, + { + "epoch": 1.1745879340694512, + "grad_norm": 5.6367011070251465, + "learning_rate": 3.0512916510670165e-05, + "loss": 5.2, + "step": 14680 + }, + { + "epoch": 1.1753880620899344, + "grad_norm": 3.228248357772827, + "learning_rate": 3.0499545381612027e-05, + "loss": 5.0736, + "step": 14690 + }, + { + "epoch": 1.1761881901104176, + "grad_norm": 3.1230525970458984, + "learning_rate": 3.048617425255389e-05, + "loss": 5.046, + "step": 14700 + }, + { + "epoch": 1.1769883181309009, + "grad_norm": 3.1620965003967285, + "learning_rate": 3.0472803123495746e-05, + "loss": 5.2402, + "step": 14710 + }, + { + "epoch": 1.1777884461513843, + "grad_norm": 2.9617507457733154, + "learning_rate": 3.045943199443761e-05, + "loss": 4.9144, + "step": 14720 + }, + { + "epoch": 1.1785885741718676, + "grad_norm": 4.437487602233887, + "learning_rate": 3.044606086537947e-05, + "loss": 5.1531, + "step": 14730 + }, + { + "epoch": 1.1793887021923508, + "grad_norm": 4.128335952758789, + "learning_rate": 3.0432689736321334e-05, + "loss": 5.152, + "step": 14740 + }, + { + "epoch": 1.180188830212834, + "grad_norm": 3.35322642326355, + "learning_rate": 3.0419318607263197e-05, + "loss": 5.1341, + "step": 14750 + }, + { + "epoch": 1.1809889582333173, + "grad_norm": 3.4951529502868652, + "learning_rate": 3.040594747820506e-05, + "loss": 5.1443, + "step": 14760 + }, + { + "epoch": 1.1817890862538005, + "grad_norm": 3.2248058319091797, + "learning_rate": 3.0392576349146922e-05, + "loss": 5.221, + "step": 14770 + }, + { + "epoch": 1.1825892142742838, + "grad_norm": 4.069246292114258, + "learning_rate": 3.0379205220088785e-05, + "loss": 5.1644, + "step": 14780 + }, + { + "epoch": 1.1833893422947672, + "grad_norm": 3.8001856803894043, + "learning_rate": 3.0365834091030647e-05, + "loss": 5.1899, + "step": 14790 + }, + { + "epoch": 1.1841894703152505, + "grad_norm": 2.359663248062134, + "learning_rate": 3.035246296197251e-05, + "loss": 5.1497, + "step": 14800 + }, + { + "epoch": 1.1849895983357337, + "grad_norm": 2.8918564319610596, + "learning_rate": 3.0339091832914373e-05, + "loss": 5.0774, + "step": 14810 + }, + { + "epoch": 1.185789726356217, + "grad_norm": 2.6341822147369385, + "learning_rate": 3.0325720703856235e-05, + "loss": 5.0062, + "step": 14820 + }, + { + "epoch": 1.1865898543767002, + "grad_norm": 2.6374053955078125, + "learning_rate": 3.0312349574798098e-05, + "loss": 5.0424, + "step": 14830 + }, + { + "epoch": 1.1873899823971836, + "grad_norm": 3.653303623199463, + "learning_rate": 3.029897844573996e-05, + "loss": 5.1633, + "step": 14840 + }, + { + "epoch": 1.1881901104176669, + "grad_norm": 3.1818668842315674, + "learning_rate": 3.0286944429587637e-05, + "loss": 5.0724, + "step": 14850 + }, + { + "epoch": 1.18899023843815, + "grad_norm": 3.803523540496826, + "learning_rate": 3.02735733005295e-05, + "loss": 5.0358, + "step": 14860 + }, + { + "epoch": 1.1897903664586333, + "grad_norm": 2.8528194427490234, + "learning_rate": 3.026020217147136e-05, + "loss": 5.2583, + "step": 14870 + }, + { + "epoch": 1.1905904944791166, + "grad_norm": 3.8696224689483643, + "learning_rate": 3.024683104241322e-05, + "loss": 5.1876, + "step": 14880 + }, + { + "epoch": 1.1913906224996, + "grad_norm": 2.9173583984375, + "learning_rate": 3.0233459913355084e-05, + "loss": 5.0444, + "step": 14890 + }, + { + "epoch": 1.1921907505200833, + "grad_norm": 3.349648952484131, + "learning_rate": 3.0220088784296947e-05, + "loss": 5.1046, + "step": 14900 + }, + { + "epoch": 1.1929908785405665, + "grad_norm": 3.1171865463256836, + "learning_rate": 3.020671765523881e-05, + "loss": 5.1088, + "step": 14910 + }, + { + "epoch": 1.1937910065610498, + "grad_norm": 3.1871280670166016, + "learning_rate": 3.0193346526180672e-05, + "loss": 4.9422, + "step": 14920 + }, + { + "epoch": 1.194591134581533, + "grad_norm": 5.893035888671875, + "learning_rate": 3.0179975397122535e-05, + "loss": 5.1122, + "step": 14930 + }, + { + "epoch": 1.1953912626020164, + "grad_norm": 2.8081605434417725, + "learning_rate": 3.0166604268064397e-05, + "loss": 5.0883, + "step": 14940 + }, + { + "epoch": 1.1961913906224997, + "grad_norm": 3.0510361194610596, + "learning_rate": 3.015323313900626e-05, + "loss": 5.1283, + "step": 14950 + }, + { + "epoch": 1.196991518642983, + "grad_norm": 4.671666622161865, + "learning_rate": 3.0139862009948123e-05, + "loss": 5.0935, + "step": 14960 + }, + { + "epoch": 1.1977916466634662, + "grad_norm": 4.178012847900391, + "learning_rate": 3.0126490880889985e-05, + "loss": 5.0072, + "step": 14970 + }, + { + "epoch": 1.1985917746839494, + "grad_norm": 3.84997820854187, + "learning_rate": 3.0113119751831848e-05, + "loss": 5.01, + "step": 14980 + }, + { + "epoch": 1.1993919027044326, + "grad_norm": 3.262110710144043, + "learning_rate": 3.009974862277371e-05, + "loss": 5.1383, + "step": 14990 + }, + { + "epoch": 1.2001920307249159, + "grad_norm": 2.7314374446868896, + "learning_rate": 3.0086377493715573e-05, + "loss": 5.0762, + "step": 15000 + }, + { + "epoch": 1.2009921587453993, + "grad_norm": 2.8299612998962402, + "learning_rate": 3.0073006364657436e-05, + "loss": 5.0924, + "step": 15010 + }, + { + "epoch": 1.2017922867658826, + "grad_norm": 3.1154122352600098, + "learning_rate": 3.0059635235599292e-05, + "loss": 5.1659, + "step": 15020 + }, + { + "epoch": 1.2025924147863658, + "grad_norm": 3.96730375289917, + "learning_rate": 3.0046264106541155e-05, + "loss": 5.0078, + "step": 15030 + }, + { + "epoch": 1.203392542806849, + "grad_norm": 3.298128604888916, + "learning_rate": 3.0032892977483017e-05, + "loss": 4.9144, + "step": 15040 + }, + { + "epoch": 1.2041926708273323, + "grad_norm": 3.2130002975463867, + "learning_rate": 3.001952184842488e-05, + "loss": 5.0833, + "step": 15050 + }, + { + "epoch": 1.2049927988478157, + "grad_norm": 3.294297456741333, + "learning_rate": 3.0006150719366743e-05, + "loss": 5.0306, + "step": 15060 + }, + { + "epoch": 1.205792926868299, + "grad_norm": 4.316399574279785, + "learning_rate": 2.9992779590308606e-05, + "loss": 5.1813, + "step": 15070 + }, + { + "epoch": 1.2065930548887822, + "grad_norm": 2.5436484813690186, + "learning_rate": 2.9979408461250468e-05, + "loss": 5.0249, + "step": 15080 + }, + { + "epoch": 1.2073931829092654, + "grad_norm": 3.144047737121582, + "learning_rate": 2.996603733219233e-05, + "loss": 5.1768, + "step": 15090 + }, + { + "epoch": 1.2081933109297487, + "grad_norm": 2.7458813190460205, + "learning_rate": 2.9952666203134194e-05, + "loss": 5.136, + "step": 15100 + }, + { + "epoch": 1.2089934389502321, + "grad_norm": 5.8104095458984375, + "learning_rate": 2.9939295074076056e-05, + "loss": 5.0067, + "step": 15110 + }, + { + "epoch": 1.2097935669707154, + "grad_norm": 4.127492904663086, + "learning_rate": 2.992592394501792e-05, + "loss": 5.1984, + "step": 15120 + }, + { + "epoch": 1.2105936949911986, + "grad_norm": 2.4212141036987305, + "learning_rate": 2.991255281595978e-05, + "loss": 5.0106, + "step": 15130 + }, + { + "epoch": 1.2113938230116819, + "grad_norm": 2.9509193897247314, + "learning_rate": 2.9899181686901644e-05, + "loss": 5.3269, + "step": 15140 + }, + { + "epoch": 1.212193951032165, + "grad_norm": 3.008227825164795, + "learning_rate": 2.9885810557843507e-05, + "loss": 5.2664, + "step": 15150 + }, + { + "epoch": 1.2129940790526483, + "grad_norm": 4.015718936920166, + "learning_rate": 2.987243942878537e-05, + "loss": 4.9637, + "step": 15160 + }, + { + "epoch": 1.2137942070731318, + "grad_norm": 4.144075870513916, + "learning_rate": 2.9859068299727232e-05, + "loss": 5.2009, + "step": 15170 + }, + { + "epoch": 1.214594335093615, + "grad_norm": 2.7135703563690186, + "learning_rate": 2.9845697170669092e-05, + "loss": 5.0274, + "step": 15180 + }, + { + "epoch": 1.2153944631140983, + "grad_norm": 2.703979015350342, + "learning_rate": 2.9832326041610954e-05, + "loss": 4.9492, + "step": 15190 + }, + { + "epoch": 1.2161945911345815, + "grad_norm": 3.669940948486328, + "learning_rate": 2.9818954912552817e-05, + "loss": 5.1698, + "step": 15200 + }, + { + "epoch": 1.2169947191550647, + "grad_norm": 3.5432851314544678, + "learning_rate": 2.980558378349468e-05, + "loss": 5.0233, + "step": 15210 + }, + { + "epoch": 1.217794847175548, + "grad_norm": 7.245898723602295, + "learning_rate": 2.9792212654436542e-05, + "loss": 5.0977, + "step": 15220 + }, + { + "epoch": 1.2185949751960314, + "grad_norm": 3.1283178329467773, + "learning_rate": 2.9778841525378405e-05, + "loss": 4.9224, + "step": 15230 + }, + { + "epoch": 1.2193951032165147, + "grad_norm": 4.550083160400391, + "learning_rate": 2.9765470396320268e-05, + "loss": 5.1493, + "step": 15240 + }, + { + "epoch": 1.220195231236998, + "grad_norm": 2.429172992706299, + "learning_rate": 2.975209926726213e-05, + "loss": 4.8708, + "step": 15250 + }, + { + "epoch": 1.2209953592574811, + "grad_norm": 3.3362350463867188, + "learning_rate": 2.9738728138203993e-05, + "loss": 5.1264, + "step": 15260 + }, + { + "epoch": 1.2217954872779644, + "grad_norm": 3.737426996231079, + "learning_rate": 2.9725357009145856e-05, + "loss": 5.0933, + "step": 15270 + }, + { + "epoch": 1.2225956152984478, + "grad_norm": 3.679044723510742, + "learning_rate": 2.971198588008772e-05, + "loss": 5.1628, + "step": 15280 + }, + { + "epoch": 1.223395743318931, + "grad_norm": 3.072758436203003, + "learning_rate": 2.969861475102958e-05, + "loss": 5.0404, + "step": 15290 + }, + { + "epoch": 1.2241958713394143, + "grad_norm": 2.881199598312378, + "learning_rate": 2.9685243621971444e-05, + "loss": 5.1415, + "step": 15300 + }, + { + "epoch": 1.2249959993598976, + "grad_norm": 2.628080129623413, + "learning_rate": 2.9671872492913307e-05, + "loss": 4.8588, + "step": 15310 + }, + { + "epoch": 1.2257961273803808, + "grad_norm": 3.700045108795166, + "learning_rate": 2.965850136385517e-05, + "loss": 4.8126, + "step": 15320 + }, + { + "epoch": 1.2265962554008643, + "grad_norm": 3.386517286300659, + "learning_rate": 2.9645130234797025e-05, + "loss": 5.0336, + "step": 15330 + }, + { + "epoch": 1.2273963834213475, + "grad_norm": 3.7533726692199707, + "learning_rate": 2.9631759105738888e-05, + "loss": 4.9332, + "step": 15340 + }, + { + "epoch": 1.2281965114418307, + "grad_norm": 3.1511549949645996, + "learning_rate": 2.961838797668075e-05, + "loss": 5.1175, + "step": 15350 + }, + { + "epoch": 1.228996639462314, + "grad_norm": 2.8970258235931396, + "learning_rate": 2.9605016847622613e-05, + "loss": 4.9315, + "step": 15360 + }, + { + "epoch": 1.2297967674827972, + "grad_norm": 4.416659355163574, + "learning_rate": 2.9591645718564476e-05, + "loss": 4.9091, + "step": 15370 + }, + { + "epoch": 1.2305968955032804, + "grad_norm": 3.8760738372802734, + "learning_rate": 2.957827458950634e-05, + "loss": 5.0697, + "step": 15380 + }, + { + "epoch": 1.2313970235237637, + "grad_norm": 2.789149761199951, + "learning_rate": 2.95649034604482e-05, + "loss": 5.129, + "step": 15390 + }, + { + "epoch": 1.2321971515442471, + "grad_norm": 2.7208750247955322, + "learning_rate": 2.9551532331390064e-05, + "loss": 5.1578, + "step": 15400 + }, + { + "epoch": 1.2329972795647304, + "grad_norm": 3.365661144256592, + "learning_rate": 2.9538161202331927e-05, + "loss": 5.0808, + "step": 15410 + }, + { + "epoch": 1.2337974075852136, + "grad_norm": 3.0695741176605225, + "learning_rate": 2.952479007327379e-05, + "loss": 4.7112, + "step": 15420 + }, + { + "epoch": 1.2345975356056968, + "grad_norm": 3.3540539741516113, + "learning_rate": 2.9511418944215652e-05, + "loss": 5.0135, + "step": 15430 + }, + { + "epoch": 1.23539766362618, + "grad_norm": 3.529069185256958, + "learning_rate": 2.9498047815157515e-05, + "loss": 5.001, + "step": 15440 + }, + { + "epoch": 1.2361977916466635, + "grad_norm": 2.7868266105651855, + "learning_rate": 2.9484676686099378e-05, + "loss": 5.0339, + "step": 15450 + }, + { + "epoch": 1.2369979196671468, + "grad_norm": 4.2665839195251465, + "learning_rate": 2.947130555704124e-05, + "loss": 5.0819, + "step": 15460 + }, + { + "epoch": 1.23779804768763, + "grad_norm": 3.5087594985961914, + "learning_rate": 2.9457934427983103e-05, + "loss": 5.1543, + "step": 15470 + }, + { + "epoch": 1.2385981757081133, + "grad_norm": 2.8320281505584717, + "learning_rate": 2.9444563298924966e-05, + "loss": 5.0284, + "step": 15480 + }, + { + "epoch": 1.2393983037285965, + "grad_norm": 3.7067158222198486, + "learning_rate": 2.943119216986682e-05, + "loss": 5.1457, + "step": 15490 + }, + { + "epoch": 1.24019843174908, + "grad_norm": 2.533123016357422, + "learning_rate": 2.9417821040808684e-05, + "loss": 4.9353, + "step": 15500 + }, + { + "epoch": 1.2409985597695632, + "grad_norm": 5.649012565612793, + "learning_rate": 2.9404449911750547e-05, + "loss": 4.9736, + "step": 15510 + }, + { + "epoch": 1.2417986877900464, + "grad_norm": 2.7992377281188965, + "learning_rate": 2.939107878269241e-05, + "loss": 5.0501, + "step": 15520 + }, + { + "epoch": 1.2425988158105297, + "grad_norm": 4.733801364898682, + "learning_rate": 2.9377707653634272e-05, + "loss": 5.0347, + "step": 15530 + }, + { + "epoch": 1.243398943831013, + "grad_norm": 3.681401014328003, + "learning_rate": 2.9364336524576135e-05, + "loss": 5.0521, + "step": 15540 + }, + { + "epoch": 1.2441990718514964, + "grad_norm": 3.267540693283081, + "learning_rate": 2.9350965395517998e-05, + "loss": 5.1938, + "step": 15550 + }, + { + "epoch": 1.2449991998719796, + "grad_norm": 3.616941452026367, + "learning_rate": 2.933759426645986e-05, + "loss": 4.959, + "step": 15560 + }, + { + "epoch": 1.2457993278924628, + "grad_norm": 2.2835965156555176, + "learning_rate": 2.9324223137401723e-05, + "loss": 5.1389, + "step": 15570 + }, + { + "epoch": 1.246599455912946, + "grad_norm": 4.935213565826416, + "learning_rate": 2.9310852008343586e-05, + "loss": 5.0684, + "step": 15580 + }, + { + "epoch": 1.2473995839334293, + "grad_norm": 2.770784616470337, + "learning_rate": 2.929748087928545e-05, + "loss": 5.0231, + "step": 15590 + }, + { + "epoch": 1.2481997119539125, + "grad_norm": 3.6176304817199707, + "learning_rate": 2.928410975022731e-05, + "loss": 5.1103, + "step": 15600 + }, + { + "epoch": 1.2489998399743958, + "grad_norm": 2.7759737968444824, + "learning_rate": 2.9270738621169174e-05, + "loss": 5.2312, + "step": 15610 + }, + { + "epoch": 1.2497999679948792, + "grad_norm": 3.1174516677856445, + "learning_rate": 2.9257367492111037e-05, + "loss": 5.1001, + "step": 15620 + }, + { + "epoch": 1.2506000960153625, + "grad_norm": 3.429516315460205, + "learning_rate": 2.92439963630529e-05, + "loss": 4.9854, + "step": 15630 + }, + { + "epoch": 1.2514002240358457, + "grad_norm": 3.940547227859497, + "learning_rate": 2.923062523399476e-05, + "loss": 5.1451, + "step": 15640 + }, + { + "epoch": 1.252200352056329, + "grad_norm": 3.9238924980163574, + "learning_rate": 2.921725410493662e-05, + "loss": 5.0231, + "step": 15650 + }, + { + "epoch": 1.2530004800768122, + "grad_norm": 3.668210983276367, + "learning_rate": 2.9203882975878484e-05, + "loss": 4.8446, + "step": 15660 + }, + { + "epoch": 1.2538006080972957, + "grad_norm": 3.138932704925537, + "learning_rate": 2.9190511846820347e-05, + "loss": 5.1351, + "step": 15670 + }, + { + "epoch": 1.254600736117779, + "grad_norm": 2.6472814083099365, + "learning_rate": 2.917714071776221e-05, + "loss": 5.0462, + "step": 15680 + }, + { + "epoch": 1.2554008641382621, + "grad_norm": 3.304532766342163, + "learning_rate": 2.9163769588704072e-05, + "loss": 5.1271, + "step": 15690 + }, + { + "epoch": 1.2562009921587454, + "grad_norm": 3.478391170501709, + "learning_rate": 2.9150398459645935e-05, + "loss": 5.0127, + "step": 15700 + }, + { + "epoch": 1.2570011201792286, + "grad_norm": 2.867481231689453, + "learning_rate": 2.9137027330587797e-05, + "loss": 5.061, + "step": 15710 + }, + { + "epoch": 1.257801248199712, + "grad_norm": 3.4853031635284424, + "learning_rate": 2.912365620152966e-05, + "loss": 4.8585, + "step": 15720 + }, + { + "epoch": 1.2586013762201953, + "grad_norm": 2.6660521030426025, + "learning_rate": 2.9110285072471523e-05, + "loss": 4.906, + "step": 15730 + }, + { + "epoch": 1.2594015042406785, + "grad_norm": 4.05171012878418, + "learning_rate": 2.9096913943413385e-05, + "loss": 4.9971, + "step": 15740 + }, + { + "epoch": 1.2602016322611618, + "grad_norm": 2.448624849319458, + "learning_rate": 2.9083542814355248e-05, + "loss": 5.2265, + "step": 15750 + }, + { + "epoch": 1.261001760281645, + "grad_norm": 3.1989893913269043, + "learning_rate": 2.907017168529711e-05, + "loss": 5.0903, + "step": 15760 + }, + { + "epoch": 1.2618018883021285, + "grad_norm": 3.377147674560547, + "learning_rate": 2.9056800556238974e-05, + "loss": 4.9618, + "step": 15770 + }, + { + "epoch": 1.2626020163226115, + "grad_norm": 2.9969663619995117, + "learning_rate": 2.9043429427180836e-05, + "loss": 5.1172, + "step": 15780 + }, + { + "epoch": 1.263402144343095, + "grad_norm": 2.4437100887298584, + "learning_rate": 2.90300582981227e-05, + "loss": 4.9518, + "step": 15790 + }, + { + "epoch": 1.2642022723635782, + "grad_norm": 2.806199073791504, + "learning_rate": 2.9016687169064555e-05, + "loss": 5.0077, + "step": 15800 + }, + { + "epoch": 1.2650024003840614, + "grad_norm": 3.0119595527648926, + "learning_rate": 2.9003316040006417e-05, + "loss": 5.1922, + "step": 15810 + }, + { + "epoch": 1.2658025284045447, + "grad_norm": 4.022315502166748, + "learning_rate": 2.898994491094828e-05, + "loss": 5.0947, + "step": 15820 + }, + { + "epoch": 1.266602656425028, + "grad_norm": 3.117807388305664, + "learning_rate": 2.8976573781890143e-05, + "loss": 5.1026, + "step": 15830 + }, + { + "epoch": 1.2674027844455114, + "grad_norm": 5.5977935791015625, + "learning_rate": 2.8963202652832006e-05, + "loss": 4.9931, + "step": 15840 + }, + { + "epoch": 1.2682029124659946, + "grad_norm": 4.070805549621582, + "learning_rate": 2.8949831523773868e-05, + "loss": 5.0492, + "step": 15850 + }, + { + "epoch": 1.2690030404864778, + "grad_norm": 2.844937324523926, + "learning_rate": 2.893646039471573e-05, + "loss": 5.0453, + "step": 15860 + }, + { + "epoch": 1.269803168506961, + "grad_norm": 3.0533149242401123, + "learning_rate": 2.8923089265657594e-05, + "loss": 5.129, + "step": 15870 + }, + { + "epoch": 1.2706032965274443, + "grad_norm": 4.0250043869018555, + "learning_rate": 2.8909718136599456e-05, + "loss": 4.9406, + "step": 15880 + }, + { + "epoch": 1.2714034245479278, + "grad_norm": 3.149026870727539, + "learning_rate": 2.889634700754132e-05, + "loss": 5.0056, + "step": 15890 + }, + { + "epoch": 1.272203552568411, + "grad_norm": 4.144321918487549, + "learning_rate": 2.8882975878483182e-05, + "loss": 4.9246, + "step": 15900 + }, + { + "epoch": 1.2730036805888942, + "grad_norm": 2.5918688774108887, + "learning_rate": 2.8869604749425044e-05, + "loss": 4.8537, + "step": 15910 + }, + { + "epoch": 1.2738038086093775, + "grad_norm": 2.5930793285369873, + "learning_rate": 2.8856233620366907e-05, + "loss": 4.9614, + "step": 15920 + }, + { + "epoch": 1.2746039366298607, + "grad_norm": 2.6094300746917725, + "learning_rate": 2.884286249130877e-05, + "loss": 4.9636, + "step": 15930 + }, + { + "epoch": 1.2754040646503442, + "grad_norm": 3.7304527759552, + "learning_rate": 2.8829491362250632e-05, + "loss": 5.0208, + "step": 15940 + }, + { + "epoch": 1.2762041926708274, + "grad_norm": 3.657022714614868, + "learning_rate": 2.881612023319249e-05, + "loss": 5.0232, + "step": 15950 + }, + { + "epoch": 1.2770043206913106, + "grad_norm": 2.3147027492523193, + "learning_rate": 2.880274910413435e-05, + "loss": 5.0987, + "step": 15960 + }, + { + "epoch": 1.2778044487117939, + "grad_norm": 2.9794461727142334, + "learning_rate": 2.8789377975076214e-05, + "loss": 5.0097, + "step": 15970 + }, + { + "epoch": 1.2786045767322771, + "grad_norm": 4.13859748840332, + "learning_rate": 2.8776006846018076e-05, + "loss": 5.0925, + "step": 15980 + }, + { + "epoch": 1.2794047047527606, + "grad_norm": 3.289497137069702, + "learning_rate": 2.876263571695994e-05, + "loss": 4.9943, + "step": 15990 + }, + { + "epoch": 1.2802048327732436, + "grad_norm": 4.03442907333374, + "learning_rate": 2.8749264587901802e-05, + "loss": 5.1402, + "step": 16000 + }, + { + "epoch": 1.281004960793727, + "grad_norm": 2.7994818687438965, + "learning_rate": 2.8735893458843665e-05, + "loss": 5.1223, + "step": 16010 + }, + { + "epoch": 1.2818050888142103, + "grad_norm": 2.323319673538208, + "learning_rate": 2.8722522329785527e-05, + "loss": 4.9936, + "step": 16020 + }, + { + "epoch": 1.2826052168346935, + "grad_norm": 3.254915952682495, + "learning_rate": 2.870915120072739e-05, + "loss": 4.9216, + "step": 16030 + }, + { + "epoch": 1.2834053448551768, + "grad_norm": 2.253689765930176, + "learning_rate": 2.8695780071669253e-05, + "loss": 4.9689, + "step": 16040 + }, + { + "epoch": 1.28420547287566, + "grad_norm": 2.4922399520874023, + "learning_rate": 2.8682408942611115e-05, + "loss": 5.0267, + "step": 16050 + }, + { + "epoch": 1.2850056008961435, + "grad_norm": 3.0966029167175293, + "learning_rate": 2.8669037813552978e-05, + "loss": 4.9785, + "step": 16060 + }, + { + "epoch": 1.2858057289166267, + "grad_norm": 3.88032603263855, + "learning_rate": 2.865566668449484e-05, + "loss": 4.8793, + "step": 16070 + }, + { + "epoch": 1.28660585693711, + "grad_norm": 3.152989625930786, + "learning_rate": 2.8642295555436703e-05, + "loss": 5.0864, + "step": 16080 + }, + { + "epoch": 1.2874059849575932, + "grad_norm": 2.652728796005249, + "learning_rate": 2.8628924426378566e-05, + "loss": 5.0895, + "step": 16090 + }, + { + "epoch": 1.2882061129780764, + "grad_norm": 3.099534273147583, + "learning_rate": 2.8615553297320425e-05, + "loss": 5.245, + "step": 16100 + }, + { + "epoch": 1.2890062409985599, + "grad_norm": 3.588867664337158, + "learning_rate": 2.8602182168262288e-05, + "loss": 5.0127, + "step": 16110 + }, + { + "epoch": 1.289806369019043, + "grad_norm": 3.4451441764831543, + "learning_rate": 2.858881103920415e-05, + "loss": 5.1003, + "step": 16120 + }, + { + "epoch": 1.2906064970395263, + "grad_norm": 3.004190683364868, + "learning_rate": 2.8575439910146013e-05, + "loss": 5.0977, + "step": 16130 + }, + { + "epoch": 1.2914066250600096, + "grad_norm": 2.912879467010498, + "learning_rate": 2.8562068781087876e-05, + "loss": 4.8822, + "step": 16140 + }, + { + "epoch": 1.2922067530804928, + "grad_norm": 3.078666925430298, + "learning_rate": 2.854869765202974e-05, + "loss": 4.9094, + "step": 16150 + }, + { + "epoch": 1.2930068811009763, + "grad_norm": 3.8617477416992188, + "learning_rate": 2.85353265229716e-05, + "loss": 5.0841, + "step": 16160 + }, + { + "epoch": 1.2938070091214593, + "grad_norm": 3.9073874950408936, + "learning_rate": 2.8521955393913464e-05, + "loss": 4.9613, + "step": 16170 + }, + { + "epoch": 1.2946071371419428, + "grad_norm": 4.929398059844971, + "learning_rate": 2.8508584264855327e-05, + "loss": 5.0763, + "step": 16180 + }, + { + "epoch": 1.295407265162426, + "grad_norm": 2.9813883304595947, + "learning_rate": 2.849521313579719e-05, + "loss": 5.0864, + "step": 16190 + }, + { + "epoch": 1.2962073931829092, + "grad_norm": 2.894916296005249, + "learning_rate": 2.8481842006739052e-05, + "loss": 5.0887, + "step": 16200 + }, + { + "epoch": 1.2970075212033925, + "grad_norm": 3.0193276405334473, + "learning_rate": 2.8468470877680915e-05, + "loss": 5.1709, + "step": 16210 + }, + { + "epoch": 1.2978076492238757, + "grad_norm": 2.6719863414764404, + "learning_rate": 2.8455099748622778e-05, + "loss": 5.1043, + "step": 16220 + }, + { + "epoch": 1.2986077772443592, + "grad_norm": 5.449126243591309, + "learning_rate": 2.844172861956464e-05, + "loss": 5.0786, + "step": 16230 + }, + { + "epoch": 1.2994079052648424, + "grad_norm": 3.498746633529663, + "learning_rate": 2.8428357490506503e-05, + "loss": 5.1796, + "step": 16240 + }, + { + "epoch": 1.3002080332853256, + "grad_norm": 3.3248443603515625, + "learning_rate": 2.8414986361448366e-05, + "loss": 5.0488, + "step": 16250 + }, + { + "epoch": 1.3010081613058089, + "grad_norm": 3.4345290660858154, + "learning_rate": 2.840161523239022e-05, + "loss": 5.0962, + "step": 16260 + }, + { + "epoch": 1.3018082893262921, + "grad_norm": 3.6769347190856934, + "learning_rate": 2.8388244103332084e-05, + "loss": 5.1032, + "step": 16270 + }, + { + "epoch": 1.3026084173467756, + "grad_norm": 6.013178825378418, + "learning_rate": 2.8374872974273947e-05, + "loss": 5.2127, + "step": 16280 + }, + { + "epoch": 1.3034085453672588, + "grad_norm": 3.117189884185791, + "learning_rate": 2.836150184521581e-05, + "loss": 5.0172, + "step": 16290 + }, + { + "epoch": 1.304208673387742, + "grad_norm": 2.8342814445495605, + "learning_rate": 2.8348130716157672e-05, + "loss": 5.185, + "step": 16300 + }, + { + "epoch": 1.3050088014082253, + "grad_norm": 3.0346531867980957, + "learning_rate": 2.8334759587099535e-05, + "loss": 5.0422, + "step": 16310 + }, + { + "epoch": 1.3058089294287085, + "grad_norm": 2.4778048992156982, + "learning_rate": 2.8321388458041398e-05, + "loss": 5.2192, + "step": 16320 + }, + { + "epoch": 1.306609057449192, + "grad_norm": 2.3309547901153564, + "learning_rate": 2.830801732898326e-05, + "loss": 4.8922, + "step": 16330 + }, + { + "epoch": 1.3074091854696752, + "grad_norm": 5.362358093261719, + "learning_rate": 2.8294646199925123e-05, + "loss": 5.1077, + "step": 16340 + }, + { + "epoch": 1.3082093134901585, + "grad_norm": 3.8114402294158936, + "learning_rate": 2.8281275070866986e-05, + "loss": 5.1087, + "step": 16350 + }, + { + "epoch": 1.3090094415106417, + "grad_norm": 3.24176025390625, + "learning_rate": 2.826790394180885e-05, + "loss": 4.9285, + "step": 16360 + }, + { + "epoch": 1.309809569531125, + "grad_norm": 3.4344241619110107, + "learning_rate": 2.825453281275071e-05, + "loss": 5.0988, + "step": 16370 + }, + { + "epoch": 1.3106096975516084, + "grad_norm": 4.151029109954834, + "learning_rate": 2.8241161683692574e-05, + "loss": 5.027, + "step": 16380 + }, + { + "epoch": 1.3114098255720914, + "grad_norm": 3.6375935077667236, + "learning_rate": 2.8227790554634437e-05, + "loss": 4.9798, + "step": 16390 + }, + { + "epoch": 1.3122099535925749, + "grad_norm": 4.077208042144775, + "learning_rate": 2.82144194255763e-05, + "loss": 4.9757, + "step": 16400 + }, + { + "epoch": 1.313010081613058, + "grad_norm": 2.053307056427002, + "learning_rate": 2.8201048296518155e-05, + "loss": 5.0139, + "step": 16410 + }, + { + "epoch": 1.3138102096335413, + "grad_norm": 3.136188268661499, + "learning_rate": 2.8187677167460018e-05, + "loss": 5.2215, + "step": 16420 + }, + { + "epoch": 1.3146103376540246, + "grad_norm": 3.371777296066284, + "learning_rate": 2.817430603840188e-05, + "loss": 5.1862, + "step": 16430 + }, + { + "epoch": 1.3154104656745078, + "grad_norm": 2.6995646953582764, + "learning_rate": 2.8160934909343743e-05, + "loss": 5.0416, + "step": 16440 + }, + { + "epoch": 1.3162105936949913, + "grad_norm": 4.669895172119141, + "learning_rate": 2.8147563780285606e-05, + "loss": 5.1037, + "step": 16450 + }, + { + "epoch": 1.3170107217154745, + "grad_norm": 3.6712257862091064, + "learning_rate": 2.813419265122747e-05, + "loss": 5.0958, + "step": 16460 + }, + { + "epoch": 1.3178108497359577, + "grad_norm": 3.1598026752471924, + "learning_rate": 2.812082152216933e-05, + "loss": 4.8109, + "step": 16470 + }, + { + "epoch": 1.318610977756441, + "grad_norm": 3.089665412902832, + "learning_rate": 2.8107450393111194e-05, + "loss": 5.1861, + "step": 16480 + }, + { + "epoch": 1.3194111057769242, + "grad_norm": 2.7782411575317383, + "learning_rate": 2.8094079264053057e-05, + "loss": 5.0381, + "step": 16490 + }, + { + "epoch": 1.3202112337974077, + "grad_norm": 2.675037145614624, + "learning_rate": 2.808070813499492e-05, + "loss": 5.0606, + "step": 16500 + }, + { + "epoch": 1.321011361817891, + "grad_norm": 2.1817705631256104, + "learning_rate": 2.8067337005936782e-05, + "loss": 4.9736, + "step": 16510 + }, + { + "epoch": 1.3218114898383742, + "grad_norm": 4.562685012817383, + "learning_rate": 2.8053965876878645e-05, + "loss": 5.2282, + "step": 16520 + }, + { + "epoch": 1.3226116178588574, + "grad_norm": 3.4051151275634766, + "learning_rate": 2.8040594747820507e-05, + "loss": 5.0466, + "step": 16530 + }, + { + "epoch": 1.3234117458793406, + "grad_norm": 3.512916088104248, + "learning_rate": 2.802722361876237e-05, + "loss": 5.1473, + "step": 16540 + }, + { + "epoch": 1.324211873899824, + "grad_norm": 2.6192626953125, + "learning_rate": 2.8013852489704233e-05, + "loss": 4.9312, + "step": 16550 + }, + { + "epoch": 1.3250120019203073, + "grad_norm": 2.950780153274536, + "learning_rate": 2.8000481360646096e-05, + "loss": 5.0987, + "step": 16560 + }, + { + "epoch": 1.3258121299407906, + "grad_norm": 3.2263593673706055, + "learning_rate": 2.7987110231587955e-05, + "loss": 5.041, + "step": 16570 + }, + { + "epoch": 1.3266122579612738, + "grad_norm": 3.2121188640594482, + "learning_rate": 2.7973739102529818e-05, + "loss": 4.9174, + "step": 16580 + }, + { + "epoch": 1.327412385981757, + "grad_norm": 5.739203929901123, + "learning_rate": 2.796036797347168e-05, + "loss": 4.9011, + "step": 16590 + }, + { + "epoch": 1.3282125140022405, + "grad_norm": 3.4774510860443115, + "learning_rate": 2.7946996844413543e-05, + "loss": 4.87, + "step": 16600 + }, + { + "epoch": 1.3290126420227235, + "grad_norm": 3.5726733207702637, + "learning_rate": 2.7933625715355406e-05, + "loss": 4.9914, + "step": 16610 + }, + { + "epoch": 1.329812770043207, + "grad_norm": 6.456293106079102, + "learning_rate": 2.7920254586297268e-05, + "loss": 4.998, + "step": 16620 + }, + { + "epoch": 1.3306128980636902, + "grad_norm": 3.2161965370178223, + "learning_rate": 2.790688345723913e-05, + "loss": 5.1975, + "step": 16630 + }, + { + "epoch": 1.3314130260841734, + "grad_norm": 2.956698179244995, + "learning_rate": 2.7893512328180994e-05, + "loss": 4.8964, + "step": 16640 + }, + { + "epoch": 1.3322131541046567, + "grad_norm": 2.859546661376953, + "learning_rate": 2.7880141199122856e-05, + "loss": 4.9889, + "step": 16650 + }, + { + "epoch": 1.33301328212514, + "grad_norm": 2.9399912357330322, + "learning_rate": 2.786677007006472e-05, + "loss": 5.1039, + "step": 16660 + }, + { + "epoch": 1.3338134101456234, + "grad_norm": 2.7161428928375244, + "learning_rate": 2.7853398941006582e-05, + "loss": 5.02, + "step": 16670 + }, + { + "epoch": 1.3346135381661066, + "grad_norm": 3.227506399154663, + "learning_rate": 2.7840027811948444e-05, + "loss": 4.859, + "step": 16680 + }, + { + "epoch": 1.3354136661865899, + "grad_norm": 2.404874086380005, + "learning_rate": 2.7826656682890307e-05, + "loss": 5.0912, + "step": 16690 + }, + { + "epoch": 1.336213794207073, + "grad_norm": 3.3921666145324707, + "learning_rate": 2.781328555383217e-05, + "loss": 5.1004, + "step": 16700 + }, + { + "epoch": 1.3370139222275563, + "grad_norm": 3.100717306137085, + "learning_rate": 2.7799914424774032e-05, + "loss": 5.0184, + "step": 16710 + }, + { + "epoch": 1.3378140502480398, + "grad_norm": 4.171880722045898, + "learning_rate": 2.778654329571589e-05, + "loss": 4.9575, + "step": 16720 + }, + { + "epoch": 1.338614178268523, + "grad_norm": 2.831758975982666, + "learning_rate": 2.777317216665775e-05, + "loss": 4.8927, + "step": 16730 + }, + { + "epoch": 1.3394143062890063, + "grad_norm": 3.1294634342193604, + "learning_rate": 2.7759801037599614e-05, + "loss": 5.023, + "step": 16740 + }, + { + "epoch": 1.3402144343094895, + "grad_norm": 3.0629208087921143, + "learning_rate": 2.7746429908541476e-05, + "loss": 5.1852, + "step": 16750 + }, + { + "epoch": 1.3410145623299727, + "grad_norm": 3.2801673412323, + "learning_rate": 2.773305877948334e-05, + "loss": 5.1116, + "step": 16760 + }, + { + "epoch": 1.3418146903504562, + "grad_norm": 3.6318020820617676, + "learning_rate": 2.7719687650425202e-05, + "loss": 4.8689, + "step": 16770 + }, + { + "epoch": 1.3426148183709394, + "grad_norm": 3.2776827812194824, + "learning_rate": 2.7706316521367065e-05, + "loss": 4.8672, + "step": 16780 + }, + { + "epoch": 1.3434149463914227, + "grad_norm": 2.7179038524627686, + "learning_rate": 2.7692945392308927e-05, + "loss": 5.0371, + "step": 16790 + }, + { + "epoch": 1.344215074411906, + "grad_norm": 5.088667869567871, + "learning_rate": 2.767957426325079e-05, + "loss": 5.0763, + "step": 16800 + }, + { + "epoch": 1.3450152024323891, + "grad_norm": 4.196096420288086, + "learning_rate": 2.7666203134192653e-05, + "loss": 4.9903, + "step": 16810 + }, + { + "epoch": 1.3458153304528724, + "grad_norm": 2.5765233039855957, + "learning_rate": 2.7652832005134515e-05, + "loss": 5.0393, + "step": 16820 + }, + { + "epoch": 1.3466154584733556, + "grad_norm": 2.857628583908081, + "learning_rate": 2.7639460876076378e-05, + "loss": 5.1609, + "step": 16830 + }, + { + "epoch": 1.347415586493839, + "grad_norm": 4.593959808349609, + "learning_rate": 2.762608974701824e-05, + "loss": 5.3302, + "step": 16840 + }, + { + "epoch": 1.3482157145143223, + "grad_norm": 2.7411885261535645, + "learning_rate": 2.7612718617960103e-05, + "loss": 5.0989, + "step": 16850 + }, + { + "epoch": 1.3490158425348056, + "grad_norm": 3.586278200149536, + "learning_rate": 2.7599347488901966e-05, + "loss": 5.0393, + "step": 16860 + }, + { + "epoch": 1.3498159705552888, + "grad_norm": 4.603923797607422, + "learning_rate": 2.758597635984383e-05, + "loss": 5.1441, + "step": 16870 + }, + { + "epoch": 1.350616098575772, + "grad_norm": 5.242514133453369, + "learning_rate": 2.7572605230785685e-05, + "loss": 5.1488, + "step": 16880 + }, + { + "epoch": 1.3514162265962555, + "grad_norm": 5.148528575897217, + "learning_rate": 2.7559234101727547e-05, + "loss": 5.1064, + "step": 16890 + }, + { + "epoch": 1.3522163546167387, + "grad_norm": 3.759023904800415, + "learning_rate": 2.754586297266941e-05, + "loss": 4.9888, + "step": 16900 + }, + { + "epoch": 1.353016482637222, + "grad_norm": 3.1770131587982178, + "learning_rate": 2.7532491843611273e-05, + "loss": 4.972, + "step": 16910 + }, + { + "epoch": 1.3538166106577052, + "grad_norm": 3.5174386501312256, + "learning_rate": 2.7519120714553135e-05, + "loss": 5.1717, + "step": 16920 + }, + { + "epoch": 1.3546167386781884, + "grad_norm": 4.057755470275879, + "learning_rate": 2.7505749585494998e-05, + "loss": 5.1299, + "step": 16930 + }, + { + "epoch": 1.355416866698672, + "grad_norm": 3.446735382080078, + "learning_rate": 2.749237845643686e-05, + "loss": 4.9329, + "step": 16940 + }, + { + "epoch": 1.3562169947191551, + "grad_norm": 3.140084743499756, + "learning_rate": 2.7479007327378724e-05, + "loss": 5.1446, + "step": 16950 + }, + { + "epoch": 1.3570171227396384, + "grad_norm": 3.435009241104126, + "learning_rate": 2.7465636198320586e-05, + "loss": 5.095, + "step": 16960 + }, + { + "epoch": 1.3578172507601216, + "grad_norm": 3.1479451656341553, + "learning_rate": 2.745226506926245e-05, + "loss": 4.9733, + "step": 16970 + }, + { + "epoch": 1.3586173787806048, + "grad_norm": 2.3184094429016113, + "learning_rate": 2.743889394020431e-05, + "loss": 5.0376, + "step": 16980 + }, + { + "epoch": 1.3594175068010883, + "grad_norm": 3.5338847637176514, + "learning_rate": 2.7425522811146174e-05, + "loss": 4.9706, + "step": 16990 + }, + { + "epoch": 1.3602176348215713, + "grad_norm": 3.1620054244995117, + "learning_rate": 2.7412151682088037e-05, + "loss": 5.0098, + "step": 17000 + }, + { + "epoch": 1.3610177628420548, + "grad_norm": 3.9371540546417236, + "learning_rate": 2.73987805530299e-05, + "loss": 5.2918, + "step": 17010 + }, + { + "epoch": 1.361817890862538, + "grad_norm": 3.9547667503356934, + "learning_rate": 2.7385409423971762e-05, + "loss": 4.9147, + "step": 17020 + }, + { + "epoch": 1.3626180188830213, + "grad_norm": 6.235637664794922, + "learning_rate": 2.737203829491362e-05, + "loss": 5.1119, + "step": 17030 + }, + { + "epoch": 1.3634181469035045, + "grad_norm": 7.610907077789307, + "learning_rate": 2.7358667165855484e-05, + "loss": 5.0062, + "step": 17040 + }, + { + "epoch": 1.3642182749239877, + "grad_norm": 2.9273130893707275, + "learning_rate": 2.7345296036797347e-05, + "loss": 5.1216, + "step": 17050 + }, + { + "epoch": 1.3650184029444712, + "grad_norm": 3.5307564735412598, + "learning_rate": 2.733192490773921e-05, + "loss": 5.283, + "step": 17060 + }, + { + "epoch": 1.3658185309649544, + "grad_norm": 4.250466346740723, + "learning_rate": 2.7318553778681072e-05, + "loss": 5.1638, + "step": 17070 + }, + { + "epoch": 1.3666186589854377, + "grad_norm": 3.930469512939453, + "learning_rate": 2.7305182649622935e-05, + "loss": 5.0906, + "step": 17080 + }, + { + "epoch": 1.367418787005921, + "grad_norm": 2.873779773712158, + "learning_rate": 2.7291811520564798e-05, + "loss": 5.106, + "step": 17090 + }, + { + "epoch": 1.3682189150264041, + "grad_norm": 3.261646270751953, + "learning_rate": 2.727844039150666e-05, + "loss": 4.9115, + "step": 17100 + }, + { + "epoch": 1.3690190430468876, + "grad_norm": 3.0499074459075928, + "learning_rate": 2.7265069262448523e-05, + "loss": 5.035, + "step": 17110 + }, + { + "epoch": 1.3698191710673708, + "grad_norm": 3.5967090129852295, + "learning_rate": 2.7251698133390386e-05, + "loss": 5.0455, + "step": 17120 + }, + { + "epoch": 1.370619299087854, + "grad_norm": 5.134222507476807, + "learning_rate": 2.723832700433225e-05, + "loss": 5.0828, + "step": 17130 + }, + { + "epoch": 1.3714194271083373, + "grad_norm": 2.415818929672241, + "learning_rate": 2.722495587527411e-05, + "loss": 4.9838, + "step": 17140 + }, + { + "epoch": 1.3722195551288205, + "grad_norm": 3.8739757537841797, + "learning_rate": 2.7211584746215974e-05, + "loss": 5.145, + "step": 17150 + }, + { + "epoch": 1.373019683149304, + "grad_norm": 3.7621076107025146, + "learning_rate": 2.7198213617157837e-05, + "loss": 5.0735, + "step": 17160 + }, + { + "epoch": 1.3738198111697872, + "grad_norm": 4.789724349975586, + "learning_rate": 2.71848424880997e-05, + "loss": 4.9842, + "step": 17170 + }, + { + "epoch": 1.3746199391902705, + "grad_norm": 2.3499200344085693, + "learning_rate": 2.7171471359041562e-05, + "loss": 5.1707, + "step": 17180 + }, + { + "epoch": 1.3754200672107537, + "grad_norm": 3.083209753036499, + "learning_rate": 2.7158100229983418e-05, + "loss": 5.1676, + "step": 17190 + }, + { + "epoch": 1.376220195231237, + "grad_norm": 3.0781798362731934, + "learning_rate": 2.714472910092528e-05, + "loss": 4.9164, + "step": 17200 + }, + { + "epoch": 1.3770203232517204, + "grad_norm": 2.9193978309631348, + "learning_rate": 2.7131357971867143e-05, + "loss": 4.9213, + "step": 17210 + }, + { + "epoch": 1.3778204512722034, + "grad_norm": 5.844182014465332, + "learning_rate": 2.7117986842809006e-05, + "loss": 5.2283, + "step": 17220 + }, + { + "epoch": 1.3786205792926869, + "grad_norm": 2.885617733001709, + "learning_rate": 2.710461571375087e-05, + "loss": 4.9839, + "step": 17230 + }, + { + "epoch": 1.3794207073131701, + "grad_norm": 2.9228098392486572, + "learning_rate": 2.709124458469273e-05, + "loss": 5.1437, + "step": 17240 + }, + { + "epoch": 1.3802208353336534, + "grad_norm": 2.6880314350128174, + "learning_rate": 2.7077873455634594e-05, + "loss": 5.0111, + "step": 17250 + }, + { + "epoch": 1.3810209633541366, + "grad_norm": 4.601352691650391, + "learning_rate": 2.7064502326576457e-05, + "loss": 4.9988, + "step": 17260 + }, + { + "epoch": 1.3818210913746198, + "grad_norm": 4.172126293182373, + "learning_rate": 2.705113119751832e-05, + "loss": 5.022, + "step": 17270 + }, + { + "epoch": 1.3826212193951033, + "grad_norm": 2.526599407196045, + "learning_rate": 2.7037760068460182e-05, + "loss": 5.138, + "step": 17280 + }, + { + "epoch": 1.3834213474155865, + "grad_norm": 2.531637191772461, + "learning_rate": 2.7024388939402045e-05, + "loss": 4.9327, + "step": 17290 + }, + { + "epoch": 1.3842214754360698, + "grad_norm": 2.816145896911621, + "learning_rate": 2.7011017810343907e-05, + "loss": 5.0343, + "step": 17300 + }, + { + "epoch": 1.385021603456553, + "grad_norm": 3.2603354454040527, + "learning_rate": 2.699764668128577e-05, + "loss": 5.0129, + "step": 17310 + }, + { + "epoch": 1.3858217314770362, + "grad_norm": 2.7474968433380127, + "learning_rate": 2.6984275552227633e-05, + "loss": 5.0807, + "step": 17320 + }, + { + "epoch": 1.3866218594975197, + "grad_norm": 3.539966344833374, + "learning_rate": 2.6970904423169496e-05, + "loss": 5.0476, + "step": 17330 + }, + { + "epoch": 1.387421987518003, + "grad_norm": 3.2417261600494385, + "learning_rate": 2.6957533294111355e-05, + "loss": 4.9799, + "step": 17340 + }, + { + "epoch": 1.3882221155384862, + "grad_norm": 3.1689510345458984, + "learning_rate": 2.6944162165053218e-05, + "loss": 5.131, + "step": 17350 + }, + { + "epoch": 1.3890222435589694, + "grad_norm": 3.3699281215667725, + "learning_rate": 2.693079103599508e-05, + "loss": 4.9792, + "step": 17360 + }, + { + "epoch": 1.3898223715794527, + "grad_norm": 3.8069379329681396, + "learning_rate": 2.6917419906936943e-05, + "loss": 5.2615, + "step": 17370 + }, + { + "epoch": 1.3906224995999361, + "grad_norm": 3.1969141960144043, + "learning_rate": 2.6904048777878806e-05, + "loss": 4.8507, + "step": 17380 + }, + { + "epoch": 1.3914226276204193, + "grad_norm": 2.7576959133148193, + "learning_rate": 2.689067764882067e-05, + "loss": 5.1096, + "step": 17390 + }, + { + "epoch": 1.3922227556409026, + "grad_norm": 4.1617608070373535, + "learning_rate": 2.687730651976253e-05, + "loss": 4.9802, + "step": 17400 + }, + { + "epoch": 1.3930228836613858, + "grad_norm": 4.2402567863464355, + "learning_rate": 2.6863935390704394e-05, + "loss": 5.1928, + "step": 17410 + }, + { + "epoch": 1.393823011681869, + "grad_norm": 5.131753921508789, + "learning_rate": 2.6850564261646256e-05, + "loss": 5.099, + "step": 17420 + }, + { + "epoch": 1.3946231397023523, + "grad_norm": 2.932196617126465, + "learning_rate": 2.683719313258812e-05, + "loss": 4.8787, + "step": 17430 + }, + { + "epoch": 1.3954232677228355, + "grad_norm": 3.221860885620117, + "learning_rate": 2.6823822003529982e-05, + "loss": 5.2802, + "step": 17440 + }, + { + "epoch": 1.396223395743319, + "grad_norm": 3.539557456970215, + "learning_rate": 2.6810450874471844e-05, + "loss": 4.8109, + "step": 17450 + }, + { + "epoch": 1.3970235237638022, + "grad_norm": 4.26516056060791, + "learning_rate": 2.6797079745413707e-05, + "loss": 5.1081, + "step": 17460 + }, + { + "epoch": 1.3978236517842855, + "grad_norm": 6.4403605461120605, + "learning_rate": 2.678370861635557e-05, + "loss": 5.1178, + "step": 17470 + }, + { + "epoch": 1.3986237798047687, + "grad_norm": 3.0701773166656494, + "learning_rate": 2.6770337487297433e-05, + "loss": 5.1088, + "step": 17480 + }, + { + "epoch": 1.399423907825252, + "grad_norm": 3.2516419887542725, + "learning_rate": 2.6756966358239295e-05, + "loss": 5.0524, + "step": 17490 + }, + { + "epoch": 1.4002240358457354, + "grad_norm": 3.4712700843811035, + "learning_rate": 2.674359522918115e-05, + "loss": 4.9977, + "step": 17500 + }, + { + "epoch": 1.4010241638662186, + "grad_norm": 3.4102516174316406, + "learning_rate": 2.6730224100123014e-05, + "loss": 4.8785, + "step": 17510 + }, + { + "epoch": 1.4018242918867019, + "grad_norm": 3.1689910888671875, + "learning_rate": 2.6716852971064877e-05, + "loss": 5.2968, + "step": 17520 + }, + { + "epoch": 1.4026244199071851, + "grad_norm": 3.560192823410034, + "learning_rate": 2.670348184200674e-05, + "loss": 5.1994, + "step": 17530 + }, + { + "epoch": 1.4034245479276684, + "grad_norm": 4.837986469268799, + "learning_rate": 2.6690110712948602e-05, + "loss": 4.913, + "step": 17540 + }, + { + "epoch": 1.4042246759481518, + "grad_norm": 3.8842897415161133, + "learning_rate": 2.6676739583890465e-05, + "loss": 5.1379, + "step": 17550 + }, + { + "epoch": 1.405024803968635, + "grad_norm": 2.939554452896118, + "learning_rate": 2.6663368454832327e-05, + "loss": 4.9999, + "step": 17560 + }, + { + "epoch": 1.4058249319891183, + "grad_norm": 3.137080192565918, + "learning_rate": 2.664999732577419e-05, + "loss": 4.9539, + "step": 17570 + }, + { + "epoch": 1.4066250600096015, + "grad_norm": 3.7876265048980713, + "learning_rate": 2.6636626196716053e-05, + "loss": 5.0494, + "step": 17580 + }, + { + "epoch": 1.4074251880300848, + "grad_norm": 3.8008368015289307, + "learning_rate": 2.6623255067657915e-05, + "loss": 5.0823, + "step": 17590 + }, + { + "epoch": 1.4082253160505682, + "grad_norm": 2.5244452953338623, + "learning_rate": 2.6609883938599778e-05, + "loss": 5.0717, + "step": 17600 + }, + { + "epoch": 1.4090254440710512, + "grad_norm": 2.8249638080596924, + "learning_rate": 2.659651280954164e-05, + "loss": 4.8676, + "step": 17610 + }, + { + "epoch": 1.4098255720915347, + "grad_norm": 4.564566135406494, + "learning_rate": 2.6583141680483503e-05, + "loss": 5.1186, + "step": 17620 + }, + { + "epoch": 1.410625700112018, + "grad_norm": 3.5436971187591553, + "learning_rate": 2.6569770551425366e-05, + "loss": 4.9887, + "step": 17630 + }, + { + "epoch": 1.4114258281325012, + "grad_norm": 3.748399019241333, + "learning_rate": 2.655639942236723e-05, + "loss": 5.0972, + "step": 17640 + }, + { + "epoch": 1.4122259561529844, + "grad_norm": 3.347356081008911, + "learning_rate": 2.6543028293309085e-05, + "loss": 5.1476, + "step": 17650 + }, + { + "epoch": 1.4130260841734676, + "grad_norm": 4.2582807540893555, + "learning_rate": 2.6529657164250947e-05, + "loss": 4.893, + "step": 17660 + }, + { + "epoch": 1.413826212193951, + "grad_norm": 3.092129945755005, + "learning_rate": 2.651628603519281e-05, + "loss": 5.0277, + "step": 17670 + }, + { + "epoch": 1.4146263402144343, + "grad_norm": 3.6835200786590576, + "learning_rate": 2.6502914906134673e-05, + "loss": 5.0984, + "step": 17680 + }, + { + "epoch": 1.4154264682349176, + "grad_norm": 2.6071996688842773, + "learning_rate": 2.6489543777076535e-05, + "loss": 5.1737, + "step": 17690 + }, + { + "epoch": 1.4162265962554008, + "grad_norm": 2.6273155212402344, + "learning_rate": 2.6476172648018398e-05, + "loss": 4.9545, + "step": 17700 + }, + { + "epoch": 1.417026724275884, + "grad_norm": 3.8607401847839355, + "learning_rate": 2.646280151896026e-05, + "loss": 5.1669, + "step": 17710 + }, + { + "epoch": 1.4178268522963675, + "grad_norm": 3.5712077617645264, + "learning_rate": 2.6449430389902124e-05, + "loss": 5.1462, + "step": 17720 + }, + { + "epoch": 1.4186269803168507, + "grad_norm": 3.0255634784698486, + "learning_rate": 2.6436059260843986e-05, + "loss": 4.9677, + "step": 17730 + }, + { + "epoch": 1.419427108337334, + "grad_norm": 2.684609889984131, + "learning_rate": 2.642268813178585e-05, + "loss": 5.056, + "step": 17740 + }, + { + "epoch": 1.4202272363578172, + "grad_norm": 3.0845911502838135, + "learning_rate": 2.640931700272771e-05, + "loss": 4.9412, + "step": 17750 + }, + { + "epoch": 1.4210273643783005, + "grad_norm": 3.321986198425293, + "learning_rate": 2.6395945873669574e-05, + "loss": 5.1335, + "step": 17760 + }, + { + "epoch": 1.421827492398784, + "grad_norm": 3.656062126159668, + "learning_rate": 2.6382574744611437e-05, + "loss": 4.8692, + "step": 17770 + }, + { + "epoch": 1.4226276204192672, + "grad_norm": 2.865105152130127, + "learning_rate": 2.63692036155533e-05, + "loss": 5.0353, + "step": 17780 + }, + { + "epoch": 1.4234277484397504, + "grad_norm": 3.2242352962493896, + "learning_rate": 2.6355832486495162e-05, + "loss": 5.0441, + "step": 17790 + }, + { + "epoch": 1.4242278764602336, + "grad_norm": 2.8633460998535156, + "learning_rate": 2.634246135743702e-05, + "loss": 5.1531, + "step": 17800 + }, + { + "epoch": 1.4250280044807169, + "grad_norm": 4.637207508087158, + "learning_rate": 2.6329090228378884e-05, + "loss": 4.9218, + "step": 17810 + }, + { + "epoch": 1.4258281325012003, + "grad_norm": 3.403052806854248, + "learning_rate": 2.6315719099320747e-05, + "loss": 4.9892, + "step": 17820 + }, + { + "epoch": 1.4266282605216833, + "grad_norm": 3.6156423091888428, + "learning_rate": 2.630234797026261e-05, + "loss": 5.0119, + "step": 17830 + }, + { + "epoch": 1.4274283885421668, + "grad_norm": 4.8892340660095215, + "learning_rate": 2.6288976841204472e-05, + "loss": 4.9413, + "step": 17840 + }, + { + "epoch": 1.42822851656265, + "grad_norm": 4.089402198791504, + "learning_rate": 2.6275605712146335e-05, + "loss": 5.078, + "step": 17850 + }, + { + "epoch": 1.4290286445831333, + "grad_norm": 4.567444324493408, + "learning_rate": 2.6262234583088198e-05, + "loss": 4.9818, + "step": 17860 + }, + { + "epoch": 1.4298287726036165, + "grad_norm": 3.038895845413208, + "learning_rate": 2.624886345403006e-05, + "loss": 4.8535, + "step": 17870 + }, + { + "epoch": 1.4306289006240998, + "grad_norm": 3.1715147495269775, + "learning_rate": 2.6235492324971923e-05, + "loss": 5.118, + "step": 17880 + }, + { + "epoch": 1.4314290286445832, + "grad_norm": 5.904415607452393, + "learning_rate": 2.6222121195913786e-05, + "loss": 5.1062, + "step": 17890 + }, + { + "epoch": 1.4322291566650664, + "grad_norm": 3.092773675918579, + "learning_rate": 2.620875006685565e-05, + "loss": 5.0615, + "step": 17900 + }, + { + "epoch": 1.4330292846855497, + "grad_norm": 4.345393180847168, + "learning_rate": 2.619537893779751e-05, + "loss": 5.0647, + "step": 17910 + }, + { + "epoch": 1.433829412706033, + "grad_norm": 3.2700984477996826, + "learning_rate": 2.6182007808739374e-05, + "loss": 5.0142, + "step": 17920 + }, + { + "epoch": 1.4346295407265162, + "grad_norm": 3.5998690128326416, + "learning_rate": 2.6168636679681237e-05, + "loss": 5.0143, + "step": 17930 + }, + { + "epoch": 1.4354296687469996, + "grad_norm": 3.5779027938842773, + "learning_rate": 2.61552655506231e-05, + "loss": 5.0386, + "step": 17940 + }, + { + "epoch": 1.4362297967674829, + "grad_norm": 4.815369606018066, + "learning_rate": 2.6141894421564962e-05, + "loss": 5.0961, + "step": 17950 + }, + { + "epoch": 1.437029924787966, + "grad_norm": 3.695279836654663, + "learning_rate": 2.6128523292506818e-05, + "loss": 5.048, + "step": 17960 + }, + { + "epoch": 1.4378300528084493, + "grad_norm": 5.449552536010742, + "learning_rate": 2.611515216344868e-05, + "loss": 4.9294, + "step": 17970 + }, + { + "epoch": 1.4386301808289326, + "grad_norm": 3.0460941791534424, + "learning_rate": 2.6101781034390543e-05, + "loss": 4.9002, + "step": 17980 + }, + { + "epoch": 1.439430308849416, + "grad_norm": 4.6357951164245605, + "learning_rate": 2.6088409905332406e-05, + "loss": 5.0084, + "step": 17990 + }, + { + "epoch": 1.4402304368698993, + "grad_norm": 4.996743679046631, + "learning_rate": 2.607503877627427e-05, + "loss": 4.9983, + "step": 18000 + }, + { + "epoch": 1.4410305648903825, + "grad_norm": 3.1133697032928467, + "learning_rate": 2.606166764721613e-05, + "loss": 5.1251, + "step": 18010 + }, + { + "epoch": 1.4418306929108657, + "grad_norm": 2.8803775310516357, + "learning_rate": 2.6048296518157994e-05, + "loss": 5.1401, + "step": 18020 + }, + { + "epoch": 1.442630820931349, + "grad_norm": 2.200620174407959, + "learning_rate": 2.6034925389099857e-05, + "loss": 5.0268, + "step": 18030 + }, + { + "epoch": 1.4434309489518322, + "grad_norm": 6.3594651222229, + "learning_rate": 2.602155426004172e-05, + "loss": 5.0733, + "step": 18040 + }, + { + "epoch": 1.4442310769723155, + "grad_norm": 2.7869396209716797, + "learning_rate": 2.6008183130983582e-05, + "loss": 5.0708, + "step": 18050 + }, + { + "epoch": 1.445031204992799, + "grad_norm": 4.231550693511963, + "learning_rate": 2.5994812001925445e-05, + "loss": 5.0453, + "step": 18060 + }, + { + "epoch": 1.4458313330132821, + "grad_norm": 3.3731446266174316, + "learning_rate": 2.5981440872867308e-05, + "loss": 5.0811, + "step": 18070 + }, + { + "epoch": 1.4466314610337654, + "grad_norm": 2.6151371002197266, + "learning_rate": 2.596806974380917e-05, + "loss": 5.012, + "step": 18080 + }, + { + "epoch": 1.4474315890542486, + "grad_norm": 4.7653937339782715, + "learning_rate": 2.5954698614751033e-05, + "loss": 5.0166, + "step": 18090 + }, + { + "epoch": 1.4482317170747319, + "grad_norm": 2.9431347846984863, + "learning_rate": 2.5941327485692896e-05, + "loss": 4.8474, + "step": 18100 + }, + { + "epoch": 1.4490318450952153, + "grad_norm": 2.9158408641815186, + "learning_rate": 2.592795635663475e-05, + "loss": 4.9953, + "step": 18110 + }, + { + "epoch": 1.4498319731156986, + "grad_norm": 2.303264617919922, + "learning_rate": 2.5914585227576614e-05, + "loss": 4.9232, + "step": 18120 + }, + { + "epoch": 1.4506321011361818, + "grad_norm": 3.09885835647583, + "learning_rate": 2.5901214098518477e-05, + "loss": 4.9656, + "step": 18130 + }, + { + "epoch": 1.451432229156665, + "grad_norm": 4.0016045570373535, + "learning_rate": 2.588784296946034e-05, + "loss": 5.1111, + "step": 18140 + }, + { + "epoch": 1.4522323571771483, + "grad_norm": 3.566770076751709, + "learning_rate": 2.5874471840402202e-05, + "loss": 5.1114, + "step": 18150 + }, + { + "epoch": 1.4530324851976317, + "grad_norm": 3.5348453521728516, + "learning_rate": 2.5861100711344065e-05, + "loss": 5.0469, + "step": 18160 + }, + { + "epoch": 1.453832613218115, + "grad_norm": 4.667888641357422, + "learning_rate": 2.5847729582285928e-05, + "loss": 4.9915, + "step": 18170 + }, + { + "epoch": 1.4546327412385982, + "grad_norm": 3.968620777130127, + "learning_rate": 2.583435845322779e-05, + "loss": 4.9876, + "step": 18180 + }, + { + "epoch": 1.4554328692590814, + "grad_norm": 3.7125017642974854, + "learning_rate": 2.5820987324169653e-05, + "loss": 5.1154, + "step": 18190 + }, + { + "epoch": 1.4562329972795647, + "grad_norm": 4.5525312423706055, + "learning_rate": 2.5807616195111516e-05, + "loss": 5.0959, + "step": 18200 + }, + { + "epoch": 1.4570331253000481, + "grad_norm": 3.8833093643188477, + "learning_rate": 2.579424506605338e-05, + "loss": 5.0564, + "step": 18210 + }, + { + "epoch": 1.4578332533205312, + "grad_norm": 2.3059375286102295, + "learning_rate": 2.578087393699524e-05, + "loss": 5.1766, + "step": 18220 + }, + { + "epoch": 1.4586333813410146, + "grad_norm": 2.98335599899292, + "learning_rate": 2.5767502807937104e-05, + "loss": 4.9341, + "step": 18230 + }, + { + "epoch": 1.4594335093614978, + "grad_norm": 2.6796722412109375, + "learning_rate": 2.5754131678878966e-05, + "loss": 5.1078, + "step": 18240 + }, + { + "epoch": 1.460233637381981, + "grad_norm": 3.41542649269104, + "learning_rate": 2.574076054982083e-05, + "loss": 5.0378, + "step": 18250 + }, + { + "epoch": 1.4610337654024643, + "grad_norm": 3.245790481567383, + "learning_rate": 2.5727389420762692e-05, + "loss": 5.1121, + "step": 18260 + }, + { + "epoch": 1.4618338934229476, + "grad_norm": 3.9725239276885986, + "learning_rate": 2.571401829170455e-05, + "loss": 4.9956, + "step": 18270 + }, + { + "epoch": 1.462634021443431, + "grad_norm": 3.3008267879486084, + "learning_rate": 2.5700647162646414e-05, + "loss": 5.032, + "step": 18280 + }, + { + "epoch": 1.4634341494639143, + "grad_norm": 2.838578701019287, + "learning_rate": 2.5687276033588277e-05, + "loss": 4.9847, + "step": 18290 + }, + { + "epoch": 1.4642342774843975, + "grad_norm": 5.162490367889404, + "learning_rate": 2.567390490453014e-05, + "loss": 5.0771, + "step": 18300 + }, + { + "epoch": 1.4650344055048807, + "grad_norm": 3.455357789993286, + "learning_rate": 2.5660533775472002e-05, + "loss": 5.0616, + "step": 18310 + }, + { + "epoch": 1.465834533525364, + "grad_norm": 2.9185259342193604, + "learning_rate": 2.5647162646413865e-05, + "loss": 4.9563, + "step": 18320 + }, + { + "epoch": 1.4666346615458474, + "grad_norm": 3.7584445476531982, + "learning_rate": 2.5633791517355727e-05, + "loss": 5.0305, + "step": 18330 + }, + { + "epoch": 1.4674347895663307, + "grad_norm": 3.1257286071777344, + "learning_rate": 2.562042038829759e-05, + "loss": 5.0208, + "step": 18340 + }, + { + "epoch": 1.468234917586814, + "grad_norm": 2.5908803939819336, + "learning_rate": 2.5607049259239453e-05, + "loss": 5.0216, + "step": 18350 + }, + { + "epoch": 1.4690350456072971, + "grad_norm": 3.889470100402832, + "learning_rate": 2.5593678130181315e-05, + "loss": 5.2175, + "step": 18360 + }, + { + "epoch": 1.4698351736277804, + "grad_norm": 2.6699845790863037, + "learning_rate": 2.5580307001123178e-05, + "loss": 5.0305, + "step": 18370 + }, + { + "epoch": 1.4706353016482638, + "grad_norm": 4.451345920562744, + "learning_rate": 2.556693587206504e-05, + "loss": 4.9873, + "step": 18380 + }, + { + "epoch": 1.471435429668747, + "grad_norm": 3.441887855529785, + "learning_rate": 2.5553564743006903e-05, + "loss": 4.958, + "step": 18390 + }, + { + "epoch": 1.4722355576892303, + "grad_norm": 2.9705936908721924, + "learning_rate": 2.5540193613948766e-05, + "loss": 4.9719, + "step": 18400 + }, + { + "epoch": 1.4730356857097135, + "grad_norm": 4.436412334442139, + "learning_rate": 2.552682248489063e-05, + "loss": 4.899, + "step": 18410 + }, + { + "epoch": 1.4738358137301968, + "grad_norm": 3.027207136154175, + "learning_rate": 2.5513451355832485e-05, + "loss": 5.0222, + "step": 18420 + }, + { + "epoch": 1.4746359417506802, + "grad_norm": 3.141206741333008, + "learning_rate": 2.5500080226774347e-05, + "loss": 4.8712, + "step": 18430 + }, + { + "epoch": 1.4754360697711633, + "grad_norm": 2.8075220584869385, + "learning_rate": 2.548670909771621e-05, + "loss": 5.1235, + "step": 18440 + }, + { + "epoch": 1.4762361977916467, + "grad_norm": 3.697413921356201, + "learning_rate": 2.5473337968658073e-05, + "loss": 4.9516, + "step": 18450 + }, + { + "epoch": 1.47703632581213, + "grad_norm": 2.754885673522949, + "learning_rate": 2.5459966839599936e-05, + "loss": 5.0219, + "step": 18460 + }, + { + "epoch": 1.4778364538326132, + "grad_norm": 2.937675714492798, + "learning_rate": 2.5446595710541798e-05, + "loss": 5.0053, + "step": 18470 + }, + { + "epoch": 1.4786365818530964, + "grad_norm": 4.04290771484375, + "learning_rate": 2.543322458148366e-05, + "loss": 5.0199, + "step": 18480 + }, + { + "epoch": 1.4794367098735797, + "grad_norm": 3.605565309524536, + "learning_rate": 2.5419853452425524e-05, + "loss": 4.9443, + "step": 18490 + }, + { + "epoch": 1.4802368378940631, + "grad_norm": 3.089582920074463, + "learning_rate": 2.5406482323367386e-05, + "loss": 5.1402, + "step": 18500 + }, + { + "epoch": 1.4810369659145464, + "grad_norm": 2.7486472129821777, + "learning_rate": 2.539311119430925e-05, + "loss": 4.9903, + "step": 18510 + }, + { + "epoch": 1.4818370939350296, + "grad_norm": 3.1345067024230957, + "learning_rate": 2.537974006525111e-05, + "loss": 4.867, + "step": 18520 + }, + { + "epoch": 1.4826372219555128, + "grad_norm": 2.698291540145874, + "learning_rate": 2.5366368936192974e-05, + "loss": 5.0244, + "step": 18530 + }, + { + "epoch": 1.483437349975996, + "grad_norm": 3.02305006980896, + "learning_rate": 2.5352997807134837e-05, + "loss": 5.1695, + "step": 18540 + }, + { + "epoch": 1.4842374779964795, + "grad_norm": 3.198091745376587, + "learning_rate": 2.53396266780767e-05, + "loss": 5.0686, + "step": 18550 + }, + { + "epoch": 1.4850376060169628, + "grad_norm": 3.5371475219726562, + "learning_rate": 2.5326255549018562e-05, + "loss": 5.1243, + "step": 18560 + }, + { + "epoch": 1.485837734037446, + "grad_norm": 6.168661594390869, + "learning_rate": 2.5312884419960425e-05, + "loss": 5.041, + "step": 18570 + }, + { + "epoch": 1.4866378620579292, + "grad_norm": 2.842280864715576, + "learning_rate": 2.529951329090228e-05, + "loss": 5.0337, + "step": 18580 + }, + { + "epoch": 1.4874379900784125, + "grad_norm": 2.6275157928466797, + "learning_rate": 2.5286142161844144e-05, + "loss": 4.8418, + "step": 18590 + }, + { + "epoch": 1.488238118098896, + "grad_norm": 3.1988492012023926, + "learning_rate": 2.5272771032786006e-05, + "loss": 4.9543, + "step": 18600 + }, + { + "epoch": 1.4890382461193792, + "grad_norm": 2.890760660171509, + "learning_rate": 2.525939990372787e-05, + "loss": 5.0444, + "step": 18610 + }, + { + "epoch": 1.4898383741398624, + "grad_norm": 3.2027745246887207, + "learning_rate": 2.5246028774669732e-05, + "loss": 5.1103, + "step": 18620 + }, + { + "epoch": 1.4906385021603457, + "grad_norm": 2.3930492401123047, + "learning_rate": 2.5232657645611594e-05, + "loss": 5.0071, + "step": 18630 + }, + { + "epoch": 1.491438630180829, + "grad_norm": 2.7484025955200195, + "learning_rate": 2.5219286516553457e-05, + "loss": 5.1018, + "step": 18640 + }, + { + "epoch": 1.4922387582013121, + "grad_norm": 4.265021800994873, + "learning_rate": 2.520591538749532e-05, + "loss": 4.9677, + "step": 18650 + }, + { + "epoch": 1.4930388862217954, + "grad_norm": 3.4047820568084717, + "learning_rate": 2.5192544258437183e-05, + "loss": 5.0718, + "step": 18660 + }, + { + "epoch": 1.4938390142422788, + "grad_norm": 3.635500192642212, + "learning_rate": 2.5179173129379045e-05, + "loss": 5.159, + "step": 18670 + }, + { + "epoch": 1.494639142262762, + "grad_norm": 3.42348575592041, + "learning_rate": 2.5165802000320908e-05, + "loss": 5.2187, + "step": 18680 + }, + { + "epoch": 1.4954392702832453, + "grad_norm": 3.96763277053833, + "learning_rate": 2.515243087126277e-05, + "loss": 5.0085, + "step": 18690 + }, + { + "epoch": 1.4962393983037285, + "grad_norm": 4.471992015838623, + "learning_rate": 2.5139059742204633e-05, + "loss": 4.8476, + "step": 18700 + }, + { + "epoch": 1.4970395263242118, + "grad_norm": 3.588132858276367, + "learning_rate": 2.5125688613146496e-05, + "loss": 4.9779, + "step": 18710 + }, + { + "epoch": 1.4978396543446952, + "grad_norm": 4.183651924133301, + "learning_rate": 2.511231748408836e-05, + "loss": 5.0973, + "step": 18720 + }, + { + "epoch": 1.4986397823651785, + "grad_norm": 4.290589332580566, + "learning_rate": 2.5098946355030218e-05, + "loss": 5.0105, + "step": 18730 + }, + { + "epoch": 1.4994399103856617, + "grad_norm": 3.7274177074432373, + "learning_rate": 2.508557522597208e-05, + "loss": 5.0491, + "step": 18740 + }, + { + "epoch": 1.500240038406145, + "grad_norm": 4.3094682693481445, + "learning_rate": 2.5072204096913943e-05, + "loss": 5.0653, + "step": 18750 + }, + { + "epoch": 1.5010401664266282, + "grad_norm": 3.182372570037842, + "learning_rate": 2.5058832967855806e-05, + "loss": 5.0284, + "step": 18760 + }, + { + "epoch": 1.5018402944471116, + "grad_norm": 3.6180033683776855, + "learning_rate": 2.504546183879767e-05, + "loss": 5.1201, + "step": 18770 + }, + { + "epoch": 1.5026404224675947, + "grad_norm": 3.979736328125, + "learning_rate": 2.503209070973953e-05, + "loss": 5.0016, + "step": 18780 + }, + { + "epoch": 1.5034405504880781, + "grad_norm": 3.0754947662353516, + "learning_rate": 2.5018719580681394e-05, + "loss": 4.9988, + "step": 18790 + }, + { + "epoch": 1.5042406785085614, + "grad_norm": 3.5843517780303955, + "learning_rate": 2.5005348451623257e-05, + "loss": 5.003, + "step": 18800 + }, + { + "epoch": 1.5050408065290446, + "grad_norm": 4.648735523223877, + "learning_rate": 2.499197732256512e-05, + "loss": 4.9771, + "step": 18810 + }, + { + "epoch": 1.505840934549528, + "grad_norm": Infinity, + "learning_rate": 2.4979943306412795e-05, + "loss": 5.0095, + "step": 18820 + }, + { + "epoch": 1.506641062570011, + "grad_norm": 3.6529128551483154, + "learning_rate": 2.4966572177354658e-05, + "loss": 5.1996, + "step": 18830 + }, + { + "epoch": 1.5074411905904945, + "grad_norm": 3.5937626361846924, + "learning_rate": 2.495320104829652e-05, + "loss": 4.9082, + "step": 18840 + }, + { + "epoch": 1.5082413186109778, + "grad_norm": 3.3477978706359863, + "learning_rate": 2.493982991923838e-05, + "loss": 5.0897, + "step": 18850 + }, + { + "epoch": 1.509041446631461, + "grad_norm": 2.748612642288208, + "learning_rate": 2.4926458790180243e-05, + "loss": 4.8157, + "step": 18860 + }, + { + "epoch": 1.5098415746519445, + "grad_norm": 3.3521084785461426, + "learning_rate": 2.4913087661122105e-05, + "loss": 5.1061, + "step": 18870 + }, + { + "epoch": 1.5106417026724275, + "grad_norm": 3.596810817718506, + "learning_rate": 2.4899716532063968e-05, + "loss": 4.9414, + "step": 18880 + }, + { + "epoch": 1.511441830692911, + "grad_norm": 3.07938289642334, + "learning_rate": 2.488634540300583e-05, + "loss": 4.9793, + "step": 18890 + }, + { + "epoch": 1.5122419587133942, + "grad_norm": 2.709639310836792, + "learning_rate": 2.4872974273947693e-05, + "loss": 5.0751, + "step": 18900 + }, + { + "epoch": 1.5130420867338774, + "grad_norm": 5.074763774871826, + "learning_rate": 2.4859603144889556e-05, + "loss": 5.0816, + "step": 18910 + }, + { + "epoch": 1.5138422147543606, + "grad_norm": 3.0176215171813965, + "learning_rate": 2.484623201583142e-05, + "loss": 5.1346, + "step": 18920 + }, + { + "epoch": 1.5146423427748439, + "grad_norm": 4.006988525390625, + "learning_rate": 2.4832860886773278e-05, + "loss": 5.1273, + "step": 18930 + }, + { + "epoch": 1.5154424707953273, + "grad_norm": 4.101680755615234, + "learning_rate": 2.481948975771514e-05, + "loss": 4.975, + "step": 18940 + }, + { + "epoch": 1.5162425988158106, + "grad_norm": 3.672960042953491, + "learning_rate": 2.4806118628657003e-05, + "loss": 5.0561, + "step": 18950 + }, + { + "epoch": 1.5170427268362938, + "grad_norm": 3.214320421218872, + "learning_rate": 2.4792747499598866e-05, + "loss": 4.9711, + "step": 18960 + }, + { + "epoch": 1.517842854856777, + "grad_norm": 4.82122278213501, + "learning_rate": 2.477937637054073e-05, + "loss": 5.136, + "step": 18970 + }, + { + "epoch": 1.5186429828772603, + "grad_norm": 3.8006436824798584, + "learning_rate": 2.476600524148259e-05, + "loss": 5.1249, + "step": 18980 + }, + { + "epoch": 1.5194431108977438, + "grad_norm": 3.1978204250335693, + "learning_rate": 2.4752634112424454e-05, + "loss": 5.2438, + "step": 18990 + }, + { + "epoch": 1.5202432389182268, + "grad_norm": 3.2975521087646484, + "learning_rate": 2.4739262983366317e-05, + "loss": 4.9452, + "step": 19000 + }, + { + "epoch": 1.5210433669387102, + "grad_norm": 2.568472385406494, + "learning_rate": 2.472589185430818e-05, + "loss": 5.1489, + "step": 19010 + }, + { + "epoch": 1.5218434949591935, + "grad_norm": 2.766832113265991, + "learning_rate": 2.4712520725250042e-05, + "loss": 4.9362, + "step": 19020 + }, + { + "epoch": 1.5226436229796767, + "grad_norm": 3.665269136428833, + "learning_rate": 2.4699149596191905e-05, + "loss": 4.9117, + "step": 19030 + }, + { + "epoch": 1.5234437510001602, + "grad_norm": 3.0590641498565674, + "learning_rate": 2.4685778467133768e-05, + "loss": 5.03, + "step": 19040 + }, + { + "epoch": 1.5242438790206432, + "grad_norm": 4.048924446105957, + "learning_rate": 2.467240733807563e-05, + "loss": 5.2367, + "step": 19050 + }, + { + "epoch": 1.5250440070411266, + "grad_norm": 3.6891164779663086, + "learning_rate": 2.4659036209017493e-05, + "loss": 5.0999, + "step": 19060 + }, + { + "epoch": 1.5258441350616099, + "grad_norm": 6.118479251861572, + "learning_rate": 2.4645665079959356e-05, + "loss": 5.2272, + "step": 19070 + }, + { + "epoch": 1.5266442630820931, + "grad_norm": 4.393064498901367, + "learning_rate": 2.4632293950901215e-05, + "loss": 5.0277, + "step": 19080 + }, + { + "epoch": 1.5274443911025766, + "grad_norm": 3.447316884994507, + "learning_rate": 2.4618922821843078e-05, + "loss": 5.1005, + "step": 19090 + }, + { + "epoch": 1.5282445191230596, + "grad_norm": 2.9048802852630615, + "learning_rate": 2.460555169278494e-05, + "loss": 5.0995, + "step": 19100 + }, + { + "epoch": 1.529044647143543, + "grad_norm": 3.22967529296875, + "learning_rate": 2.4592180563726803e-05, + "loss": 5.1191, + "step": 19110 + }, + { + "epoch": 1.5298447751640263, + "grad_norm": 5.411147117614746, + "learning_rate": 2.4578809434668666e-05, + "loss": 4.9711, + "step": 19120 + }, + { + "epoch": 1.5306449031845095, + "grad_norm": 2.6100914478302, + "learning_rate": 2.456543830561053e-05, + "loss": 4.8903, + "step": 19130 + }, + { + "epoch": 1.5314450312049928, + "grad_norm": 4.481126308441162, + "learning_rate": 2.455206717655239e-05, + "loss": 5.0353, + "step": 19140 + }, + { + "epoch": 1.532245159225476, + "grad_norm": 2.8812084197998047, + "learning_rate": 2.4538696047494254e-05, + "loss": 5.0517, + "step": 19150 + }, + { + "epoch": 1.5330452872459595, + "grad_norm": 3.40692138671875, + "learning_rate": 2.4525324918436113e-05, + "loss": 5.1662, + "step": 19160 + }, + { + "epoch": 1.5338454152664425, + "grad_norm": 2.64628529548645, + "learning_rate": 2.4511953789377976e-05, + "loss": 5.1305, + "step": 19170 + }, + { + "epoch": 1.534645543286926, + "grad_norm": 3.343109607696533, + "learning_rate": 2.449858266031984e-05, + "loss": 5.0441, + "step": 19180 + }, + { + "epoch": 1.5354456713074092, + "grad_norm": 3.306133270263672, + "learning_rate": 2.44852115312617e-05, + "loss": 4.9099, + "step": 19190 + }, + { + "epoch": 1.5362457993278924, + "grad_norm": 3.217883825302124, + "learning_rate": 2.4471840402203564e-05, + "loss": 5.0963, + "step": 19200 + }, + { + "epoch": 1.5370459273483759, + "grad_norm": 2.76540470123291, + "learning_rate": 2.4458469273145427e-05, + "loss": 5.0184, + "step": 19210 + }, + { + "epoch": 1.5378460553688589, + "grad_norm": 3.376350164413452, + "learning_rate": 2.444509814408729e-05, + "loss": 5.0837, + "step": 19220 + }, + { + "epoch": 1.5386461833893423, + "grad_norm": 4.767297744750977, + "learning_rate": 2.443172701502915e-05, + "loss": 5.0034, + "step": 19230 + }, + { + "epoch": 1.5394463114098256, + "grad_norm": 3.958462953567505, + "learning_rate": 2.441835588597101e-05, + "loss": 5.1961, + "step": 19240 + }, + { + "epoch": 1.5402464394303088, + "grad_norm": 2.8455073833465576, + "learning_rate": 2.4404984756912874e-05, + "loss": 4.92, + "step": 19250 + }, + { + "epoch": 1.5410465674507923, + "grad_norm": 2.7165021896362305, + "learning_rate": 2.4391613627854737e-05, + "loss": 4.9634, + "step": 19260 + }, + { + "epoch": 1.5418466954712753, + "grad_norm": 4.948694705963135, + "learning_rate": 2.43782424987966e-05, + "loss": 4.9641, + "step": 19270 + }, + { + "epoch": 1.5426468234917587, + "grad_norm": 3.5397555828094482, + "learning_rate": 2.4364871369738462e-05, + "loss": 4.891, + "step": 19280 + }, + { + "epoch": 1.543446951512242, + "grad_norm": 3.971384048461914, + "learning_rate": 2.4351500240680325e-05, + "loss": 5.0002, + "step": 19290 + }, + { + "epoch": 1.5442470795327252, + "grad_norm": 2.627703905105591, + "learning_rate": 2.4338129111622187e-05, + "loss": 4.9231, + "step": 19300 + }, + { + "epoch": 1.5450472075532087, + "grad_norm": 6.132839202880859, + "learning_rate": 2.4324757982564047e-05, + "loss": 4.9914, + "step": 19310 + }, + { + "epoch": 1.5458473355736917, + "grad_norm": 2.6523935794830322, + "learning_rate": 2.431138685350591e-05, + "loss": 4.9752, + "step": 19320 + }, + { + "epoch": 1.5466474635941752, + "grad_norm": 3.1848411560058594, + "learning_rate": 2.4298015724447772e-05, + "loss": 5.1569, + "step": 19330 + }, + { + "epoch": 1.5474475916146584, + "grad_norm": 3.3984134197235107, + "learning_rate": 2.4284644595389635e-05, + "loss": 4.967, + "step": 19340 + }, + { + "epoch": 1.5482477196351416, + "grad_norm": 3.3843936920166016, + "learning_rate": 2.4271273466331497e-05, + "loss": 5.0225, + "step": 19350 + }, + { + "epoch": 1.5490478476556249, + "grad_norm": 3.611131191253662, + "learning_rate": 2.425790233727336e-05, + "loss": 4.9721, + "step": 19360 + }, + { + "epoch": 1.549847975676108, + "grad_norm": 3.0888559818267822, + "learning_rate": 2.4244531208215223e-05, + "loss": 5.0042, + "step": 19370 + }, + { + "epoch": 1.5506481036965916, + "grad_norm": 3.0562126636505127, + "learning_rate": 2.4231160079157086e-05, + "loss": 5.1078, + "step": 19380 + }, + { + "epoch": 1.5514482317170746, + "grad_norm": 2.9445252418518066, + "learning_rate": 2.4217788950098945e-05, + "loss": 4.9544, + "step": 19390 + }, + { + "epoch": 1.552248359737558, + "grad_norm": 2.643602132797241, + "learning_rate": 2.4204417821040808e-05, + "loss": 5.0008, + "step": 19400 + }, + { + "epoch": 1.5530484877580413, + "grad_norm": 3.939926862716675, + "learning_rate": 2.419104669198267e-05, + "loss": 5.0934, + "step": 19410 + }, + { + "epoch": 1.5538486157785245, + "grad_norm": 4.864276885986328, + "learning_rate": 2.4177675562924533e-05, + "loss": 4.7946, + "step": 19420 + }, + { + "epoch": 1.554648743799008, + "grad_norm": 3.8411593437194824, + "learning_rate": 2.4164304433866396e-05, + "loss": 4.9201, + "step": 19430 + }, + { + "epoch": 1.555448871819491, + "grad_norm": 3.1794567108154297, + "learning_rate": 2.4150933304808258e-05, + "loss": 5.0872, + "step": 19440 + }, + { + "epoch": 1.5562489998399744, + "grad_norm": 3.6120903491973877, + "learning_rate": 2.413756217575012e-05, + "loss": 5.091, + "step": 19450 + }, + { + "epoch": 1.5570491278604577, + "grad_norm": 2.759181022644043, + "learning_rate": 2.4124191046691984e-05, + "loss": 5.0926, + "step": 19460 + }, + { + "epoch": 1.557849255880941, + "grad_norm": 3.43410062789917, + "learning_rate": 2.4110819917633846e-05, + "loss": 5.0082, + "step": 19470 + }, + { + "epoch": 1.5586493839014244, + "grad_norm": 5.191288948059082, + "learning_rate": 2.409744878857571e-05, + "loss": 5.1893, + "step": 19480 + }, + { + "epoch": 1.5594495119219074, + "grad_norm": 2.750684976577759, + "learning_rate": 2.4084077659517572e-05, + "loss": 5.0553, + "step": 19490 + }, + { + "epoch": 1.5602496399423909, + "grad_norm": 3.0791661739349365, + "learning_rate": 2.4070706530459434e-05, + "loss": 5.0705, + "step": 19500 + }, + { + "epoch": 1.561049767962874, + "grad_norm": 3.4672889709472656, + "learning_rate": 2.4057335401401297e-05, + "loss": 5.131, + "step": 19510 + }, + { + "epoch": 1.5618498959833573, + "grad_norm": 4.692345142364502, + "learning_rate": 2.404396427234316e-05, + "loss": 5.1392, + "step": 19520 + }, + { + "epoch": 1.5626500240038406, + "grad_norm": 3.5774660110473633, + "learning_rate": 2.4030593143285023e-05, + "loss": 5.055, + "step": 19530 + }, + { + "epoch": 1.5634501520243238, + "grad_norm": 2.421363592147827, + "learning_rate": 2.4017222014226882e-05, + "loss": 5.0437, + "step": 19540 + }, + { + "epoch": 1.5642502800448073, + "grad_norm": 4.430188179016113, + "learning_rate": 2.4003850885168745e-05, + "loss": 5.1267, + "step": 19550 + }, + { + "epoch": 1.5650504080652905, + "grad_norm": 3.8973233699798584, + "learning_rate": 2.3990479756110607e-05, + "loss": 4.9341, + "step": 19560 + }, + { + "epoch": 1.5658505360857737, + "grad_norm": 4.119020462036133, + "learning_rate": 2.397710862705247e-05, + "loss": 5.0242, + "step": 19570 + }, + { + "epoch": 1.566650664106257, + "grad_norm": 7.76237154006958, + "learning_rate": 2.3963737497994333e-05, + "loss": 5.1869, + "step": 19580 + }, + { + "epoch": 1.5674507921267402, + "grad_norm": 2.831879138946533, + "learning_rate": 2.3950366368936195e-05, + "loss": 5.043, + "step": 19590 + }, + { + "epoch": 1.5682509201472237, + "grad_norm": 3.3284389972686768, + "learning_rate": 2.3936995239878058e-05, + "loss": 4.9872, + "step": 19600 + }, + { + "epoch": 1.5690510481677067, + "grad_norm": 2.6948013305664062, + "learning_rate": 2.392362411081992e-05, + "loss": 5.0638, + "step": 19610 + }, + { + "epoch": 1.5698511761881901, + "grad_norm": 2.664315700531006, + "learning_rate": 2.391025298176178e-05, + "loss": 4.9393, + "step": 19620 + }, + { + "epoch": 1.5706513042086734, + "grad_norm": 3.3716866970062256, + "learning_rate": 2.3896881852703643e-05, + "loss": 4.9731, + "step": 19630 + }, + { + "epoch": 1.5714514322291566, + "grad_norm": 3.558425188064575, + "learning_rate": 2.3883510723645505e-05, + "loss": 5.0486, + "step": 19640 + }, + { + "epoch": 1.57225156024964, + "grad_norm": 3.051264762878418, + "learning_rate": 2.3870139594587368e-05, + "loss": 5.1662, + "step": 19650 + }, + { + "epoch": 1.573051688270123, + "grad_norm": 4.363539695739746, + "learning_rate": 2.385676846552923e-05, + "loss": 4.8295, + "step": 19660 + }, + { + "epoch": 1.5738518162906066, + "grad_norm": 3.811920166015625, + "learning_rate": 2.3843397336471093e-05, + "loss": 5.0468, + "step": 19670 + }, + { + "epoch": 1.5746519443110898, + "grad_norm": 4.834085464477539, + "learning_rate": 2.3830026207412956e-05, + "loss": 5.0537, + "step": 19680 + }, + { + "epoch": 1.575452072331573, + "grad_norm": 3.673466444015503, + "learning_rate": 2.381665507835482e-05, + "loss": 5.0796, + "step": 19690 + }, + { + "epoch": 1.5762522003520565, + "grad_norm": 2.913201093673706, + "learning_rate": 2.3803283949296678e-05, + "loss": 5.0079, + "step": 19700 + }, + { + "epoch": 1.5770523283725395, + "grad_norm": 2.7734830379486084, + "learning_rate": 2.378991282023854e-05, + "loss": 5.0544, + "step": 19710 + }, + { + "epoch": 1.577852456393023, + "grad_norm": 2.8796842098236084, + "learning_rate": 2.3776541691180403e-05, + "loss": 4.9974, + "step": 19720 + }, + { + "epoch": 1.5786525844135062, + "grad_norm": 3.4903788566589355, + "learning_rate": 2.3763170562122266e-05, + "loss": 5.1065, + "step": 19730 + }, + { + "epoch": 1.5794527124339894, + "grad_norm": 2.96079683303833, + "learning_rate": 2.374979943306413e-05, + "loss": 4.8157, + "step": 19740 + }, + { + "epoch": 1.5802528404544727, + "grad_norm": 3.280221700668335, + "learning_rate": 2.373642830400599e-05, + "loss": 4.9894, + "step": 19750 + }, + { + "epoch": 1.581052968474956, + "grad_norm": 3.4602150917053223, + "learning_rate": 2.3723057174947854e-05, + "loss": 5.0472, + "step": 19760 + }, + { + "epoch": 1.5818530964954394, + "grad_norm": 2.6225857734680176, + "learning_rate": 2.3709686045889717e-05, + "loss": 4.9227, + "step": 19770 + }, + { + "epoch": 1.5826532245159224, + "grad_norm": 2.760751247406006, + "learning_rate": 2.3696314916831576e-05, + "loss": 4.9783, + "step": 19780 + }, + { + "epoch": 1.5834533525364058, + "grad_norm": 3.081970691680908, + "learning_rate": 2.368294378777344e-05, + "loss": 5.1022, + "step": 19790 + }, + { + "epoch": 1.584253480556889, + "grad_norm": 4.927330017089844, + "learning_rate": 2.36695726587153e-05, + "loss": 5.0991, + "step": 19800 + }, + { + "epoch": 1.5850536085773723, + "grad_norm": 3.2381088733673096, + "learning_rate": 2.3656201529657164e-05, + "loss": 4.9632, + "step": 19810 + }, + { + "epoch": 1.5858537365978558, + "grad_norm": 3.803109884262085, + "learning_rate": 2.3642830400599027e-05, + "loss": 5.2309, + "step": 19820 + }, + { + "epoch": 1.5866538646183388, + "grad_norm": 3.326345682144165, + "learning_rate": 2.362945927154089e-05, + "loss": 4.9104, + "step": 19830 + }, + { + "epoch": 1.5874539926388223, + "grad_norm": 3.231013059616089, + "learning_rate": 2.3616088142482752e-05, + "loss": 4.9701, + "step": 19840 + }, + { + "epoch": 1.5882541206593055, + "grad_norm": 4.239320278167725, + "learning_rate": 2.3602717013424615e-05, + "loss": 4.8555, + "step": 19850 + }, + { + "epoch": 1.5890542486797887, + "grad_norm": 2.4793667793273926, + "learning_rate": 2.3589345884366478e-05, + "loss": 4.9989, + "step": 19860 + }, + { + "epoch": 1.5898543767002722, + "grad_norm": 2.578479528427124, + "learning_rate": 2.357597475530834e-05, + "loss": 5.0905, + "step": 19870 + }, + { + "epoch": 1.5906545047207552, + "grad_norm": 3.510375499725342, + "learning_rate": 2.3562603626250203e-05, + "loss": 5.1081, + "step": 19880 + }, + { + "epoch": 1.5914546327412387, + "grad_norm": 2.9421463012695312, + "learning_rate": 2.3549232497192066e-05, + "loss": 5.1245, + "step": 19890 + }, + { + "epoch": 1.592254760761722, + "grad_norm": 2.8770906925201416, + "learning_rate": 2.353586136813393e-05, + "loss": 4.8955, + "step": 19900 + }, + { + "epoch": 1.5930548887822051, + "grad_norm": 3.7492361068725586, + "learning_rate": 2.352249023907579e-05, + "loss": 4.9209, + "step": 19910 + }, + { + "epoch": 1.5938550168026886, + "grad_norm": 3.8520376682281494, + "learning_rate": 2.3509119110017654e-05, + "loss": 5.0284, + "step": 19920 + }, + { + "epoch": 1.5946551448231716, + "grad_norm": 4.247003555297852, + "learning_rate": 2.3495747980959513e-05, + "loss": 4.9151, + "step": 19930 + }, + { + "epoch": 1.595455272843655, + "grad_norm": 2.5604374408721924, + "learning_rate": 2.3482376851901376e-05, + "loss": 4.9954, + "step": 19940 + }, + { + "epoch": 1.5962554008641383, + "grad_norm": 3.5611424446105957, + "learning_rate": 2.346900572284324e-05, + "loss": 5.2244, + "step": 19950 + }, + { + "epoch": 1.5970555288846215, + "grad_norm": 3.0666303634643555, + "learning_rate": 2.34556345937851e-05, + "loss": 5.0632, + "step": 19960 + }, + { + "epoch": 1.5978556569051048, + "grad_norm": 3.0092382431030273, + "learning_rate": 2.3442263464726964e-05, + "loss": 4.9118, + "step": 19970 + }, + { + "epoch": 1.598655784925588, + "grad_norm": 3.099194288253784, + "learning_rate": 2.3428892335668827e-05, + "loss": 4.877, + "step": 19980 + }, + { + "epoch": 1.5994559129460715, + "grad_norm": 3.5368402004241943, + "learning_rate": 2.341552120661069e-05, + "loss": 5.1603, + "step": 19990 + }, + { + "epoch": 1.6002560409665545, + "grad_norm": 3.597376823425293, + "learning_rate": 2.3402150077552552e-05, + "loss": 4.974, + "step": 20000 + }, + { + "epoch": 1.601056168987038, + "grad_norm": 2.8193488121032715, + "learning_rate": 2.338877894849441e-05, + "loss": 5.0046, + "step": 20010 + }, + { + "epoch": 1.6018562970075212, + "grad_norm": 3.2118396759033203, + "learning_rate": 2.3375407819436274e-05, + "loss": 5.1282, + "step": 20020 + }, + { + "epoch": 1.6026564250280044, + "grad_norm": 3.20345401763916, + "learning_rate": 2.3362036690378137e-05, + "loss": 4.998, + "step": 20030 + }, + { + "epoch": 1.6034565530484879, + "grad_norm": 2.5816469192504883, + "learning_rate": 2.334866556132e-05, + "loss": 4.988, + "step": 20040 + }, + { + "epoch": 1.604256681068971, + "grad_norm": 3.370908260345459, + "learning_rate": 2.3335294432261862e-05, + "loss": 4.8653, + "step": 20050 + }, + { + "epoch": 1.6050568090894544, + "grad_norm": 3.355635404586792, + "learning_rate": 2.3321923303203725e-05, + "loss": 4.9665, + "step": 20060 + }, + { + "epoch": 1.6058569371099376, + "grad_norm": 4.375133991241455, + "learning_rate": 2.3308552174145587e-05, + "loss": 4.8952, + "step": 20070 + }, + { + "epoch": 1.6066570651304208, + "grad_norm": 3.30694580078125, + "learning_rate": 2.3295181045087447e-05, + "loss": 4.8787, + "step": 20080 + }, + { + "epoch": 1.6074571931509043, + "grad_norm": 3.211714267730713, + "learning_rate": 2.328180991602931e-05, + "loss": 5.0676, + "step": 20090 + }, + { + "epoch": 1.6082573211713873, + "grad_norm": 2.7749686241149902, + "learning_rate": 2.3268438786971172e-05, + "loss": 5.0726, + "step": 20100 + }, + { + "epoch": 1.6090574491918708, + "grad_norm": 2.468235731124878, + "learning_rate": 2.3255067657913035e-05, + "loss": 5.1256, + "step": 20110 + }, + { + "epoch": 1.609857577212354, + "grad_norm": 2.8571157455444336, + "learning_rate": 2.3241696528854898e-05, + "loss": 5.0278, + "step": 20120 + }, + { + "epoch": 1.6106577052328372, + "grad_norm": 3.286158561706543, + "learning_rate": 2.322832539979676e-05, + "loss": 5.0154, + "step": 20130 + }, + { + "epoch": 1.6114578332533205, + "grad_norm": 3.3127472400665283, + "learning_rate": 2.3214954270738623e-05, + "loss": 5.0027, + "step": 20140 + }, + { + "epoch": 1.6122579612738037, + "grad_norm": 2.9354875087738037, + "learning_rate": 2.3201583141680486e-05, + "loss": 4.9334, + "step": 20150 + }, + { + "epoch": 1.6130580892942872, + "grad_norm": 5.583163261413574, + "learning_rate": 2.3188212012622345e-05, + "loss": 5.0311, + "step": 20160 + }, + { + "epoch": 1.6138582173147704, + "grad_norm": 2.93721079826355, + "learning_rate": 2.3174840883564208e-05, + "loss": 5.0904, + "step": 20170 + }, + { + "epoch": 1.6146583453352537, + "grad_norm": 3.2356340885162354, + "learning_rate": 2.316146975450607e-05, + "loss": 4.8532, + "step": 20180 + }, + { + "epoch": 1.615458473355737, + "grad_norm": 2.2506487369537354, + "learning_rate": 2.3148098625447933e-05, + "loss": 4.9251, + "step": 20190 + }, + { + "epoch": 1.6162586013762201, + "grad_norm": 4.617792129516602, + "learning_rate": 2.3134727496389796e-05, + "loss": 5.0233, + "step": 20200 + }, + { + "epoch": 1.6170587293967036, + "grad_norm": 5.237868309020996, + "learning_rate": 2.312135636733166e-05, + "loss": 4.9328, + "step": 20210 + }, + { + "epoch": 1.6178588574171866, + "grad_norm": 3.0345842838287354, + "learning_rate": 2.310798523827352e-05, + "loss": 4.9304, + "step": 20220 + }, + { + "epoch": 1.61865898543767, + "grad_norm": 4.008810997009277, + "learning_rate": 2.3094614109215384e-05, + "loss": 5.0552, + "step": 20230 + }, + { + "epoch": 1.6194591134581533, + "grad_norm": 3.126352310180664, + "learning_rate": 2.3081242980157243e-05, + "loss": 5.1723, + "step": 20240 + }, + { + "epoch": 1.6202592414786365, + "grad_norm": 2.8254294395446777, + "learning_rate": 2.3067871851099106e-05, + "loss": 5.0642, + "step": 20250 + }, + { + "epoch": 1.62105936949912, + "grad_norm": 5.126392364501953, + "learning_rate": 2.305450072204097e-05, + "loss": 5.0199, + "step": 20260 + }, + { + "epoch": 1.621859497519603, + "grad_norm": 3.706127882003784, + "learning_rate": 2.304112959298283e-05, + "loss": 4.7973, + "step": 20270 + }, + { + "epoch": 1.6226596255400865, + "grad_norm": 2.41340970993042, + "learning_rate": 2.3027758463924694e-05, + "loss": 5.0053, + "step": 20280 + }, + { + "epoch": 1.6234597535605697, + "grad_norm": 3.6797945499420166, + "learning_rate": 2.3014387334866556e-05, + "loss": 5.0205, + "step": 20290 + }, + { + "epoch": 1.624259881581053, + "grad_norm": 3.8211288452148438, + "learning_rate": 2.300101620580842e-05, + "loss": 4.9126, + "step": 20300 + }, + { + "epoch": 1.6250600096015364, + "grad_norm": 4.667059421539307, + "learning_rate": 2.2987645076750282e-05, + "loss": 4.8957, + "step": 20310 + }, + { + "epoch": 1.6258601376220194, + "grad_norm": 3.175122022628784, + "learning_rate": 2.2974273947692145e-05, + "loss": 4.9999, + "step": 20320 + }, + { + "epoch": 1.6266602656425029, + "grad_norm": 5.523460865020752, + "learning_rate": 2.2960902818634007e-05, + "loss": 5.0519, + "step": 20330 + }, + { + "epoch": 1.6274603936629861, + "grad_norm": 4.125027656555176, + "learning_rate": 2.294753168957587e-05, + "loss": 5.0168, + "step": 20340 + }, + { + "epoch": 1.6282605216834694, + "grad_norm": 3.6286473274230957, + "learning_rate": 2.2934160560517733e-05, + "loss": 5.0361, + "step": 20350 + }, + { + "epoch": 1.6290606497039526, + "grad_norm": 3.122196674346924, + "learning_rate": 2.2920789431459595e-05, + "loss": 4.9964, + "step": 20360 + }, + { + "epoch": 1.6298607777244358, + "grad_norm": 3.153103828430176, + "learning_rate": 2.2907418302401458e-05, + "loss": 5.2442, + "step": 20370 + }, + { + "epoch": 1.6306609057449193, + "grad_norm": 3.096548557281494, + "learning_rate": 2.289404717334332e-05, + "loss": 4.9076, + "step": 20380 + }, + { + "epoch": 1.6314610337654023, + "grad_norm": 3.5147552490234375, + "learning_rate": 2.288067604428518e-05, + "loss": 4.9899, + "step": 20390 + }, + { + "epoch": 1.6322611617858858, + "grad_norm": 2.5290558338165283, + "learning_rate": 2.2867304915227043e-05, + "loss": 4.8719, + "step": 20400 + }, + { + "epoch": 1.633061289806369, + "grad_norm": 3.173197031021118, + "learning_rate": 2.2853933786168905e-05, + "loss": 5.1526, + "step": 20410 + }, + { + "epoch": 1.6338614178268522, + "grad_norm": 3.5563409328460693, + "learning_rate": 2.2840562657110768e-05, + "loss": 5.1405, + "step": 20420 + }, + { + "epoch": 1.6346615458473357, + "grad_norm": 4.4011335372924805, + "learning_rate": 2.282719152805263e-05, + "loss": 5.0407, + "step": 20430 + }, + { + "epoch": 1.6354616738678187, + "grad_norm": 2.9350881576538086, + "learning_rate": 2.2813820398994493e-05, + "loss": 5.0016, + "step": 20440 + }, + { + "epoch": 1.6362618018883022, + "grad_norm": 3.446441650390625, + "learning_rate": 2.2800449269936356e-05, + "loss": 5.0064, + "step": 20450 + }, + { + "epoch": 1.6370619299087854, + "grad_norm": 4.476505756378174, + "learning_rate": 2.278707814087822e-05, + "loss": 4.9877, + "step": 20460 + }, + { + "epoch": 1.6378620579292686, + "grad_norm": 3.0098717212677, + "learning_rate": 2.2773707011820078e-05, + "loss": 4.9854, + "step": 20470 + }, + { + "epoch": 1.638662185949752, + "grad_norm": 3.7491648197174072, + "learning_rate": 2.276033588276194e-05, + "loss": 4.9484, + "step": 20480 + }, + { + "epoch": 1.6394623139702351, + "grad_norm": 2.899272918701172, + "learning_rate": 2.2746964753703803e-05, + "loss": 5.0725, + "step": 20490 + }, + { + "epoch": 1.6402624419907186, + "grad_norm": 4.026328086853027, + "learning_rate": 2.2733593624645666e-05, + "loss": 5.1211, + "step": 20500 + }, + { + "epoch": 1.6410625700112018, + "grad_norm": 3.2892467975616455, + "learning_rate": 2.272022249558753e-05, + "loss": 5.138, + "step": 20510 + }, + { + "epoch": 1.641862698031685, + "grad_norm": 3.549945116043091, + "learning_rate": 2.270685136652939e-05, + "loss": 5.1562, + "step": 20520 + }, + { + "epoch": 1.6426628260521685, + "grad_norm": 2.596651792526245, + "learning_rate": 2.2693480237471254e-05, + "loss": 4.9889, + "step": 20530 + }, + { + "epoch": 1.6434629540726515, + "grad_norm": 4.061959266662598, + "learning_rate": 2.2680109108413117e-05, + "loss": 4.7925, + "step": 20540 + }, + { + "epoch": 1.644263082093135, + "grad_norm": 4.479005336761475, + "learning_rate": 2.2666737979354976e-05, + "loss": 5.0266, + "step": 20550 + }, + { + "epoch": 1.6450632101136182, + "grad_norm": 3.3911259174346924, + "learning_rate": 2.265336685029684e-05, + "loss": 5.0193, + "step": 20560 + }, + { + "epoch": 1.6458633381341015, + "grad_norm": 3.546149969100952, + "learning_rate": 2.26399957212387e-05, + "loss": 5.1076, + "step": 20570 + }, + { + "epoch": 1.6466634661545847, + "grad_norm": 3.0582120418548584, + "learning_rate": 2.2626624592180564e-05, + "loss": 5.0789, + "step": 20580 + }, + { + "epoch": 1.647463594175068, + "grad_norm": 3.0280404090881348, + "learning_rate": 2.2613253463122427e-05, + "loss": 5.1195, + "step": 20590 + }, + { + "epoch": 1.6482637221955514, + "grad_norm": 3.3636317253112793, + "learning_rate": 2.259988233406429e-05, + "loss": 4.7961, + "step": 20600 + }, + { + "epoch": 1.6490638502160344, + "grad_norm": 2.9316253662109375, + "learning_rate": 2.2586511205006152e-05, + "loss": 4.9303, + "step": 20610 + }, + { + "epoch": 1.6498639782365179, + "grad_norm": 2.799799919128418, + "learning_rate": 2.2573140075948015e-05, + "loss": 5.0877, + "step": 20620 + }, + { + "epoch": 1.650664106257001, + "grad_norm": 4.057336807250977, + "learning_rate": 2.2559768946889874e-05, + "loss": 5.0723, + "step": 20630 + }, + { + "epoch": 1.6514642342774843, + "grad_norm": 3.0030903816223145, + "learning_rate": 2.2546397817831737e-05, + "loss": 5.0231, + "step": 20640 + }, + { + "epoch": 1.6522643622979678, + "grad_norm": 3.6355438232421875, + "learning_rate": 2.25330266887736e-05, + "loss": 4.9928, + "step": 20650 + }, + { + "epoch": 1.6530644903184508, + "grad_norm": 2.6018145084381104, + "learning_rate": 2.2519655559715462e-05, + "loss": 4.9014, + "step": 20660 + }, + { + "epoch": 1.6538646183389343, + "grad_norm": 2.9779539108276367, + "learning_rate": 2.2506284430657325e-05, + "loss": 5.1838, + "step": 20670 + }, + { + "epoch": 1.6546647463594175, + "grad_norm": 2.8421037197113037, + "learning_rate": 2.2492913301599188e-05, + "loss": 5.0329, + "step": 20680 + }, + { + "epoch": 1.6554648743799008, + "grad_norm": 4.846928119659424, + "learning_rate": 2.247954217254105e-05, + "loss": 4.8454, + "step": 20690 + }, + { + "epoch": 1.6562650024003842, + "grad_norm": 2.6106202602386475, + "learning_rate": 2.246617104348291e-05, + "loss": 5.0867, + "step": 20700 + }, + { + "epoch": 1.6570651304208672, + "grad_norm": 4.7461676597595215, + "learning_rate": 2.2452799914424773e-05, + "loss": 4.9611, + "step": 20710 + }, + { + "epoch": 1.6578652584413507, + "grad_norm": 2.6487231254577637, + "learning_rate": 2.2439428785366635e-05, + "loss": 4.9811, + "step": 20720 + }, + { + "epoch": 1.658665386461834, + "grad_norm": 3.8583147525787354, + "learning_rate": 2.2426057656308498e-05, + "loss": 5.0762, + "step": 20730 + }, + { + "epoch": 1.6594655144823172, + "grad_norm": 5.3724846839904785, + "learning_rate": 2.241268652725036e-05, + "loss": 4.9821, + "step": 20740 + }, + { + "epoch": 1.6602656425028004, + "grad_norm": 3.061331033706665, + "learning_rate": 2.2399315398192223e-05, + "loss": 4.9907, + "step": 20750 + }, + { + "epoch": 1.6610657705232836, + "grad_norm": 2.9054677486419678, + "learning_rate": 2.2385944269134086e-05, + "loss": 5.0538, + "step": 20760 + }, + { + "epoch": 1.661865898543767, + "grad_norm": 2.819784164428711, + "learning_rate": 2.237257314007595e-05, + "loss": 5.142, + "step": 20770 + }, + { + "epoch": 1.6626660265642503, + "grad_norm": 2.849148750305176, + "learning_rate": 2.235920201101781e-05, + "loss": 4.9267, + "step": 20780 + }, + { + "epoch": 1.6634661545847336, + "grad_norm": 3.2724263668060303, + "learning_rate": 2.2345830881959674e-05, + "loss": 4.9058, + "step": 20790 + }, + { + "epoch": 1.6642662826052168, + "grad_norm": 2.774827480316162, + "learning_rate": 2.2332459752901537e-05, + "loss": 5.0322, + "step": 20800 + }, + { + "epoch": 1.6650664106257, + "grad_norm": 5.824799537658691, + "learning_rate": 2.23190886238434e-05, + "loss": 4.9, + "step": 20810 + }, + { + "epoch": 1.6658665386461835, + "grad_norm": 2.275923490524292, + "learning_rate": 2.2305717494785262e-05, + "loss": 5.037, + "step": 20820 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 3.7803714275360107, + "learning_rate": 2.2292346365727125e-05, + "loss": 4.9231, + "step": 20830 + }, + { + "epoch": 1.66746679468715, + "grad_norm": 3.1279492378234863, + "learning_rate": 2.2278975236668987e-05, + "loss": 5.1147, + "step": 20840 + }, + { + "epoch": 1.6682669227076332, + "grad_norm": 2.410947561264038, + "learning_rate": 2.226560410761085e-05, + "loss": 5.0152, + "step": 20850 + }, + { + "epoch": 1.6690670507281165, + "grad_norm": 3.348628282546997, + "learning_rate": 2.225223297855271e-05, + "loss": 4.9903, + "step": 20860 + }, + { + "epoch": 1.6698671787486, + "grad_norm": 3.7435998916625977, + "learning_rate": 2.2238861849494572e-05, + "loss": 5.1829, + "step": 20870 + }, + { + "epoch": 1.670667306769083, + "grad_norm": 2.872425079345703, + "learning_rate": 2.2225490720436435e-05, + "loss": 5.0331, + "step": 20880 + }, + { + "epoch": 1.6714674347895664, + "grad_norm": 3.688359022140503, + "learning_rate": 2.2212119591378298e-05, + "loss": 4.9848, + "step": 20890 + }, + { + "epoch": 1.6722675628100496, + "grad_norm": 2.4183199405670166, + "learning_rate": 2.219874846232016e-05, + "loss": 4.9103, + "step": 20900 + }, + { + "epoch": 1.6730676908305329, + "grad_norm": 3.6013360023498535, + "learning_rate": 2.2185377333262023e-05, + "loss": 4.9898, + "step": 20910 + }, + { + "epoch": 1.6738678188510163, + "grad_norm": 3.179523229598999, + "learning_rate": 2.2172006204203886e-05, + "loss": 5.0444, + "step": 20920 + }, + { + "epoch": 1.6746679468714993, + "grad_norm": 3.9969305992126465, + "learning_rate": 2.2158635075145745e-05, + "loss": 5.2334, + "step": 20930 + }, + { + "epoch": 1.6754680748919828, + "grad_norm": 3.4864439964294434, + "learning_rate": 2.2145263946087608e-05, + "loss": 4.9699, + "step": 20940 + }, + { + "epoch": 1.676268202912466, + "grad_norm": 3.6860687732696533, + "learning_rate": 2.213189281702947e-05, + "loss": 5.0731, + "step": 20950 + }, + { + "epoch": 1.6770683309329493, + "grad_norm": 5.067797660827637, + "learning_rate": 2.2118521687971333e-05, + "loss": 5.0058, + "step": 20960 + }, + { + "epoch": 1.6778684589534325, + "grad_norm": 3.295374631881714, + "learning_rate": 2.2105150558913196e-05, + "loss": 5.2059, + "step": 20970 + }, + { + "epoch": 1.6786685869739157, + "grad_norm": 2.855257987976074, + "learning_rate": 2.209177942985506e-05, + "loss": 5.134, + "step": 20980 + }, + { + "epoch": 1.6794687149943992, + "grad_norm": 3.441474199295044, + "learning_rate": 2.207840830079692e-05, + "loss": 5.0258, + "step": 20990 + }, + { + "epoch": 1.6802688430148822, + "grad_norm": 2.9601447582244873, + "learning_rate": 2.2065037171738784e-05, + "loss": 4.9197, + "step": 21000 + }, + { + "epoch": 1.6802688430148822, + "eval_loss": 5.565999984741211, + "eval_runtime": 13.2003, + "eval_samples_per_second": 3.03, + "eval_steps_per_second": 0.379, + "step": 21000 + }, + { + "epoch": 1.6810689710353657, + "grad_norm": 3.3317766189575195, + "learning_rate": 2.2051666042680643e-05, + "loss": 4.8904, + "step": 21010 + }, + { + "epoch": 1.681869099055849, + "grad_norm": 3.8202877044677734, + "learning_rate": 2.2038294913622506e-05, + "loss": 4.9713, + "step": 21020 + }, + { + "epoch": 1.6826692270763322, + "grad_norm": 4.571699142456055, + "learning_rate": 2.202492378456437e-05, + "loss": 4.8879, + "step": 21030 + }, + { + "epoch": 1.6834693550968156, + "grad_norm": 3.028830051422119, + "learning_rate": 2.201155265550623e-05, + "loss": 5.1747, + "step": 21040 + }, + { + "epoch": 1.6842694831172986, + "grad_norm": 3.196197271347046, + "learning_rate": 2.1998181526448094e-05, + "loss": 5.0718, + "step": 21050 + }, + { + "epoch": 1.685069611137782, + "grad_norm": 2.611711263656616, + "learning_rate": 2.1984810397389957e-05, + "loss": 4.8901, + "step": 21060 + }, + { + "epoch": 1.6858697391582653, + "grad_norm": 2.8445308208465576, + "learning_rate": 2.197143926833182e-05, + "loss": 4.8063, + "step": 21070 + }, + { + "epoch": 1.6866698671787486, + "grad_norm": 3.1951494216918945, + "learning_rate": 2.1958068139273682e-05, + "loss": 4.9334, + "step": 21080 + }, + { + "epoch": 1.687469995199232, + "grad_norm": 2.3488590717315674, + "learning_rate": 2.194469701021554e-05, + "loss": 4.9107, + "step": 21090 + }, + { + "epoch": 1.688270123219715, + "grad_norm": 4.008801460266113, + "learning_rate": 2.1931325881157404e-05, + "loss": 5.0342, + "step": 21100 + }, + { + "epoch": 1.6890702512401985, + "grad_norm": 2.185612201690674, + "learning_rate": 2.1917954752099267e-05, + "loss": 5.0431, + "step": 21110 + }, + { + "epoch": 1.6898703792606817, + "grad_norm": 2.571093797683716, + "learning_rate": 2.190458362304113e-05, + "loss": 4.9865, + "step": 21120 + }, + { + "epoch": 1.690670507281165, + "grad_norm": 3.084378957748413, + "learning_rate": 2.1891212493982992e-05, + "loss": 4.9913, + "step": 21130 + }, + { + "epoch": 1.6914706353016484, + "grad_norm": 2.625178813934326, + "learning_rate": 2.1877841364924855e-05, + "loss": 4.8234, + "step": 21140 + }, + { + "epoch": 1.6922707633221314, + "grad_norm": 3.9121668338775635, + "learning_rate": 2.1864470235866717e-05, + "loss": 4.9822, + "step": 21150 + }, + { + "epoch": 1.693070891342615, + "grad_norm": 3.599107265472412, + "learning_rate": 2.185109910680858e-05, + "loss": 5.0993, + "step": 21160 + }, + { + "epoch": 1.6938710193630981, + "grad_norm": 2.523613929748535, + "learning_rate": 2.1837727977750443e-05, + "loss": 4.7632, + "step": 21170 + }, + { + "epoch": 1.6946711473835814, + "grad_norm": 5.175262928009033, + "learning_rate": 2.1824356848692305e-05, + "loss": 5.0006, + "step": 21180 + }, + { + "epoch": 1.6954712754040646, + "grad_norm": 2.815415382385254, + "learning_rate": 2.1810985719634168e-05, + "loss": 4.9402, + "step": 21190 + }, + { + "epoch": 1.6962714034245479, + "grad_norm": 3.0755605697631836, + "learning_rate": 2.179761459057603e-05, + "loss": 4.8869, + "step": 21200 + }, + { + "epoch": 1.6970715314450313, + "grad_norm": 2.684621810913086, + "learning_rate": 2.1784243461517893e-05, + "loss": 4.9828, + "step": 21210 + }, + { + "epoch": 1.6978716594655143, + "grad_norm": 2.6758275032043457, + "learning_rate": 2.1770872332459756e-05, + "loss": 5.1145, + "step": 21220 + }, + { + "epoch": 1.6986717874859978, + "grad_norm": 3.088541030883789, + "learning_rate": 2.175750120340162e-05, + "loss": 4.9545, + "step": 21230 + }, + { + "epoch": 1.699471915506481, + "grad_norm": 2.9712045192718506, + "learning_rate": 2.1744130074343478e-05, + "loss": 4.8429, + "step": 21240 + }, + { + "epoch": 1.7002720435269643, + "grad_norm": 2.9696614742279053, + "learning_rate": 2.173075894528534e-05, + "loss": 5.097, + "step": 21250 + }, + { + "epoch": 1.7010721715474477, + "grad_norm": 3.073406934738159, + "learning_rate": 2.1717387816227204e-05, + "loss": 4.9124, + "step": 21260 + }, + { + "epoch": 1.7018722995679307, + "grad_norm": 2.623845100402832, + "learning_rate": 2.1704016687169066e-05, + "loss": 4.9771, + "step": 21270 + }, + { + "epoch": 1.7026724275884142, + "grad_norm": 3.0245361328125, + "learning_rate": 2.169064555811093e-05, + "loss": 4.98, + "step": 21280 + }, + { + "epoch": 1.7034725556088974, + "grad_norm": 5.218230724334717, + "learning_rate": 2.167727442905279e-05, + "loss": 5.0837, + "step": 21290 + }, + { + "epoch": 1.7042726836293807, + "grad_norm": 4.287439823150635, + "learning_rate": 2.1663903299994654e-05, + "loss": 4.9261, + "step": 21300 + }, + { + "epoch": 1.7050728116498641, + "grad_norm": 2.7261388301849365, + "learning_rate": 2.1650532170936517e-05, + "loss": 4.9462, + "step": 21310 + }, + { + "epoch": 1.7058729396703471, + "grad_norm": 3.0269289016723633, + "learning_rate": 2.1637161041878376e-05, + "loss": 4.9972, + "step": 21320 + }, + { + "epoch": 1.7066730676908306, + "grad_norm": 3.629207134246826, + "learning_rate": 2.162378991282024e-05, + "loss": 5.0556, + "step": 21330 + }, + { + "epoch": 1.7074731957113138, + "grad_norm": 4.204544544219971, + "learning_rate": 2.16104187837621e-05, + "loss": 5.0391, + "step": 21340 + }, + { + "epoch": 1.708273323731797, + "grad_norm": 3.9278762340545654, + "learning_rate": 2.1597047654703964e-05, + "loss": 5.0107, + "step": 21350 + }, + { + "epoch": 1.7090734517522803, + "grad_norm": 3.9020843505859375, + "learning_rate": 2.1583676525645827e-05, + "loss": 4.9798, + "step": 21360 + }, + { + "epoch": 1.7098735797727636, + "grad_norm": 3.871673107147217, + "learning_rate": 2.157030539658769e-05, + "loss": 5.0305, + "step": 21370 + }, + { + "epoch": 1.710673707793247, + "grad_norm": 3.754272937774658, + "learning_rate": 2.1556934267529552e-05, + "loss": 5.1621, + "step": 21380 + }, + { + "epoch": 1.7114738358137302, + "grad_norm": 3.5809175968170166, + "learning_rate": 2.1543563138471415e-05, + "loss": 5.0017, + "step": 21390 + }, + { + "epoch": 1.7122739638342135, + "grad_norm": 3.7547807693481445, + "learning_rate": 2.1530192009413274e-05, + "loss": 5.0926, + "step": 21400 + }, + { + "epoch": 1.7130740918546967, + "grad_norm": 3.9002151489257812, + "learning_rate": 2.1516820880355137e-05, + "loss": 4.9253, + "step": 21410 + }, + { + "epoch": 1.71387421987518, + "grad_norm": 3.080944299697876, + "learning_rate": 2.1503449751297e-05, + "loss": 4.963, + "step": 21420 + }, + { + "epoch": 1.7146743478956634, + "grad_norm": 4.989534378051758, + "learning_rate": 2.1490078622238862e-05, + "loss": 5.149, + "step": 21430 + }, + { + "epoch": 1.7154744759161464, + "grad_norm": 3.4675042629241943, + "learning_rate": 2.1476707493180725e-05, + "loss": 4.9175, + "step": 21440 + }, + { + "epoch": 1.71627460393663, + "grad_norm": 2.9454565048217773, + "learning_rate": 2.1463336364122588e-05, + "loss": 5.0187, + "step": 21450 + }, + { + "epoch": 1.7170747319571131, + "grad_norm": 3.1735284328460693, + "learning_rate": 2.144996523506445e-05, + "loss": 4.932, + "step": 21460 + }, + { + "epoch": 1.7178748599775964, + "grad_norm": 4.5251054763793945, + "learning_rate": 2.1436594106006313e-05, + "loss": 5.0622, + "step": 21470 + }, + { + "epoch": 1.7186749879980798, + "grad_norm": 3.7694296836853027, + "learning_rate": 2.1423222976948173e-05, + "loss": 5.071, + "step": 21480 + }, + { + "epoch": 1.7194751160185628, + "grad_norm": 3.9317219257354736, + "learning_rate": 2.1409851847890035e-05, + "loss": 4.9159, + "step": 21490 + }, + { + "epoch": 1.7202752440390463, + "grad_norm": 3.568376064300537, + "learning_rate": 2.1396480718831898e-05, + "loss": 5.1121, + "step": 21500 + }, + { + "epoch": 1.7210753720595295, + "grad_norm": 3.1742336750030518, + "learning_rate": 2.138310958977376e-05, + "loss": 5.0329, + "step": 21510 + }, + { + "epoch": 1.7218755000800128, + "grad_norm": 4.287962913513184, + "learning_rate": 2.1369738460715623e-05, + "loss": 4.9761, + "step": 21520 + }, + { + "epoch": 1.7226756281004962, + "grad_norm": 3.787036180496216, + "learning_rate": 2.1356367331657486e-05, + "loss": 4.9856, + "step": 21530 + }, + { + "epoch": 1.7234757561209793, + "grad_norm": 3.4677791595458984, + "learning_rate": 2.134299620259935e-05, + "loss": 5.0777, + "step": 21540 + }, + { + "epoch": 1.7242758841414627, + "grad_norm": 3.511162757873535, + "learning_rate": 2.1329625073541208e-05, + "loss": 5.0944, + "step": 21550 + }, + { + "epoch": 1.725076012161946, + "grad_norm": 3.1692442893981934, + "learning_rate": 2.131625394448307e-05, + "loss": 4.936, + "step": 21560 + }, + { + "epoch": 1.7258761401824292, + "grad_norm": 3.253077507019043, + "learning_rate": 2.1302882815424933e-05, + "loss": 5.1004, + "step": 21570 + }, + { + "epoch": 1.7266762682029124, + "grad_norm": 4.503719806671143, + "learning_rate": 2.1289511686366796e-05, + "loss": 5.0816, + "step": 21580 + }, + { + "epoch": 1.7274763962233957, + "grad_norm": 3.9515204429626465, + "learning_rate": 2.127614055730866e-05, + "loss": 5.0213, + "step": 21590 + }, + { + "epoch": 1.7282765242438791, + "grad_norm": 3.2743678092956543, + "learning_rate": 2.126276942825052e-05, + "loss": 4.945, + "step": 21600 + }, + { + "epoch": 1.7290766522643621, + "grad_norm": 3.2138924598693848, + "learning_rate": 2.1249398299192384e-05, + "loss": 4.9907, + "step": 21610 + }, + { + "epoch": 1.7298767802848456, + "grad_norm": 2.7698373794555664, + "learning_rate": 2.1236027170134247e-05, + "loss": 5.0068, + "step": 21620 + }, + { + "epoch": 1.7306769083053288, + "grad_norm": 3.4661061763763428, + "learning_rate": 2.122265604107611e-05, + "loss": 4.7865, + "step": 21630 + }, + { + "epoch": 1.731477036325812, + "grad_norm": 4.473834991455078, + "learning_rate": 2.1209284912017972e-05, + "loss": 5.0263, + "step": 21640 + }, + { + "epoch": 1.7322771643462955, + "grad_norm": 4.1776275634765625, + "learning_rate": 2.1195913782959835e-05, + "loss": 4.8809, + "step": 21650 + }, + { + "epoch": 1.7330772923667785, + "grad_norm": 3.335580348968506, + "learning_rate": 2.1182542653901698e-05, + "loss": 5.0557, + "step": 21660 + }, + { + "epoch": 1.733877420387262, + "grad_norm": 4.314727783203125, + "learning_rate": 2.116917152484356e-05, + "loss": 4.8628, + "step": 21670 + }, + { + "epoch": 1.7346775484077452, + "grad_norm": 3.1467959880828857, + "learning_rate": 2.1155800395785423e-05, + "loss": 5.0191, + "step": 21680 + }, + { + "epoch": 1.7354776764282285, + "grad_norm": 3.9090895652770996, + "learning_rate": 2.1142429266727286e-05, + "loss": 4.8245, + "step": 21690 + }, + { + "epoch": 1.736277804448712, + "grad_norm": 2.6189544200897217, + "learning_rate": 2.112905813766915e-05, + "loss": 4.8945, + "step": 21700 + }, + { + "epoch": 1.737077932469195, + "grad_norm": 3.156756639480591, + "learning_rate": 2.1115687008611008e-05, + "loss": 5.0731, + "step": 21710 + }, + { + "epoch": 1.7378780604896784, + "grad_norm": 3.3744938373565674, + "learning_rate": 2.110231587955287e-05, + "loss": 5.202, + "step": 21720 + }, + { + "epoch": 1.7386781885101616, + "grad_norm": 3.677507162094116, + "learning_rate": 2.1088944750494733e-05, + "loss": 5.0053, + "step": 21730 + }, + { + "epoch": 1.7394783165306449, + "grad_norm": 2.5175113677978516, + "learning_rate": 2.1075573621436596e-05, + "loss": 4.8895, + "step": 21740 + }, + { + "epoch": 1.7402784445511283, + "grad_norm": 3.9189088344573975, + "learning_rate": 2.106220249237846e-05, + "loss": 4.9188, + "step": 21750 + }, + { + "epoch": 1.7410785725716114, + "grad_norm": 4.0822625160217285, + "learning_rate": 2.104883136332032e-05, + "loss": 4.9906, + "step": 21760 + }, + { + "epoch": 1.7418787005920948, + "grad_norm": 3.3237664699554443, + "learning_rate": 2.1035460234262184e-05, + "loss": 5.0885, + "step": 21770 + }, + { + "epoch": 1.742678828612578, + "grad_norm": 4.276275634765625, + "learning_rate": 2.1022089105204043e-05, + "loss": 4.9108, + "step": 21780 + }, + { + "epoch": 1.7434789566330613, + "grad_norm": 4.213038444519043, + "learning_rate": 2.1008717976145906e-05, + "loss": 4.9961, + "step": 21790 + }, + { + "epoch": 1.7442790846535445, + "grad_norm": 3.0935757160186768, + "learning_rate": 2.099534684708777e-05, + "loss": 5.0978, + "step": 21800 + }, + { + "epoch": 1.7450792126740278, + "grad_norm": 3.3736915588378906, + "learning_rate": 2.098197571802963e-05, + "loss": 4.8611, + "step": 21810 + }, + { + "epoch": 1.7458793406945112, + "grad_norm": 3.6856744289398193, + "learning_rate": 2.0968604588971494e-05, + "loss": 4.9403, + "step": 21820 + }, + { + "epoch": 1.7466794687149942, + "grad_norm": 3.847534656524658, + "learning_rate": 2.0955233459913357e-05, + "loss": 5.0023, + "step": 21830 + }, + { + "epoch": 1.7474795967354777, + "grad_norm": 4.812027931213379, + "learning_rate": 2.094186233085522e-05, + "loss": 4.9796, + "step": 21840 + }, + { + "epoch": 1.748279724755961, + "grad_norm": 2.484204053878784, + "learning_rate": 2.0928491201797082e-05, + "loss": 4.9755, + "step": 21850 + }, + { + "epoch": 1.7490798527764442, + "grad_norm": 3.052938938140869, + "learning_rate": 2.091512007273894e-05, + "loss": 5.013, + "step": 21860 + }, + { + "epoch": 1.7498799807969276, + "grad_norm": 3.8069536685943604, + "learning_rate": 2.0901748943680804e-05, + "loss": 5.1675, + "step": 21870 + }, + { + "epoch": 1.7506801088174107, + "grad_norm": 4.167869567871094, + "learning_rate": 2.0888377814622667e-05, + "loss": 5.1041, + "step": 21880 + }, + { + "epoch": 1.7514802368378941, + "grad_norm": 3.1281728744506836, + "learning_rate": 2.087500668556453e-05, + "loss": 5.0955, + "step": 21890 + }, + { + "epoch": 1.7522803648583773, + "grad_norm": 3.4163386821746826, + "learning_rate": 2.0861635556506392e-05, + "loss": 4.976, + "step": 21900 + }, + { + "epoch": 1.7530804928788606, + "grad_norm": 2.665518045425415, + "learning_rate": 2.0848264427448255e-05, + "loss": 4.9827, + "step": 21910 + }, + { + "epoch": 1.753880620899344, + "grad_norm": 3.978625535964966, + "learning_rate": 2.0834893298390117e-05, + "loss": 5.125, + "step": 21920 + }, + { + "epoch": 1.754680748919827, + "grad_norm": 2.702791452407837, + "learning_rate": 2.082152216933198e-05, + "loss": 4.9508, + "step": 21930 + }, + { + "epoch": 1.7554808769403105, + "grad_norm": 2.9301092624664307, + "learning_rate": 2.080815104027384e-05, + "loss": 5.1346, + "step": 21940 + }, + { + "epoch": 1.7562810049607938, + "grad_norm": 3.169142007827759, + "learning_rate": 2.0794779911215702e-05, + "loss": 4.899, + "step": 21950 + }, + { + "epoch": 1.757081132981277, + "grad_norm": 4.339301109313965, + "learning_rate": 2.0781408782157565e-05, + "loss": 4.9364, + "step": 21960 + }, + { + "epoch": 1.7578812610017602, + "grad_norm": 3.94887113571167, + "learning_rate": 2.0768037653099427e-05, + "loss": 4.9057, + "step": 21970 + }, + { + "epoch": 1.7586813890222435, + "grad_norm": 2.7177371978759766, + "learning_rate": 2.075466652404129e-05, + "loss": 5.0119, + "step": 21980 + }, + { + "epoch": 1.759481517042727, + "grad_norm": 4.602911472320557, + "learning_rate": 2.0741295394983153e-05, + "loss": 4.8852, + "step": 21990 + }, + { + "epoch": 1.7602816450632102, + "grad_norm": 3.233092784881592, + "learning_rate": 2.0727924265925015e-05, + "loss": 5.0399, + "step": 22000 + }, + { + "epoch": 1.7610817730836934, + "grad_norm": 3.1726016998291016, + "learning_rate": 2.0714553136866878e-05, + "loss": 4.9855, + "step": 22010 + }, + { + "epoch": 1.7618819011041766, + "grad_norm": 8.455097198486328, + "learning_rate": 2.070118200780874e-05, + "loss": 4.8389, + "step": 22020 + }, + { + "epoch": 1.7626820291246599, + "grad_norm": 3.6071341037750244, + "learning_rate": 2.0687810878750604e-05, + "loss": 5.059, + "step": 22030 + }, + { + "epoch": 1.7634821571451433, + "grad_norm": 3.182056188583374, + "learning_rate": 2.0674439749692466e-05, + "loss": 5.0172, + "step": 22040 + }, + { + "epoch": 1.7642822851656264, + "grad_norm": 3.3609728813171387, + "learning_rate": 2.066106862063433e-05, + "loss": 4.8298, + "step": 22050 + }, + { + "epoch": 1.7650824131861098, + "grad_norm": 4.301130771636963, + "learning_rate": 2.064769749157619e-05, + "loss": 5.0654, + "step": 22060 + }, + { + "epoch": 1.765882541206593, + "grad_norm": 3.5085902214050293, + "learning_rate": 2.0634326362518054e-05, + "loss": 5.035, + "step": 22070 + }, + { + "epoch": 1.7666826692270763, + "grad_norm": 3.628776788711548, + "learning_rate": 2.0620955233459917e-05, + "loss": 4.846, + "step": 22080 + }, + { + "epoch": 1.7674827972475597, + "grad_norm": 2.593838691711426, + "learning_rate": 2.0607584104401776e-05, + "loss": 4.9209, + "step": 22090 + }, + { + "epoch": 1.7682829252680428, + "grad_norm": 4.2121477127075195, + "learning_rate": 2.059421297534364e-05, + "loss": 5.0184, + "step": 22100 + }, + { + "epoch": 1.7690830532885262, + "grad_norm": 2.8298094272613525, + "learning_rate": 2.05808418462855e-05, + "loss": 4.9215, + "step": 22110 + }, + { + "epoch": 1.7698831813090095, + "grad_norm": 4.237027168273926, + "learning_rate": 2.0567470717227364e-05, + "loss": 5.0723, + "step": 22120 + }, + { + "epoch": 1.7706833093294927, + "grad_norm": 3.460895299911499, + "learning_rate": 2.0554099588169227e-05, + "loss": 5.0134, + "step": 22130 + }, + { + "epoch": 1.7714834373499762, + "grad_norm": 2.9256222248077393, + "learning_rate": 2.054072845911109e-05, + "loss": 4.8817, + "step": 22140 + }, + { + "epoch": 1.7722835653704592, + "grad_norm": 3.658893585205078, + "learning_rate": 2.0527357330052952e-05, + "loss": 4.906, + "step": 22150 + }, + { + "epoch": 1.7730836933909426, + "grad_norm": 3.463165521621704, + "learning_rate": 2.0513986200994815e-05, + "loss": 5.0674, + "step": 22160 + }, + { + "epoch": 1.7738838214114259, + "grad_norm": 2.669952154159546, + "learning_rate": 2.0500615071936674e-05, + "loss": 4.9548, + "step": 22170 + }, + { + "epoch": 1.774683949431909, + "grad_norm": 3.712484836578369, + "learning_rate": 2.0487243942878537e-05, + "loss": 5.0874, + "step": 22180 + }, + { + "epoch": 1.7754840774523923, + "grad_norm": 3.4459969997406006, + "learning_rate": 2.04738728138204e-05, + "loss": 4.9384, + "step": 22190 + }, + { + "epoch": 1.7762842054728756, + "grad_norm": 4.165179252624512, + "learning_rate": 2.0460501684762263e-05, + "loss": 5.0259, + "step": 22200 + }, + { + "epoch": 1.777084333493359, + "grad_norm": 3.050100326538086, + "learning_rate": 2.0447130555704125e-05, + "loss": 4.9838, + "step": 22210 + }, + { + "epoch": 1.777884461513842, + "grad_norm": 3.6571977138519287, + "learning_rate": 2.0433759426645988e-05, + "loss": 4.8971, + "step": 22220 + }, + { + "epoch": 1.7786845895343255, + "grad_norm": 3.6707186698913574, + "learning_rate": 2.042038829758785e-05, + "loss": 5.0419, + "step": 22230 + }, + { + "epoch": 1.7794847175548087, + "grad_norm": 3.24741792678833, + "learning_rate": 2.0407017168529713e-05, + "loss": 5.1247, + "step": 22240 + }, + { + "epoch": 1.780284845575292, + "grad_norm": 4.1845703125, + "learning_rate": 2.0393646039471573e-05, + "loss": 5.1055, + "step": 22250 + }, + { + "epoch": 1.7810849735957754, + "grad_norm": 2.6318318843841553, + "learning_rate": 2.0380274910413435e-05, + "loss": 4.9075, + "step": 22260 + }, + { + "epoch": 1.7818851016162585, + "grad_norm": 3.2743167877197266, + "learning_rate": 2.0366903781355298e-05, + "loss": 5.0425, + "step": 22270 + }, + { + "epoch": 1.782685229636742, + "grad_norm": 2.7158398628234863, + "learning_rate": 2.035353265229716e-05, + "loss": 4.913, + "step": 22280 + }, + { + "epoch": 1.7834853576572252, + "grad_norm": 4.428764343261719, + "learning_rate": 2.0340161523239023e-05, + "loss": 4.9971, + "step": 22290 + }, + { + "epoch": 1.7842854856777084, + "grad_norm": 3.811169385910034, + "learning_rate": 2.0326790394180886e-05, + "loss": 4.9423, + "step": 22300 + }, + { + "epoch": 1.7850856136981919, + "grad_norm": 3.651449680328369, + "learning_rate": 2.031341926512275e-05, + "loss": 5.2026, + "step": 22310 + }, + { + "epoch": 1.7858857417186749, + "grad_norm": 3.165903329849243, + "learning_rate": 2.030004813606461e-05, + "loss": 5.1141, + "step": 22320 + }, + { + "epoch": 1.7866858697391583, + "grad_norm": 2.8548033237457275, + "learning_rate": 2.028667700700647e-05, + "loss": 5.046, + "step": 22330 + }, + { + "epoch": 1.7874859977596416, + "grad_norm": 3.2436907291412354, + "learning_rate": 2.0273305877948333e-05, + "loss": 5.1067, + "step": 22340 + }, + { + "epoch": 1.7882861257801248, + "grad_norm": 3.599581241607666, + "learning_rate": 2.0259934748890196e-05, + "loss": 5.0487, + "step": 22350 + }, + { + "epoch": 1.7890862538006083, + "grad_norm": 3.722228765487671, + "learning_rate": 2.024656361983206e-05, + "loss": 5.0016, + "step": 22360 + }, + { + "epoch": 1.7898863818210913, + "grad_norm": 3.774880886077881, + "learning_rate": 2.023319249077392e-05, + "loss": 5.0675, + "step": 22370 + }, + { + "epoch": 1.7906865098415747, + "grad_norm": 4.3687968254089355, + "learning_rate": 2.0219821361715784e-05, + "loss": 5.0152, + "step": 22380 + }, + { + "epoch": 1.791486637862058, + "grad_norm": 3.880603790283203, + "learning_rate": 2.0206450232657647e-05, + "loss": 4.9069, + "step": 22390 + }, + { + "epoch": 1.7922867658825412, + "grad_norm": 6.448166847229004, + "learning_rate": 2.0193079103599506e-05, + "loss": 5.0175, + "step": 22400 + }, + { + "epoch": 1.7930868939030244, + "grad_norm": 3.8508150577545166, + "learning_rate": 2.017970797454137e-05, + "loss": 5.1245, + "step": 22410 + }, + { + "epoch": 1.7938870219235077, + "grad_norm": 2.9482107162475586, + "learning_rate": 2.016633684548323e-05, + "loss": 4.9807, + "step": 22420 + }, + { + "epoch": 1.7946871499439911, + "grad_norm": 3.987626314163208, + "learning_rate": 2.0152965716425094e-05, + "loss": 5.1364, + "step": 22430 + }, + { + "epoch": 1.7954872779644742, + "grad_norm": 3.995668649673462, + "learning_rate": 2.0139594587366957e-05, + "loss": 5.0172, + "step": 22440 + }, + { + "epoch": 1.7962874059849576, + "grad_norm": 3.471242904663086, + "learning_rate": 2.012622345830882e-05, + "loss": 5.0859, + "step": 22450 + }, + { + "epoch": 1.7970875340054409, + "grad_norm": 4.026039123535156, + "learning_rate": 2.0112852329250682e-05, + "loss": 5.1441, + "step": 22460 + }, + { + "epoch": 1.797887662025924, + "grad_norm": 5.313076019287109, + "learning_rate": 2.0099481200192545e-05, + "loss": 4.9975, + "step": 22470 + }, + { + "epoch": 1.7986877900464076, + "grad_norm": 3.445133924484253, + "learning_rate": 2.0086110071134408e-05, + "loss": 5.0569, + "step": 22480 + }, + { + "epoch": 1.7994879180668906, + "grad_norm": 3.028310775756836, + "learning_rate": 2.007273894207627e-05, + "loss": 4.9515, + "step": 22490 + }, + { + "epoch": 1.800288046087374, + "grad_norm": 3.2729883193969727, + "learning_rate": 2.0059367813018133e-05, + "loss": 4.9365, + "step": 22500 + }, + { + "epoch": 1.8010881741078573, + "grad_norm": 2.9513375759124756, + "learning_rate": 2.0045996683959996e-05, + "loss": 4.8909, + "step": 22510 + }, + { + "epoch": 1.8018883021283405, + "grad_norm": 3.4466264247894287, + "learning_rate": 2.003262555490186e-05, + "loss": 4.8748, + "step": 22520 + }, + { + "epoch": 1.802688430148824, + "grad_norm": 3.1651034355163574, + "learning_rate": 2.001925442584372e-05, + "loss": 4.8218, + "step": 22530 + }, + { + "epoch": 1.803488558169307, + "grad_norm": 2.885279893875122, + "learning_rate": 2.0005883296785584e-05, + "loss": 4.8861, + "step": 22540 + }, + { + "epoch": 1.8042886861897904, + "grad_norm": 3.265558958053589, + "learning_rate": 1.9992512167727447e-05, + "loss": 5.1021, + "step": 22550 + }, + { + "epoch": 1.8050888142102737, + "grad_norm": 3.3836588859558105, + "learning_rate": 1.9979141038669306e-05, + "loss": 4.9152, + "step": 22560 + }, + { + "epoch": 1.805888942230757, + "grad_norm": 4.1803693771362305, + "learning_rate": 1.996576990961117e-05, + "loss": 4.9414, + "step": 22570 + }, + { + "epoch": 1.8066890702512401, + "grad_norm": 3.471590995788574, + "learning_rate": 1.995239878055303e-05, + "loss": 4.8972, + "step": 22580 + }, + { + "epoch": 1.8074891982717234, + "grad_norm": 4.42878532409668, + "learning_rate": 1.9939027651494894e-05, + "loss": 5.0472, + "step": 22590 + }, + { + "epoch": 1.8082893262922068, + "grad_norm": 3.460707187652588, + "learning_rate": 1.9925656522436757e-05, + "loss": 5.1091, + "step": 22600 + }, + { + "epoch": 1.80908945431269, + "grad_norm": 2.901390790939331, + "learning_rate": 1.991228539337862e-05, + "loss": 4.9029, + "step": 22610 + }, + { + "epoch": 1.8098895823331733, + "grad_norm": 3.826354503631592, + "learning_rate": 1.9898914264320482e-05, + "loss": 5.0852, + "step": 22620 + }, + { + "epoch": 1.8106897103536566, + "grad_norm": 3.9841110706329346, + "learning_rate": 1.988554313526234e-05, + "loss": 4.9441, + "step": 22630 + }, + { + "epoch": 1.8114898383741398, + "grad_norm": 3.2412731647491455, + "learning_rate": 1.9872172006204204e-05, + "loss": 4.9013, + "step": 22640 + }, + { + "epoch": 1.8122899663946233, + "grad_norm": 3.6046488285064697, + "learning_rate": 1.9858800877146067e-05, + "loss": 5.0176, + "step": 22650 + }, + { + "epoch": 1.8130900944151063, + "grad_norm": 4.1147260665893555, + "learning_rate": 1.984542974808793e-05, + "loss": 4.8024, + "step": 22660 + }, + { + "epoch": 1.8138902224355897, + "grad_norm": 3.180675983428955, + "learning_rate": 1.9832058619029792e-05, + "loss": 4.9099, + "step": 22670 + }, + { + "epoch": 1.814690350456073, + "grad_norm": 3.9428789615631104, + "learning_rate": 1.9818687489971655e-05, + "loss": 5.003, + "step": 22680 + }, + { + "epoch": 1.8154904784765562, + "grad_norm": 2.911067247390747, + "learning_rate": 1.9805316360913517e-05, + "loss": 4.9009, + "step": 22690 + }, + { + "epoch": 1.8162906064970397, + "grad_norm": 3.8873329162597656, + "learning_rate": 1.979194523185538e-05, + "loss": 5.1122, + "step": 22700 + }, + { + "epoch": 1.8170907345175227, + "grad_norm": 3.5868935585021973, + "learning_rate": 1.977857410279724e-05, + "loss": 4.9064, + "step": 22710 + }, + { + "epoch": 1.8178908625380061, + "grad_norm": 3.8088326454162598, + "learning_rate": 1.9765202973739102e-05, + "loss": 5.0972, + "step": 22720 + }, + { + "epoch": 1.8186909905584894, + "grad_norm": 3.4376535415649414, + "learning_rate": 1.9751831844680965e-05, + "loss": 4.9626, + "step": 22730 + }, + { + "epoch": 1.8194911185789726, + "grad_norm": 3.151939630508423, + "learning_rate": 1.9738460715622827e-05, + "loss": 4.9541, + "step": 22740 + }, + { + "epoch": 1.820291246599456, + "grad_norm": 4.372435569763184, + "learning_rate": 1.972508958656469e-05, + "loss": 4.9463, + "step": 22750 + }, + { + "epoch": 1.821091374619939, + "grad_norm": 2.9375088214874268, + "learning_rate": 1.9711718457506553e-05, + "loss": 5.019, + "step": 22760 + }, + { + "epoch": 1.8218915026404225, + "grad_norm": 3.458109140396118, + "learning_rate": 1.9698347328448416e-05, + "loss": 5.0701, + "step": 22770 + }, + { + "epoch": 1.8226916306609058, + "grad_norm": 3.90632700920105, + "learning_rate": 1.9684976199390278e-05, + "loss": 4.9235, + "step": 22780 + }, + { + "epoch": 1.823491758681389, + "grad_norm": 3.9509658813476562, + "learning_rate": 1.9671605070332138e-05, + "loss": 4.892, + "step": 22790 + }, + { + "epoch": 1.8242918867018723, + "grad_norm": 3.8089678287506104, + "learning_rate": 1.9658233941274e-05, + "loss": 5.0283, + "step": 22800 + }, + { + "epoch": 1.8250920147223555, + "grad_norm": 3.4844071865081787, + "learning_rate": 1.9644862812215863e-05, + "loss": 4.9961, + "step": 22810 + }, + { + "epoch": 1.825892142742839, + "grad_norm": 2.961293935775757, + "learning_rate": 1.9631491683157726e-05, + "loss": 5.0996, + "step": 22820 + }, + { + "epoch": 1.826692270763322, + "grad_norm": 2.8211441040039062, + "learning_rate": 1.9618120554099588e-05, + "loss": 4.9355, + "step": 22830 + }, + { + "epoch": 1.8274923987838054, + "grad_norm": 4.970695972442627, + "learning_rate": 1.960474942504145e-05, + "loss": 5.0109, + "step": 22840 + }, + { + "epoch": 1.8282925268042887, + "grad_norm": 3.2211780548095703, + "learning_rate": 1.9591378295983314e-05, + "loss": 4.9364, + "step": 22850 + }, + { + "epoch": 1.829092654824772, + "grad_norm": 2.114428758621216, + "learning_rate": 1.9578007166925176e-05, + "loss": 4.9879, + "step": 22860 + }, + { + "epoch": 1.8298927828452554, + "grad_norm": 5.383838176727295, + "learning_rate": 1.956463603786704e-05, + "loss": 5.0305, + "step": 22870 + }, + { + "epoch": 1.8306929108657384, + "grad_norm": 3.4552454948425293, + "learning_rate": 1.9551264908808902e-05, + "loss": 4.946, + "step": 22880 + }, + { + "epoch": 1.8314930388862218, + "grad_norm": 3.4518730640411377, + "learning_rate": 1.9537893779750764e-05, + "loss": 4.9089, + "step": 22890 + }, + { + "epoch": 1.832293166906705, + "grad_norm": 8.194537162780762, + "learning_rate": 1.9524522650692627e-05, + "loss": 4.8648, + "step": 22900 + }, + { + "epoch": 1.8330932949271883, + "grad_norm": 3.472346782684326, + "learning_rate": 1.951115152163449e-05, + "loss": 5.0056, + "step": 22910 + }, + { + "epoch": 1.8338934229476718, + "grad_norm": 2.3425405025482178, + "learning_rate": 1.9497780392576352e-05, + "loss": 5.0722, + "step": 22920 + }, + { + "epoch": 1.8346935509681548, + "grad_norm": 4.770868301391602, + "learning_rate": 1.9484409263518215e-05, + "loss": 4.8792, + "step": 22930 + }, + { + "epoch": 1.8354936789886382, + "grad_norm": 3.50227689743042, + "learning_rate": 1.9471038134460074e-05, + "loss": 4.9434, + "step": 22940 + }, + { + "epoch": 1.8362938070091215, + "grad_norm": 3.5927786827087402, + "learning_rate": 1.9457667005401937e-05, + "loss": 4.9179, + "step": 22950 + }, + { + "epoch": 1.8370939350296047, + "grad_norm": 4.16779088973999, + "learning_rate": 1.94442958763438e-05, + "loss": 5.1915, + "step": 22960 + }, + { + "epoch": 1.8378940630500882, + "grad_norm": 2.5966103076934814, + "learning_rate": 1.9430924747285663e-05, + "loss": 4.9658, + "step": 22970 + }, + { + "epoch": 1.8386941910705712, + "grad_norm": 2.5739660263061523, + "learning_rate": 1.9417553618227525e-05, + "loss": 4.9782, + "step": 22980 + }, + { + "epoch": 1.8394943190910547, + "grad_norm": 2.8742406368255615, + "learning_rate": 1.9404182489169388e-05, + "loss": 5.0417, + "step": 22990 + }, + { + "epoch": 1.840294447111538, + "grad_norm": 4.191195964813232, + "learning_rate": 1.939081136011125e-05, + "loss": 4.8885, + "step": 23000 + }, + { + "epoch": 1.8410945751320211, + "grad_norm": 3.3833866119384766, + "learning_rate": 1.9377440231053113e-05, + "loss": 5.1906, + "step": 23010 + }, + { + "epoch": 1.8418947031525044, + "grad_norm": 3.468492031097412, + "learning_rate": 1.9364069101994973e-05, + "loss": 5.0216, + "step": 23020 + }, + { + "epoch": 1.8426948311729876, + "grad_norm": 3.2246339321136475, + "learning_rate": 1.9350697972936835e-05, + "loss": 5.0528, + "step": 23030 + }, + { + "epoch": 1.843494959193471, + "grad_norm": 3.780441999435425, + "learning_rate": 1.9337326843878698e-05, + "loss": 4.9197, + "step": 23040 + }, + { + "epoch": 1.844295087213954, + "grad_norm": 3.0459189414978027, + "learning_rate": 1.932395571482056e-05, + "loss": 4.9636, + "step": 23050 + }, + { + "epoch": 1.8450952152344375, + "grad_norm": 2.835489511489868, + "learning_rate": 1.9310584585762423e-05, + "loss": 5.044, + "step": 23060 + }, + { + "epoch": 1.8458953432549208, + "grad_norm": 2.615612506866455, + "learning_rate": 1.9297213456704286e-05, + "loss": 4.9497, + "step": 23070 + }, + { + "epoch": 1.846695471275404, + "grad_norm": 3.1220602989196777, + "learning_rate": 1.928384232764615e-05, + "loss": 4.8735, + "step": 23080 + }, + { + "epoch": 1.8474955992958875, + "grad_norm": 4.260631561279297, + "learning_rate": 1.9271808311493824e-05, + "loss": 5.0581, + "step": 23090 + }, + { + "epoch": 1.8482957273163705, + "grad_norm": 2.928492546081543, + "learning_rate": 1.9258437182435684e-05, + "loss": 4.9898, + "step": 23100 + }, + { + "epoch": 1.849095855336854, + "grad_norm": 3.321458101272583, + "learning_rate": 1.9245066053377546e-05, + "loss": 4.9585, + "step": 23110 + }, + { + "epoch": 1.8498959833573372, + "grad_norm": 3.4391822814941406, + "learning_rate": 1.923169492431941e-05, + "loss": 5.0762, + "step": 23120 + }, + { + "epoch": 1.8506961113778204, + "grad_norm": 2.8308773040771484, + "learning_rate": 1.9218323795261272e-05, + "loss": 4.9746, + "step": 23130 + }, + { + "epoch": 1.8514962393983039, + "grad_norm": 3.0411789417266846, + "learning_rate": 1.9204952666203135e-05, + "loss": 4.7202, + "step": 23140 + }, + { + "epoch": 1.852296367418787, + "grad_norm": 4.386777400970459, + "learning_rate": 1.9191581537144997e-05, + "loss": 5.0321, + "step": 23150 + }, + { + "epoch": 1.8530964954392704, + "grad_norm": 3.381636381149292, + "learning_rate": 1.917821040808686e-05, + "loss": 4.9549, + "step": 23160 + }, + { + "epoch": 1.8538966234597536, + "grad_norm": 2.740494728088379, + "learning_rate": 1.9164839279028723e-05, + "loss": 5.1105, + "step": 23170 + }, + { + "epoch": 1.8546967514802368, + "grad_norm": 2.8650572299957275, + "learning_rate": 1.9151468149970582e-05, + "loss": 4.9549, + "step": 23180 + }, + { + "epoch": 1.85549687950072, + "grad_norm": 2.8108630180358887, + "learning_rate": 1.9138097020912445e-05, + "loss": 4.8654, + "step": 23190 + }, + { + "epoch": 1.8562970075212033, + "grad_norm": 4.914087772369385, + "learning_rate": 1.9124725891854307e-05, + "loss": 4.9375, + "step": 23200 + }, + { + "epoch": 1.8570971355416868, + "grad_norm": 3.421011447906494, + "learning_rate": 1.911135476279617e-05, + "loss": 5.0395, + "step": 23210 + }, + { + "epoch": 1.85789726356217, + "grad_norm": 5.41282844543457, + "learning_rate": 1.9097983633738033e-05, + "loss": 4.9296, + "step": 23220 + }, + { + "epoch": 1.8586973915826532, + "grad_norm": 5.05392599105835, + "learning_rate": 1.9084612504679895e-05, + "loss": 5.1467, + "step": 23230 + }, + { + "epoch": 1.8594975196031365, + "grad_norm": 2.838409185409546, + "learning_rate": 1.9071241375621758e-05, + "loss": 4.7497, + "step": 23240 + }, + { + "epoch": 1.8602976476236197, + "grad_norm": 3.0737788677215576, + "learning_rate": 1.905787024656362e-05, + "loss": 4.9982, + "step": 23250 + }, + { + "epoch": 1.8610977756441032, + "grad_norm": 2.8832297325134277, + "learning_rate": 1.9044499117505483e-05, + "loss": 5.0327, + "step": 23260 + }, + { + "epoch": 1.8618979036645862, + "grad_norm": 3.0281708240509033, + "learning_rate": 1.9031127988447346e-05, + "loss": 5.0735, + "step": 23270 + }, + { + "epoch": 1.8626980316850696, + "grad_norm": 2.8153724670410156, + "learning_rate": 1.901775685938921e-05, + "loss": 4.97, + "step": 23280 + }, + { + "epoch": 1.8634981597055529, + "grad_norm": 3.8780734539031982, + "learning_rate": 1.900438573033107e-05, + "loss": 5.1535, + "step": 23290 + }, + { + "epoch": 1.8642982877260361, + "grad_norm": 3.875718832015991, + "learning_rate": 1.8991014601272934e-05, + "loss": 5.0427, + "step": 23300 + }, + { + "epoch": 1.8650984157465196, + "grad_norm": 5.313499927520752, + "learning_rate": 1.8977643472214797e-05, + "loss": 5.0147, + "step": 23310 + }, + { + "epoch": 1.8658985437670026, + "grad_norm": 3.9303500652313232, + "learning_rate": 1.896427234315666e-05, + "loss": 4.9529, + "step": 23320 + }, + { + "epoch": 1.866698671787486, + "grad_norm": 5.089270114898682, + "learning_rate": 1.895090121409852e-05, + "loss": 4.7841, + "step": 23330 + }, + { + "epoch": 1.8674987998079693, + "grad_norm": 4.434947490692139, + "learning_rate": 1.893753008504038e-05, + "loss": 4.9213, + "step": 23340 + }, + { + "epoch": 1.8682989278284525, + "grad_norm": 3.5199437141418457, + "learning_rate": 1.8924158955982244e-05, + "loss": 4.9473, + "step": 23350 + }, + { + "epoch": 1.869099055848936, + "grad_norm": 2.435863494873047, + "learning_rate": 1.8910787826924107e-05, + "loss": 4.9539, + "step": 23360 + }, + { + "epoch": 1.869899183869419, + "grad_norm": 3.309080123901367, + "learning_rate": 1.889741669786597e-05, + "loss": 5.1178, + "step": 23370 + }, + { + "epoch": 1.8706993118899025, + "grad_norm": 3.9439151287078857, + "learning_rate": 1.8884045568807832e-05, + "loss": 4.8227, + "step": 23380 + }, + { + "epoch": 1.8714994399103857, + "grad_norm": 4.685158729553223, + "learning_rate": 1.8870674439749695e-05, + "loss": 4.9227, + "step": 23390 + }, + { + "epoch": 1.872299567930869, + "grad_norm": 3.316544532775879, + "learning_rate": 1.8857303310691558e-05, + "loss": 4.9866, + "step": 23400 + }, + { + "epoch": 1.8730996959513522, + "grad_norm": 2.861067056655884, + "learning_rate": 1.8843932181633417e-05, + "loss": 4.9267, + "step": 23410 + }, + { + "epoch": 1.8738998239718354, + "grad_norm": 3.334657907485962, + "learning_rate": 1.883056105257528e-05, + "loss": 5.0008, + "step": 23420 + }, + { + "epoch": 1.8746999519923189, + "grad_norm": 3.105860948562622, + "learning_rate": 1.8817189923517142e-05, + "loss": 4.9593, + "step": 23430 + }, + { + "epoch": 1.8755000800128019, + "grad_norm": 4.091304779052734, + "learning_rate": 1.8803818794459005e-05, + "loss": 4.9262, + "step": 23440 + }, + { + "epoch": 1.8763002080332853, + "grad_norm": 2.4841034412384033, + "learning_rate": 1.8790447665400868e-05, + "loss": 4.9754, + "step": 23450 + }, + { + "epoch": 1.8771003360537686, + "grad_norm": 3.671037435531616, + "learning_rate": 1.877707653634273e-05, + "loss": 4.9462, + "step": 23460 + }, + { + "epoch": 1.8779004640742518, + "grad_norm": 3.1614885330200195, + "learning_rate": 1.8763705407284593e-05, + "loss": 5.0635, + "step": 23470 + }, + { + "epoch": 1.8787005920947353, + "grad_norm": 3.2696330547332764, + "learning_rate": 1.8750334278226456e-05, + "loss": 4.9501, + "step": 23480 + }, + { + "epoch": 1.8795007201152183, + "grad_norm": 4.031539440155029, + "learning_rate": 1.8736963149168315e-05, + "loss": 4.9381, + "step": 23490 + }, + { + "epoch": 1.8803008481357018, + "grad_norm": 3.9160051345825195, + "learning_rate": 1.8723592020110178e-05, + "loss": 5.0243, + "step": 23500 + }, + { + "epoch": 1.881100976156185, + "grad_norm": 4.479321479797363, + "learning_rate": 1.871022089105204e-05, + "loss": 4.9724, + "step": 23510 + }, + { + "epoch": 1.8819011041766682, + "grad_norm": 3.626620054244995, + "learning_rate": 1.8696849761993903e-05, + "loss": 4.9047, + "step": 23520 + }, + { + "epoch": 1.8827012321971517, + "grad_norm": 3.6235501766204834, + "learning_rate": 1.8683478632935766e-05, + "loss": 4.9176, + "step": 23530 + }, + { + "epoch": 1.8835013602176347, + "grad_norm": 3.4153671264648438, + "learning_rate": 1.867010750387763e-05, + "loss": 5.1204, + "step": 23540 + }, + { + "epoch": 1.8843014882381182, + "grad_norm": 2.90492844581604, + "learning_rate": 1.865673637481949e-05, + "loss": 4.9843, + "step": 23550 + }, + { + "epoch": 1.8851016162586014, + "grad_norm": 4.847935199737549, + "learning_rate": 1.864336524576135e-05, + "loss": 4.9955, + "step": 23560 + }, + { + "epoch": 1.8859017442790846, + "grad_norm": 3.3774425983428955, + "learning_rate": 1.8629994116703213e-05, + "loss": 5.0434, + "step": 23570 + }, + { + "epoch": 1.886701872299568, + "grad_norm": 4.005410671234131, + "learning_rate": 1.8616622987645076e-05, + "loss": 4.943, + "step": 23580 + }, + { + "epoch": 1.887502000320051, + "grad_norm": 3.4165306091308594, + "learning_rate": 1.860325185858694e-05, + "loss": 4.7749, + "step": 23590 + }, + { + "epoch": 1.8883021283405346, + "grad_norm": 4.466346263885498, + "learning_rate": 1.85898807295288e-05, + "loss": 4.953, + "step": 23600 + }, + { + "epoch": 1.8891022563610178, + "grad_norm": 3.477444887161255, + "learning_rate": 1.8576509600470664e-05, + "loss": 5.0208, + "step": 23610 + }, + { + "epoch": 1.889902384381501, + "grad_norm": 3.3577370643615723, + "learning_rate": 1.8563138471412527e-05, + "loss": 4.9861, + "step": 23620 + }, + { + "epoch": 1.8907025124019843, + "grad_norm": 5.305810451507568, + "learning_rate": 1.854976734235439e-05, + "loss": 5.2137, + "step": 23630 + }, + { + "epoch": 1.8915026404224675, + "grad_norm": 2.8031234741210938, + "learning_rate": 1.853639621329625e-05, + "loss": 5.0322, + "step": 23640 + }, + { + "epoch": 1.892302768442951, + "grad_norm": 2.6856045722961426, + "learning_rate": 1.852302508423811e-05, + "loss": 4.7417, + "step": 23650 + }, + { + "epoch": 1.893102896463434, + "grad_norm": 3.518064498901367, + "learning_rate": 1.8509653955179974e-05, + "loss": 4.9908, + "step": 23660 + }, + { + "epoch": 1.8939030244839175, + "grad_norm": 4.442662239074707, + "learning_rate": 1.8496282826121837e-05, + "loss": 5.0058, + "step": 23670 + }, + { + "epoch": 1.8947031525044007, + "grad_norm": 3.661250352859497, + "learning_rate": 1.84829116970637e-05, + "loss": 4.9123, + "step": 23680 + }, + { + "epoch": 1.895503280524884, + "grad_norm": 2.6517558097839355, + "learning_rate": 1.8469540568005562e-05, + "loss": 4.9644, + "step": 23690 + }, + { + "epoch": 1.8963034085453674, + "grad_norm": 2.9907848834991455, + "learning_rate": 1.8456169438947425e-05, + "loss": 4.8955, + "step": 23700 + }, + { + "epoch": 1.8971035365658504, + "grad_norm": 3.4989070892333984, + "learning_rate": 1.8442798309889288e-05, + "loss": 4.9782, + "step": 23710 + }, + { + "epoch": 1.8979036645863339, + "grad_norm": 3.2629289627075195, + "learning_rate": 1.842942718083115e-05, + "loss": 5.073, + "step": 23720 + }, + { + "epoch": 1.898703792606817, + "grad_norm": 2.9857161045074463, + "learning_rate": 1.8416056051773013e-05, + "loss": 5.022, + "step": 23730 + }, + { + "epoch": 1.8995039206273003, + "grad_norm": 3.6350457668304443, + "learning_rate": 1.8402684922714876e-05, + "loss": 5.0102, + "step": 23740 + }, + { + "epoch": 1.9003040486477838, + "grad_norm": 3.987959146499634, + "learning_rate": 1.838931379365674e-05, + "loss": 4.8519, + "step": 23750 + }, + { + "epoch": 1.9011041766682668, + "grad_norm": 2.2430574893951416, + "learning_rate": 1.83759426645986e-05, + "loss": 4.9962, + "step": 23760 + }, + { + "epoch": 1.9019043046887503, + "grad_norm": 2.8868260383605957, + "learning_rate": 1.8362571535540464e-05, + "loss": 5.116, + "step": 23770 + }, + { + "epoch": 1.9027044327092335, + "grad_norm": 3.6265523433685303, + "learning_rate": 1.8349200406482326e-05, + "loss": 4.9706, + "step": 23780 + }, + { + "epoch": 1.9035045607297167, + "grad_norm": 4.022701740264893, + "learning_rate": 1.8335829277424186e-05, + "loss": 4.9531, + "step": 23790 + }, + { + "epoch": 1.9043046887502, + "grad_norm": 3.127108573913574, + "learning_rate": 1.832245814836605e-05, + "loss": 5.0577, + "step": 23800 + }, + { + "epoch": 1.9051048167706832, + "grad_norm": 6.9320502281188965, + "learning_rate": 1.830908701930791e-05, + "loss": 4.8681, + "step": 23810 + }, + { + "epoch": 1.9059049447911667, + "grad_norm": 4.856078624725342, + "learning_rate": 1.8295715890249774e-05, + "loss": 5.0294, + "step": 23820 + }, + { + "epoch": 1.90670507281165, + "grad_norm": 3.244516372680664, + "learning_rate": 1.8282344761191636e-05, + "loss": 4.9191, + "step": 23830 + }, + { + "epoch": 1.9075052008321332, + "grad_norm": 3.1606297492980957, + "learning_rate": 1.82689736321335e-05, + "loss": 5.0632, + "step": 23840 + }, + { + "epoch": 1.9083053288526164, + "grad_norm": 4.155543804168701, + "learning_rate": 1.8255602503075362e-05, + "loss": 5.1398, + "step": 23850 + }, + { + "epoch": 1.9091054568730996, + "grad_norm": 2.770430088043213, + "learning_rate": 1.8242231374017225e-05, + "loss": 5.0403, + "step": 23860 + }, + { + "epoch": 1.909905584893583, + "grad_norm": 3.0609514713287354, + "learning_rate": 1.8228860244959084e-05, + "loss": 4.8498, + "step": 23870 + }, + { + "epoch": 1.910705712914066, + "grad_norm": 3.4689247608184814, + "learning_rate": 1.8215489115900947e-05, + "loss": 4.9944, + "step": 23880 + }, + { + "epoch": 1.9115058409345496, + "grad_norm": 3.3922059535980225, + "learning_rate": 1.820211798684281e-05, + "loss": 4.7982, + "step": 23890 + }, + { + "epoch": 1.9123059689550328, + "grad_norm": 3.3366963863372803, + "learning_rate": 1.8188746857784672e-05, + "loss": 4.9941, + "step": 23900 + }, + { + "epoch": 1.913106096975516, + "grad_norm": 2.8962085247039795, + "learning_rate": 1.8175375728726535e-05, + "loss": 4.7124, + "step": 23910 + }, + { + "epoch": 1.9139062249959995, + "grad_norm": 6.0768141746521, + "learning_rate": 1.8162004599668397e-05, + "loss": 4.9149, + "step": 23920 + }, + { + "epoch": 1.9147063530164825, + "grad_norm": 3.3171348571777344, + "learning_rate": 1.814863347061026e-05, + "loss": 4.9691, + "step": 23930 + }, + { + "epoch": 1.915506481036966, + "grad_norm": 4.5769453048706055, + "learning_rate": 1.8135262341552123e-05, + "loss": 4.8855, + "step": 23940 + }, + { + "epoch": 1.9163066090574492, + "grad_norm": 3.704608917236328, + "learning_rate": 1.8121891212493982e-05, + "loss": 5.0219, + "step": 23950 + }, + { + "epoch": 1.9171067370779324, + "grad_norm": 4.647409439086914, + "learning_rate": 1.8108520083435845e-05, + "loss": 4.9864, + "step": 23960 + }, + { + "epoch": 1.917906865098416, + "grad_norm": 3.0767157077789307, + "learning_rate": 1.8095148954377707e-05, + "loss": 4.9873, + "step": 23970 + }, + { + "epoch": 1.918706993118899, + "grad_norm": 3.1951663494110107, + "learning_rate": 1.808177782531957e-05, + "loss": 4.9565, + "step": 23980 + }, + { + "epoch": 1.9195071211393824, + "grad_norm": 3.0078985691070557, + "learning_rate": 1.8068406696261433e-05, + "loss": 5.0643, + "step": 23990 + }, + { + "epoch": 1.9203072491598656, + "grad_norm": 2.8039395809173584, + "learning_rate": 1.8055035567203295e-05, + "loss": 4.968, + "step": 24000 + }, + { + "epoch": 1.9211073771803489, + "grad_norm": 2.70816969871521, + "learning_rate": 1.8041664438145158e-05, + "loss": 5.0067, + "step": 24010 + }, + { + "epoch": 1.921907505200832, + "grad_norm": 3.6637299060821533, + "learning_rate": 1.802829330908702e-05, + "loss": 5.0332, + "step": 24020 + }, + { + "epoch": 1.9227076332213153, + "grad_norm": 2.8274829387664795, + "learning_rate": 1.801492218002888e-05, + "loss": 4.8953, + "step": 24030 + }, + { + "epoch": 1.9235077612417988, + "grad_norm": 3.530215263366699, + "learning_rate": 1.8001551050970743e-05, + "loss": 4.9168, + "step": 24040 + }, + { + "epoch": 1.9243078892622818, + "grad_norm": 2.753448724746704, + "learning_rate": 1.7988179921912605e-05, + "loss": 5.016, + "step": 24050 + }, + { + "epoch": 1.9251080172827653, + "grad_norm": 3.4737961292266846, + "learning_rate": 1.7974808792854468e-05, + "loss": 4.9456, + "step": 24060 + }, + { + "epoch": 1.9259081453032485, + "grad_norm": 3.040010452270508, + "learning_rate": 1.796143766379633e-05, + "loss": 4.903, + "step": 24070 + }, + { + "epoch": 1.9267082733237317, + "grad_norm": 2.961254835128784, + "learning_rate": 1.7948066534738194e-05, + "loss": 5.0527, + "step": 24080 + }, + { + "epoch": 1.9275084013442152, + "grad_norm": 2.690537929534912, + "learning_rate": 1.7934695405680056e-05, + "loss": 4.8845, + "step": 24090 + }, + { + "epoch": 1.9283085293646982, + "grad_norm": 3.9988303184509277, + "learning_rate": 1.792132427662192e-05, + "loss": 5.0803, + "step": 24100 + }, + { + "epoch": 1.9291086573851817, + "grad_norm": 2.8897204399108887, + "learning_rate": 1.790795314756378e-05, + "loss": 4.9407, + "step": 24110 + }, + { + "epoch": 1.929908785405665, + "grad_norm": 4.699467182159424, + "learning_rate": 1.7894582018505644e-05, + "loss": 4.9493, + "step": 24120 + }, + { + "epoch": 1.9307089134261481, + "grad_norm": 4.01737117767334, + "learning_rate": 1.7881210889447507e-05, + "loss": 4.9465, + "step": 24130 + }, + { + "epoch": 1.9315090414466316, + "grad_norm": 2.703599214553833, + "learning_rate": 1.786783976038937e-05, + "loss": 4.9857, + "step": 24140 + }, + { + "epoch": 1.9323091694671146, + "grad_norm": 2.5545291900634766, + "learning_rate": 1.7854468631331232e-05, + "loss": 5.0033, + "step": 24150 + }, + { + "epoch": 1.933109297487598, + "grad_norm": 3.572033166885376, + "learning_rate": 1.7841097502273095e-05, + "loss": 4.9381, + "step": 24160 + }, + { + "epoch": 1.9339094255080813, + "grad_norm": 4.481420516967773, + "learning_rate": 1.7827726373214958e-05, + "loss": 4.976, + "step": 24170 + }, + { + "epoch": 1.9347095535285646, + "grad_norm": 8.394909858703613, + "learning_rate": 1.7814355244156817e-05, + "loss": 4.871, + "step": 24180 + }, + { + "epoch": 1.935509681549048, + "grad_norm": 3.418012857437134, + "learning_rate": 1.780098411509868e-05, + "loss": 4.8412, + "step": 24190 + }, + { + "epoch": 1.936309809569531, + "grad_norm": 4.226028919219971, + "learning_rate": 1.7787612986040542e-05, + "loss": 4.9214, + "step": 24200 + }, + { + "epoch": 1.9371099375900145, + "grad_norm": 3.3171331882476807, + "learning_rate": 1.7774241856982405e-05, + "loss": 4.8912, + "step": 24210 + }, + { + "epoch": 1.9379100656104977, + "grad_norm": 2.7133543491363525, + "learning_rate": 1.7760870727924268e-05, + "loss": 5.067, + "step": 24220 + }, + { + "epoch": 1.938710193630981, + "grad_norm": 3.8669393062591553, + "learning_rate": 1.774749959886613e-05, + "loss": 4.9646, + "step": 24230 + }, + { + "epoch": 1.9395103216514642, + "grad_norm": 3.915174722671509, + "learning_rate": 1.7734128469807993e-05, + "loss": 4.9525, + "step": 24240 + }, + { + "epoch": 1.9403104496719474, + "grad_norm": 4.278127193450928, + "learning_rate": 1.7720757340749856e-05, + "loss": 4.9979, + "step": 24250 + }, + { + "epoch": 1.941110577692431, + "grad_norm": 4.306387901306152, + "learning_rate": 1.7707386211691715e-05, + "loss": 4.8818, + "step": 24260 + }, + { + "epoch": 1.941910705712914, + "grad_norm": 3.726982831954956, + "learning_rate": 1.7694015082633578e-05, + "loss": 5.0455, + "step": 24270 + }, + { + "epoch": 1.9427108337333974, + "grad_norm": 4.664205551147461, + "learning_rate": 1.768064395357544e-05, + "loss": 5.043, + "step": 24280 + }, + { + "epoch": 1.9435109617538806, + "grad_norm": 3.069760799407959, + "learning_rate": 1.7667272824517303e-05, + "loss": 4.9116, + "step": 24290 + }, + { + "epoch": 1.9443110897743638, + "grad_norm": 3.658348798751831, + "learning_rate": 1.7653901695459166e-05, + "loss": 4.858, + "step": 24300 + }, + { + "epoch": 1.9451112177948473, + "grad_norm": 2.518824338912964, + "learning_rate": 1.764053056640103e-05, + "loss": 4.8353, + "step": 24310 + }, + { + "epoch": 1.9459113458153303, + "grad_norm": 3.897017478942871, + "learning_rate": 1.762715943734289e-05, + "loss": 4.9834, + "step": 24320 + }, + { + "epoch": 1.9467114738358138, + "grad_norm": 3.2303273677825928, + "learning_rate": 1.7613788308284754e-05, + "loss": 5.0684, + "step": 24330 + }, + { + "epoch": 1.947511601856297, + "grad_norm": 3.314673662185669, + "learning_rate": 1.7600417179226613e-05, + "loss": 4.9663, + "step": 24340 + }, + { + "epoch": 1.9483117298767803, + "grad_norm": 2.891897201538086, + "learning_rate": 1.7587046050168476e-05, + "loss": 4.8986, + "step": 24350 + }, + { + "epoch": 1.9491118578972637, + "grad_norm": 2.976445436477661, + "learning_rate": 1.757367492111034e-05, + "loss": 5.0788, + "step": 24360 + }, + { + "epoch": 1.9499119859177467, + "grad_norm": 2.883258819580078, + "learning_rate": 1.75603037920522e-05, + "loss": 5.03, + "step": 24370 + }, + { + "epoch": 1.9507121139382302, + "grad_norm": 3.258368968963623, + "learning_rate": 1.7546932662994064e-05, + "loss": 4.9691, + "step": 24380 + }, + { + "epoch": 1.9515122419587134, + "grad_norm": 3.8020787239074707, + "learning_rate": 1.7533561533935927e-05, + "loss": 4.9491, + "step": 24390 + }, + { + "epoch": 1.9523123699791967, + "grad_norm": 3.3602609634399414, + "learning_rate": 1.752019040487779e-05, + "loss": 4.8515, + "step": 24400 + }, + { + "epoch": 1.95311249799968, + "grad_norm": 3.7602756023406982, + "learning_rate": 1.750681927581965e-05, + "loss": 4.9146, + "step": 24410 + }, + { + "epoch": 1.9539126260201631, + "grad_norm": 3.219118356704712, + "learning_rate": 1.749344814676151e-05, + "loss": 4.9882, + "step": 24420 + }, + { + "epoch": 1.9547127540406466, + "grad_norm": 3.1614081859588623, + "learning_rate": 1.7480077017703374e-05, + "loss": 4.9193, + "step": 24430 + }, + { + "epoch": 1.9555128820611298, + "grad_norm": 3.2397539615631104, + "learning_rate": 1.7466705888645237e-05, + "loss": 5.0602, + "step": 24440 + }, + { + "epoch": 1.956313010081613, + "grad_norm": 2.708376169204712, + "learning_rate": 1.74533347595871e-05, + "loss": 4.8835, + "step": 24450 + }, + { + "epoch": 1.9571131381020963, + "grad_norm": 2.535634994506836, + "learning_rate": 1.7439963630528962e-05, + "loss": 4.9571, + "step": 24460 + }, + { + "epoch": 1.9579132661225795, + "grad_norm": 3.6022346019744873, + "learning_rate": 1.7426592501470825e-05, + "loss": 4.8711, + "step": 24470 + }, + { + "epoch": 1.958713394143063, + "grad_norm": 3.2954986095428467, + "learning_rate": 1.7413221372412688e-05, + "loss": 4.8985, + "step": 24480 + }, + { + "epoch": 1.959513522163546, + "grad_norm": 4.4332122802734375, + "learning_rate": 1.7399850243354547e-05, + "loss": 4.9637, + "step": 24490 + }, + { + "epoch": 1.9603136501840295, + "grad_norm": 3.2086029052734375, + "learning_rate": 1.738647911429641e-05, + "loss": 4.9817, + "step": 24500 + }, + { + "epoch": 1.9611137782045127, + "grad_norm": 3.447162389755249, + "learning_rate": 1.7373107985238272e-05, + "loss": 4.8359, + "step": 24510 + }, + { + "epoch": 1.961913906224996, + "grad_norm": 3.8578414916992188, + "learning_rate": 1.7359736856180135e-05, + "loss": 4.9379, + "step": 24520 + }, + { + "epoch": 1.9627140342454794, + "grad_norm": 3.037017822265625, + "learning_rate": 1.7346365727121998e-05, + "loss": 5.1682, + "step": 24530 + }, + { + "epoch": 1.9635141622659624, + "grad_norm": 2.9559504985809326, + "learning_rate": 1.733299459806386e-05, + "loss": 4.8014, + "step": 24540 + }, + { + "epoch": 1.9643142902864459, + "grad_norm": 4.131083011627197, + "learning_rate": 1.7319623469005723e-05, + "loss": 4.9633, + "step": 24550 + }, + { + "epoch": 1.9651144183069291, + "grad_norm": 3.3252639770507812, + "learning_rate": 1.7306252339947586e-05, + "loss": 5.0054, + "step": 24560 + }, + { + "epoch": 1.9659145463274124, + "grad_norm": 3.904116153717041, + "learning_rate": 1.729288121088945e-05, + "loss": 4.971, + "step": 24570 + }, + { + "epoch": 1.9667146743478958, + "grad_norm": 3.152641773223877, + "learning_rate": 1.727951008183131e-05, + "loss": 5.1016, + "step": 24580 + }, + { + "epoch": 1.9675148023683788, + "grad_norm": 3.344860315322876, + "learning_rate": 1.7266138952773174e-05, + "loss": 4.9248, + "step": 24590 + }, + { + "epoch": 1.9683149303888623, + "grad_norm": 2.9340274333953857, + "learning_rate": 1.7252767823715036e-05, + "loss": 4.987, + "step": 24600 + }, + { + "epoch": 1.9691150584093455, + "grad_norm": 2.496817111968994, + "learning_rate": 1.72393966946569e-05, + "loss": 5.031, + "step": 24610 + }, + { + "epoch": 1.9699151864298288, + "grad_norm": 3.044074296951294, + "learning_rate": 1.7226025565598762e-05, + "loss": 4.9457, + "step": 24620 + }, + { + "epoch": 1.970715314450312, + "grad_norm": 4.735547065734863, + "learning_rate": 1.7212654436540625e-05, + "loss": 4.8686, + "step": 24630 + }, + { + "epoch": 1.9715154424707952, + "grad_norm": 3.4390697479248047, + "learning_rate": 1.7199283307482484e-05, + "loss": 4.9262, + "step": 24640 + }, + { + "epoch": 1.9723155704912787, + "grad_norm": 3.429409980773926, + "learning_rate": 1.7185912178424347e-05, + "loss": 5.0453, + "step": 24650 + }, + { + "epoch": 1.9731156985117617, + "grad_norm": 5.1396942138671875, + "learning_rate": 1.717254104936621e-05, + "loss": 4.8638, + "step": 24660 + }, + { + "epoch": 1.9739158265322452, + "grad_norm": 3.099233388900757, + "learning_rate": 1.7159169920308072e-05, + "loss": 5.0777, + "step": 24670 + }, + { + "epoch": 1.9747159545527284, + "grad_norm": 6.756525993347168, + "learning_rate": 1.7145798791249935e-05, + "loss": 4.9537, + "step": 24680 + }, + { + "epoch": 1.9755160825732117, + "grad_norm": 2.5224523544311523, + "learning_rate": 1.7132427662191797e-05, + "loss": 5.0601, + "step": 24690 + }, + { + "epoch": 1.976316210593695, + "grad_norm": 2.7391388416290283, + "learning_rate": 1.711905653313366e-05, + "loss": 4.8926, + "step": 24700 + }, + { + "epoch": 1.9771163386141781, + "grad_norm": 3.1466548442840576, + "learning_rate": 1.7105685404075523e-05, + "loss": 4.9819, + "step": 24710 + }, + { + "epoch": 1.9779164666346616, + "grad_norm": 3.471501588821411, + "learning_rate": 1.7092314275017382e-05, + "loss": 5.0699, + "step": 24720 + }, + { + "epoch": 1.9787165946551448, + "grad_norm": 4.275609970092773, + "learning_rate": 1.7078943145959245e-05, + "loss": 4.7845, + "step": 24730 + }, + { + "epoch": 1.979516722675628, + "grad_norm": 3.1587014198303223, + "learning_rate": 1.7065572016901107e-05, + "loss": 5.0001, + "step": 24740 + }, + { + "epoch": 1.9803168506961115, + "grad_norm": 5.504805088043213, + "learning_rate": 1.705220088784297e-05, + "loss": 4.8489, + "step": 24750 + }, + { + "epoch": 1.9811169787165945, + "grad_norm": 4.1532206535339355, + "learning_rate": 1.7038829758784833e-05, + "loss": 4.9888, + "step": 24760 + }, + { + "epoch": 1.981917106737078, + "grad_norm": 3.094951629638672, + "learning_rate": 1.7025458629726695e-05, + "loss": 4.9468, + "step": 24770 + }, + { + "epoch": 1.9827172347575612, + "grad_norm": 5.671091556549072, + "learning_rate": 1.7012087500668558e-05, + "loss": 5.0144, + "step": 24780 + }, + { + "epoch": 1.9835173627780445, + "grad_norm": 4.130462646484375, + "learning_rate": 1.699871637161042e-05, + "loss": 4.9417, + "step": 24790 + }, + { + "epoch": 1.984317490798528, + "grad_norm": 6.027336597442627, + "learning_rate": 1.698534524255228e-05, + "loss": 4.8084, + "step": 24800 + }, + { + "epoch": 1.985117618819011, + "grad_norm": 2.759535312652588, + "learning_rate": 1.6971974113494143e-05, + "loss": 4.962, + "step": 24810 + }, + { + "epoch": 1.9859177468394944, + "grad_norm": 2.916520833969116, + "learning_rate": 1.6958602984436006e-05, + "loss": 4.9052, + "step": 24820 + }, + { + "epoch": 1.9867178748599776, + "grad_norm": 3.448692560195923, + "learning_rate": 1.6945231855377868e-05, + "loss": 4.8362, + "step": 24830 + }, + { + "epoch": 1.9875180028804609, + "grad_norm": 2.9183404445648193, + "learning_rate": 1.693186072631973e-05, + "loss": 4.9522, + "step": 24840 + }, + { + "epoch": 1.9883181309009441, + "grad_norm": 2.444122076034546, + "learning_rate": 1.6918489597261594e-05, + "loss": 5.0336, + "step": 24850 + }, + { + "epoch": 1.9891182589214274, + "grad_norm": 5.753563404083252, + "learning_rate": 1.6905118468203456e-05, + "loss": 4.9037, + "step": 24860 + }, + { + "epoch": 1.9899183869419108, + "grad_norm": 2.641191244125366, + "learning_rate": 1.689174733914532e-05, + "loss": 4.795, + "step": 24870 + }, + { + "epoch": 1.9907185149623938, + "grad_norm": 2.918086290359497, + "learning_rate": 1.6878376210087178e-05, + "loss": 4.901, + "step": 24880 + }, + { + "epoch": 1.9915186429828773, + "grad_norm": 3.7362258434295654, + "learning_rate": 1.686500508102904e-05, + "loss": 4.9758, + "step": 24890 + }, + { + "epoch": 1.9923187710033605, + "grad_norm": 3.5491786003112793, + "learning_rate": 1.6851633951970904e-05, + "loss": 5.126, + "step": 24900 + }, + { + "epoch": 1.9931188990238438, + "grad_norm": 2.8364782333374023, + "learning_rate": 1.6838262822912766e-05, + "loss": 5.0067, + "step": 24910 + }, + { + "epoch": 1.9939190270443272, + "grad_norm": 4.282176971435547, + "learning_rate": 1.682489169385463e-05, + "loss": 4.8921, + "step": 24920 + }, + { + "epoch": 1.9947191550648102, + "grad_norm": 4.440709590911865, + "learning_rate": 1.6811520564796492e-05, + "loss": 5.0284, + "step": 24930 + }, + { + "epoch": 1.9955192830852937, + "grad_norm": 3.450624704360962, + "learning_rate": 1.6798149435738354e-05, + "loss": 5.0158, + "step": 24940 + }, + { + "epoch": 1.996319411105777, + "grad_norm": 3.278850793838501, + "learning_rate": 1.6784778306680217e-05, + "loss": 5.0562, + "step": 24950 + }, + { + "epoch": 1.9971195391262602, + "grad_norm": 3.931243658065796, + "learning_rate": 1.677140717762208e-05, + "loss": 4.966, + "step": 24960 + }, + { + "epoch": 1.9979196671467436, + "grad_norm": 4.377293586730957, + "learning_rate": 1.6758036048563942e-05, + "loss": 4.8362, + "step": 24970 + }, + { + "epoch": 1.9987197951672266, + "grad_norm": 3.4111695289611816, + "learning_rate": 1.6744664919505805e-05, + "loss": 5.1388, + "step": 24980 + }, + { + "epoch": 1.99951992318771, + "grad_norm": 3.131072521209717, + "learning_rate": 1.6731293790447668e-05, + "loss": 4.8877, + "step": 24990 + }, + { + "epoch": 2.000320051208193, + "grad_norm": 3.912254571914673, + "learning_rate": 1.671792266138953e-05, + "loss": 4.8321, + "step": 25000 + }, + { + "epoch": 2.0011201792286766, + "grad_norm": 3.7595908641815186, + "learning_rate": 1.6704551532331393e-05, + "loss": 4.338, + "step": 25010 + }, + { + "epoch": 2.00192030724916, + "grad_norm": 5.618625640869141, + "learning_rate": 1.6691180403273256e-05, + "loss": 4.3945, + "step": 25020 + }, + { + "epoch": 2.002720435269643, + "grad_norm": 5.2287468910217285, + "learning_rate": 1.6677809274215115e-05, + "loss": 4.2806, + "step": 25030 + }, + { + "epoch": 2.0035205632901265, + "grad_norm": 3.5416719913482666, + "learning_rate": 1.6664438145156978e-05, + "loss": 4.3418, + "step": 25040 + }, + { + "epoch": 2.0043206913106095, + "grad_norm": 4.5456109046936035, + "learning_rate": 1.665106701609884e-05, + "loss": 4.2207, + "step": 25050 + }, + { + "epoch": 2.005120819331093, + "grad_norm": 4.272172927856445, + "learning_rate": 1.6637695887040703e-05, + "loss": 4.2139, + "step": 25060 + }, + { + "epoch": 2.0059209473515764, + "grad_norm": 4.0079121589660645, + "learning_rate": 1.6624324757982566e-05, + "loss": 4.4569, + "step": 25070 + }, + { + "epoch": 2.0067210753720595, + "grad_norm": 5.050939083099365, + "learning_rate": 1.661095362892443e-05, + "loss": 4.4224, + "step": 25080 + }, + { + "epoch": 2.007521203392543, + "grad_norm": 3.5307321548461914, + "learning_rate": 1.659758249986629e-05, + "loss": 4.307, + "step": 25090 + }, + { + "epoch": 2.008321331413026, + "grad_norm": 4.467302322387695, + "learning_rate": 1.6584211370808154e-05, + "loss": 4.304, + "step": 25100 + }, + { + "epoch": 2.0091214594335094, + "grad_norm": 5.906796455383301, + "learning_rate": 1.6570840241750013e-05, + "loss": 4.2925, + "step": 25110 + }, + { + "epoch": 2.009921587453993, + "grad_norm": 4.336019039154053, + "learning_rate": 1.6557469112691876e-05, + "loss": 4.1098, + "step": 25120 + }, + { + "epoch": 2.010721715474476, + "grad_norm": 6.186092376708984, + "learning_rate": 1.654409798363374e-05, + "loss": 4.384, + "step": 25130 + }, + { + "epoch": 2.0115218434949593, + "grad_norm": 6.390085220336914, + "learning_rate": 1.65307268545756e-05, + "loss": 4.1767, + "step": 25140 + }, + { + "epoch": 2.0123219715154423, + "grad_norm": 5.528529167175293, + "learning_rate": 1.6517355725517464e-05, + "loss": 4.2967, + "step": 25150 + }, + { + "epoch": 2.013122099535926, + "grad_norm": 4.196529865264893, + "learning_rate": 1.6503984596459327e-05, + "loss": 4.3129, + "step": 25160 + }, + { + "epoch": 2.013922227556409, + "grad_norm": 4.266825199127197, + "learning_rate": 1.649061346740119e-05, + "loss": 4.1665, + "step": 25170 + }, + { + "epoch": 2.0147223555768923, + "grad_norm": 3.9628491401672363, + "learning_rate": 1.647724233834305e-05, + "loss": 4.2982, + "step": 25180 + }, + { + "epoch": 2.0155224835973757, + "grad_norm": 5.902552604675293, + "learning_rate": 1.646387120928491e-05, + "loss": 4.2788, + "step": 25190 + }, + { + "epoch": 2.0163226116178588, + "grad_norm": 4.918710708618164, + "learning_rate": 1.6450500080226774e-05, + "loss": 4.4017, + "step": 25200 + }, + { + "epoch": 2.017122739638342, + "grad_norm": 5.401872158050537, + "learning_rate": 1.6437128951168637e-05, + "loss": 4.3384, + "step": 25210 + }, + { + "epoch": 2.0179228676588252, + "grad_norm": 4.330984592437744, + "learning_rate": 1.64237578221105e-05, + "loss": 4.2743, + "step": 25220 + }, + { + "epoch": 2.0187229956793087, + "grad_norm": 4.99106502532959, + "learning_rate": 1.6410386693052362e-05, + "loss": 4.3562, + "step": 25230 + }, + { + "epoch": 2.019523123699792, + "grad_norm": 5.204268455505371, + "learning_rate": 1.6397015563994225e-05, + "loss": 4.325, + "step": 25240 + }, + { + "epoch": 2.020323251720275, + "grad_norm": 4.139738082885742, + "learning_rate": 1.6383644434936088e-05, + "loss": 4.3247, + "step": 25250 + }, + { + "epoch": 2.0211233797407586, + "grad_norm": 4.763280868530273, + "learning_rate": 1.6370273305877947e-05, + "loss": 4.2754, + "step": 25260 + }, + { + "epoch": 2.0219235077612416, + "grad_norm": 4.396899223327637, + "learning_rate": 1.635690217681981e-05, + "loss": 4.4056, + "step": 25270 + }, + { + "epoch": 2.022723635781725, + "grad_norm": 5.1739702224731445, + "learning_rate": 1.6343531047761672e-05, + "loss": 4.2894, + "step": 25280 + }, + { + "epoch": 2.0235237638022086, + "grad_norm": 3.864838123321533, + "learning_rate": 1.6330159918703535e-05, + "loss": 4.4537, + "step": 25290 + }, + { + "epoch": 2.0243238918226916, + "grad_norm": 5.35354471206665, + "learning_rate": 1.6316788789645398e-05, + "loss": 4.2356, + "step": 25300 + }, + { + "epoch": 2.025124019843175, + "grad_norm": 7.268294811248779, + "learning_rate": 1.630341766058726e-05, + "loss": 4.3507, + "step": 25310 + }, + { + "epoch": 2.025924147863658, + "grad_norm": 3.9637956619262695, + "learning_rate": 1.6290046531529123e-05, + "loss": 4.3009, + "step": 25320 + }, + { + "epoch": 2.0267242758841415, + "grad_norm": 4.4494476318359375, + "learning_rate": 1.6276675402470986e-05, + "loss": 4.1836, + "step": 25330 + }, + { + "epoch": 2.027524403904625, + "grad_norm": 4.388992786407471, + "learning_rate": 1.6263304273412845e-05, + "loss": 4.3256, + "step": 25340 + }, + { + "epoch": 2.028324531925108, + "grad_norm": 6.115511417388916, + "learning_rate": 1.6249933144354708e-05, + "loss": 4.3195, + "step": 25350 + }, + { + "epoch": 2.0291246599455914, + "grad_norm": 5.367356300354004, + "learning_rate": 1.623656201529657e-05, + "loss": 4.333, + "step": 25360 + }, + { + "epoch": 2.0299247879660745, + "grad_norm": 6.696788787841797, + "learning_rate": 1.6223190886238433e-05, + "loss": 4.1472, + "step": 25370 + }, + { + "epoch": 2.030724915986558, + "grad_norm": 5.665947914123535, + "learning_rate": 1.6209819757180296e-05, + "loss": 4.4066, + "step": 25380 + }, + { + "epoch": 2.031525044007041, + "grad_norm": 5.398012638092041, + "learning_rate": 1.619644862812216e-05, + "loss": 4.2199, + "step": 25390 + }, + { + "epoch": 2.0323251720275244, + "grad_norm": 6.494447708129883, + "learning_rate": 1.618307749906402e-05, + "loss": 4.1575, + "step": 25400 + }, + { + "epoch": 2.033125300048008, + "grad_norm": 4.272920608520508, + "learning_rate": 1.6169706370005884e-05, + "loss": 4.2159, + "step": 25410 + }, + { + "epoch": 2.033925428068491, + "grad_norm": 5.143256187438965, + "learning_rate": 1.6156335240947747e-05, + "loss": 4.1872, + "step": 25420 + }, + { + "epoch": 2.0347255560889743, + "grad_norm": 6.310397624969482, + "learning_rate": 1.614296411188961e-05, + "loss": 4.3852, + "step": 25430 + }, + { + "epoch": 2.0355256841094573, + "grad_norm": 5.364697456359863, + "learning_rate": 1.6129592982831472e-05, + "loss": 4.3888, + "step": 25440 + }, + { + "epoch": 2.036325812129941, + "grad_norm": 5.257568836212158, + "learning_rate": 1.6116221853773335e-05, + "loss": 4.3191, + "step": 25450 + }, + { + "epoch": 2.0371259401504243, + "grad_norm": 5.236495494842529, + "learning_rate": 1.6102850724715197e-05, + "loss": 4.3468, + "step": 25460 + }, + { + "epoch": 2.0379260681709073, + "grad_norm": 5.136487007141113, + "learning_rate": 1.608947959565706e-05, + "loss": 4.3643, + "step": 25470 + }, + { + "epoch": 2.0387261961913907, + "grad_norm": 3.6016764640808105, + "learning_rate": 1.6076108466598923e-05, + "loss": 4.3133, + "step": 25480 + }, + { + "epoch": 2.0395263242118737, + "grad_norm": 4.143081188201904, + "learning_rate": 1.6062737337540782e-05, + "loss": 4.3392, + "step": 25490 + }, + { + "epoch": 2.040326452232357, + "grad_norm": 3.9485390186309814, + "learning_rate": 1.6049366208482645e-05, + "loss": 4.3585, + "step": 25500 + }, + { + "epoch": 2.0411265802528407, + "grad_norm": 4.107684135437012, + "learning_rate": 1.6035995079424507e-05, + "loss": 4.1508, + "step": 25510 + }, + { + "epoch": 2.0419267082733237, + "grad_norm": 3.8906235694885254, + "learning_rate": 1.602262395036637e-05, + "loss": 4.144, + "step": 25520 + }, + { + "epoch": 2.042726836293807, + "grad_norm": 8.17663860321045, + "learning_rate": 1.6009252821308233e-05, + "loss": 4.2195, + "step": 25530 + }, + { + "epoch": 2.04352696431429, + "grad_norm": 4.338802337646484, + "learning_rate": 1.5995881692250095e-05, + "loss": 4.2329, + "step": 25540 + }, + { + "epoch": 2.0443270923347736, + "grad_norm": 4.052145481109619, + "learning_rate": 1.5982510563191958e-05, + "loss": 4.3109, + "step": 25550 + }, + { + "epoch": 2.0451272203552566, + "grad_norm": 3.725970506668091, + "learning_rate": 1.596913943413382e-05, + "loss": 4.2197, + "step": 25560 + }, + { + "epoch": 2.04592734837574, + "grad_norm": 5.759598731994629, + "learning_rate": 1.595576830507568e-05, + "loss": 4.2164, + "step": 25570 + }, + { + "epoch": 2.0467274763962235, + "grad_norm": 5.7644195556640625, + "learning_rate": 1.5942397176017543e-05, + "loss": 4.4158, + "step": 25580 + }, + { + "epoch": 2.0475276044167066, + "grad_norm": 4.668453693389893, + "learning_rate": 1.5929026046959406e-05, + "loss": 4.2267, + "step": 25590 + }, + { + "epoch": 2.04832773243719, + "grad_norm": 11.40067195892334, + "learning_rate": 1.5915654917901268e-05, + "loss": 4.4523, + "step": 25600 + }, + { + "epoch": 2.049127860457673, + "grad_norm": 5.45416784286499, + "learning_rate": 1.590228378884313e-05, + "loss": 4.1557, + "step": 25610 + }, + { + "epoch": 2.0499279884781565, + "grad_norm": 5.021010875701904, + "learning_rate": 1.5888912659784994e-05, + "loss": 4.3983, + "step": 25620 + }, + { + "epoch": 2.05072811649864, + "grad_norm": 5.092507362365723, + "learning_rate": 1.5875541530726856e-05, + "loss": 4.2009, + "step": 25630 + }, + { + "epoch": 2.051528244519123, + "grad_norm": 4.925893306732178, + "learning_rate": 1.586217040166872e-05, + "loss": 4.1259, + "step": 25640 + }, + { + "epoch": 2.0523283725396064, + "grad_norm": 5.131783485412598, + "learning_rate": 1.5848799272610578e-05, + "loss": 4.1493, + "step": 25650 + }, + { + "epoch": 2.0531285005600894, + "grad_norm": 5.167172431945801, + "learning_rate": 1.583542814355244e-05, + "loss": 4.3849, + "step": 25660 + }, + { + "epoch": 2.053928628580573, + "grad_norm": 6.500699043273926, + "learning_rate": 1.5822057014494304e-05, + "loss": 4.1778, + "step": 25670 + }, + { + "epoch": 2.0547287566010564, + "grad_norm": 4.588797569274902, + "learning_rate": 1.5808685885436166e-05, + "loss": 4.3639, + "step": 25680 + }, + { + "epoch": 2.0555288846215394, + "grad_norm": 4.2404608726501465, + "learning_rate": 1.579531475637803e-05, + "loss": 4.2917, + "step": 25690 + }, + { + "epoch": 2.056329012642023, + "grad_norm": 4.761908054351807, + "learning_rate": 1.5781943627319892e-05, + "loss": 4.312, + "step": 25700 + }, + { + "epoch": 2.057129140662506, + "grad_norm": 5.473426818847656, + "learning_rate": 1.5768572498261754e-05, + "loss": 4.172, + "step": 25710 + }, + { + "epoch": 2.0579292686829893, + "grad_norm": 4.701712608337402, + "learning_rate": 1.5755201369203617e-05, + "loss": 4.3831, + "step": 25720 + }, + { + "epoch": 2.0587293967034728, + "grad_norm": 9.382383346557617, + "learning_rate": 1.5741830240145476e-05, + "loss": 4.182, + "step": 25730 + }, + { + "epoch": 2.059529524723956, + "grad_norm": 4.000535011291504, + "learning_rate": 1.572845911108734e-05, + "loss": 4.2129, + "step": 25740 + }, + { + "epoch": 2.0603296527444392, + "grad_norm": 4.71032190322876, + "learning_rate": 1.5715087982029202e-05, + "loss": 4.2976, + "step": 25750 + }, + { + "epoch": 2.0611297807649223, + "grad_norm": 4.48236083984375, + "learning_rate": 1.5701716852971064e-05, + "loss": 4.342, + "step": 25760 + }, + { + "epoch": 2.0619299087854057, + "grad_norm": 4.62367057800293, + "learning_rate": 1.5688345723912927e-05, + "loss": 4.256, + "step": 25770 + }, + { + "epoch": 2.0627300368058887, + "grad_norm": 3.801255464553833, + "learning_rate": 1.567497459485479e-05, + "loss": 4.4418, + "step": 25780 + }, + { + "epoch": 2.063530164826372, + "grad_norm": 3.6294422149658203, + "learning_rate": 1.5661603465796653e-05, + "loss": 4.2888, + "step": 25790 + }, + { + "epoch": 2.0643302928468557, + "grad_norm": 4.3946146965026855, + "learning_rate": 1.5648232336738515e-05, + "loss": 4.2986, + "step": 25800 + }, + { + "epoch": 2.0651304208673387, + "grad_norm": 4.36204195022583, + "learning_rate": 1.5634861207680378e-05, + "loss": 4.2825, + "step": 25810 + }, + { + "epoch": 2.065930548887822, + "grad_norm": 4.553360462188721, + "learning_rate": 1.562149007862224e-05, + "loss": 4.3057, + "step": 25820 + }, + { + "epoch": 2.066730676908305, + "grad_norm": 5.6123127937316895, + "learning_rate": 1.5608118949564103e-05, + "loss": 4.4922, + "step": 25830 + }, + { + "epoch": 2.0675308049287886, + "grad_norm": 4.838746070861816, + "learning_rate": 1.5594747820505966e-05, + "loss": 4.4108, + "step": 25840 + }, + { + "epoch": 2.068330932949272, + "grad_norm": 5.99254035949707, + "learning_rate": 1.558137669144783e-05, + "loss": 4.3313, + "step": 25850 + }, + { + "epoch": 2.069131060969755, + "grad_norm": 5.4797282218933105, + "learning_rate": 1.556800556238969e-05, + "loss": 4.2516, + "step": 25860 + }, + { + "epoch": 2.0699311889902385, + "grad_norm": 5.283844947814941, + "learning_rate": 1.5554634433331554e-05, + "loss": 4.4497, + "step": 25870 + }, + { + "epoch": 2.0707313170107216, + "grad_norm": 4.948397636413574, + "learning_rate": 1.5541263304273413e-05, + "loss": 4.0877, + "step": 25880 + }, + { + "epoch": 2.071531445031205, + "grad_norm": 7.391055107116699, + "learning_rate": 1.5527892175215276e-05, + "loss": 4.3646, + "step": 25890 + }, + { + "epoch": 2.0723315730516885, + "grad_norm": 6.029944896697998, + "learning_rate": 1.551452104615714e-05, + "loss": 4.3319, + "step": 25900 + }, + { + "epoch": 2.0731317010721715, + "grad_norm": 5.694515228271484, + "learning_rate": 1.5501149917099e-05, + "loss": 4.3011, + "step": 25910 + }, + { + "epoch": 2.073931829092655, + "grad_norm": 5.351522922515869, + "learning_rate": 1.5487778788040864e-05, + "loss": 4.3508, + "step": 25920 + }, + { + "epoch": 2.074731957113138, + "grad_norm": 5.074141025543213, + "learning_rate": 1.5474407658982727e-05, + "loss": 4.3033, + "step": 25930 + }, + { + "epoch": 2.0755320851336214, + "grad_norm": 4.946030139923096, + "learning_rate": 1.546103652992459e-05, + "loss": 4.4583, + "step": 25940 + }, + { + "epoch": 2.076332213154105, + "grad_norm": 4.760500431060791, + "learning_rate": 1.5447665400866452e-05, + "loss": 4.2749, + "step": 25950 + }, + { + "epoch": 2.077132341174588, + "grad_norm": 5.357514381408691, + "learning_rate": 1.543429427180831e-05, + "loss": 4.3748, + "step": 25960 + }, + { + "epoch": 2.0779324691950714, + "grad_norm": 4.862049579620361, + "learning_rate": 1.5420923142750174e-05, + "loss": 4.2431, + "step": 25970 + }, + { + "epoch": 2.0787325972155544, + "grad_norm": 4.293335914611816, + "learning_rate": 1.5407552013692037e-05, + "loss": 4.2634, + "step": 25980 + }, + { + "epoch": 2.079532725236038, + "grad_norm": 4.8585615158081055, + "learning_rate": 1.53941808846339e-05, + "loss": 4.3626, + "step": 25990 + }, + { + "epoch": 2.080332853256521, + "grad_norm": 5.967334270477295, + "learning_rate": 1.5380809755575762e-05, + "loss": 4.2984, + "step": 26000 + }, + { + "epoch": 2.0811329812770043, + "grad_norm": 3.997610569000244, + "learning_rate": 1.5367438626517625e-05, + "loss": 4.2662, + "step": 26010 + }, + { + "epoch": 2.0819331092974878, + "grad_norm": 5.521271228790283, + "learning_rate": 1.5354067497459488e-05, + "loss": 4.3059, + "step": 26020 + }, + { + "epoch": 2.0827332373179708, + "grad_norm": 5.361555576324463, + "learning_rate": 1.5340696368401347e-05, + "loss": 4.2308, + "step": 26030 + }, + { + "epoch": 2.0835333653384542, + "grad_norm": 4.794193267822266, + "learning_rate": 1.532732523934321e-05, + "loss": 4.4665, + "step": 26040 + }, + { + "epoch": 2.0843334933589373, + "grad_norm": 4.499879360198975, + "learning_rate": 1.5313954110285072e-05, + "loss": 4.2128, + "step": 26050 + }, + { + "epoch": 2.0851336213794207, + "grad_norm": 3.574612855911255, + "learning_rate": 1.5300582981226935e-05, + "loss": 4.3812, + "step": 26060 + }, + { + "epoch": 2.085933749399904, + "grad_norm": 5.214561939239502, + "learning_rate": 1.5287211852168798e-05, + "loss": 4.5072, + "step": 26070 + }, + { + "epoch": 2.086733877420387, + "grad_norm": 6.188739776611328, + "learning_rate": 1.527384072311066e-05, + "loss": 4.4135, + "step": 26080 + }, + { + "epoch": 2.0875340054408706, + "grad_norm": 6.990460395812988, + "learning_rate": 1.5260469594052523e-05, + "loss": 4.3509, + "step": 26090 + }, + { + "epoch": 2.0883341334613537, + "grad_norm": 4.942378520965576, + "learning_rate": 1.5247098464994386e-05, + "loss": 4.3602, + "step": 26100 + }, + { + "epoch": 2.089134261481837, + "grad_norm": 6.065731048583984, + "learning_rate": 1.5233727335936245e-05, + "loss": 4.1068, + "step": 26110 + }, + { + "epoch": 2.0899343895023206, + "grad_norm": 4.916247844696045, + "learning_rate": 1.5220356206878108e-05, + "loss": 4.3243, + "step": 26120 + }, + { + "epoch": 2.0907345175228036, + "grad_norm": 6.229096412658691, + "learning_rate": 1.520698507781997e-05, + "loss": 4.2303, + "step": 26130 + }, + { + "epoch": 2.091534645543287, + "grad_norm": 5.115452289581299, + "learning_rate": 1.5193613948761833e-05, + "loss": 4.3319, + "step": 26140 + }, + { + "epoch": 2.09233477356377, + "grad_norm": 6.276767253875732, + "learning_rate": 1.5180242819703696e-05, + "loss": 4.3341, + "step": 26150 + }, + { + "epoch": 2.0931349015842535, + "grad_norm": 5.382528781890869, + "learning_rate": 1.5166871690645559e-05, + "loss": 4.3723, + "step": 26160 + }, + { + "epoch": 2.0939350296047365, + "grad_norm": 8.183712005615234, + "learning_rate": 1.5153500561587421e-05, + "loss": 4.2733, + "step": 26170 + }, + { + "epoch": 2.09473515762522, + "grad_norm": 4.3973188400268555, + "learning_rate": 1.5140129432529284e-05, + "loss": 4.1724, + "step": 26180 + }, + { + "epoch": 2.0955352856457035, + "grad_norm": 11.1422758102417, + "learning_rate": 1.5126758303471145e-05, + "loss": 4.2879, + "step": 26190 + }, + { + "epoch": 2.0963354136661865, + "grad_norm": 3.9524261951446533, + "learning_rate": 1.5113387174413008e-05, + "loss": 4.1583, + "step": 26200 + }, + { + "epoch": 2.09713554168667, + "grad_norm": 5.151445388793945, + "learning_rate": 1.510001604535487e-05, + "loss": 4.3369, + "step": 26210 + }, + { + "epoch": 2.097935669707153, + "grad_norm": 4.194479465484619, + "learning_rate": 1.5086644916296733e-05, + "loss": 4.2749, + "step": 26220 + }, + { + "epoch": 2.0987357977276364, + "grad_norm": 3.648646831512451, + "learning_rate": 1.5073273787238596e-05, + "loss": 4.1944, + "step": 26230 + }, + { + "epoch": 2.09953592574812, + "grad_norm": 4.388803005218506, + "learning_rate": 1.5059902658180458e-05, + "loss": 4.336, + "step": 26240 + }, + { + "epoch": 2.100336053768603, + "grad_norm": 8.618120193481445, + "learning_rate": 1.5046531529122321e-05, + "loss": 4.2613, + "step": 26250 + }, + { + "epoch": 2.1011361817890863, + "grad_norm": 4.906309604644775, + "learning_rate": 1.5033160400064184e-05, + "loss": 4.3029, + "step": 26260 + }, + { + "epoch": 2.1019363098095694, + "grad_norm": 4.446979522705078, + "learning_rate": 1.5019789271006043e-05, + "loss": 4.3164, + "step": 26270 + }, + { + "epoch": 2.102736437830053, + "grad_norm": 4.349876403808594, + "learning_rate": 1.5006418141947906e-05, + "loss": 4.3869, + "step": 26280 + }, + { + "epoch": 2.1035365658505363, + "grad_norm": 5.089364528656006, + "learning_rate": 1.4993047012889768e-05, + "loss": 4.4643, + "step": 26290 + }, + { + "epoch": 2.1043366938710193, + "grad_norm": 5.051955699920654, + "learning_rate": 1.4979675883831631e-05, + "loss": 4.4284, + "step": 26300 + }, + { + "epoch": 2.1051368218915028, + "grad_norm": 6.135965824127197, + "learning_rate": 1.4966304754773494e-05, + "loss": 4.2545, + "step": 26310 + }, + { + "epoch": 2.1059369499119858, + "grad_norm": 5.15330696105957, + "learning_rate": 1.4952933625715357e-05, + "loss": 4.0856, + "step": 26320 + }, + { + "epoch": 2.1067370779324692, + "grad_norm": 5.134254455566406, + "learning_rate": 1.493956249665722e-05, + "loss": 4.2384, + "step": 26330 + }, + { + "epoch": 2.1075372059529527, + "grad_norm": 4.56715726852417, + "learning_rate": 1.492619136759908e-05, + "loss": 4.2916, + "step": 26340 + }, + { + "epoch": 2.1083373339734357, + "grad_norm": 5.437539100646973, + "learning_rate": 1.4912820238540943e-05, + "loss": 4.1559, + "step": 26350 + }, + { + "epoch": 2.109137461993919, + "grad_norm": 4.780087471008301, + "learning_rate": 1.4900786222388619e-05, + "loss": 4.1329, + "step": 26360 + }, + { + "epoch": 2.109937590014402, + "grad_norm": 4.702531337738037, + "learning_rate": 1.4887415093330481e-05, + "loss": 4.3365, + "step": 26370 + }, + { + "epoch": 2.1107377180348856, + "grad_norm": 4.454277515411377, + "learning_rate": 1.4874043964272344e-05, + "loss": 4.2059, + "step": 26380 + }, + { + "epoch": 2.1115378460553687, + "grad_norm": 4.2128119468688965, + "learning_rate": 1.4860672835214207e-05, + "loss": 4.0916, + "step": 26390 + }, + { + "epoch": 2.112337974075852, + "grad_norm": 4.9969635009765625, + "learning_rate": 1.484730170615607e-05, + "loss": 4.3129, + "step": 26400 + }, + { + "epoch": 2.1131381020963356, + "grad_norm": 5.207634449005127, + "learning_rate": 1.4833930577097932e-05, + "loss": 4.2242, + "step": 26410 + }, + { + "epoch": 2.1139382301168186, + "grad_norm": 5.739307403564453, + "learning_rate": 1.4820559448039791e-05, + "loss": 4.3439, + "step": 26420 + }, + { + "epoch": 2.114738358137302, + "grad_norm": 4.130275726318359, + "learning_rate": 1.4807188318981654e-05, + "loss": 4.3128, + "step": 26430 + }, + { + "epoch": 2.115538486157785, + "grad_norm": 4.299551486968994, + "learning_rate": 1.4793817189923517e-05, + "loss": 4.3764, + "step": 26440 + }, + { + "epoch": 2.1163386141782685, + "grad_norm": 5.259521484375, + "learning_rate": 1.478044606086538e-05, + "loss": 4.2741, + "step": 26450 + }, + { + "epoch": 2.117138742198752, + "grad_norm": 5.917020797729492, + "learning_rate": 1.4767074931807242e-05, + "loss": 4.2622, + "step": 26460 + }, + { + "epoch": 2.117938870219235, + "grad_norm": 5.523458957672119, + "learning_rate": 1.4753703802749105e-05, + "loss": 4.2634, + "step": 26470 + }, + { + "epoch": 2.1187389982397185, + "grad_norm": 5.1210432052612305, + "learning_rate": 1.4740332673690968e-05, + "loss": 4.3752, + "step": 26480 + }, + { + "epoch": 2.1195391262602015, + "grad_norm": 5.014133930206299, + "learning_rate": 1.472696154463283e-05, + "loss": 4.3112, + "step": 26490 + }, + { + "epoch": 2.120339254280685, + "grad_norm": 4.23324728012085, + "learning_rate": 1.4713590415574691e-05, + "loss": 4.3547, + "step": 26500 + }, + { + "epoch": 2.1211393823011684, + "grad_norm": 4.404748439788818, + "learning_rate": 1.4700219286516554e-05, + "loss": 4.4991, + "step": 26510 + }, + { + "epoch": 2.1219395103216514, + "grad_norm": 7.926307201385498, + "learning_rate": 1.4686848157458417e-05, + "loss": 4.2694, + "step": 26520 + }, + { + "epoch": 2.122739638342135, + "grad_norm": 4.4926347732543945, + "learning_rate": 1.467347702840028e-05, + "loss": 4.2697, + "step": 26530 + }, + { + "epoch": 2.123539766362618, + "grad_norm": 5.678550720214844, + "learning_rate": 1.4660105899342142e-05, + "loss": 4.3183, + "step": 26540 + }, + { + "epoch": 2.1243398943831013, + "grad_norm": 5.447049617767334, + "learning_rate": 1.4646734770284005e-05, + "loss": 4.2823, + "step": 26550 + }, + { + "epoch": 2.125140022403585, + "grad_norm": 3.123251438140869, + "learning_rate": 1.4633363641225867e-05, + "loss": 4.2201, + "step": 26560 + }, + { + "epoch": 2.125940150424068, + "grad_norm": 3.6877589225769043, + "learning_rate": 1.461999251216773e-05, + "loss": 4.289, + "step": 26570 + }, + { + "epoch": 2.1267402784445513, + "grad_norm": 5.2759785652160645, + "learning_rate": 1.460662138310959e-05, + "loss": 4.4292, + "step": 26580 + }, + { + "epoch": 2.1275404064650343, + "grad_norm": 4.95992374420166, + "learning_rate": 1.4593250254051452e-05, + "loss": 4.3569, + "step": 26590 + }, + { + "epoch": 2.1283405344855177, + "grad_norm": 4.725872993469238, + "learning_rate": 1.4579879124993315e-05, + "loss": 4.37, + "step": 26600 + }, + { + "epoch": 2.1291406625060008, + "grad_norm": 5.375117301940918, + "learning_rate": 1.4566507995935177e-05, + "loss": 4.2949, + "step": 26610 + }, + { + "epoch": 2.129940790526484, + "grad_norm": 5.185641765594482, + "learning_rate": 1.455313686687704e-05, + "loss": 4.2391, + "step": 26620 + }, + { + "epoch": 2.1307409185469677, + "grad_norm": 3.6917166709899902, + "learning_rate": 1.4539765737818903e-05, + "loss": 4.1506, + "step": 26630 + }, + { + "epoch": 2.1315410465674507, + "grad_norm": 5.494895935058594, + "learning_rate": 1.4526394608760765e-05, + "loss": 4.4124, + "step": 26640 + }, + { + "epoch": 2.132341174587934, + "grad_norm": 5.010682106018066, + "learning_rate": 1.4513023479702625e-05, + "loss": 4.3642, + "step": 26650 + }, + { + "epoch": 2.133141302608417, + "grad_norm": 4.535843372344971, + "learning_rate": 1.4499652350644487e-05, + "loss": 4.2799, + "step": 26660 + }, + { + "epoch": 2.1339414306289006, + "grad_norm": 3.600165367126465, + "learning_rate": 1.448628122158635e-05, + "loss": 4.2767, + "step": 26670 + }, + { + "epoch": 2.134741558649384, + "grad_norm": 5.112745761871338, + "learning_rate": 1.4472910092528213e-05, + "loss": 4.3118, + "step": 26680 + }, + { + "epoch": 2.135541686669867, + "grad_norm": 7.475447654724121, + "learning_rate": 1.4459538963470076e-05, + "loss": 4.3916, + "step": 26690 + }, + { + "epoch": 2.1363418146903506, + "grad_norm": 5.247001647949219, + "learning_rate": 1.4446167834411938e-05, + "loss": 4.2402, + "step": 26700 + }, + { + "epoch": 2.1371419427108336, + "grad_norm": 7.609944820404053, + "learning_rate": 1.4432796705353801e-05, + "loss": 4.2037, + "step": 26710 + }, + { + "epoch": 2.137942070731317, + "grad_norm": 4.795637130737305, + "learning_rate": 1.4419425576295664e-05, + "loss": 4.3401, + "step": 26720 + }, + { + "epoch": 2.1387421987518005, + "grad_norm": 5.049854755401611, + "learning_rate": 1.4406054447237525e-05, + "loss": 4.3105, + "step": 26730 + }, + { + "epoch": 2.1395423267722835, + "grad_norm": 4.4695143699646, + "learning_rate": 1.4392683318179387e-05, + "loss": 4.3286, + "step": 26740 + }, + { + "epoch": 2.140342454792767, + "grad_norm": 5.833003997802734, + "learning_rate": 1.437931218912125e-05, + "loss": 4.386, + "step": 26750 + }, + { + "epoch": 2.14114258281325, + "grad_norm": 5.397075653076172, + "learning_rate": 1.4365941060063113e-05, + "loss": 4.3105, + "step": 26760 + }, + { + "epoch": 2.1419427108337334, + "grad_norm": 4.232978820800781, + "learning_rate": 1.4352569931004975e-05, + "loss": 4.2622, + "step": 26770 + }, + { + "epoch": 2.1427428388542165, + "grad_norm": 6.420408725738525, + "learning_rate": 1.4339198801946838e-05, + "loss": 4.2835, + "step": 26780 + }, + { + "epoch": 2.1435429668747, + "grad_norm": 6.694440841674805, + "learning_rate": 1.43258276728887e-05, + "loss": 4.345, + "step": 26790 + }, + { + "epoch": 2.1443430948951834, + "grad_norm": 6.900730133056641, + "learning_rate": 1.4312456543830563e-05, + "loss": 4.1415, + "step": 26800 + }, + { + "epoch": 2.1451432229156664, + "grad_norm": 6.370269775390625, + "learning_rate": 1.4299085414772423e-05, + "loss": 4.3092, + "step": 26810 + }, + { + "epoch": 2.14594335093615, + "grad_norm": 5.234679222106934, + "learning_rate": 1.4285714285714285e-05, + "loss": 4.456, + "step": 26820 + }, + { + "epoch": 2.146743478956633, + "grad_norm": 4.147063732147217, + "learning_rate": 1.4272343156656148e-05, + "loss": 4.1961, + "step": 26830 + }, + { + "epoch": 2.1475436069771163, + "grad_norm": 4.461815357208252, + "learning_rate": 1.425897202759801e-05, + "loss": 4.158, + "step": 26840 + }, + { + "epoch": 2.1483437349976, + "grad_norm": 7.251457691192627, + "learning_rate": 1.4245600898539874e-05, + "loss": 4.3905, + "step": 26850 + }, + { + "epoch": 2.149143863018083, + "grad_norm": 4.718042850494385, + "learning_rate": 1.4232229769481736e-05, + "loss": 4.3418, + "step": 26860 + }, + { + "epoch": 2.1499439910385663, + "grad_norm": 4.637687683105469, + "learning_rate": 1.4218858640423599e-05, + "loss": 4.3702, + "step": 26870 + }, + { + "epoch": 2.1507441190590493, + "grad_norm": 6.43583869934082, + "learning_rate": 1.4205487511365462e-05, + "loss": 4.3164, + "step": 26880 + }, + { + "epoch": 2.1515442470795327, + "grad_norm": 5.462271213531494, + "learning_rate": 1.4192116382307321e-05, + "loss": 4.3193, + "step": 26890 + }, + { + "epoch": 2.152344375100016, + "grad_norm": 4.406450271606445, + "learning_rate": 1.4178745253249184e-05, + "loss": 4.3865, + "step": 26900 + }, + { + "epoch": 2.153144503120499, + "grad_norm": 6.0672383308410645, + "learning_rate": 1.4165374124191046e-05, + "loss": 4.2768, + "step": 26910 + }, + { + "epoch": 2.1539446311409827, + "grad_norm": 4.165776252746582, + "learning_rate": 1.4152002995132909e-05, + "loss": 4.1918, + "step": 26920 + }, + { + "epoch": 2.1547447591614657, + "grad_norm": 15.013633728027344, + "learning_rate": 1.4138631866074772e-05, + "loss": 4.2488, + "step": 26930 + }, + { + "epoch": 2.155544887181949, + "grad_norm": 5.367081165313721, + "learning_rate": 1.4125260737016634e-05, + "loss": 4.2885, + "step": 26940 + }, + { + "epoch": 2.156345015202432, + "grad_norm": 3.9611103534698486, + "learning_rate": 1.4111889607958497e-05, + "loss": 4.2249, + "step": 26950 + }, + { + "epoch": 2.1571451432229156, + "grad_norm": 4.482510089874268, + "learning_rate": 1.4098518478900358e-05, + "loss": 4.3155, + "step": 26960 + }, + { + "epoch": 2.157945271243399, + "grad_norm": 4.127138137817383, + "learning_rate": 1.408514734984222e-05, + "loss": 4.3791, + "step": 26970 + }, + { + "epoch": 2.158745399263882, + "grad_norm": 3.858367919921875, + "learning_rate": 1.4071776220784083e-05, + "loss": 4.3788, + "step": 26980 + }, + { + "epoch": 2.1595455272843656, + "grad_norm": 5.6368184089660645, + "learning_rate": 1.4058405091725946e-05, + "loss": 4.2714, + "step": 26990 + }, + { + "epoch": 2.160345655304849, + "grad_norm": 5.943724632263184, + "learning_rate": 1.4045033962667809e-05, + "loss": 4.3902, + "step": 27000 + }, + { + "epoch": 2.161145783325332, + "grad_norm": 5.1518683433532715, + "learning_rate": 1.4031662833609671e-05, + "loss": 4.2325, + "step": 27010 + }, + { + "epoch": 2.1619459113458155, + "grad_norm": 3.714491605758667, + "learning_rate": 1.4018291704551534e-05, + "loss": 4.3165, + "step": 27020 + }, + { + "epoch": 2.1627460393662985, + "grad_norm": 4.722376823425293, + "learning_rate": 1.4004920575493397e-05, + "loss": 4.3397, + "step": 27030 + }, + { + "epoch": 2.163546167386782, + "grad_norm": 4.137203693389893, + "learning_rate": 1.3991549446435256e-05, + "loss": 4.0377, + "step": 27040 + }, + { + "epoch": 2.164346295407265, + "grad_norm": 6.193617820739746, + "learning_rate": 1.3978178317377119e-05, + "loss": 4.2832, + "step": 27050 + }, + { + "epoch": 2.1651464234277484, + "grad_norm": 4.346227169036865, + "learning_rate": 1.3964807188318982e-05, + "loss": 4.2071, + "step": 27060 + }, + { + "epoch": 2.165946551448232, + "grad_norm": 5.1168928146362305, + "learning_rate": 1.3951436059260844e-05, + "loss": 4.4359, + "step": 27070 + }, + { + "epoch": 2.166746679468715, + "grad_norm": 6.076730728149414, + "learning_rate": 1.3938064930202707e-05, + "loss": 4.2271, + "step": 27080 + }, + { + "epoch": 2.1675468074891984, + "grad_norm": 4.987486839294434, + "learning_rate": 1.392469380114457e-05, + "loss": 4.1962, + "step": 27090 + }, + { + "epoch": 2.1683469355096814, + "grad_norm": 4.582950592041016, + "learning_rate": 1.3911322672086432e-05, + "loss": 4.1916, + "step": 27100 + }, + { + "epoch": 2.169147063530165, + "grad_norm": 4.27584981918335, + "learning_rate": 1.3897951543028295e-05, + "loss": 4.3356, + "step": 27110 + }, + { + "epoch": 2.1699471915506483, + "grad_norm": 4.555540084838867, + "learning_rate": 1.3884580413970156e-05, + "loss": 4.3487, + "step": 27120 + }, + { + "epoch": 2.1707473195711313, + "grad_norm": 7.551029205322266, + "learning_rate": 1.3871209284912019e-05, + "loss": 4.3706, + "step": 27130 + }, + { + "epoch": 2.1715474475916148, + "grad_norm": 6.390832901000977, + "learning_rate": 1.3857838155853881e-05, + "loss": 4.2307, + "step": 27140 + }, + { + "epoch": 2.172347575612098, + "grad_norm": 5.295320510864258, + "learning_rate": 1.3844467026795744e-05, + "loss": 4.3262, + "step": 27150 + }, + { + "epoch": 2.1731477036325813, + "grad_norm": 4.816108703613281, + "learning_rate": 1.3831095897737607e-05, + "loss": 4.4674, + "step": 27160 + }, + { + "epoch": 2.1739478316530647, + "grad_norm": 7.950437545776367, + "learning_rate": 1.381772476867947e-05, + "loss": 4.1512, + "step": 27170 + }, + { + "epoch": 2.1747479596735477, + "grad_norm": 6.867645740509033, + "learning_rate": 1.3804353639621332e-05, + "loss": 4.3138, + "step": 27180 + }, + { + "epoch": 2.175548087694031, + "grad_norm": 5.51462459564209, + "learning_rate": 1.3790982510563191e-05, + "loss": 4.1856, + "step": 27190 + }, + { + "epoch": 2.176348215714514, + "grad_norm": 11.651870727539062, + "learning_rate": 1.3777611381505054e-05, + "loss": 4.3696, + "step": 27200 + }, + { + "epoch": 2.1771483437349977, + "grad_norm": 5.805605411529541, + "learning_rate": 1.3764240252446917e-05, + "loss": 4.376, + "step": 27210 + }, + { + "epoch": 2.1779484717554807, + "grad_norm": 4.33229398727417, + "learning_rate": 1.375086912338878e-05, + "loss": 4.1416, + "step": 27220 + }, + { + "epoch": 2.178748599775964, + "grad_norm": 5.466543674468994, + "learning_rate": 1.3737497994330642e-05, + "loss": 4.416, + "step": 27230 + }, + { + "epoch": 2.1795487277964476, + "grad_norm": 4.547317028045654, + "learning_rate": 1.3724126865272505e-05, + "loss": 4.4502, + "step": 27240 + }, + { + "epoch": 2.1803488558169306, + "grad_norm": 6.405552864074707, + "learning_rate": 1.3710755736214368e-05, + "loss": 4.2897, + "step": 27250 + }, + { + "epoch": 2.181148983837414, + "grad_norm": 4.219295024871826, + "learning_rate": 1.369738460715623e-05, + "loss": 4.1738, + "step": 27260 + }, + { + "epoch": 2.181949111857897, + "grad_norm": 3.7601661682128906, + "learning_rate": 1.368401347809809e-05, + "loss": 4.3072, + "step": 27270 + }, + { + "epoch": 2.1827492398783805, + "grad_norm": 6.388236045837402, + "learning_rate": 1.3670642349039952e-05, + "loss": 4.2357, + "step": 27280 + }, + { + "epoch": 2.183549367898864, + "grad_norm": 4.893552780151367, + "learning_rate": 1.3657271219981815e-05, + "loss": 4.2591, + "step": 27290 + }, + { + "epoch": 2.184349495919347, + "grad_norm": 9.936212539672852, + "learning_rate": 1.3643900090923678e-05, + "loss": 4.3157, + "step": 27300 + }, + { + "epoch": 2.1851496239398305, + "grad_norm": 4.956419944763184, + "learning_rate": 1.363052896186554e-05, + "loss": 4.2788, + "step": 27310 + }, + { + "epoch": 2.1859497519603135, + "grad_norm": 6.442319869995117, + "learning_rate": 1.3617157832807403e-05, + "loss": 4.3374, + "step": 27320 + }, + { + "epoch": 2.186749879980797, + "grad_norm": 5.18787956237793, + "learning_rate": 1.3603786703749266e-05, + "loss": 4.2136, + "step": 27330 + }, + { + "epoch": 2.1875500080012804, + "grad_norm": 4.710720062255859, + "learning_rate": 1.3590415574691128e-05, + "loss": 4.3571, + "step": 27340 + }, + { + "epoch": 2.1883501360217634, + "grad_norm": 7.41901159286499, + "learning_rate": 1.357704444563299e-05, + "loss": 4.2718, + "step": 27350 + }, + { + "epoch": 2.189150264042247, + "grad_norm": 4.362005710601807, + "learning_rate": 1.3563673316574852e-05, + "loss": 4.3566, + "step": 27360 + }, + { + "epoch": 2.18995039206273, + "grad_norm": 4.403487205505371, + "learning_rate": 1.3550302187516715e-05, + "loss": 4.3139, + "step": 27370 + }, + { + "epoch": 2.1907505200832134, + "grad_norm": 8.87827205657959, + "learning_rate": 1.3536931058458577e-05, + "loss": 4.099, + "step": 27380 + }, + { + "epoch": 2.1915506481036964, + "grad_norm": 4.843017578125, + "learning_rate": 1.352355992940044e-05, + "loss": 4.2981, + "step": 27390 + }, + { + "epoch": 2.19235077612418, + "grad_norm": 8.040793418884277, + "learning_rate": 1.3510188800342303e-05, + "loss": 4.3916, + "step": 27400 + }, + { + "epoch": 2.1931509041446633, + "grad_norm": 5.31903076171875, + "learning_rate": 1.3496817671284166e-05, + "loss": 4.3502, + "step": 27410 + }, + { + "epoch": 2.1939510321651463, + "grad_norm": 5.527568817138672, + "learning_rate": 1.3483446542226028e-05, + "loss": 4.3331, + "step": 27420 + }, + { + "epoch": 2.1947511601856298, + "grad_norm": 4.029179573059082, + "learning_rate": 1.3470075413167888e-05, + "loss": 4.2993, + "step": 27430 + }, + { + "epoch": 2.195551288206113, + "grad_norm": 5.164677619934082, + "learning_rate": 1.345670428410975e-05, + "loss": 4.3953, + "step": 27440 + }, + { + "epoch": 2.1963514162265962, + "grad_norm": 5.0720720291137695, + "learning_rate": 1.3443333155051613e-05, + "loss": 4.4189, + "step": 27450 + }, + { + "epoch": 2.1971515442470797, + "grad_norm": 4.484434127807617, + "learning_rate": 1.3429962025993476e-05, + "loss": 4.3077, + "step": 27460 + }, + { + "epoch": 2.1979516722675627, + "grad_norm": 4.3660969734191895, + "learning_rate": 1.3416590896935338e-05, + "loss": 4.1555, + "step": 27470 + }, + { + "epoch": 2.198751800288046, + "grad_norm": 4.788058757781982, + "learning_rate": 1.3403219767877201e-05, + "loss": 4.2902, + "step": 27480 + }, + { + "epoch": 2.199551928308529, + "grad_norm": 6.046142578125, + "learning_rate": 1.3389848638819064e-05, + "loss": 4.3144, + "step": 27490 + }, + { + "epoch": 2.2003520563290127, + "grad_norm": 4.384228229522705, + "learning_rate": 1.3376477509760923e-05, + "loss": 4.4575, + "step": 27500 + }, + { + "epoch": 2.201152184349496, + "grad_norm": 6.044991493225098, + "learning_rate": 1.3363106380702786e-05, + "loss": 4.2463, + "step": 27510 + }, + { + "epoch": 2.201952312369979, + "grad_norm": 5.55415678024292, + "learning_rate": 1.3349735251644648e-05, + "loss": 4.2905, + "step": 27520 + }, + { + "epoch": 2.2027524403904626, + "grad_norm": 6.360270977020264, + "learning_rate": 1.3336364122586511e-05, + "loss": 4.3698, + "step": 27530 + }, + { + "epoch": 2.2035525684109456, + "grad_norm": 5.214306354522705, + "learning_rate": 1.3322992993528374e-05, + "loss": 4.2608, + "step": 27540 + }, + { + "epoch": 2.204352696431429, + "grad_norm": 5.412326335906982, + "learning_rate": 1.3309621864470236e-05, + "loss": 4.4204, + "step": 27550 + }, + { + "epoch": 2.205152824451912, + "grad_norm": 4.402900218963623, + "learning_rate": 1.3296250735412099e-05, + "loss": 4.3345, + "step": 27560 + }, + { + "epoch": 2.2059529524723955, + "grad_norm": 4.055781364440918, + "learning_rate": 1.3282879606353962e-05, + "loss": 4.2738, + "step": 27570 + }, + { + "epoch": 2.206753080492879, + "grad_norm": 4.639005661010742, + "learning_rate": 1.3269508477295823e-05, + "loss": 4.3226, + "step": 27580 + }, + { + "epoch": 2.207553208513362, + "grad_norm": 6.268497467041016, + "learning_rate": 1.3256137348237685e-05, + "loss": 4.2283, + "step": 27590 + }, + { + "epoch": 2.2083533365338455, + "grad_norm": 5.508733749389648, + "learning_rate": 1.3242766219179548e-05, + "loss": 4.3479, + "step": 27600 + }, + { + "epoch": 2.209153464554329, + "grad_norm": 6.806684970855713, + "learning_rate": 1.322939509012141e-05, + "loss": 4.2675, + "step": 27610 + }, + { + "epoch": 2.209953592574812, + "grad_norm": 7.854143142700195, + "learning_rate": 1.3216023961063274e-05, + "loss": 4.289, + "step": 27620 + }, + { + "epoch": 2.2107537205952954, + "grad_norm": 4.771146297454834, + "learning_rate": 1.3202652832005136e-05, + "loss": 4.3741, + "step": 27630 + }, + { + "epoch": 2.2115538486157784, + "grad_norm": 4.5379838943481445, + "learning_rate": 1.3189281702946999e-05, + "loss": 4.1667, + "step": 27640 + }, + { + "epoch": 2.212353976636262, + "grad_norm": 8.35084056854248, + "learning_rate": 1.3175910573888862e-05, + "loss": 4.3594, + "step": 27650 + }, + { + "epoch": 2.213154104656745, + "grad_norm": 5.7083306312561035, + "learning_rate": 1.3162539444830721e-05, + "loss": 4.3591, + "step": 27660 + }, + { + "epoch": 2.2139542326772284, + "grad_norm": 5.427682399749756, + "learning_rate": 1.3149168315772584e-05, + "loss": 4.2273, + "step": 27670 + }, + { + "epoch": 2.214754360697712, + "grad_norm": 3.5514488220214844, + "learning_rate": 1.3135797186714446e-05, + "loss": 4.1346, + "step": 27680 + }, + { + "epoch": 2.215554488718195, + "grad_norm": 6.326035022735596, + "learning_rate": 1.3122426057656309e-05, + "loss": 4.3059, + "step": 27690 + }, + { + "epoch": 2.2163546167386783, + "grad_norm": 3.6098668575286865, + "learning_rate": 1.3109054928598172e-05, + "loss": 4.318, + "step": 27700 + }, + { + "epoch": 2.2171547447591613, + "grad_norm": 5.207823276519775, + "learning_rate": 1.3095683799540034e-05, + "loss": 4.3012, + "step": 27710 + }, + { + "epoch": 2.2179548727796448, + "grad_norm": 3.878981590270996, + "learning_rate": 1.3082312670481897e-05, + "loss": 4.208, + "step": 27720 + }, + { + "epoch": 2.218755000800128, + "grad_norm": 4.142792701721191, + "learning_rate": 1.306894154142376e-05, + "loss": 4.2057, + "step": 27730 + }, + { + "epoch": 2.2195551288206112, + "grad_norm": 4.735614776611328, + "learning_rate": 1.3055570412365619e-05, + "loss": 4.3577, + "step": 27740 + }, + { + "epoch": 2.2203552568410947, + "grad_norm": 4.719352722167969, + "learning_rate": 1.3042199283307482e-05, + "loss": 4.2049, + "step": 27750 + }, + { + "epoch": 2.2211553848615777, + "grad_norm": 5.648311138153076, + "learning_rate": 1.3028828154249344e-05, + "loss": 4.2577, + "step": 27760 + }, + { + "epoch": 2.221955512882061, + "grad_norm": 4.707512855529785, + "learning_rate": 1.3015457025191207e-05, + "loss": 4.2289, + "step": 27770 + }, + { + "epoch": 2.2227556409025446, + "grad_norm": 6.615943431854248, + "learning_rate": 1.300208589613307e-05, + "loss": 4.3439, + "step": 27780 + }, + { + "epoch": 2.2235557689230276, + "grad_norm": 11.488637924194336, + "learning_rate": 1.2988714767074932e-05, + "loss": 4.4672, + "step": 27790 + }, + { + "epoch": 2.224355896943511, + "grad_norm": 4.287919044494629, + "learning_rate": 1.2975343638016795e-05, + "loss": 4.6242, + "step": 27800 + }, + { + "epoch": 2.225156024963994, + "grad_norm": 4.993651866912842, + "learning_rate": 1.2961972508958656e-05, + "loss": 4.3635, + "step": 27810 + }, + { + "epoch": 2.2259561529844776, + "grad_norm": 7.000411510467529, + "learning_rate": 1.2948601379900519e-05, + "loss": 4.092, + "step": 27820 + }, + { + "epoch": 2.2267562810049606, + "grad_norm": 13.637682914733887, + "learning_rate": 1.2935230250842382e-05, + "loss": 4.4024, + "step": 27830 + }, + { + "epoch": 2.227556409025444, + "grad_norm": 5.699049472808838, + "learning_rate": 1.2921859121784244e-05, + "loss": 4.2697, + "step": 27840 + }, + { + "epoch": 2.2283565370459275, + "grad_norm": 5.741790771484375, + "learning_rate": 1.2908487992726107e-05, + "loss": 4.2566, + "step": 27850 + }, + { + "epoch": 2.2291566650664105, + "grad_norm": 8.050752639770508, + "learning_rate": 1.289511686366797e-05, + "loss": 4.3565, + "step": 27860 + }, + { + "epoch": 2.229956793086894, + "grad_norm": 4.524864196777344, + "learning_rate": 1.2881745734609832e-05, + "loss": 4.3523, + "step": 27870 + }, + { + "epoch": 2.230756921107377, + "grad_norm": 5.576345920562744, + "learning_rate": 1.2868374605551695e-05, + "loss": 4.2655, + "step": 27880 + }, + { + "epoch": 2.2315570491278605, + "grad_norm": 7.234230041503906, + "learning_rate": 1.2855003476493554e-05, + "loss": 4.224, + "step": 27890 + }, + { + "epoch": 2.232357177148344, + "grad_norm": 3.625093698501587, + "learning_rate": 1.2841632347435417e-05, + "loss": 4.314, + "step": 27900 + }, + { + "epoch": 2.233157305168827, + "grad_norm": 5.142899990081787, + "learning_rate": 1.282826121837728e-05, + "loss": 4.2523, + "step": 27910 + }, + { + "epoch": 2.2339574331893104, + "grad_norm": 5.890714168548584, + "learning_rate": 1.2814890089319142e-05, + "loss": 4.408, + "step": 27920 + }, + { + "epoch": 2.2347575612097934, + "grad_norm": 6.055865287780762, + "learning_rate": 1.2801518960261005e-05, + "loss": 4.2961, + "step": 27930 + }, + { + "epoch": 2.235557689230277, + "grad_norm": 12.97319507598877, + "learning_rate": 1.2788147831202868e-05, + "loss": 4.2993, + "step": 27940 + }, + { + "epoch": 2.2363578172507603, + "grad_norm": 5.342799186706543, + "learning_rate": 1.277477670214473e-05, + "loss": 4.224, + "step": 27950 + }, + { + "epoch": 2.2371579452712433, + "grad_norm": 3.9940850734710693, + "learning_rate": 1.2761405573086593e-05, + "loss": 4.2813, + "step": 27960 + }, + { + "epoch": 2.237958073291727, + "grad_norm": 3.819955587387085, + "learning_rate": 1.2748034444028452e-05, + "loss": 4.2159, + "step": 27970 + }, + { + "epoch": 2.23875820131221, + "grad_norm": 6.087779998779297, + "learning_rate": 1.2734663314970315e-05, + "loss": 4.2637, + "step": 27980 + }, + { + "epoch": 2.2395583293326933, + "grad_norm": 5.827731132507324, + "learning_rate": 1.2721292185912178e-05, + "loss": 4.391, + "step": 27990 + }, + { + "epoch": 2.2403584573531763, + "grad_norm": 5.9532389640808105, + "learning_rate": 1.270792105685404e-05, + "loss": 4.2314, + "step": 28000 + }, + { + "epoch": 2.2403584573531763, + "eval_loss": 5.650319576263428, + "eval_runtime": 17.4181, + "eval_samples_per_second": 2.296, + "eval_steps_per_second": 0.287, + "step": 28000 + }, + { + "epoch": 2.2411585853736598, + "grad_norm": 4.932950973510742, + "learning_rate": 1.2694549927795903e-05, + "loss": 4.3077, + "step": 28010 + }, + { + "epoch": 2.241958713394143, + "grad_norm": 6.454247951507568, + "learning_rate": 1.2681178798737766e-05, + "loss": 4.2879, + "step": 28020 + }, + { + "epoch": 2.2427588414146262, + "grad_norm": 6.555384159088135, + "learning_rate": 1.2667807669679629e-05, + "loss": 4.1309, + "step": 28030 + }, + { + "epoch": 2.2435589694351097, + "grad_norm": 5.856935501098633, + "learning_rate": 1.265443654062149e-05, + "loss": 4.1817, + "step": 28040 + }, + { + "epoch": 2.2443590974555927, + "grad_norm": 6.825374603271484, + "learning_rate": 1.2641065411563352e-05, + "loss": 4.2748, + "step": 28050 + }, + { + "epoch": 2.245159225476076, + "grad_norm": 4.445517063140869, + "learning_rate": 1.2627694282505215e-05, + "loss": 4.1561, + "step": 28060 + }, + { + "epoch": 2.2459593534965596, + "grad_norm": 5.488250255584717, + "learning_rate": 1.2614323153447078e-05, + "loss": 4.1993, + "step": 28070 + }, + { + "epoch": 2.2467594815170426, + "grad_norm": 8.348297119140625, + "learning_rate": 1.260095202438894e-05, + "loss": 4.1527, + "step": 28080 + }, + { + "epoch": 2.247559609537526, + "grad_norm": 4.551492214202881, + "learning_rate": 1.2587580895330803e-05, + "loss": 4.2166, + "step": 28090 + }, + { + "epoch": 2.248359737558009, + "grad_norm": 4.065390110015869, + "learning_rate": 1.2574209766272666e-05, + "loss": 4.3189, + "step": 28100 + }, + { + "epoch": 2.2491598655784926, + "grad_norm": 4.235226154327393, + "learning_rate": 1.2560838637214528e-05, + "loss": 4.2793, + "step": 28110 + }, + { + "epoch": 2.249959993598976, + "grad_norm": 4.71818733215332, + "learning_rate": 1.2547467508156388e-05, + "loss": 4.3372, + "step": 28120 + }, + { + "epoch": 2.250760121619459, + "grad_norm": 5.000930309295654, + "learning_rate": 1.253409637909825e-05, + "loss": 4.227, + "step": 28130 + }, + { + "epoch": 2.2515602496399425, + "grad_norm": 4.730818748474121, + "learning_rate": 1.2520725250040113e-05, + "loss": 4.3551, + "step": 28140 + }, + { + "epoch": 2.2523603776604255, + "grad_norm": 5.300826549530029, + "learning_rate": 1.2507354120981976e-05, + "loss": 4.2116, + "step": 28150 + }, + { + "epoch": 2.253160505680909, + "grad_norm": 4.391541957855225, + "learning_rate": 1.2493982991923838e-05, + "loss": 4.3452, + "step": 28160 + }, + { + "epoch": 2.253960633701392, + "grad_norm": 7.184095859527588, + "learning_rate": 1.2480611862865701e-05, + "loss": 4.0692, + "step": 28170 + }, + { + "epoch": 2.2547607617218755, + "grad_norm": 6.555243968963623, + "learning_rate": 1.2467240733807562e-05, + "loss": 4.2418, + "step": 28180 + }, + { + "epoch": 2.255560889742359, + "grad_norm": 5.414023399353027, + "learning_rate": 1.2453869604749425e-05, + "loss": 4.2476, + "step": 28190 + }, + { + "epoch": 2.256361017762842, + "grad_norm": 5.8872599601745605, + "learning_rate": 1.2440498475691288e-05, + "loss": 4.3209, + "step": 28200 + }, + { + "epoch": 2.2571611457833254, + "grad_norm": 5.530123233795166, + "learning_rate": 1.242712734663315e-05, + "loss": 4.2192, + "step": 28210 + }, + { + "epoch": 2.257961273803809, + "grad_norm": 4.799288749694824, + "learning_rate": 1.2413756217575013e-05, + "loss": 4.1957, + "step": 28220 + }, + { + "epoch": 2.258761401824292, + "grad_norm": 5.001644134521484, + "learning_rate": 1.2400385088516876e-05, + "loss": 4.3231, + "step": 28230 + }, + { + "epoch": 2.2595615298447753, + "grad_norm": 6.378586292266846, + "learning_rate": 1.2387013959458738e-05, + "loss": 4.3512, + "step": 28240 + }, + { + "epoch": 2.2603616578652583, + "grad_norm": 4.091723918914795, + "learning_rate": 1.2373642830400601e-05, + "loss": 4.4076, + "step": 28250 + }, + { + "epoch": 2.261161785885742, + "grad_norm": 4.692946434020996, + "learning_rate": 1.2360271701342462e-05, + "loss": 4.34, + "step": 28260 + }, + { + "epoch": 2.261961913906225, + "grad_norm": 4.702794075012207, + "learning_rate": 1.2346900572284325e-05, + "loss": 4.3236, + "step": 28270 + }, + { + "epoch": 2.2627620419267083, + "grad_norm": 6.910373210906982, + "learning_rate": 1.2333529443226187e-05, + "loss": 4.2764, + "step": 28280 + }, + { + "epoch": 2.2635621699471917, + "grad_norm": 4.597626209259033, + "learning_rate": 1.232015831416805e-05, + "loss": 4.3461, + "step": 28290 + }, + { + "epoch": 2.2643622979676747, + "grad_norm": 6.760199546813965, + "learning_rate": 1.2306787185109911e-05, + "loss": 4.2325, + "step": 28300 + }, + { + "epoch": 2.265162425988158, + "grad_norm": 6.140738010406494, + "learning_rate": 1.2293416056051774e-05, + "loss": 4.2288, + "step": 28310 + }, + { + "epoch": 2.265962554008641, + "grad_norm": 5.178011894226074, + "learning_rate": 1.2280044926993636e-05, + "loss": 4.364, + "step": 28320 + }, + { + "epoch": 2.2667626820291247, + "grad_norm": 6.677469253540039, + "learning_rate": 1.2266673797935497e-05, + "loss": 4.3654, + "step": 28330 + }, + { + "epoch": 2.2675628100496077, + "grad_norm": 7.555559158325195, + "learning_rate": 1.225330266887736e-05, + "loss": 4.1391, + "step": 28340 + }, + { + "epoch": 2.268362938070091, + "grad_norm": 5.3249406814575195, + "learning_rate": 1.2239931539819223e-05, + "loss": 4.2865, + "step": 28350 + }, + { + "epoch": 2.2691630660905746, + "grad_norm": 6.221409320831299, + "learning_rate": 1.2226560410761085e-05, + "loss": 4.4498, + "step": 28360 + }, + { + "epoch": 2.2699631941110576, + "grad_norm": 6.600496292114258, + "learning_rate": 1.2213189281702946e-05, + "loss": 4.2898, + "step": 28370 + }, + { + "epoch": 2.270763322131541, + "grad_norm": 6.635727405548096, + "learning_rate": 1.219981815264481e-05, + "loss": 4.1712, + "step": 28380 + }, + { + "epoch": 2.2715634501520245, + "grad_norm": 6.427636623382568, + "learning_rate": 1.2186447023586672e-05, + "loss": 4.316, + "step": 28390 + }, + { + "epoch": 2.2723635781725076, + "grad_norm": 5.909119606018066, + "learning_rate": 1.2173075894528535e-05, + "loss": 4.3188, + "step": 28400 + }, + { + "epoch": 2.273163706192991, + "grad_norm": 4.973991870880127, + "learning_rate": 1.2159704765470396e-05, + "loss": 4.1295, + "step": 28410 + }, + { + "epoch": 2.273963834213474, + "grad_norm": 5.875889301300049, + "learning_rate": 1.2146333636412258e-05, + "loss": 4.1011, + "step": 28420 + }, + { + "epoch": 2.2747639622339575, + "grad_norm": 6.347286224365234, + "learning_rate": 1.2132962507354121e-05, + "loss": 4.3025, + "step": 28430 + }, + { + "epoch": 2.2755640902544405, + "grad_norm": 9.685184478759766, + "learning_rate": 1.2119591378295984e-05, + "loss": 4.2504, + "step": 28440 + }, + { + "epoch": 2.276364218274924, + "grad_norm": 3.5592947006225586, + "learning_rate": 1.2106220249237846e-05, + "loss": 4.0199, + "step": 28450 + }, + { + "epoch": 2.2771643462954074, + "grad_norm": 5.19354248046875, + "learning_rate": 1.2092849120179709e-05, + "loss": 4.3501, + "step": 28460 + }, + { + "epoch": 2.2779644743158904, + "grad_norm": 5.163251876831055, + "learning_rate": 1.2079477991121572e-05, + "loss": 4.2562, + "step": 28470 + }, + { + "epoch": 2.278764602336374, + "grad_norm": 3.630056381225586, + "learning_rate": 1.2067443974969247e-05, + "loss": 4.3743, + "step": 28480 + }, + { + "epoch": 2.279564730356857, + "grad_norm": 3.2357378005981445, + "learning_rate": 1.2054072845911108e-05, + "loss": 4.333, + "step": 28490 + }, + { + "epoch": 2.2803648583773404, + "grad_norm": 5.15466833114624, + "learning_rate": 1.2040701716852971e-05, + "loss": 4.295, + "step": 28500 + }, + { + "epoch": 2.281164986397824, + "grad_norm": 6.575782775878906, + "learning_rate": 1.2027330587794834e-05, + "loss": 4.1019, + "step": 28510 + }, + { + "epoch": 2.281965114418307, + "grad_norm": 4.305944442749023, + "learning_rate": 1.2013959458736697e-05, + "loss": 4.2613, + "step": 28520 + }, + { + "epoch": 2.2827652424387903, + "grad_norm": 6.759955406188965, + "learning_rate": 1.2000588329678558e-05, + "loss": 4.324, + "step": 28530 + }, + { + "epoch": 2.2835653704592733, + "grad_norm": 4.603118419647217, + "learning_rate": 1.198721720062042e-05, + "loss": 4.2917, + "step": 28540 + }, + { + "epoch": 2.284365498479757, + "grad_norm": 5.116035461425781, + "learning_rate": 1.1973846071562283e-05, + "loss": 4.0603, + "step": 28550 + }, + { + "epoch": 2.2851656265002402, + "grad_norm": 4.8761796951293945, + "learning_rate": 1.1960474942504146e-05, + "loss": 4.2702, + "step": 28560 + }, + { + "epoch": 2.2859657545207233, + "grad_norm": 7.16318941116333, + "learning_rate": 1.1947103813446008e-05, + "loss": 4.2756, + "step": 28570 + }, + { + "epoch": 2.2867658825412067, + "grad_norm": 8.105825424194336, + "learning_rate": 1.1933732684387871e-05, + "loss": 4.2905, + "step": 28580 + }, + { + "epoch": 2.2875660105616897, + "grad_norm": 6.250253200531006, + "learning_rate": 1.1920361555329734e-05, + "loss": 4.2202, + "step": 28590 + }, + { + "epoch": 2.288366138582173, + "grad_norm": 4.617362022399902, + "learning_rate": 1.1906990426271596e-05, + "loss": 4.0861, + "step": 28600 + }, + { + "epoch": 2.289166266602656, + "grad_norm": 5.895457744598389, + "learning_rate": 1.1893619297213457e-05, + "loss": 4.0952, + "step": 28610 + }, + { + "epoch": 2.2899663946231397, + "grad_norm": 4.059090614318848, + "learning_rate": 1.188024816815532e-05, + "loss": 4.1865, + "step": 28620 + }, + { + "epoch": 2.290766522643623, + "grad_norm": 8.624724388122559, + "learning_rate": 1.1866877039097183e-05, + "loss": 4.2946, + "step": 28630 + }, + { + "epoch": 2.291566650664106, + "grad_norm": 6.121615409851074, + "learning_rate": 1.1853505910039044e-05, + "loss": 4.1917, + "step": 28640 + }, + { + "epoch": 2.2923667786845896, + "grad_norm": 6.071402549743652, + "learning_rate": 1.1840134780980906e-05, + "loss": 4.3638, + "step": 28650 + }, + { + "epoch": 2.293166906705073, + "grad_norm": 4.712641716003418, + "learning_rate": 1.1826763651922769e-05, + "loss": 4.1598, + "step": 28660 + }, + { + "epoch": 2.293967034725556, + "grad_norm": 5.36449670791626, + "learning_rate": 1.1813392522864632e-05, + "loss": 4.2292, + "step": 28670 + }, + { + "epoch": 2.2947671627460395, + "grad_norm": 5.73878812789917, + "learning_rate": 1.1800021393806493e-05, + "loss": 4.362, + "step": 28680 + }, + { + "epoch": 2.2955672907665226, + "grad_norm": 4.448238372802734, + "learning_rate": 1.1786650264748355e-05, + "loss": 4.1564, + "step": 28690 + }, + { + "epoch": 2.296367418787006, + "grad_norm": 5.033682823181152, + "learning_rate": 1.1773279135690218e-05, + "loss": 4.147, + "step": 28700 + }, + { + "epoch": 2.297167546807489, + "grad_norm": 4.061682224273682, + "learning_rate": 1.175990800663208e-05, + "loss": 4.4025, + "step": 28710 + }, + { + "epoch": 2.2979676748279725, + "grad_norm": 5.760485649108887, + "learning_rate": 1.1746536877573942e-05, + "loss": 4.3773, + "step": 28720 + }, + { + "epoch": 2.298767802848456, + "grad_norm": 8.28410816192627, + "learning_rate": 1.1733165748515805e-05, + "loss": 4.1615, + "step": 28730 + }, + { + "epoch": 2.299567930868939, + "grad_norm": 4.579817771911621, + "learning_rate": 1.1719794619457667e-05, + "loss": 4.365, + "step": 28740 + }, + { + "epoch": 2.3003680588894224, + "grad_norm": 5.556467533111572, + "learning_rate": 1.170642349039953e-05, + "loss": 4.36, + "step": 28750 + }, + { + "epoch": 2.3011681869099054, + "grad_norm": 4.739517688751221, + "learning_rate": 1.1693052361341391e-05, + "loss": 4.3933, + "step": 28760 + }, + { + "epoch": 2.301968314930389, + "grad_norm": 4.348276138305664, + "learning_rate": 1.1679681232283254e-05, + "loss": 4.0874, + "step": 28770 + }, + { + "epoch": 2.302768442950872, + "grad_norm": 4.935428142547607, + "learning_rate": 1.1666310103225116e-05, + "loss": 4.3363, + "step": 28780 + }, + { + "epoch": 2.3035685709713554, + "grad_norm": 3.923457145690918, + "learning_rate": 1.1652938974166979e-05, + "loss": 4.206, + "step": 28790 + }, + { + "epoch": 2.304368698991839, + "grad_norm": 4.47625732421875, + "learning_rate": 1.1639567845108842e-05, + "loss": 4.3173, + "step": 28800 + }, + { + "epoch": 2.305168827012322, + "grad_norm": 5.407947540283203, + "learning_rate": 1.1626196716050704e-05, + "loss": 4.3247, + "step": 28810 + }, + { + "epoch": 2.3059689550328053, + "grad_norm": 4.727847099304199, + "learning_rate": 1.1612825586992567e-05, + "loss": 4.366, + "step": 28820 + }, + { + "epoch": 2.3067690830532888, + "grad_norm": 5.98025369644165, + "learning_rate": 1.159945445793443e-05, + "loss": 4.2372, + "step": 28830 + }, + { + "epoch": 2.3075692110737718, + "grad_norm": 4.812393665313721, + "learning_rate": 1.158608332887629e-05, + "loss": 4.3197, + "step": 28840 + }, + { + "epoch": 2.3083693390942552, + "grad_norm": 6.968140125274658, + "learning_rate": 1.1572712199818153e-05, + "loss": 4.4518, + "step": 28850 + }, + { + "epoch": 2.3091694671147383, + "grad_norm": 5.6051740646362305, + "learning_rate": 1.1559341070760016e-05, + "loss": 4.1207, + "step": 28860 + }, + { + "epoch": 2.3099695951352217, + "grad_norm": 4.6692705154418945, + "learning_rate": 1.1545969941701879e-05, + "loss": 4.1852, + "step": 28870 + }, + { + "epoch": 2.3107697231557047, + "grad_norm": 5.183732032775879, + "learning_rate": 1.153259881264374e-05, + "loss": 4.167, + "step": 28880 + }, + { + "epoch": 2.311569851176188, + "grad_norm": 4.867135524749756, + "learning_rate": 1.1519227683585602e-05, + "loss": 4.148, + "step": 28890 + }, + { + "epoch": 2.3123699791966716, + "grad_norm": 4.534615993499756, + "learning_rate": 1.1505856554527465e-05, + "loss": 4.2532, + "step": 28900 + }, + { + "epoch": 2.3131701072171547, + "grad_norm": 5.32534122467041, + "learning_rate": 1.1492485425469326e-05, + "loss": 4.4451, + "step": 28910 + }, + { + "epoch": 2.313970235237638, + "grad_norm": 6.864644527435303, + "learning_rate": 1.1479114296411189e-05, + "loss": 4.345, + "step": 28920 + }, + { + "epoch": 2.314770363258121, + "grad_norm": 4.056292533874512, + "learning_rate": 1.1465743167353052e-05, + "loss": 4.1855, + "step": 28930 + }, + { + "epoch": 2.3155704912786046, + "grad_norm": 4.13967227935791, + "learning_rate": 1.1452372038294914e-05, + "loss": 4.2302, + "step": 28940 + }, + { + "epoch": 2.3163706192990876, + "grad_norm": 4.996280670166016, + "learning_rate": 1.1439000909236775e-05, + "loss": 4.3012, + "step": 28950 + }, + { + "epoch": 2.317170747319571, + "grad_norm": 5.1281914710998535, + "learning_rate": 1.1425629780178638e-05, + "loss": 4.2522, + "step": 28960 + }, + { + "epoch": 2.3179708753400545, + "grad_norm": 8.366037368774414, + "learning_rate": 1.14122586511205e-05, + "loss": 4.4064, + "step": 28970 + }, + { + "epoch": 2.3187710033605375, + "grad_norm": 4.212780475616455, + "learning_rate": 1.1398887522062363e-05, + "loss": 4.1981, + "step": 28980 + }, + { + "epoch": 2.319571131381021, + "grad_norm": 7.184803485870361, + "learning_rate": 1.1385516393004226e-05, + "loss": 4.3587, + "step": 28990 + }, + { + "epoch": 2.3203712594015045, + "grad_norm": 5.358102321624756, + "learning_rate": 1.1372145263946089e-05, + "loss": 4.2411, + "step": 29000 + }, + { + "epoch": 2.3211713874219875, + "grad_norm": 4.48073673248291, + "learning_rate": 1.1358774134887951e-05, + "loss": 4.2713, + "step": 29010 + }, + { + "epoch": 2.321971515442471, + "grad_norm": 4.86694860458374, + "learning_rate": 1.1345403005829814e-05, + "loss": 4.2218, + "step": 29020 + }, + { + "epoch": 2.322771643462954, + "grad_norm": 5.006281852722168, + "learning_rate": 1.1332031876771675e-05, + "loss": 4.1286, + "step": 29030 + }, + { + "epoch": 2.3235717714834374, + "grad_norm": 4.683011531829834, + "learning_rate": 1.1318660747713538e-05, + "loss": 4.1421, + "step": 29040 + }, + { + "epoch": 2.3243718995039204, + "grad_norm": 9.335220336914062, + "learning_rate": 1.13052896186554e-05, + "loss": 4.0164, + "step": 29050 + }, + { + "epoch": 2.325172027524404, + "grad_norm": 5.779821872711182, + "learning_rate": 1.1291918489597263e-05, + "loss": 4.1787, + "step": 29060 + }, + { + "epoch": 2.3259721555448873, + "grad_norm": 4.714453220367432, + "learning_rate": 1.1278547360539124e-05, + "loss": 4.2722, + "step": 29070 + }, + { + "epoch": 2.3267722835653704, + "grad_norm": 8.025455474853516, + "learning_rate": 1.1265176231480987e-05, + "loss": 4.4028, + "step": 29080 + }, + { + "epoch": 2.327572411585854, + "grad_norm": 5.3493194580078125, + "learning_rate": 1.125180510242285e-05, + "loss": 4.2032, + "step": 29090 + }, + { + "epoch": 2.328372539606337, + "grad_norm": 4.934419631958008, + "learning_rate": 1.1238433973364712e-05, + "loss": 4.2815, + "step": 29100 + }, + { + "epoch": 2.3291726676268203, + "grad_norm": 5.229598045349121, + "learning_rate": 1.1225062844306573e-05, + "loss": 4.3688, + "step": 29110 + }, + { + "epoch": 2.3299727956473038, + "grad_norm": 6.556520938873291, + "learning_rate": 1.1211691715248436e-05, + "loss": 4.2369, + "step": 29120 + }, + { + "epoch": 2.3307729236677868, + "grad_norm": 4.490777015686035, + "learning_rate": 1.1198320586190299e-05, + "loss": 4.2573, + "step": 29130 + }, + { + "epoch": 2.3315730516882702, + "grad_norm": 5.96286678314209, + "learning_rate": 1.1184949457132161e-05, + "loss": 4.1455, + "step": 29140 + }, + { + "epoch": 2.3323731797087532, + "grad_norm": 6.92281436920166, + "learning_rate": 1.1171578328074022e-05, + "loss": 4.185, + "step": 29150 + }, + { + "epoch": 2.3331733077292367, + "grad_norm": 5.416003704071045, + "learning_rate": 1.1158207199015885e-05, + "loss": 4.2704, + "step": 29160 + }, + { + "epoch": 2.33397343574972, + "grad_norm": 3.974001407623291, + "learning_rate": 1.1144836069957748e-05, + "loss": 4.2986, + "step": 29170 + }, + { + "epoch": 2.334773563770203, + "grad_norm": 5.721689701080322, + "learning_rate": 1.113146494089961e-05, + "loss": 4.1773, + "step": 29180 + }, + { + "epoch": 2.3355736917906866, + "grad_norm": 4.181761741638184, + "learning_rate": 1.1118093811841471e-05, + "loss": 4.3206, + "step": 29190 + }, + { + "epoch": 2.3363738198111696, + "grad_norm": 5.180452823638916, + "learning_rate": 1.1104722682783334e-05, + "loss": 4.3047, + "step": 29200 + }, + { + "epoch": 2.337173947831653, + "grad_norm": 3.443024158477783, + "learning_rate": 1.1091351553725197e-05, + "loss": 4.3859, + "step": 29210 + }, + { + "epoch": 2.337974075852136, + "grad_norm": 5.066608905792236, + "learning_rate": 1.107798042466706e-05, + "loss": 4.2045, + "step": 29220 + }, + { + "epoch": 2.3387742038726196, + "grad_norm": 4.342000484466553, + "learning_rate": 1.1064609295608922e-05, + "loss": 4.1469, + "step": 29230 + }, + { + "epoch": 2.339574331893103, + "grad_norm": 8.228534698486328, + "learning_rate": 1.1051238166550785e-05, + "loss": 4.3781, + "step": 29240 + }, + { + "epoch": 2.340374459913586, + "grad_norm": 4.797886371612549, + "learning_rate": 1.1037867037492647e-05, + "loss": 4.489, + "step": 29250 + }, + { + "epoch": 2.3411745879340695, + "grad_norm": 5.288513660430908, + "learning_rate": 1.1024495908434508e-05, + "loss": 4.4058, + "step": 29260 + }, + { + "epoch": 2.341974715954553, + "grad_norm": 4.347620010375977, + "learning_rate": 1.1011124779376371e-05, + "loss": 4.2193, + "step": 29270 + }, + { + "epoch": 2.342774843975036, + "grad_norm": 6.2110466957092285, + "learning_rate": 1.0997753650318234e-05, + "loss": 4.1746, + "step": 29280 + }, + { + "epoch": 2.3435749719955195, + "grad_norm": 6.015201568603516, + "learning_rate": 1.0984382521260097e-05, + "loss": 4.3359, + "step": 29290 + }, + { + "epoch": 2.3443751000160025, + "grad_norm": 4.607378005981445, + "learning_rate": 1.0971011392201958e-05, + "loss": 4.3533, + "step": 29300 + }, + { + "epoch": 2.345175228036486, + "grad_norm": 4.885862827301025, + "learning_rate": 1.095764026314382e-05, + "loss": 4.2704, + "step": 29310 + }, + { + "epoch": 2.345975356056969, + "grad_norm": 3.8569679260253906, + "learning_rate": 1.0944269134085683e-05, + "loss": 4.3441, + "step": 29320 + }, + { + "epoch": 2.3467754840774524, + "grad_norm": 3.915543794631958, + "learning_rate": 1.0930898005027546e-05, + "loss": 4.3674, + "step": 29330 + }, + { + "epoch": 2.347575612097936, + "grad_norm": 5.9416375160217285, + "learning_rate": 1.0917526875969407e-05, + "loss": 4.2987, + "step": 29340 + }, + { + "epoch": 2.348375740118419, + "grad_norm": 4.978701114654541, + "learning_rate": 1.090415574691127e-05, + "loss": 4.1645, + "step": 29350 + }, + { + "epoch": 2.3491758681389023, + "grad_norm": 4.31766939163208, + "learning_rate": 1.0890784617853132e-05, + "loss": 4.4458, + "step": 29360 + }, + { + "epoch": 2.3499759961593853, + "grad_norm": 7.840646266937256, + "learning_rate": 1.0877413488794995e-05, + "loss": 4.2837, + "step": 29370 + }, + { + "epoch": 2.350776124179869, + "grad_norm": 6.027830600738525, + "learning_rate": 1.0864042359736856e-05, + "loss": 4.0439, + "step": 29380 + }, + { + "epoch": 2.351576252200352, + "grad_norm": 5.188173294067383, + "learning_rate": 1.0850671230678718e-05, + "loss": 4.2065, + "step": 29390 + }, + { + "epoch": 2.3523763802208353, + "grad_norm": 4.097430229187012, + "learning_rate": 1.0837300101620581e-05, + "loss": 4.344, + "step": 29400 + }, + { + "epoch": 2.3531765082413187, + "grad_norm": 9.700491905212402, + "learning_rate": 1.0823928972562444e-05, + "loss": 4.162, + "step": 29410 + }, + { + "epoch": 2.3539766362618018, + "grad_norm": 4.775502681732178, + "learning_rate": 1.0810557843504306e-05, + "loss": 4.142, + "step": 29420 + }, + { + "epoch": 2.354776764282285, + "grad_norm": 4.824921131134033, + "learning_rate": 1.0797186714446169e-05, + "loss": 4.3454, + "step": 29430 + }, + { + "epoch": 2.3555768923027687, + "grad_norm": 6.839511394500732, + "learning_rate": 1.0783815585388032e-05, + "loss": 4.3381, + "step": 29440 + }, + { + "epoch": 2.3563770203232517, + "grad_norm": 6.512791156768799, + "learning_rate": 1.0770444456329895e-05, + "loss": 4.0973, + "step": 29450 + }, + { + "epoch": 2.357177148343735, + "grad_norm": 5.187317848205566, + "learning_rate": 1.0757073327271755e-05, + "loss": 4.1051, + "step": 29460 + }, + { + "epoch": 2.357977276364218, + "grad_norm": 4.162054061889648, + "learning_rate": 1.0743702198213618e-05, + "loss": 4.2611, + "step": 29470 + }, + { + "epoch": 2.3587774043847016, + "grad_norm": 5.449061870574951, + "learning_rate": 1.0730331069155481e-05, + "loss": 4.1903, + "step": 29480 + }, + { + "epoch": 2.3595775324051846, + "grad_norm": 4.118699550628662, + "learning_rate": 1.0716959940097342e-05, + "loss": 4.2631, + "step": 29490 + }, + { + "epoch": 2.360377660425668, + "grad_norm": 7.31171178817749, + "learning_rate": 1.0703588811039205e-05, + "loss": 4.2629, + "step": 29500 + }, + { + "epoch": 2.3611777884461516, + "grad_norm": 5.051353931427002, + "learning_rate": 1.0690217681981067e-05, + "loss": 4.1623, + "step": 29510 + }, + { + "epoch": 2.3619779164666346, + "grad_norm": 6.965731620788574, + "learning_rate": 1.067684655292293e-05, + "loss": 4.295, + "step": 29520 + }, + { + "epoch": 2.362778044487118, + "grad_norm": 5.027985095977783, + "learning_rate": 1.0663475423864791e-05, + "loss": 4.1316, + "step": 29530 + }, + { + "epoch": 2.363578172507601, + "grad_norm": 4.405726432800293, + "learning_rate": 1.0650104294806654e-05, + "loss": 4.3583, + "step": 29540 + }, + { + "epoch": 2.3643783005280845, + "grad_norm": 5.971738815307617, + "learning_rate": 1.0636733165748516e-05, + "loss": 4.2283, + "step": 29550 + }, + { + "epoch": 2.3651784285485675, + "grad_norm": 6.3857269287109375, + "learning_rate": 1.0623362036690379e-05, + "loss": 4.2043, + "step": 29560 + }, + { + "epoch": 2.365978556569051, + "grad_norm": 7.309497833251953, + "learning_rate": 1.060999090763224e-05, + "loss": 4.274, + "step": 29570 + }, + { + "epoch": 2.3667786845895344, + "grad_norm": 5.590941905975342, + "learning_rate": 1.0596619778574103e-05, + "loss": 4.3008, + "step": 29580 + }, + { + "epoch": 2.3675788126100175, + "grad_norm": 8.809772491455078, + "learning_rate": 1.0583248649515965e-05, + "loss": 4.1339, + "step": 29590 + }, + { + "epoch": 2.368378940630501, + "grad_norm": 3.935534954071045, + "learning_rate": 1.0569877520457828e-05, + "loss": 4.3561, + "step": 29600 + }, + { + "epoch": 2.3691790686509844, + "grad_norm": 6.036637783050537, + "learning_rate": 1.0556506391399689e-05, + "loss": 4.1984, + "step": 29610 + }, + { + "epoch": 2.3699791966714674, + "grad_norm": 5.820141315460205, + "learning_rate": 1.0543135262341552e-05, + "loss": 4.211, + "step": 29620 + }, + { + "epoch": 2.370779324691951, + "grad_norm": 6.568235397338867, + "learning_rate": 1.0529764133283414e-05, + "loss": 4.3565, + "step": 29630 + }, + { + "epoch": 2.371579452712434, + "grad_norm": 4.8180060386657715, + "learning_rate": 1.0516393004225277e-05, + "loss": 4.3178, + "step": 29640 + }, + { + "epoch": 2.3723795807329173, + "grad_norm": 6.591854572296143, + "learning_rate": 1.050302187516714e-05, + "loss": 4.3688, + "step": 29650 + }, + { + "epoch": 2.3731797087534003, + "grad_norm": 4.703768730163574, + "learning_rate": 1.0489650746109003e-05, + "loss": 4.3742, + "step": 29660 + }, + { + "epoch": 2.373979836773884, + "grad_norm": 5.566431045532227, + "learning_rate": 1.0476279617050865e-05, + "loss": 4.2257, + "step": 29670 + }, + { + "epoch": 2.3747799647943673, + "grad_norm": 4.521890640258789, + "learning_rate": 1.0462908487992728e-05, + "loss": 4.4083, + "step": 29680 + }, + { + "epoch": 2.3755800928148503, + "grad_norm": 4.3846025466918945, + "learning_rate": 1.0449537358934589e-05, + "loss": 4.1587, + "step": 29690 + }, + { + "epoch": 2.3763802208353337, + "grad_norm": 4.914963245391846, + "learning_rate": 1.0436166229876452e-05, + "loss": 4.2169, + "step": 29700 + }, + { + "epoch": 2.3771803488558167, + "grad_norm": 4.025776386260986, + "learning_rate": 1.0422795100818314e-05, + "loss": 4.3142, + "step": 29710 + }, + { + "epoch": 2.3779804768763, + "grad_norm": 4.935368537902832, + "learning_rate": 1.0409423971760177e-05, + "loss": 4.328, + "step": 29720 + }, + { + "epoch": 2.3787806048967837, + "grad_norm": 4.549187660217285, + "learning_rate": 1.0396052842702038e-05, + "loss": 4.2335, + "step": 29730 + }, + { + "epoch": 2.3795807329172667, + "grad_norm": 4.583531856536865, + "learning_rate": 1.03826817136439e-05, + "loss": 4.2718, + "step": 29740 + }, + { + "epoch": 2.38038086093775, + "grad_norm": 5.972599983215332, + "learning_rate": 1.0369310584585763e-05, + "loss": 4.279, + "step": 29750 + }, + { + "epoch": 2.381180988958233, + "grad_norm": 4.461510181427002, + "learning_rate": 1.0355939455527624e-05, + "loss": 4.3608, + "step": 29760 + }, + { + "epoch": 2.3819811169787166, + "grad_norm": 5.027605056762695, + "learning_rate": 1.0342568326469487e-05, + "loss": 4.3147, + "step": 29770 + }, + { + "epoch": 2.3827812449992, + "grad_norm": 5.157301425933838, + "learning_rate": 1.032919719741135e-05, + "loss": 4.3418, + "step": 29780 + }, + { + "epoch": 2.383581373019683, + "grad_norm": 6.807774543762207, + "learning_rate": 1.0315826068353212e-05, + "loss": 4.1424, + "step": 29790 + }, + { + "epoch": 2.3843815010401666, + "grad_norm": 4.789431095123291, + "learning_rate": 1.0302454939295073e-05, + "loss": 4.2169, + "step": 29800 + }, + { + "epoch": 2.3851816290606496, + "grad_norm": 4.129378318786621, + "learning_rate": 1.0289083810236936e-05, + "loss": 4.1229, + "step": 29810 + }, + { + "epoch": 2.385981757081133, + "grad_norm": 5.426341533660889, + "learning_rate": 1.0275712681178799e-05, + "loss": 4.2576, + "step": 29820 + }, + { + "epoch": 2.386781885101616, + "grad_norm": 4.135334491729736, + "learning_rate": 1.0262341552120661e-05, + "loss": 4.2543, + "step": 29830 + }, + { + "epoch": 2.3875820131220995, + "grad_norm": 5.468907833099365, + "learning_rate": 1.0248970423062522e-05, + "loss": 4.4257, + "step": 29840 + }, + { + "epoch": 2.388382141142583, + "grad_norm": 4.926033020019531, + "learning_rate": 1.0235599294004385e-05, + "loss": 4.3505, + "step": 29850 + }, + { + "epoch": 2.389182269163066, + "grad_norm": 6.9714155197143555, + "learning_rate": 1.0222228164946248e-05, + "loss": 4.2705, + "step": 29860 + }, + { + "epoch": 2.3899823971835494, + "grad_norm": 5.102124214172363, + "learning_rate": 1.020885703588811e-05, + "loss": 4.2783, + "step": 29870 + }, + { + "epoch": 2.390782525204033, + "grad_norm": 4.479608535766602, + "learning_rate": 1.0195485906829973e-05, + "loss": 4.3488, + "step": 29880 + }, + { + "epoch": 2.391582653224516, + "grad_norm": 8.478421211242676, + "learning_rate": 1.0182114777771836e-05, + "loss": 4.1135, + "step": 29890 + }, + { + "epoch": 2.3923827812449994, + "grad_norm": 8.210994720458984, + "learning_rate": 1.0168743648713699e-05, + "loss": 4.2423, + "step": 29900 + }, + { + "epoch": 2.3931829092654824, + "grad_norm": 5.55629825592041, + "learning_rate": 1.0155372519655561e-05, + "loss": 4.3846, + "step": 29910 + }, + { + "epoch": 2.393983037285966, + "grad_norm": 4.0907487869262695, + "learning_rate": 1.0142001390597422e-05, + "loss": 4.117, + "step": 29920 + }, + { + "epoch": 2.394783165306449, + "grad_norm": 4.413903713226318, + "learning_rate": 1.0128630261539285e-05, + "loss": 4.4148, + "step": 29930 + }, + { + "epoch": 2.3955832933269323, + "grad_norm": 5.763768672943115, + "learning_rate": 1.0115259132481148e-05, + "loss": 4.3248, + "step": 29940 + }, + { + "epoch": 2.3963834213474158, + "grad_norm": 4.842255115509033, + "learning_rate": 1.010188800342301e-05, + "loss": 4.1066, + "step": 29950 + }, + { + "epoch": 2.397183549367899, + "grad_norm": 5.915595054626465, + "learning_rate": 1.0088516874364871e-05, + "loss": 4.2386, + "step": 29960 + }, + { + "epoch": 2.3979836773883823, + "grad_norm": 6.0812201499938965, + "learning_rate": 1.0075145745306734e-05, + "loss": 4.2461, + "step": 29970 + }, + { + "epoch": 2.3987838054088653, + "grad_norm": 5.185386657714844, + "learning_rate": 1.0061774616248597e-05, + "loss": 4.0738, + "step": 29980 + }, + { + "epoch": 2.3995839334293487, + "grad_norm": 6.472476482391357, + "learning_rate": 1.004840348719046e-05, + "loss": 4.3379, + "step": 29990 + }, + { + "epoch": 2.4003840614498317, + "grad_norm": 5.108096599578857, + "learning_rate": 1.003503235813232e-05, + "loss": 4.1048, + "step": 30000 + }, + { + "epoch": 2.401184189470315, + "grad_norm": 6.5041890144348145, + "learning_rate": 1.0021661229074183e-05, + "loss": 4.2413, + "step": 30010 + }, + { + "epoch": 2.4019843174907987, + "grad_norm": 3.8431293964385986, + "learning_rate": 1.0008290100016046e-05, + "loss": 4.1759, + "step": 30020 + }, + { + "epoch": 2.4027844455112817, + "grad_norm": 4.535530090332031, + "learning_rate": 9.994918970957909e-06, + "loss": 4.1232, + "step": 30030 + }, + { + "epoch": 2.403584573531765, + "grad_norm": 7.020683288574219, + "learning_rate": 9.98154784189977e-06, + "loss": 4.1428, + "step": 30040 + }, + { + "epoch": 2.4043847015522486, + "grad_norm": 5.312134742736816, + "learning_rate": 9.968176712841632e-06, + "loss": 4.2044, + "step": 30050 + }, + { + "epoch": 2.4051848295727316, + "grad_norm": 5.777432918548584, + "learning_rate": 9.954805583783495e-06, + "loss": 4.3428, + "step": 30060 + }, + { + "epoch": 2.405984957593215, + "grad_norm": 6.370398044586182, + "learning_rate": 9.941434454725358e-06, + "loss": 4.2981, + "step": 30070 + }, + { + "epoch": 2.406785085613698, + "grad_norm": 4.892792701721191, + "learning_rate": 9.92806332566722e-06, + "loss": 4.3618, + "step": 30080 + }, + { + "epoch": 2.4075852136341815, + "grad_norm": 5.481409072875977, + "learning_rate": 9.914692196609083e-06, + "loss": 4.2313, + "step": 30090 + }, + { + "epoch": 2.4083853416546646, + "grad_norm": 7.531737804412842, + "learning_rate": 9.901321067550946e-06, + "loss": 4.1667, + "step": 30100 + }, + { + "epoch": 2.409185469675148, + "grad_norm": 4.554537296295166, + "learning_rate": 9.887949938492807e-06, + "loss": 4.328, + "step": 30110 + }, + { + "epoch": 2.4099855976956315, + "grad_norm": 7.280032157897949, + "learning_rate": 9.87457880943467e-06, + "loss": 4.3348, + "step": 30120 + }, + { + "epoch": 2.4107857257161145, + "grad_norm": 5.1274824142456055, + "learning_rate": 9.861207680376532e-06, + "loss": 4.2058, + "step": 30130 + }, + { + "epoch": 2.411585853736598, + "grad_norm": 5.370729923248291, + "learning_rate": 9.847836551318395e-06, + "loss": 4.1759, + "step": 30140 + }, + { + "epoch": 2.412385981757081, + "grad_norm": 5.877347946166992, + "learning_rate": 9.834465422260256e-06, + "loss": 4.3684, + "step": 30150 + }, + { + "epoch": 2.4131861097775644, + "grad_norm": 3.648756742477417, + "learning_rate": 9.821094293202118e-06, + "loss": 4.0863, + "step": 30160 + }, + { + "epoch": 2.4139862377980474, + "grad_norm": 6.443056106567383, + "learning_rate": 9.807723164143981e-06, + "loss": 4.2268, + "step": 30170 + }, + { + "epoch": 2.414786365818531, + "grad_norm": 5.324306488037109, + "learning_rate": 9.794352035085844e-06, + "loss": 4.1204, + "step": 30180 + }, + { + "epoch": 2.4155864938390144, + "grad_norm": 6.442959308624268, + "learning_rate": 9.780980906027705e-06, + "loss": 4.1933, + "step": 30190 + }, + { + "epoch": 2.4163866218594974, + "grad_norm": 5.393689155578613, + "learning_rate": 9.767609776969567e-06, + "loss": 4.2285, + "step": 30200 + }, + { + "epoch": 2.417186749879981, + "grad_norm": 5.3242573738098145, + "learning_rate": 9.75423864791143e-06, + "loss": 4.24, + "step": 30210 + }, + { + "epoch": 2.4179868779004643, + "grad_norm": 4.855564594268799, + "learning_rate": 9.740867518853293e-06, + "loss": 4.2122, + "step": 30220 + }, + { + "epoch": 2.4187870059209473, + "grad_norm": 7.359550952911377, + "learning_rate": 9.727496389795154e-06, + "loss": 4.2153, + "step": 30230 + }, + { + "epoch": 2.4195871339414308, + "grad_norm": 6.255536079406738, + "learning_rate": 9.714125260737017e-06, + "loss": 4.243, + "step": 30240 + }, + { + "epoch": 2.420387261961914, + "grad_norm": 11.542984962463379, + "learning_rate": 9.70075413167888e-06, + "loss": 4.1648, + "step": 30250 + }, + { + "epoch": 2.4211873899823972, + "grad_norm": 3.8510329723358154, + "learning_rate": 9.687383002620742e-06, + "loss": 4.2724, + "step": 30260 + }, + { + "epoch": 2.4219875180028803, + "grad_norm": 6.371697425842285, + "learning_rate": 9.674011873562603e-06, + "loss": 4.3379, + "step": 30270 + }, + { + "epoch": 2.4227876460233637, + "grad_norm": 5.638033866882324, + "learning_rate": 9.660640744504466e-06, + "loss": 4.243, + "step": 30280 + }, + { + "epoch": 2.423587774043847, + "grad_norm": 4.4373698234558105, + "learning_rate": 9.647269615446328e-06, + "loss": 4.0961, + "step": 30290 + }, + { + "epoch": 2.42438790206433, + "grad_norm": 5.150527000427246, + "learning_rate": 9.633898486388191e-06, + "loss": 4.4039, + "step": 30300 + }, + { + "epoch": 2.4251880300848137, + "grad_norm": 3.810324192047119, + "learning_rate": 9.620527357330054e-06, + "loss": 4.4131, + "step": 30310 + }, + { + "epoch": 2.4259881581052967, + "grad_norm": 4.635262489318848, + "learning_rate": 9.607156228271916e-06, + "loss": 4.3309, + "step": 30320 + }, + { + "epoch": 2.42678828612578, + "grad_norm": 4.7242021560668945, + "learning_rate": 9.593785099213779e-06, + "loss": 4.4903, + "step": 30330 + }, + { + "epoch": 2.4275884141462636, + "grad_norm": 6.102695941925049, + "learning_rate": 9.58041397015564e-06, + "loss": 4.1923, + "step": 30340 + }, + { + "epoch": 2.4283885421667466, + "grad_norm": 7.34291934967041, + "learning_rate": 9.567042841097503e-06, + "loss": 4.2971, + "step": 30350 + }, + { + "epoch": 2.42918867018723, + "grad_norm": 5.011170864105225, + "learning_rate": 9.553671712039365e-06, + "loss": 4.3888, + "step": 30360 + }, + { + "epoch": 2.429988798207713, + "grad_norm": 5.15313720703125, + "learning_rate": 9.540300582981228e-06, + "loss": 4.0506, + "step": 30370 + }, + { + "epoch": 2.4307889262281965, + "grad_norm": 3.9554715156555176, + "learning_rate": 9.526929453923089e-06, + "loss": 4.3425, + "step": 30380 + }, + { + "epoch": 2.43158905424868, + "grad_norm": 7.2298479080200195, + "learning_rate": 9.513558324864952e-06, + "loss": 4.4475, + "step": 30390 + }, + { + "epoch": 2.432389182269163, + "grad_norm": 3.8345696926116943, + "learning_rate": 9.500187195806814e-06, + "loss": 4.0772, + "step": 30400 + }, + { + "epoch": 2.4331893102896465, + "grad_norm": 5.277257442474365, + "learning_rate": 9.486816066748677e-06, + "loss": 4.214, + "step": 30410 + }, + { + "epoch": 2.4339894383101295, + "grad_norm": 4.5606513023376465, + "learning_rate": 9.473444937690538e-06, + "loss": 4.2904, + "step": 30420 + }, + { + "epoch": 2.434789566330613, + "grad_norm": 6.693448543548584, + "learning_rate": 9.460073808632401e-06, + "loss": 4.3922, + "step": 30430 + }, + { + "epoch": 2.435589694351096, + "grad_norm": 6.090394973754883, + "learning_rate": 9.446702679574264e-06, + "loss": 4.1605, + "step": 30440 + }, + { + "epoch": 2.4363898223715794, + "grad_norm": 5.3264479637146, + "learning_rate": 9.433331550516126e-06, + "loss": 4.3312, + "step": 30450 + }, + { + "epoch": 2.437189950392063, + "grad_norm": 5.852473258972168, + "learning_rate": 9.419960421457987e-06, + "loss": 4.1236, + "step": 30460 + }, + { + "epoch": 2.437990078412546, + "grad_norm": 7.659939765930176, + "learning_rate": 9.40658929239985e-06, + "loss": 4.5643, + "step": 30470 + }, + { + "epoch": 2.4387902064330294, + "grad_norm": 3.983794927597046, + "learning_rate": 9.393218163341713e-06, + "loss": 4.2261, + "step": 30480 + }, + { + "epoch": 2.439590334453513, + "grad_norm": 5.556244373321533, + "learning_rate": 9.379847034283575e-06, + "loss": 4.3164, + "step": 30490 + }, + { + "epoch": 2.440390462473996, + "grad_norm": 4.4613213539123535, + "learning_rate": 9.366475905225438e-06, + "loss": 4.2227, + "step": 30500 + }, + { + "epoch": 2.4411905904944793, + "grad_norm": 4.627000331878662, + "learning_rate": 9.3531047761673e-06, + "loss": 4.442, + "step": 30510 + }, + { + "epoch": 2.4419907185149623, + "grad_norm": 8.667407989501953, + "learning_rate": 9.339733647109163e-06, + "loss": 4.1635, + "step": 30520 + }, + { + "epoch": 2.4427908465354458, + "grad_norm": 10.679652214050293, + "learning_rate": 9.326362518051026e-06, + "loss": 4.3699, + "step": 30530 + }, + { + "epoch": 2.4435909745559288, + "grad_norm": 7.4805006980896, + "learning_rate": 9.312991388992887e-06, + "loss": 4.2092, + "step": 30540 + }, + { + "epoch": 2.4443911025764122, + "grad_norm": 4.780892372131348, + "learning_rate": 9.29962025993475e-06, + "loss": 4.3487, + "step": 30550 + }, + { + "epoch": 2.4451912305968957, + "grad_norm": 5.3561930656433105, + "learning_rate": 9.286249130876612e-06, + "loss": 4.1617, + "step": 30560 + }, + { + "epoch": 2.4459913586173787, + "grad_norm": 4.493958950042725, + "learning_rate": 9.272878001818475e-06, + "loss": 4.2219, + "step": 30570 + }, + { + "epoch": 2.446791486637862, + "grad_norm": 4.278980255126953, + "learning_rate": 9.259506872760336e-06, + "loss": 4.1333, + "step": 30580 + }, + { + "epoch": 2.447591614658345, + "grad_norm": 4.631113052368164, + "learning_rate": 9.246135743702199e-06, + "loss": 4.2754, + "step": 30590 + }, + { + "epoch": 2.4483917426788286, + "grad_norm": 5.248873233795166, + "learning_rate": 9.232764614644062e-06, + "loss": 4.2683, + "step": 30600 + }, + { + "epoch": 2.4491918706993117, + "grad_norm": 3.8190808296203613, + "learning_rate": 9.219393485585923e-06, + "loss": 4.3059, + "step": 30610 + }, + { + "epoch": 2.449991998719795, + "grad_norm": 6.609463214874268, + "learning_rate": 9.206022356527785e-06, + "loss": 4.3925, + "step": 30620 + }, + { + "epoch": 2.4507921267402786, + "grad_norm": 5.2939581871032715, + "learning_rate": 9.192651227469648e-06, + "loss": 4.3248, + "step": 30630 + }, + { + "epoch": 2.4515922547607616, + "grad_norm": 8.84293270111084, + "learning_rate": 9.17928009841151e-06, + "loss": 4.2664, + "step": 30640 + }, + { + "epoch": 2.452392382781245, + "grad_norm": 6.027024269104004, + "learning_rate": 9.165908969353372e-06, + "loss": 4.2791, + "step": 30650 + }, + { + "epoch": 2.4531925108017285, + "grad_norm": 6.528242588043213, + "learning_rate": 9.152537840295234e-06, + "loss": 4.1843, + "step": 30660 + }, + { + "epoch": 2.4539926388222115, + "grad_norm": 6.670463562011719, + "learning_rate": 9.139166711237097e-06, + "loss": 4.0791, + "step": 30670 + }, + { + "epoch": 2.454792766842695, + "grad_norm": 4.609775066375732, + "learning_rate": 9.12579558217896e-06, + "loss": 4.1602, + "step": 30680 + }, + { + "epoch": 2.455592894863178, + "grad_norm": 5.768373012542725, + "learning_rate": 9.11242445312082e-06, + "loss": 4.1335, + "step": 30690 + }, + { + "epoch": 2.4563930228836615, + "grad_norm": 5.554232120513916, + "learning_rate": 9.099053324062683e-06, + "loss": 4.1415, + "step": 30700 + }, + { + "epoch": 2.4571931509041445, + "grad_norm": 5.334545612335205, + "learning_rate": 9.085682195004546e-06, + "loss": 4.2514, + "step": 30710 + }, + { + "epoch": 2.457993278924628, + "grad_norm": 11.706725120544434, + "learning_rate": 9.072311065946409e-06, + "loss": 4.3661, + "step": 30720 + }, + { + "epoch": 2.4587934069451114, + "grad_norm": 9.615589141845703, + "learning_rate": 9.058939936888271e-06, + "loss": 4.1575, + "step": 30730 + }, + { + "epoch": 2.4595935349655944, + "grad_norm": 8.476457595825195, + "learning_rate": 9.045568807830134e-06, + "loss": 4.3892, + "step": 30740 + }, + { + "epoch": 2.460393662986078, + "grad_norm": 8.876557350158691, + "learning_rate": 9.032197678771997e-06, + "loss": 4.3411, + "step": 30750 + }, + { + "epoch": 2.461193791006561, + "grad_norm": 4.643357276916504, + "learning_rate": 9.01882654971386e-06, + "loss": 4.344, + "step": 30760 + }, + { + "epoch": 2.4619939190270443, + "grad_norm": 4.706933975219727, + "learning_rate": 9.00545542065572e-06, + "loss": 4.3931, + "step": 30770 + }, + { + "epoch": 2.4627940470475274, + "grad_norm": 3.6565020084381104, + "learning_rate": 8.992084291597583e-06, + "loss": 4.2277, + "step": 30780 + }, + { + "epoch": 2.463594175068011, + "grad_norm": 6.164667129516602, + "learning_rate": 8.978713162539446e-06, + "loss": 4.217, + "step": 30790 + }, + { + "epoch": 2.4643943030884943, + "grad_norm": 5.391159534454346, + "learning_rate": 8.965342033481309e-06, + "loss": 4.3037, + "step": 30800 + }, + { + "epoch": 2.4651944311089773, + "grad_norm": 4.463956832885742, + "learning_rate": 8.95197090442317e-06, + "loss": 4.174, + "step": 30810 + }, + { + "epoch": 2.4659945591294608, + "grad_norm": 5.862019062042236, + "learning_rate": 8.938599775365032e-06, + "loss": 4.152, + "step": 30820 + }, + { + "epoch": 2.466794687149944, + "grad_norm": 6.314411163330078, + "learning_rate": 8.925228646306895e-06, + "loss": 4.1766, + "step": 30830 + }, + { + "epoch": 2.4675948151704272, + "grad_norm": 5.068113803863525, + "learning_rate": 8.911857517248758e-06, + "loss": 4.3446, + "step": 30840 + }, + { + "epoch": 2.4683949431909107, + "grad_norm": 4.690631866455078, + "learning_rate": 8.898486388190619e-06, + "loss": 4.3606, + "step": 30850 + }, + { + "epoch": 2.4691950712113937, + "grad_norm": 10.336904525756836, + "learning_rate": 8.885115259132481e-06, + "loss": 4.2696, + "step": 30860 + }, + { + "epoch": 2.469995199231877, + "grad_norm": 4.266592502593994, + "learning_rate": 8.871744130074344e-06, + "loss": 4.3406, + "step": 30870 + }, + { + "epoch": 2.47079532725236, + "grad_norm": 6.125695705413818, + "learning_rate": 8.858373001016205e-06, + "loss": 4.2, + "step": 30880 + }, + { + "epoch": 2.4715954552728436, + "grad_norm": 8.705606460571289, + "learning_rate": 8.845001871958068e-06, + "loss": 4.3633, + "step": 30890 + }, + { + "epoch": 2.472395583293327, + "grad_norm": 3.629751205444336, + "learning_rate": 8.83163074289993e-06, + "loss": 4.2029, + "step": 30900 + }, + { + "epoch": 2.47319571131381, + "grad_norm": 5.424594879150391, + "learning_rate": 8.818259613841793e-06, + "loss": 4.3171, + "step": 30910 + }, + { + "epoch": 2.4739958393342936, + "grad_norm": 5.205273628234863, + "learning_rate": 8.804888484783656e-06, + "loss": 4.2902, + "step": 30920 + }, + { + "epoch": 2.4747959673547766, + "grad_norm": 5.293666839599609, + "learning_rate": 8.791517355725518e-06, + "loss": 4.2665, + "step": 30930 + }, + { + "epoch": 2.47559609537526, + "grad_norm": 7.430743217468262, + "learning_rate": 8.778146226667381e-06, + "loss": 4.227, + "step": 30940 + }, + { + "epoch": 2.4763962233957435, + "grad_norm": 7.585148811340332, + "learning_rate": 8.764775097609244e-06, + "loss": 4.1007, + "step": 30950 + }, + { + "epoch": 2.4771963514162265, + "grad_norm": 3.8104374408721924, + "learning_rate": 8.751403968551105e-06, + "loss": 4.2632, + "step": 30960 + }, + { + "epoch": 2.47799647943671, + "grad_norm": 4.126226902008057, + "learning_rate": 8.738032839492967e-06, + "loss": 4.2082, + "step": 30970 + }, + { + "epoch": 2.478796607457193, + "grad_norm": 4.376924514770508, + "learning_rate": 8.72466171043483e-06, + "loss": 4.2911, + "step": 30980 + }, + { + "epoch": 2.4795967354776765, + "grad_norm": 5.41331148147583, + "learning_rate": 8.711290581376693e-06, + "loss": 4.1285, + "step": 30990 + }, + { + "epoch": 2.48039686349816, + "grad_norm": 5.507528305053711, + "learning_rate": 8.697919452318554e-06, + "loss": 4.3046, + "step": 31000 + }, + { + "epoch": 2.481196991518643, + "grad_norm": 5.433472156524658, + "learning_rate": 8.684548323260417e-06, + "loss": 4.2791, + "step": 31010 + }, + { + "epoch": 2.4819971195391264, + "grad_norm": 5.576103210449219, + "learning_rate": 8.67117719420228e-06, + "loss": 4.3817, + "step": 31020 + }, + { + "epoch": 2.4827972475596094, + "grad_norm": 8.479930877685547, + "learning_rate": 8.659143178049955e-06, + "loss": 4.4041, + "step": 31030 + }, + { + "epoch": 2.483597375580093, + "grad_norm": 10.624326705932617, + "learning_rate": 8.645772048991816e-06, + "loss": 4.2272, + "step": 31040 + }, + { + "epoch": 2.484397503600576, + "grad_norm": 9.005964279174805, + "learning_rate": 8.632400919933679e-06, + "loss": 4.2243, + "step": 31050 + }, + { + "epoch": 2.4851976316210593, + "grad_norm": 5.739156723022461, + "learning_rate": 8.619029790875541e-06, + "loss": 4.2756, + "step": 31060 + }, + { + "epoch": 2.485997759641543, + "grad_norm": 4.491320610046387, + "learning_rate": 8.605658661817404e-06, + "loss": 4.307, + "step": 31070 + }, + { + "epoch": 2.486797887662026, + "grad_norm": 5.620110988616943, + "learning_rate": 8.592287532759267e-06, + "loss": 4.1882, + "step": 31080 + }, + { + "epoch": 2.4875980156825093, + "grad_norm": 7.674624443054199, + "learning_rate": 8.57891640370113e-06, + "loss": 4.2893, + "step": 31090 + }, + { + "epoch": 2.4883981437029927, + "grad_norm": 7.919490337371826, + "learning_rate": 8.565545274642992e-06, + "loss": 4.2795, + "step": 31100 + }, + { + "epoch": 2.4891982717234757, + "grad_norm": 5.725222110748291, + "learning_rate": 8.552174145584855e-06, + "loss": 4.2809, + "step": 31110 + }, + { + "epoch": 2.489998399743959, + "grad_norm": 5.071161270141602, + "learning_rate": 8.538803016526716e-06, + "loss": 4.0492, + "step": 31120 + }, + { + "epoch": 2.490798527764442, + "grad_norm": 6.060359954833984, + "learning_rate": 8.525431887468579e-06, + "loss": 4.2231, + "step": 31130 + }, + { + "epoch": 2.4915986557849257, + "grad_norm": 5.7602081298828125, + "learning_rate": 8.512060758410441e-06, + "loss": 4.0903, + "step": 31140 + }, + { + "epoch": 2.4923987838054087, + "grad_norm": 6.505451679229736, + "learning_rate": 8.498689629352304e-06, + "loss": 4.3469, + "step": 31150 + }, + { + "epoch": 2.493198911825892, + "grad_norm": 4.702768802642822, + "learning_rate": 8.485318500294165e-06, + "loss": 4.2681, + "step": 31160 + }, + { + "epoch": 2.4939990398463756, + "grad_norm": 5.579276084899902, + "learning_rate": 8.471947371236028e-06, + "loss": 4.1579, + "step": 31170 + }, + { + "epoch": 2.4947991678668586, + "grad_norm": 4.7463788986206055, + "learning_rate": 8.45857624217789e-06, + "loss": 4.3411, + "step": 31180 + }, + { + "epoch": 2.495599295887342, + "grad_norm": 5.336600303649902, + "learning_rate": 8.445205113119751e-06, + "loss": 4.3368, + "step": 31190 + }, + { + "epoch": 2.496399423907825, + "grad_norm": 5.681230545043945, + "learning_rate": 8.431833984061614e-06, + "loss": 4.2342, + "step": 31200 + }, + { + "epoch": 2.4971995519283086, + "grad_norm": 5.4162139892578125, + "learning_rate": 8.418462855003477e-06, + "loss": 4.2021, + "step": 31210 + }, + { + "epoch": 2.4979996799487916, + "grad_norm": 4.396571636199951, + "learning_rate": 8.40509172594534e-06, + "loss": 4.1137, + "step": 31220 + }, + { + "epoch": 2.498799807969275, + "grad_norm": 5.792074203491211, + "learning_rate": 8.3917205968872e-06, + "loss": 4.1966, + "step": 31230 + }, + { + "epoch": 2.4995999359897585, + "grad_norm": 4.996471881866455, + "learning_rate": 8.378349467829063e-06, + "loss": 4.2879, + "step": 31240 + }, + { + "epoch": 2.5004000640102415, + "grad_norm": 5.074433326721191, + "learning_rate": 8.364978338770926e-06, + "loss": 4.1484, + "step": 31250 + }, + { + "epoch": 2.501200192030725, + "grad_norm": 4.65183162689209, + "learning_rate": 8.351607209712788e-06, + "loss": 4.3771, + "step": 31260 + }, + { + "epoch": 2.5020003200512084, + "grad_norm": 5.808924198150635, + "learning_rate": 8.338236080654651e-06, + "loss": 4.205, + "step": 31270 + }, + { + "epoch": 2.5028004480716914, + "grad_norm": 7.044647216796875, + "learning_rate": 8.324864951596514e-06, + "loss": 4.2596, + "step": 31280 + }, + { + "epoch": 2.503600576092175, + "grad_norm": 7.707614898681641, + "learning_rate": 8.311493822538376e-06, + "loss": 4.1315, + "step": 31290 + }, + { + "epoch": 2.504400704112658, + "grad_norm": 4.458408355712891, + "learning_rate": 8.29812269348024e-06, + "loss": 4.3516, + "step": 31300 + }, + { + "epoch": 2.5052008321331414, + "grad_norm": 5.028105735778809, + "learning_rate": 8.2847515644221e-06, + "loss": 4.2033, + "step": 31310 + }, + { + "epoch": 2.5060009601536244, + "grad_norm": 5.130579471588135, + "learning_rate": 8.271380435363963e-06, + "loss": 4.2853, + "step": 31320 + }, + { + "epoch": 2.506801088174108, + "grad_norm": 7.856816291809082, + "learning_rate": 8.258009306305826e-06, + "loss": 4.0916, + "step": 31330 + }, + { + "epoch": 2.5076012161945913, + "grad_norm": 5.169328212738037, + "learning_rate": 8.244638177247688e-06, + "loss": 4.2018, + "step": 31340 + }, + { + "epoch": 2.5084013442150743, + "grad_norm": 6.61760139465332, + "learning_rate": 8.23126704818955e-06, + "loss": 4.1828, + "step": 31350 + }, + { + "epoch": 2.509201472235558, + "grad_norm": 4.724680423736572, + "learning_rate": 8.217895919131412e-06, + "loss": 4.0726, + "step": 31360 + }, + { + "epoch": 2.5100016002560412, + "grad_norm": 5.668628215789795, + "learning_rate": 8.204524790073275e-06, + "loss": 4.2675, + "step": 31370 + }, + { + "epoch": 2.5108017282765243, + "grad_norm": 4.8627142906188965, + "learning_rate": 8.191153661015137e-06, + "loss": 4.2985, + "step": 31380 + }, + { + "epoch": 2.5116018562970073, + "grad_norm": 5.198115825653076, + "learning_rate": 8.177782531956998e-06, + "loss": 4.3581, + "step": 31390 + }, + { + "epoch": 2.5124019843174907, + "grad_norm": 4.514540195465088, + "learning_rate": 8.164411402898861e-06, + "loss": 4.3062, + "step": 31400 + }, + { + "epoch": 2.513202112337974, + "grad_norm": 7.25886344909668, + "learning_rate": 8.151040273840724e-06, + "loss": 4.2991, + "step": 31410 + }, + { + "epoch": 2.514002240358457, + "grad_norm": 5.974603176116943, + "learning_rate": 8.137669144782586e-06, + "loss": 4.1022, + "step": 31420 + }, + { + "epoch": 2.5148023683789407, + "grad_norm": 5.448121547698975, + "learning_rate": 8.124298015724447e-06, + "loss": 4.1075, + "step": 31430 + }, + { + "epoch": 2.515602496399424, + "grad_norm": 5.210214614868164, + "learning_rate": 8.11092688666631e-06, + "loss": 4.2649, + "step": 31440 + }, + { + "epoch": 2.516402624419907, + "grad_norm": 11.993659019470215, + "learning_rate": 8.097555757608173e-06, + "loss": 4.2198, + "step": 31450 + }, + { + "epoch": 2.5172027524403906, + "grad_norm": 4.927745819091797, + "learning_rate": 8.084184628550035e-06, + "loss": 4.0223, + "step": 31460 + }, + { + "epoch": 2.5180028804608736, + "grad_norm": 4.729024887084961, + "learning_rate": 8.070813499491896e-06, + "loss": 4.2895, + "step": 31470 + }, + { + "epoch": 2.518803008481357, + "grad_norm": 5.384552955627441, + "learning_rate": 8.057442370433759e-06, + "loss": 4.3527, + "step": 31480 + }, + { + "epoch": 2.51960313650184, + "grad_norm": 8.20038890838623, + "learning_rate": 8.044071241375622e-06, + "loss": 4.2892, + "step": 31490 + }, + { + "epoch": 2.5204032645223235, + "grad_norm": 4.21329927444458, + "learning_rate": 8.030700112317484e-06, + "loss": 4.4118, + "step": 31500 + }, + { + "epoch": 2.521203392542807, + "grad_norm": 5.070785999298096, + "learning_rate": 8.017328983259347e-06, + "loss": 4.4495, + "step": 31510 + }, + { + "epoch": 2.52200352056329, + "grad_norm": 5.975208759307861, + "learning_rate": 8.00395785420121e-06, + "loss": 4.2316, + "step": 31520 + }, + { + "epoch": 2.5228036485837735, + "grad_norm": 4.3103718757629395, + "learning_rate": 7.990586725143073e-06, + "loss": 4.3446, + "step": 31530 + }, + { + "epoch": 2.523603776604257, + "grad_norm": 4.407105922698975, + "learning_rate": 7.977215596084934e-06, + "loss": 4.387, + "step": 31540 + }, + { + "epoch": 2.52440390462474, + "grad_norm": 6.030669212341309, + "learning_rate": 7.963844467026796e-06, + "loss": 4.4376, + "step": 31550 + }, + { + "epoch": 2.525204032645223, + "grad_norm": 5.338545799255371, + "learning_rate": 7.950473337968659e-06, + "loss": 4.2574, + "step": 31560 + }, + { + "epoch": 2.5260041606657064, + "grad_norm": 7.339935779571533, + "learning_rate": 7.937102208910522e-06, + "loss": 4.186, + "step": 31570 + }, + { + "epoch": 2.52680428868619, + "grad_norm": 5.970847129821777, + "learning_rate": 7.923731079852383e-06, + "loss": 4.3115, + "step": 31580 + }, + { + "epoch": 2.527604416706673, + "grad_norm": 4.5242228507995605, + "learning_rate": 7.910359950794245e-06, + "loss": 4.4381, + "step": 31590 + }, + { + "epoch": 2.5284045447271564, + "grad_norm": 7.288176536560059, + "learning_rate": 7.896988821736108e-06, + "loss": 4.1498, + "step": 31600 + }, + { + "epoch": 2.52920467274764, + "grad_norm": 4.609799861907959, + "learning_rate": 7.88361769267797e-06, + "loss": 4.3975, + "step": 31610 + }, + { + "epoch": 2.530004800768123, + "grad_norm": 6.066311359405518, + "learning_rate": 7.870246563619832e-06, + "loss": 4.2106, + "step": 31620 + }, + { + "epoch": 2.5308049287886063, + "grad_norm": 4.771033763885498, + "learning_rate": 7.856875434561694e-06, + "loss": 4.4468, + "step": 31630 + }, + { + "epoch": 2.5316050568090893, + "grad_norm": 5.995556831359863, + "learning_rate": 7.843504305503557e-06, + "loss": 4.2434, + "step": 31640 + }, + { + "epoch": 2.5324051848295728, + "grad_norm": 6.122812747955322, + "learning_rate": 7.83013317644542e-06, + "loss": 4.2819, + "step": 31650 + }, + { + "epoch": 2.533205312850056, + "grad_norm": 6.691094875335693, + "learning_rate": 7.81676204738728e-06, + "loss": 4.3078, + "step": 31660 + }, + { + "epoch": 2.5340054408705392, + "grad_norm": 5.067676544189453, + "learning_rate": 7.803390918329143e-06, + "loss": 4.1885, + "step": 31670 + }, + { + "epoch": 2.5348055688910227, + "grad_norm": 5.308253288269043, + "learning_rate": 7.790019789271006e-06, + "loss": 4.1887, + "step": 31680 + }, + { + "epoch": 2.5356056969115057, + "grad_norm": 5.868967533111572, + "learning_rate": 7.776648660212869e-06, + "loss": 4.1599, + "step": 31690 + }, + { + "epoch": 2.536405824931989, + "grad_norm": 5.12509298324585, + "learning_rate": 7.76327753115473e-06, + "loss": 4.1866, + "step": 31700 + }, + { + "epoch": 2.5372059529524726, + "grad_norm": 7.411617279052734, + "learning_rate": 7.749906402096593e-06, + "loss": 4.3881, + "step": 31710 + }, + { + "epoch": 2.5380060809729557, + "grad_norm": 5.138408660888672, + "learning_rate": 7.736535273038455e-06, + "loss": 4.2416, + "step": 31720 + }, + { + "epoch": 2.5388062089934387, + "grad_norm": 4.619532108306885, + "learning_rate": 7.723164143980318e-06, + "loss": 4.361, + "step": 31730 + }, + { + "epoch": 2.539606337013922, + "grad_norm": 4.554988861083984, + "learning_rate": 7.70979301492218e-06, + "loss": 4.2662, + "step": 31740 + }, + { + "epoch": 2.5404064650344056, + "grad_norm": 5.840234279632568, + "learning_rate": 7.696421885864043e-06, + "loss": 4.1966, + "step": 31750 + }, + { + "epoch": 2.5412065930548886, + "grad_norm": 4.803789138793945, + "learning_rate": 7.683050756805906e-06, + "loss": 4.1967, + "step": 31760 + }, + { + "epoch": 2.542006721075372, + "grad_norm": 7.9466142654418945, + "learning_rate": 7.669679627747767e-06, + "loss": 4.3066, + "step": 31770 + }, + { + "epoch": 2.5428068490958555, + "grad_norm": 5.5534844398498535, + "learning_rate": 7.65630849868963e-06, + "loss": 4.2559, + "step": 31780 + }, + { + "epoch": 2.5436069771163385, + "grad_norm": 4.615208148956299, + "learning_rate": 7.642937369631492e-06, + "loss": 4.0991, + "step": 31790 + }, + { + "epoch": 2.544407105136822, + "grad_norm": 5.248322486877441, + "learning_rate": 7.629566240573355e-06, + "loss": 4.2412, + "step": 31800 + }, + { + "epoch": 2.545207233157305, + "grad_norm": 5.644094467163086, + "learning_rate": 7.616195111515216e-06, + "loss": 4.1779, + "step": 31810 + }, + { + "epoch": 2.5460073611777885, + "grad_norm": 5.409252643585205, + "learning_rate": 7.602823982457079e-06, + "loss": 4.4094, + "step": 31820 + }, + { + "epoch": 2.5468074891982715, + "grad_norm": 6.40919303894043, + "learning_rate": 7.589452853398941e-06, + "loss": 4.1346, + "step": 31830 + }, + { + "epoch": 2.547607617218755, + "grad_norm": 4.60876989364624, + "learning_rate": 7.576081724340804e-06, + "loss": 4.2401, + "step": 31840 + }, + { + "epoch": 2.5484077452392384, + "grad_norm": 4.963344097137451, + "learning_rate": 7.562710595282665e-06, + "loss": 4.396, + "step": 31850 + }, + { + "epoch": 2.5492078732597214, + "grad_norm": 8.271472930908203, + "learning_rate": 7.549339466224528e-06, + "loss": 4.3553, + "step": 31860 + }, + { + "epoch": 2.550008001280205, + "grad_norm": 5.118110179901123, + "learning_rate": 7.5359683371663905e-06, + "loss": 4.1426, + "step": 31870 + }, + { + "epoch": 2.5508081293006883, + "grad_norm": 6.315212249755859, + "learning_rate": 7.522597208108253e-06, + "loss": 4.132, + "step": 31880 + }, + { + "epoch": 2.5516082573211714, + "grad_norm": 6.116096019744873, + "learning_rate": 7.509226079050115e-06, + "loss": 4.1163, + "step": 31890 + }, + { + "epoch": 2.552408385341655, + "grad_norm": 7.4716877937316895, + "learning_rate": 7.495854949991978e-06, + "loss": 4.3314, + "step": 31900 + }, + { + "epoch": 2.553208513362138, + "grad_norm": 6.010276794433594, + "learning_rate": 7.48248382093384e-06, + "loss": 4.3779, + "step": 31910 + }, + { + "epoch": 2.5540086413826213, + "grad_norm": 5.290060043334961, + "learning_rate": 7.469112691875703e-06, + "loss": 4.3454, + "step": 31920 + }, + { + "epoch": 2.5548087694031043, + "grad_norm": 4.201476097106934, + "learning_rate": 7.455741562817564e-06, + "loss": 4.3305, + "step": 31930 + }, + { + "epoch": 2.5556088974235878, + "grad_norm": 4.023552894592285, + "learning_rate": 7.442370433759427e-06, + "loss": 4.2372, + "step": 31940 + }, + { + "epoch": 2.5564090254440712, + "grad_norm": 6.457052707672119, + "learning_rate": 7.4289993047012894e-06, + "loss": 4.3504, + "step": 31950 + }, + { + "epoch": 2.5572091534645542, + "grad_norm": 5.600893497467041, + "learning_rate": 7.415628175643152e-06, + "loss": 4.2232, + "step": 31960 + }, + { + "epoch": 2.5580092814850377, + "grad_norm": 7.067571640014648, + "learning_rate": 7.402257046585014e-06, + "loss": 4.3225, + "step": 31970 + }, + { + "epoch": 2.558809409505521, + "grad_norm": 5.7097249031066895, + "learning_rate": 7.388885917526877e-06, + "loss": 4.3295, + "step": 31980 + }, + { + "epoch": 2.559609537526004, + "grad_norm": 4.277981758117676, + "learning_rate": 7.375514788468739e-06, + "loss": 4.1644, + "step": 31990 + }, + { + "epoch": 2.560409665546487, + "grad_norm": 5.122984886169434, + "learning_rate": 7.362143659410602e-06, + "loss": 4.0891, + "step": 32000 + }, + { + "epoch": 2.5612097935669706, + "grad_norm": 4.904813289642334, + "learning_rate": 7.348772530352463e-06, + "loss": 4.3021, + "step": 32010 + }, + { + "epoch": 2.562009921587454, + "grad_norm": 5.248583793640137, + "learning_rate": 7.335401401294326e-06, + "loss": 4.2706, + "step": 32020 + }, + { + "epoch": 2.562810049607937, + "grad_norm": 4.642624855041504, + "learning_rate": 7.3220302722361884e-06, + "loss": 4.3708, + "step": 32030 + }, + { + "epoch": 2.5636101776284206, + "grad_norm": 4.165192127227783, + "learning_rate": 7.3086591431780494e-06, + "loss": 4.2576, + "step": 32040 + }, + { + "epoch": 2.564410305648904, + "grad_norm": 4.46279239654541, + "learning_rate": 7.295288014119912e-06, + "loss": 4.3745, + "step": 32050 + }, + { + "epoch": 2.565210433669387, + "grad_norm": 6.376815319061279, + "learning_rate": 7.281916885061775e-06, + "loss": 4.2703, + "step": 32060 + }, + { + "epoch": 2.5660105616898705, + "grad_norm": 5.364983081817627, + "learning_rate": 7.2685457560036375e-06, + "loss": 4.2359, + "step": 32070 + }, + { + "epoch": 2.5668106897103535, + "grad_norm": 5.545655727386475, + "learning_rate": 7.255174626945499e-06, + "loss": 4.3481, + "step": 32080 + }, + { + "epoch": 2.567610817730837, + "grad_norm": 6.107922077178955, + "learning_rate": 7.241803497887362e-06, + "loss": 4.2213, + "step": 32090 + }, + { + "epoch": 2.56841094575132, + "grad_norm": 8.90227222442627, + "learning_rate": 7.228432368829225e-06, + "loss": 4.0856, + "step": 32100 + }, + { + "epoch": 2.5692110737718035, + "grad_norm": 4.109879493713379, + "learning_rate": 7.215061239771087e-06, + "loss": 4.232, + "step": 32110 + }, + { + "epoch": 2.570011201792287, + "grad_norm": 4.951588153839111, + "learning_rate": 7.201690110712948e-06, + "loss": 4.2644, + "step": 32120 + }, + { + "epoch": 2.57081132981277, + "grad_norm": 6.807476043701172, + "learning_rate": 7.188318981654811e-06, + "loss": 4.2228, + "step": 32130 + }, + { + "epoch": 2.5716114578332534, + "grad_norm": 4.3869547843933105, + "learning_rate": 7.174947852596674e-06, + "loss": 4.2833, + "step": 32140 + }, + { + "epoch": 2.572411585853737, + "grad_norm": 7.943798542022705, + "learning_rate": 7.1615767235385365e-06, + "loss": 4.1522, + "step": 32150 + }, + { + "epoch": 2.57321171387422, + "grad_norm": 5.016458034515381, + "learning_rate": 7.1482055944803975e-06, + "loss": 4.276, + "step": 32160 + }, + { + "epoch": 2.574011841894703, + "grad_norm": 5.562156677246094, + "learning_rate": 7.13483446542226e-06, + "loss": 4.2845, + "step": 32170 + }, + { + "epoch": 2.5748119699151863, + "grad_norm": 4.856637477874756, + "learning_rate": 7.121463336364123e-06, + "loss": 4.2354, + "step": 32180 + }, + { + "epoch": 2.57561209793567, + "grad_norm": 4.221564769744873, + "learning_rate": 7.1080922073059855e-06, + "loss": 4.4084, + "step": 32190 + }, + { + "epoch": 2.576412225956153, + "grad_norm": 5.096288681030273, + "learning_rate": 7.094721078247847e-06, + "loss": 4.2814, + "step": 32200 + }, + { + "epoch": 2.5772123539766363, + "grad_norm": 6.132599353790283, + "learning_rate": 7.08134994918971e-06, + "loss": 4.1574, + "step": 32210 + }, + { + "epoch": 2.5780124819971197, + "grad_norm": 5.005707740783691, + "learning_rate": 7.067978820131573e-06, + "loss": 4.1658, + "step": 32220 + }, + { + "epoch": 2.5788126100176028, + "grad_norm": 4.944260597229004, + "learning_rate": 7.0546076910734355e-06, + "loss": 4.3081, + "step": 32230 + }, + { + "epoch": 2.579612738038086, + "grad_norm": 8.648292541503906, + "learning_rate": 7.0412365620152964e-06, + "loss": 4.2806, + "step": 32240 + }, + { + "epoch": 2.5804128660585692, + "grad_norm": 6.548149108886719, + "learning_rate": 7.027865432957159e-06, + "loss": 4.2142, + "step": 32250 + }, + { + "epoch": 2.5812129940790527, + "grad_norm": 4.361201286315918, + "learning_rate": 7.014494303899022e-06, + "loss": 4.1742, + "step": 32260 + }, + { + "epoch": 2.5820131220995357, + "grad_norm": 5.768430709838867, + "learning_rate": 7.0011231748408845e-06, + "loss": 4.2943, + "step": 32270 + }, + { + "epoch": 2.582813250120019, + "grad_norm": 6.053584575653076, + "learning_rate": 6.9877520457827455e-06, + "loss": 4.2239, + "step": 32280 + }, + { + "epoch": 2.5836133781405026, + "grad_norm": 7.139733791351318, + "learning_rate": 6.974380916724608e-06, + "loss": 4.4575, + "step": 32290 + }, + { + "epoch": 2.5844135061609856, + "grad_norm": 4.825145721435547, + "learning_rate": 6.961009787666471e-06, + "loss": 4.2602, + "step": 32300 + }, + { + "epoch": 2.585213634181469, + "grad_norm": 4.957446575164795, + "learning_rate": 6.947638658608334e-06, + "loss": 4.2541, + "step": 32310 + }, + { + "epoch": 2.5860137622019526, + "grad_norm": 4.746156215667725, + "learning_rate": 6.9342675295501954e-06, + "loss": 4.1876, + "step": 32320 + }, + { + "epoch": 2.5868138902224356, + "grad_norm": 6.32666015625, + "learning_rate": 6.920896400492058e-06, + "loss": 4.2707, + "step": 32330 + }, + { + "epoch": 2.5876140182429186, + "grad_norm": 7.385421276092529, + "learning_rate": 6.907525271433921e-06, + "loss": 4.3383, + "step": 32340 + }, + { + "epoch": 2.588414146263402, + "grad_norm": 5.54195499420166, + "learning_rate": 6.894154142375782e-06, + "loss": 4.2667, + "step": 32350 + }, + { + "epoch": 2.5892142742838855, + "grad_norm": 5.738757133483887, + "learning_rate": 6.8807830133176445e-06, + "loss": 4.2666, + "step": 32360 + }, + { + "epoch": 2.5900144023043685, + "grad_norm": 4.932640552520752, + "learning_rate": 6.867411884259507e-06, + "loss": 4.0864, + "step": 32370 + }, + { + "epoch": 2.590814530324852, + "grad_norm": 6.3347320556640625, + "learning_rate": 6.85404075520137e-06, + "loss": 4.2319, + "step": 32380 + }, + { + "epoch": 2.5916146583453354, + "grad_norm": 6.726009368896484, + "learning_rate": 6.840669626143231e-06, + "loss": 4.2479, + "step": 32390 + }, + { + "epoch": 2.5924147863658185, + "grad_norm": 5.636048316955566, + "learning_rate": 6.8272984970850936e-06, + "loss": 4.2169, + "step": 32400 + }, + { + "epoch": 2.593214914386302, + "grad_norm": 5.036827564239502, + "learning_rate": 6.813927368026956e-06, + "loss": 4.1135, + "step": 32410 + }, + { + "epoch": 2.594015042406785, + "grad_norm": 9.839925765991211, + "learning_rate": 6.800556238968819e-06, + "loss": 4.5051, + "step": 32420 + }, + { + "epoch": 2.5948151704272684, + "grad_norm": 6.498042583465576, + "learning_rate": 6.787185109910681e-06, + "loss": 4.3001, + "step": 32430 + }, + { + "epoch": 2.5956152984477514, + "grad_norm": 5.901638031005859, + "learning_rate": 6.7738139808525435e-06, + "loss": 4.2694, + "step": 32440 + }, + { + "epoch": 2.596415426468235, + "grad_norm": 4.632312774658203, + "learning_rate": 6.760442851794406e-06, + "loss": 4.2341, + "step": 32450 + }, + { + "epoch": 2.5972155544887183, + "grad_norm": 6.943007469177246, + "learning_rate": 6.747071722736269e-06, + "loss": 4.2814, + "step": 32460 + }, + { + "epoch": 2.5980156825092013, + "grad_norm": 5.386791229248047, + "learning_rate": 6.73370059367813e-06, + "loss": 4.2518, + "step": 32470 + }, + { + "epoch": 2.598815810529685, + "grad_norm": 5.245943069458008, + "learning_rate": 6.7203294646199925e-06, + "loss": 4.3079, + "step": 32480 + }, + { + "epoch": 2.5996159385501683, + "grad_norm": 4.878678798675537, + "learning_rate": 6.706958335561855e-06, + "loss": 4.3576, + "step": 32490 + }, + { + "epoch": 2.6004160665706513, + "grad_norm": 6.896281719207764, + "learning_rate": 6.693587206503718e-06, + "loss": 3.9788, + "step": 32500 + }, + { + "epoch": 2.6012161945911347, + "grad_norm": 11.621658325195312, + "learning_rate": 6.68021607744558e-06, + "loss": 4.3443, + "step": 32510 + }, + { + "epoch": 2.6020163226116177, + "grad_norm": 5.696441173553467, + "learning_rate": 6.6668449483874425e-06, + "loss": 4.2309, + "step": 32520 + }, + { + "epoch": 2.602816450632101, + "grad_norm": 6.503956317901611, + "learning_rate": 6.653473819329305e-06, + "loss": 4.195, + "step": 32530 + }, + { + "epoch": 2.6036165786525842, + "grad_norm": 5.1271209716796875, + "learning_rate": 6.640102690271168e-06, + "loss": 4.1775, + "step": 32540 + }, + { + "epoch": 2.6044167066730677, + "grad_norm": 7.013203144073486, + "learning_rate": 6.626731561213029e-06, + "loss": 4.2677, + "step": 32550 + }, + { + "epoch": 2.605216834693551, + "grad_norm": 5.267161846160889, + "learning_rate": 6.6133604321548915e-06, + "loss": 4.1407, + "step": 32560 + }, + { + "epoch": 2.606016962714034, + "grad_norm": 5.86568546295166, + "learning_rate": 6.599989303096754e-06, + "loss": 4.3156, + "step": 32570 + }, + { + "epoch": 2.6068170907345176, + "grad_norm": 12.335358619689941, + "learning_rate": 6.586618174038617e-06, + "loss": 4.2735, + "step": 32580 + }, + { + "epoch": 2.607617218755001, + "grad_norm": 6.064377784729004, + "learning_rate": 6.573247044980478e-06, + "loss": 4.2603, + "step": 32590 + }, + { + "epoch": 2.608417346775484, + "grad_norm": 4.735530376434326, + "learning_rate": 6.559875915922341e-06, + "loss": 4.2768, + "step": 32600 + }, + { + "epoch": 2.609217474795967, + "grad_norm": 5.105945587158203, + "learning_rate": 6.546504786864203e-06, + "loss": 4.2271, + "step": 32610 + }, + { + "epoch": 2.6100176028164506, + "grad_norm": 4.72859001159668, + "learning_rate": 6.533133657806065e-06, + "loss": 4.4265, + "step": 32620 + }, + { + "epoch": 2.610817730836934, + "grad_norm": 5.76402473449707, + "learning_rate": 6.519762528747928e-06, + "loss": 4.2542, + "step": 32630 + }, + { + "epoch": 2.611617858857417, + "grad_norm": 4.846867084503174, + "learning_rate": 6.5063913996897905e-06, + "loss": 4.1611, + "step": 32640 + }, + { + "epoch": 2.6124179868779005, + "grad_norm": 3.9136717319488525, + "learning_rate": 6.493020270631653e-06, + "loss": 4.1242, + "step": 32650 + }, + { + "epoch": 2.613218114898384, + "grad_norm": 5.419270992279053, + "learning_rate": 6.479649141573514e-06, + "loss": 4.1072, + "step": 32660 + }, + { + "epoch": 2.614018242918867, + "grad_norm": 4.859950542449951, + "learning_rate": 6.466278012515377e-06, + "loss": 4.3008, + "step": 32670 + }, + { + "epoch": 2.6148183709393504, + "grad_norm": 4.817934989929199, + "learning_rate": 6.4529068834572396e-06, + "loss": 4.2979, + "step": 32680 + }, + { + "epoch": 2.6156184989598334, + "grad_norm": 6.053598403930664, + "learning_rate": 6.439535754399102e-06, + "loss": 4.0841, + "step": 32690 + }, + { + "epoch": 2.616418626980317, + "grad_norm": 6.537989139556885, + "learning_rate": 6.426164625340963e-06, + "loss": 4.2682, + "step": 32700 + }, + { + "epoch": 2.6172187550008, + "grad_norm": 6.115545749664307, + "learning_rate": 6.412793496282826e-06, + "loss": 4.3778, + "step": 32710 + }, + { + "epoch": 2.6180188830212834, + "grad_norm": 9.145062446594238, + "learning_rate": 6.399422367224689e-06, + "loss": 4.3598, + "step": 32720 + }, + { + "epoch": 2.618819011041767, + "grad_norm": 4.901673316955566, + "learning_rate": 6.386051238166551e-06, + "loss": 4.3805, + "step": 32730 + }, + { + "epoch": 2.61961913906225, + "grad_norm": 5.381241798400879, + "learning_rate": 6.372680109108413e-06, + "loss": 4.2268, + "step": 32740 + }, + { + "epoch": 2.6204192670827333, + "grad_norm": 4.830419063568115, + "learning_rate": 6.359308980050276e-06, + "loss": 4.2856, + "step": 32750 + }, + { + "epoch": 2.6212193951032168, + "grad_norm": 5.5708794593811035, + "learning_rate": 6.3459378509921385e-06, + "loss": 4.2111, + "step": 32760 + }, + { + "epoch": 2.6220195231237, + "grad_norm": 5.884067535400391, + "learning_rate": 6.332566721934001e-06, + "loss": 4.3577, + "step": 32770 + }, + { + "epoch": 2.622819651144183, + "grad_norm": 5.066050052642822, + "learning_rate": 6.319195592875862e-06, + "loss": 4.2514, + "step": 32780 + }, + { + "epoch": 2.6236197791646663, + "grad_norm": 4.707382678985596, + "learning_rate": 6.305824463817725e-06, + "loss": 4.1726, + "step": 32790 + }, + { + "epoch": 2.6244199071851497, + "grad_norm": 5.376534938812256, + "learning_rate": 6.292453334759588e-06, + "loss": 4.2471, + "step": 32800 + }, + { + "epoch": 2.6252200352056327, + "grad_norm": 4.097368240356445, + "learning_rate": 6.27908220570145e-06, + "loss": 4.1882, + "step": 32810 + }, + { + "epoch": 2.626020163226116, + "grad_norm": 8.935079574584961, + "learning_rate": 6.265711076643311e-06, + "loss": 4.3797, + "step": 32820 + }, + { + "epoch": 2.6268202912465997, + "grad_norm": 7.076402187347412, + "learning_rate": 6.252339947585174e-06, + "loss": 4.1752, + "step": 32830 + }, + { + "epoch": 2.6276204192670827, + "grad_norm": 6.128974914550781, + "learning_rate": 6.238968818527037e-06, + "loss": 4.2087, + "step": 32840 + }, + { + "epoch": 2.628420547287566, + "grad_norm": 5.931004047393799, + "learning_rate": 6.225597689468899e-06, + "loss": 4.2863, + "step": 32850 + }, + { + "epoch": 2.629220675308049, + "grad_norm": 5.642322540283203, + "learning_rate": 6.212226560410762e-06, + "loss": 4.3657, + "step": 32860 + }, + { + "epoch": 2.6300208033285326, + "grad_norm": 5.371090888977051, + "learning_rate": 6.198855431352624e-06, + "loss": 3.9877, + "step": 32870 + }, + { + "epoch": 2.6308209313490156, + "grad_norm": 10.01162052154541, + "learning_rate": 6.185484302294487e-06, + "loss": 4.2464, + "step": 32880 + }, + { + "epoch": 2.631621059369499, + "grad_norm": 5.889012336730957, + "learning_rate": 6.1721131732363484e-06, + "loss": 4.3251, + "step": 32890 + }, + { + "epoch": 2.6324211873899825, + "grad_norm": 5.746575832366943, + "learning_rate": 6.15874204417821e-06, + "loss": 4.2681, + "step": 32900 + }, + { + "epoch": 2.6332213154104656, + "grad_norm": 5.833990573883057, + "learning_rate": 6.145370915120073e-06, + "loss": 4.2323, + "step": 32910 + }, + { + "epoch": 2.634021443430949, + "grad_norm": 6.475826740264893, + "learning_rate": 6.131999786061935e-06, + "loss": 4.177, + "step": 32920 + }, + { + "epoch": 2.6348215714514325, + "grad_norm": 6.790261268615723, + "learning_rate": 6.1186286570037975e-06, + "loss": 4.1992, + "step": 32930 + }, + { + "epoch": 2.6356216994719155, + "grad_norm": 6.625553131103516, + "learning_rate": 6.105257527945659e-06, + "loss": 4.2821, + "step": 32940 + }, + { + "epoch": 2.6364218274923985, + "grad_norm": 4.441702365875244, + "learning_rate": 6.091886398887522e-06, + "loss": 4.3954, + "step": 32950 + }, + { + "epoch": 2.637221955512882, + "grad_norm": 8.516060829162598, + "learning_rate": 6.078515269829385e-06, + "loss": 4.4075, + "step": 32960 + }, + { + "epoch": 2.6380220835333654, + "grad_norm": 7.188780784606934, + "learning_rate": 6.065144140771247e-06, + "loss": 4.1722, + "step": 32970 + }, + { + "epoch": 2.6388222115538484, + "grad_norm": 4.861379146575928, + "learning_rate": 6.051773011713109e-06, + "loss": 4.3142, + "step": 32980 + }, + { + "epoch": 2.639622339574332, + "grad_norm": 8.562806129455566, + "learning_rate": 6.038401882654972e-06, + "loss": 4.4365, + "step": 32990 + }, + { + "epoch": 2.6404224675948154, + "grad_norm": 4.433754920959473, + "learning_rate": 6.025030753596834e-06, + "loss": 4.2044, + "step": 33000 + }, + { + "epoch": 2.6412225956152984, + "grad_norm": 5.652956485748291, + "learning_rate": 6.0116596245386965e-06, + "loss": 4.2038, + "step": 33010 + }, + { + "epoch": 2.642022723635782, + "grad_norm": 5.2881693840026855, + "learning_rate": 5.998288495480558e-06, + "loss": 4.2475, + "step": 33020 + }, + { + "epoch": 2.642822851656265, + "grad_norm": 6.064772129058838, + "learning_rate": 5.984917366422421e-06, + "loss": 4.3771, + "step": 33030 + }, + { + "epoch": 2.6436229796767483, + "grad_norm": 4.478077411651611, + "learning_rate": 5.971546237364283e-06, + "loss": 4.2836, + "step": 33040 + }, + { + "epoch": 2.6444231076972313, + "grad_norm": 6.518775939941406, + "learning_rate": 5.9581751083061455e-06, + "loss": 4.2132, + "step": 33050 + }, + { + "epoch": 2.645223235717715, + "grad_norm": 5.864172458648682, + "learning_rate": 5.944803979248008e-06, + "loss": 4.1413, + "step": 33060 + }, + { + "epoch": 2.6460233637381982, + "grad_norm": 6.359694480895996, + "learning_rate": 5.931432850189871e-06, + "loss": 4.012, + "step": 33070 + }, + { + "epoch": 2.6468234917586813, + "grad_norm": 4.7483439445495605, + "learning_rate": 5.918061721131733e-06, + "loss": 4.2365, + "step": 33080 + }, + { + "epoch": 2.6476236197791647, + "grad_norm": 6.524518013000488, + "learning_rate": 5.9046905920735955e-06, + "loss": 4.2251, + "step": 33090 + }, + { + "epoch": 2.648423747799648, + "grad_norm": 4.645505428314209, + "learning_rate": 5.891319463015457e-06, + "loss": 4.3661, + "step": 33100 + }, + { + "epoch": 2.649223875820131, + "grad_norm": 4.526218414306641, + "learning_rate": 5.87794833395732e-06, + "loss": 4.157, + "step": 33110 + }, + { + "epoch": 2.6500240038406147, + "grad_norm": 6.154730796813965, + "learning_rate": 5.864577204899182e-06, + "loss": 4.1405, + "step": 33120 + }, + { + "epoch": 2.6508241318610977, + "grad_norm": 4.572940349578857, + "learning_rate": 5.8512060758410445e-06, + "loss": 4.3686, + "step": 33130 + }, + { + "epoch": 2.651624259881581, + "grad_norm": 4.475552558898926, + "learning_rate": 5.837834946782906e-06, + "loss": 4.1899, + "step": 33140 + }, + { + "epoch": 2.652424387902064, + "grad_norm": 5.439560890197754, + "learning_rate": 5.824463817724769e-06, + "loss": 4.278, + "step": 33150 + }, + { + "epoch": 2.6532245159225476, + "grad_norm": 5.904727458953857, + "learning_rate": 5.811092688666631e-06, + "loss": 4.1599, + "step": 33160 + }, + { + "epoch": 2.654024643943031, + "grad_norm": 7.036167621612549, + "learning_rate": 5.797721559608494e-06, + "loss": 4.2771, + "step": 33170 + }, + { + "epoch": 2.654824771963514, + "grad_norm": 5.601536273956299, + "learning_rate": 5.785687543456169e-06, + "loss": 4.316, + "step": 33180 + }, + { + "epoch": 2.6556248999839975, + "grad_norm": 5.479313373565674, + "learning_rate": 5.772316414398032e-06, + "loss": 4.2582, + "step": 33190 + }, + { + "epoch": 2.656425028004481, + "grad_norm": 7.038112640380859, + "learning_rate": 5.758945285339895e-06, + "loss": 4.3412, + "step": 33200 + }, + { + "epoch": 2.657225156024964, + "grad_norm": 6.468174934387207, + "learning_rate": 5.7455741562817566e-06, + "loss": 4.3545, + "step": 33210 + }, + { + "epoch": 2.658025284045447, + "grad_norm": 7.697492599487305, + "learning_rate": 5.732203027223619e-06, + "loss": 4.4066, + "step": 33220 + }, + { + "epoch": 2.6588254120659305, + "grad_norm": 4.525124549865723, + "learning_rate": 5.718831898165481e-06, + "loss": 4.3206, + "step": 33230 + }, + { + "epoch": 2.659625540086414, + "grad_norm": 3.5019888877868652, + "learning_rate": 5.705460769107344e-06, + "loss": 4.1181, + "step": 33240 + }, + { + "epoch": 2.660425668106897, + "grad_norm": 5.208031177520752, + "learning_rate": 5.692089640049206e-06, + "loss": 4.3428, + "step": 33250 + }, + { + "epoch": 2.6612257961273804, + "grad_norm": 4.360513687133789, + "learning_rate": 5.678718510991068e-06, + "loss": 4.2993, + "step": 33260 + }, + { + "epoch": 2.662025924147864, + "grad_norm": 5.4428911209106445, + "learning_rate": 5.66534738193293e-06, + "loss": 4.3044, + "step": 33270 + }, + { + "epoch": 2.662826052168347, + "grad_norm": 5.437986373901367, + "learning_rate": 5.651976252874793e-06, + "loss": 4.1053, + "step": 33280 + }, + { + "epoch": 2.6636261801888304, + "grad_norm": 6.016399383544922, + "learning_rate": 5.6386051238166555e-06, + "loss": 4.0533, + "step": 33290 + }, + { + "epoch": 2.6644263082093134, + "grad_norm": 4.479848861694336, + "learning_rate": 5.625233994758518e-06, + "loss": 4.278, + "step": 33300 + }, + { + "epoch": 2.665226436229797, + "grad_norm": 3.8998403549194336, + "learning_rate": 5.61186286570038e-06, + "loss": 4.1033, + "step": 33310 + }, + { + "epoch": 2.66602656425028, + "grad_norm": 5.68388032913208, + "learning_rate": 5.598491736642243e-06, + "loss": 4.3475, + "step": 33320 + }, + { + "epoch": 2.6668266922707633, + "grad_norm": 5.672200679779053, + "learning_rate": 5.585120607584105e-06, + "loss": 4.2638, + "step": 33330 + }, + { + "epoch": 2.6676268202912468, + "grad_norm": 5.235283851623535, + "learning_rate": 5.571749478525967e-06, + "loss": 4.1448, + "step": 33340 + }, + { + "epoch": 2.6684269483117298, + "grad_norm": 5.4079108238220215, + "learning_rate": 5.558378349467829e-06, + "loss": 3.9993, + "step": 33350 + }, + { + "epoch": 2.6692270763322132, + "grad_norm": 9.00804328918457, + "learning_rate": 5.545007220409692e-06, + "loss": 4.296, + "step": 33360 + }, + { + "epoch": 2.6700272043526967, + "grad_norm": 4.167604923248291, + "learning_rate": 5.531636091351554e-06, + "loss": 4.1637, + "step": 33370 + }, + { + "epoch": 2.6708273323731797, + "grad_norm": 4.981602668762207, + "learning_rate": 5.518264962293416e-06, + "loss": 4.3314, + "step": 33380 + }, + { + "epoch": 2.6716274603936627, + "grad_norm": 4.367321968078613, + "learning_rate": 5.504893833235278e-06, + "loss": 4.2848, + "step": 33390 + }, + { + "epoch": 2.672427588414146, + "grad_norm": 5.457118988037109, + "learning_rate": 5.491522704177141e-06, + "loss": 4.2957, + "step": 33400 + }, + { + "epoch": 2.6732277164346296, + "grad_norm": 5.582805156707764, + "learning_rate": 5.478151575119004e-06, + "loss": 4.2366, + "step": 33410 + }, + { + "epoch": 2.6740278444551127, + "grad_norm": 6.88631010055542, + "learning_rate": 5.464780446060866e-06, + "loss": 4.1834, + "step": 33420 + }, + { + "epoch": 2.674827972475596, + "grad_norm": 6.5617218017578125, + "learning_rate": 5.451409317002728e-06, + "loss": 4.3329, + "step": 33430 + }, + { + "epoch": 2.6756281004960796, + "grad_norm": 5.843389511108398, + "learning_rate": 5.438038187944591e-06, + "loss": 4.3819, + "step": 33440 + }, + { + "epoch": 2.6764282285165626, + "grad_norm": 5.388167381286621, + "learning_rate": 5.424667058886453e-06, + "loss": 4.2512, + "step": 33450 + }, + { + "epoch": 2.677228356537046, + "grad_norm": 8.47705078125, + "learning_rate": 5.411295929828315e-06, + "loss": 4.1728, + "step": 33460 + }, + { + "epoch": 2.678028484557529, + "grad_norm": 17.4747371673584, + "learning_rate": 5.397924800770177e-06, + "loss": 4.4086, + "step": 33470 + }, + { + "epoch": 2.6788286125780125, + "grad_norm": 5.968992710113525, + "learning_rate": 5.384553671712039e-06, + "loss": 4.2446, + "step": 33480 + }, + { + "epoch": 2.6796287405984955, + "grad_norm": 5.716801166534424, + "learning_rate": 5.371182542653902e-06, + "loss": 4.3397, + "step": 33490 + }, + { + "epoch": 2.680428868618979, + "grad_norm": 5.442111968994141, + "learning_rate": 5.3578114135957636e-06, + "loss": 4.2471, + "step": 33500 + }, + { + "epoch": 2.6812289966394625, + "grad_norm": 6.218289852142334, + "learning_rate": 5.344440284537626e-06, + "loss": 4.3326, + "step": 33510 + }, + { + "epoch": 2.6820291246599455, + "grad_norm": 5.563192367553711, + "learning_rate": 5.331069155479489e-06, + "loss": 4.1598, + "step": 33520 + }, + { + "epoch": 2.682829252680429, + "grad_norm": 6.493220329284668, + "learning_rate": 5.317698026421352e-06, + "loss": 4.0064, + "step": 33530 + }, + { + "epoch": 2.6836293807009124, + "grad_norm": 6.705641269683838, + "learning_rate": 5.3043268973632135e-06, + "loss": 4.286, + "step": 33540 + }, + { + "epoch": 2.6844295087213954, + "grad_norm": 8.996630668640137, + "learning_rate": 5.290955768305076e-06, + "loss": 4.0485, + "step": 33550 + }, + { + "epoch": 2.685229636741879, + "grad_norm": 5.042446136474609, + "learning_rate": 5.277584639246938e-06, + "loss": 4.4114, + "step": 33560 + }, + { + "epoch": 2.686029764762362, + "grad_norm": 5.418910026550293, + "learning_rate": 5.264213510188801e-06, + "loss": 4.1548, + "step": 33570 + }, + { + "epoch": 2.6868298927828453, + "grad_norm": 4.982604026794434, + "learning_rate": 5.2508423811306625e-06, + "loss": 4.1292, + "step": 33580 + }, + { + "epoch": 2.6876300208033284, + "grad_norm": 5.434889316558838, + "learning_rate": 5.237471252072525e-06, + "loss": 4.4752, + "step": 33590 + }, + { + "epoch": 2.688430148823812, + "grad_norm": 5.765676021575928, + "learning_rate": 5.224100123014387e-06, + "loss": 4.2641, + "step": 33600 + }, + { + "epoch": 2.6892302768442953, + "grad_norm": 5.155886650085449, + "learning_rate": 5.21072899395625e-06, + "loss": 4.0127, + "step": 33610 + }, + { + "epoch": 2.6900304048647783, + "grad_norm": 8.582798957824707, + "learning_rate": 5.1973578648981125e-06, + "loss": 4.1382, + "step": 33620 + }, + { + "epoch": 2.6908305328852617, + "grad_norm": 7.404249668121338, + "learning_rate": 5.183986735839975e-06, + "loss": 4.2623, + "step": 33630 + }, + { + "epoch": 2.6916306609057448, + "grad_norm": 9.338781356811523, + "learning_rate": 5.170615606781837e-06, + "loss": 4.1845, + "step": 33640 + }, + { + "epoch": 2.6924307889262282, + "grad_norm": 6.720228672027588, + "learning_rate": 5.1572444777237e-06, + "loss": 4.2549, + "step": 33650 + }, + { + "epoch": 2.6932309169467112, + "grad_norm": 6.17422342300415, + "learning_rate": 5.1438733486655615e-06, + "loss": 4.3159, + "step": 33660 + }, + { + "epoch": 2.6940310449671947, + "grad_norm": 5.542844772338867, + "learning_rate": 5.130502219607424e-06, + "loss": 4.3158, + "step": 33670 + }, + { + "epoch": 2.694831172987678, + "grad_norm": 4.788525104522705, + "learning_rate": 5.117131090549286e-06, + "loss": 4.3005, + "step": 33680 + }, + { + "epoch": 2.695631301008161, + "grad_norm": 5.047336578369141, + "learning_rate": 5.103759961491149e-06, + "loss": 4.0872, + "step": 33690 + }, + { + "epoch": 2.6964314290286446, + "grad_norm": 9.078147888183594, + "learning_rate": 5.090388832433011e-06, + "loss": 4.3044, + "step": 33700 + }, + { + "epoch": 2.697231557049128, + "grad_norm": 6.686065673828125, + "learning_rate": 5.077017703374873e-06, + "loss": 4.1941, + "step": 33710 + }, + { + "epoch": 2.698031685069611, + "grad_norm": 5.865580081939697, + "learning_rate": 5.063646574316735e-06, + "loss": 4.2317, + "step": 33720 + }, + { + "epoch": 2.6988318130900946, + "grad_norm": 4.9534173011779785, + "learning_rate": 5.050275445258598e-06, + "loss": 4.2982, + "step": 33730 + }, + { + "epoch": 2.6996319411105776, + "grad_norm": 4.907919406890869, + "learning_rate": 5.0369043162004605e-06, + "loss": 4.4078, + "step": 33740 + }, + { + "epoch": 2.700432069131061, + "grad_norm": 6.2328267097473145, + "learning_rate": 5.023533187142323e-06, + "loss": 4.1612, + "step": 33750 + }, + { + "epoch": 2.701232197151544, + "grad_norm": 5.902498722076416, + "learning_rate": 5.010162058084185e-06, + "loss": 4.0897, + "step": 33760 + }, + { + "epoch": 2.7020323251720275, + "grad_norm": 4.614875793457031, + "learning_rate": 4.996790929026047e-06, + "loss": 4.1132, + "step": 33770 + }, + { + "epoch": 2.702832453192511, + "grad_norm": 8.119933128356934, + "learning_rate": 4.9834197999679096e-06, + "loss": 3.9534, + "step": 33780 + }, + { + "epoch": 2.703632581212994, + "grad_norm": 6.0485148429870605, + "learning_rate": 4.970048670909771e-06, + "loss": 4.3585, + "step": 33790 + }, + { + "epoch": 2.7044327092334774, + "grad_norm": 6.55008602142334, + "learning_rate": 4.956677541851634e-06, + "loss": 4.2445, + "step": 33800 + }, + { + "epoch": 2.705232837253961, + "grad_norm": 4.953001976013184, + "learning_rate": 4.943306412793496e-06, + "loss": 4.2733, + "step": 33810 + }, + { + "epoch": 2.706032965274444, + "grad_norm": 6.0347490310668945, + "learning_rate": 4.929935283735359e-06, + "loss": 4.2542, + "step": 33820 + }, + { + "epoch": 2.706833093294927, + "grad_norm": 4.666191101074219, + "learning_rate": 4.916564154677221e-06, + "loss": 4.2326, + "step": 33830 + }, + { + "epoch": 2.7076332213154104, + "grad_norm": 4.969473361968994, + "learning_rate": 4.903193025619084e-06, + "loss": 4.0745, + "step": 33840 + }, + { + "epoch": 2.708433349335894, + "grad_norm": 5.747929096221924, + "learning_rate": 4.889821896560946e-06, + "loss": 4.1663, + "step": 33850 + }, + { + "epoch": 2.709233477356377, + "grad_norm": 4.519825458526611, + "learning_rate": 4.8764507675028085e-06, + "loss": 4.1346, + "step": 33860 + }, + { + "epoch": 2.7100336053768603, + "grad_norm": 5.24179220199585, + "learning_rate": 4.86307963844467e-06, + "loss": 4.165, + "step": 33870 + }, + { + "epoch": 2.710833733397344, + "grad_norm": 5.994459629058838, + "learning_rate": 4.849708509386533e-06, + "loss": 4.2889, + "step": 33880 + }, + { + "epoch": 2.711633861417827, + "grad_norm": 5.778345584869385, + "learning_rate": 4.836337380328395e-06, + "loss": 4.1544, + "step": 33890 + }, + { + "epoch": 2.7124339894383103, + "grad_norm": 7.161036968231201, + "learning_rate": 4.822966251270258e-06, + "loss": 4.1862, + "step": 33900 + }, + { + "epoch": 2.7132341174587933, + "grad_norm": 7.405507564544678, + "learning_rate": 4.8095951222121195e-06, + "loss": 4.1942, + "step": 33910 + }, + { + "epoch": 2.7140342454792767, + "grad_norm": 5.971241474151611, + "learning_rate": 4.796223993153982e-06, + "loss": 4.2161, + "step": 33920 + }, + { + "epoch": 2.7148343734997598, + "grad_norm": 5.452059268951416, + "learning_rate": 4.782852864095844e-06, + "loss": 4.1921, + "step": 33930 + }, + { + "epoch": 2.715634501520243, + "grad_norm": 4.6873345375061035, + "learning_rate": 4.769481735037707e-06, + "loss": 4.2156, + "step": 33940 + }, + { + "epoch": 2.7164346295407267, + "grad_norm": 4.376823902130127, + "learning_rate": 4.756110605979569e-06, + "loss": 4.1594, + "step": 33950 + }, + { + "epoch": 2.7172347575612097, + "grad_norm": 5.478845119476318, + "learning_rate": 4.742739476921432e-06, + "loss": 4.2715, + "step": 33960 + }, + { + "epoch": 2.718034885581693, + "grad_norm": 8.683806419372559, + "learning_rate": 4.729368347863294e-06, + "loss": 4.2901, + "step": 33970 + }, + { + "epoch": 2.7188350136021766, + "grad_norm": 5.288990497589111, + "learning_rate": 4.715997218805157e-06, + "loss": 4.2991, + "step": 33980 + }, + { + "epoch": 2.7196351416226596, + "grad_norm": 6.266578197479248, + "learning_rate": 4.7026260897470184e-06, + "loss": 4.255, + "step": 33990 + }, + { + "epoch": 2.7204352696431426, + "grad_norm": 6.196168422698975, + "learning_rate": 4.689254960688881e-06, + "loss": 4.3106, + "step": 34000 + }, + { + "epoch": 2.721235397663626, + "grad_norm": 5.192313194274902, + "learning_rate": 4.675883831630743e-06, + "loss": 4.2563, + "step": 34010 + }, + { + "epoch": 2.7220355256841096, + "grad_norm": 5.5003886222839355, + "learning_rate": 4.662512702572606e-06, + "loss": 4.4209, + "step": 34020 + }, + { + "epoch": 2.7228356537045926, + "grad_norm": 5.434267997741699, + "learning_rate": 4.6491415735144675e-06, + "loss": 4.2888, + "step": 34030 + }, + { + "epoch": 2.723635781725076, + "grad_norm": 8.187822341918945, + "learning_rate": 4.635770444456329e-06, + "loss": 4.2338, + "step": 34040 + }, + { + "epoch": 2.7244359097455595, + "grad_norm": 5.527400970458984, + "learning_rate": 4.622399315398192e-06, + "loss": 4.0227, + "step": 34050 + }, + { + "epoch": 2.7252360377660425, + "grad_norm": 7.845839977264404, + "learning_rate": 4.609028186340055e-06, + "loss": 4.2575, + "step": 34060 + }, + { + "epoch": 2.726036165786526, + "grad_norm": 4.837810039520264, + "learning_rate": 4.595657057281917e-06, + "loss": 4.1485, + "step": 34070 + }, + { + "epoch": 2.726836293807009, + "grad_norm": 7.224013328552246, + "learning_rate": 4.582285928223779e-06, + "loss": 4.2924, + "step": 34080 + }, + { + "epoch": 2.7276364218274924, + "grad_norm": 6.373143196105957, + "learning_rate": 4.568914799165642e-06, + "loss": 4.3085, + "step": 34090 + }, + { + "epoch": 2.7284365498479755, + "grad_norm": 4.7438764572143555, + "learning_rate": 4.555543670107504e-06, + "loss": 4.1749, + "step": 34100 + }, + { + "epoch": 2.729236677868459, + "grad_norm": 4.517533779144287, + "learning_rate": 4.5421725410493665e-06, + "loss": 4.3599, + "step": 34110 + }, + { + "epoch": 2.7300368058889424, + "grad_norm": 6.462946891784668, + "learning_rate": 4.528801411991228e-06, + "loss": 4.2767, + "step": 34120 + }, + { + "epoch": 2.7308369339094254, + "grad_norm": 4.755046367645264, + "learning_rate": 4.515430282933091e-06, + "loss": 4.2932, + "step": 34130 + }, + { + "epoch": 2.731637061929909, + "grad_norm": 5.173582077026367, + "learning_rate": 4.502059153874953e-06, + "loss": 4.4344, + "step": 34140 + }, + { + "epoch": 2.7324371899503923, + "grad_norm": 7.195948600769043, + "learning_rate": 4.4886880248168156e-06, + "loss": 4.1337, + "step": 34150 + }, + { + "epoch": 2.7332373179708753, + "grad_norm": 4.898958206176758, + "learning_rate": 4.475316895758678e-06, + "loss": 4.2497, + "step": 34160 + }, + { + "epoch": 2.734037445991359, + "grad_norm": 6.266502857208252, + "learning_rate": 4.461945766700541e-06, + "loss": 4.0755, + "step": 34170 + }, + { + "epoch": 2.734837574011842, + "grad_norm": 12.440402030944824, + "learning_rate": 4.448574637642403e-06, + "loss": 4.2299, + "step": 34180 + }, + { + "epoch": 2.7356377020323253, + "grad_norm": 5.204728126525879, + "learning_rate": 4.4352035085842655e-06, + "loss": 4.2285, + "step": 34190 + }, + { + "epoch": 2.7364378300528083, + "grad_norm": 4.905936241149902, + "learning_rate": 4.421832379526127e-06, + "loss": 4.2483, + "step": 34200 + }, + { + "epoch": 2.7372379580732917, + "grad_norm": 4.211105823516846, + "learning_rate": 4.40846125046799e-06, + "loss": 4.0633, + "step": 34210 + }, + { + "epoch": 2.738038086093775, + "grad_norm": 4.4559807777404785, + "learning_rate": 4.395090121409852e-06, + "loss": 4.1973, + "step": 34220 + }, + { + "epoch": 2.738838214114258, + "grad_norm": 7.232518196105957, + "learning_rate": 4.3817189923517145e-06, + "loss": 4.0555, + "step": 34230 + }, + { + "epoch": 2.7396383421347417, + "grad_norm": 4.614950656890869, + "learning_rate": 4.368347863293576e-06, + "loss": 4.2235, + "step": 34240 + }, + { + "epoch": 2.7404384701552247, + "grad_norm": 6.457540035247803, + "learning_rate": 4.354976734235439e-06, + "loss": 4.238, + "step": 34250 + }, + { + "epoch": 2.741238598175708, + "grad_norm": 6.911721706390381, + "learning_rate": 4.341605605177301e-06, + "loss": 4.1752, + "step": 34260 + }, + { + "epoch": 2.742038726196191, + "grad_norm": 5.3381876945495605, + "learning_rate": 4.328234476119164e-06, + "loss": 4.3043, + "step": 34270 + }, + { + "epoch": 2.7428388542166746, + "grad_norm": 4.276921272277832, + "learning_rate": 4.314863347061026e-06, + "loss": 4.2093, + "step": 34280 + }, + { + "epoch": 2.743638982237158, + "grad_norm": 6.417922496795654, + "learning_rate": 4.301492218002889e-06, + "loss": 4.2126, + "step": 34290 + }, + { + "epoch": 2.744439110257641, + "grad_norm": 6.303336143493652, + "learning_rate": 4.288121088944751e-06, + "loss": 4.2688, + "step": 34300 + }, + { + "epoch": 2.7452392382781245, + "grad_norm": 6.443734645843506, + "learning_rate": 4.2747499598866135e-06, + "loss": 4.3859, + "step": 34310 + }, + { + "epoch": 2.746039366298608, + "grad_norm": 5.473753452301025, + "learning_rate": 4.261378830828475e-06, + "loss": 4.2519, + "step": 34320 + }, + { + "epoch": 2.746839494319091, + "grad_norm": 5.459244728088379, + "learning_rate": 4.248007701770337e-06, + "loss": 4.3614, + "step": 34330 + }, + { + "epoch": 2.7476396223395745, + "grad_norm": 6.8252434730529785, + "learning_rate": 4.2346365727122e-06, + "loss": 4.1491, + "step": 34340 + }, + { + "epoch": 2.7484397503600575, + "grad_norm": 4.675537586212158, + "learning_rate": 4.221265443654062e-06, + "loss": 4.3317, + "step": 34350 + }, + { + "epoch": 2.749239878380541, + "grad_norm": 4.900448322296143, + "learning_rate": 4.207894314595924e-06, + "loss": 4.2671, + "step": 34360 + }, + { + "epoch": 2.750040006401024, + "grad_norm": 4.971939563751221, + "learning_rate": 4.194523185537787e-06, + "loss": 4.1103, + "step": 34370 + }, + { + "epoch": 2.7508401344215074, + "grad_norm": 4.37131929397583, + "learning_rate": 4.18115205647965e-06, + "loss": 4.2015, + "step": 34380 + }, + { + "epoch": 2.751640262441991, + "grad_norm": 4.4726786613464355, + "learning_rate": 4.167780927421512e-06, + "loss": 4.3048, + "step": 34390 + }, + { + "epoch": 2.752440390462474, + "grad_norm": 4.819699764251709, + "learning_rate": 4.154409798363374e-06, + "loss": 4.0875, + "step": 34400 + }, + { + "epoch": 2.7532405184829574, + "grad_norm": 4.992520332336426, + "learning_rate": 4.141038669305236e-06, + "loss": 4.22, + "step": 34410 + }, + { + "epoch": 2.754040646503441, + "grad_norm": 7.037467002868652, + "learning_rate": 4.127667540247099e-06, + "loss": 4.0536, + "step": 34420 + }, + { + "epoch": 2.754840774523924, + "grad_norm": 4.956068515777588, + "learning_rate": 4.114296411188961e-06, + "loss": 4.2732, + "step": 34430 + }, + { + "epoch": 2.755640902544407, + "grad_norm": 5.513150691986084, + "learning_rate": 4.100925282130823e-06, + "loss": 4.2847, + "step": 34440 + }, + { + "epoch": 2.7564410305648903, + "grad_norm": 7.297191143035889, + "learning_rate": 4.087554153072685e-06, + "loss": 4.2398, + "step": 34450 + }, + { + "epoch": 2.7572411585853738, + "grad_norm": 4.674797058105469, + "learning_rate": 4.074183024014548e-06, + "loss": 4.1731, + "step": 34460 + }, + { + "epoch": 2.758041286605857, + "grad_norm": 4.875251293182373, + "learning_rate": 4.06081189495641e-06, + "loss": 4.2261, + "step": 34470 + }, + { + "epoch": 2.7588414146263402, + "grad_norm": 6.054131507873535, + "learning_rate": 4.0474407658982725e-06, + "loss": 4.1148, + "step": 34480 + }, + { + "epoch": 2.7596415426468237, + "grad_norm": 6.8574910163879395, + "learning_rate": 4.034069636840135e-06, + "loss": 4.3257, + "step": 34490 + }, + { + "epoch": 2.7604416706673067, + "grad_norm": 3.8812949657440186, + "learning_rate": 4.020698507781998e-06, + "loss": 4.3107, + "step": 34500 + }, + { + "epoch": 2.76124179868779, + "grad_norm": 4.484575271606445, + "learning_rate": 4.00732737872386e-06, + "loss": 4.1445, + "step": 34510 + }, + { + "epoch": 2.762041926708273, + "grad_norm": 4.373636722564697, + "learning_rate": 3.993956249665722e-06, + "loss": 4.1372, + "step": 34520 + }, + { + "epoch": 2.7628420547287567, + "grad_norm": 5.4754509925842285, + "learning_rate": 3.980585120607584e-06, + "loss": 4.1681, + "step": 34530 + }, + { + "epoch": 2.7636421827492397, + "grad_norm": 16.14682960510254, + "learning_rate": 3.967213991549447e-06, + "loss": 4.1908, + "step": 34540 + }, + { + "epoch": 2.764442310769723, + "grad_norm": 6.3839592933654785, + "learning_rate": 3.953842862491309e-06, + "loss": 4.1729, + "step": 34550 + }, + { + "epoch": 2.7652424387902066, + "grad_norm": 4.453866004943848, + "learning_rate": 3.9404717334331714e-06, + "loss": 4.149, + "step": 34560 + }, + { + "epoch": 2.7660425668106896, + "grad_norm": 6.7042951583862305, + "learning_rate": 3.927100604375033e-06, + "loss": 4.1094, + "step": 34570 + }, + { + "epoch": 2.766842694831173, + "grad_norm": 5.805509567260742, + "learning_rate": 3.913729475316896e-06, + "loss": 4.1464, + "step": 34580 + }, + { + "epoch": 2.7676428228516565, + "grad_norm": 6.5062360763549805, + "learning_rate": 3.900358346258759e-06, + "loss": 4.1876, + "step": 34590 + }, + { + "epoch": 2.7684429508721395, + "grad_norm": 6.451037406921387, + "learning_rate": 3.886987217200621e-06, + "loss": 4.2076, + "step": 34600 + }, + { + "epoch": 2.7692430788926226, + "grad_norm": 4.453220844268799, + "learning_rate": 3.873616088142483e-06, + "loss": 4.0763, + "step": 34610 + }, + { + "epoch": 2.770043206913106, + "grad_norm": 5.057280540466309, + "learning_rate": 3.860244959084345e-06, + "loss": 4.2001, + "step": 34620 + }, + { + "epoch": 2.7708433349335895, + "grad_norm": 4.674801349639893, + "learning_rate": 3.846873830026208e-06, + "loss": 4.4211, + "step": 34630 + }, + { + "epoch": 2.7716434629540725, + "grad_norm": 4.858605861663818, + "learning_rate": 3.83350270096807e-06, + "loss": 4.1275, + "step": 34640 + }, + { + "epoch": 2.772443590974556, + "grad_norm": 6.631062030792236, + "learning_rate": 3.820131571909932e-06, + "loss": 4.342, + "step": 34650 + }, + { + "epoch": 2.7732437189950394, + "grad_norm": 7.900754928588867, + "learning_rate": 3.806760442851794e-06, + "loss": 4.1309, + "step": 34660 + }, + { + "epoch": 2.7740438470155224, + "grad_norm": 6.944363117218018, + "learning_rate": 3.793389313793657e-06, + "loss": 4.3148, + "step": 34670 + }, + { + "epoch": 2.774843975036006, + "grad_norm": 9.841028213500977, + "learning_rate": 3.780018184735519e-06, + "loss": 4.2127, + "step": 34680 + }, + { + "epoch": 2.775644103056489, + "grad_norm": 5.778743743896484, + "learning_rate": 3.7666470556773818e-06, + "loss": 4.2897, + "step": 34690 + }, + { + "epoch": 2.7764442310769724, + "grad_norm": 7.016035079956055, + "learning_rate": 3.7532759266192436e-06, + "loss": 4.0521, + "step": 34700 + }, + { + "epoch": 2.7772443590974554, + "grad_norm": 5.496110916137695, + "learning_rate": 3.7399047975611063e-06, + "loss": 4.378, + "step": 34710 + }, + { + "epoch": 2.778044487117939, + "grad_norm": 4.283048629760742, + "learning_rate": 3.7265336685029686e-06, + "loss": 4.2144, + "step": 34720 + }, + { + "epoch": 2.7788446151384223, + "grad_norm": 4.840237617492676, + "learning_rate": 3.7131625394448312e-06, + "loss": 4.0939, + "step": 34730 + }, + { + "epoch": 2.7796447431589053, + "grad_norm": 6.175449371337891, + "learning_rate": 3.699791410386693e-06, + "loss": 4.1317, + "step": 34740 + }, + { + "epoch": 2.7804448711793888, + "grad_norm": 7.649020195007324, + "learning_rate": 3.6864202813285558e-06, + "loss": 4.0876, + "step": 34750 + }, + { + "epoch": 2.7812449991998722, + "grad_norm": 4.315440654754639, + "learning_rate": 3.6730491522704176e-06, + "loss": 4.3091, + "step": 34760 + }, + { + "epoch": 2.7820451272203552, + "grad_norm": 7.573521137237549, + "learning_rate": 3.6596780232122803e-06, + "loss": 4.2258, + "step": 34770 + }, + { + "epoch": 2.7828452552408387, + "grad_norm": 5.323004722595215, + "learning_rate": 3.6463068941541426e-06, + "loss": 4.3041, + "step": 34780 + }, + { + "epoch": 2.7836453832613217, + "grad_norm": 5.814512729644775, + "learning_rate": 3.6329357650960053e-06, + "loss": 4.121, + "step": 34790 + }, + { + "epoch": 2.784445511281805, + "grad_norm": 5.472853183746338, + "learning_rate": 3.619564636037867e-06, + "loss": 4.2736, + "step": 34800 + }, + { + "epoch": 2.785245639302288, + "grad_norm": 6.29842472076416, + "learning_rate": 3.60619350697973e-06, + "loss": 4.1963, + "step": 34810 + }, + { + "epoch": 2.7860457673227716, + "grad_norm": 4.491445064544678, + "learning_rate": 3.5928223779215916e-06, + "loss": 4.2659, + "step": 34820 + }, + { + "epoch": 2.786845895343255, + "grad_norm": 7.308801651000977, + "learning_rate": 3.5794512488634543e-06, + "loss": 4.2923, + "step": 34830 + }, + { + "epoch": 2.787646023363738, + "grad_norm": 5.32773494720459, + "learning_rate": 3.5660801198053166e-06, + "loss": 4.2347, + "step": 34840 + }, + { + "epoch": 2.7884461513842216, + "grad_norm": 5.688913822174072, + "learning_rate": 3.5527089907471793e-06, + "loss": 4.1971, + "step": 34850 + }, + { + "epoch": 2.7892462794047046, + "grad_norm": 5.6290740966796875, + "learning_rate": 3.539337861689041e-06, + "loss": 4.1935, + "step": 34860 + }, + { + "epoch": 2.790046407425188, + "grad_norm": 4.931374549865723, + "learning_rate": 3.525966732630904e-06, + "loss": 4.1234, + "step": 34870 + }, + { + "epoch": 2.790846535445671, + "grad_norm": 6.678748607635498, + "learning_rate": 3.5125956035727657e-06, + "loss": 4.2878, + "step": 34880 + }, + { + "epoch": 2.7916466634661545, + "grad_norm": 4.9022626876831055, + "learning_rate": 3.499224474514628e-06, + "loss": 4.1224, + "step": 34890 + }, + { + "epoch": 2.792446791486638, + "grad_norm": 6.170747756958008, + "learning_rate": 3.4858533454564906e-06, + "loss": 4.2608, + "step": 34900 + }, + { + "epoch": 2.793246919507121, + "grad_norm": 4.918048858642578, + "learning_rate": 3.4724822163983525e-06, + "loss": 4.1567, + "step": 34910 + }, + { + "epoch": 2.7940470475276045, + "grad_norm": 7.252740383148193, + "learning_rate": 3.459111087340215e-06, + "loss": 4.1902, + "step": 34920 + }, + { + "epoch": 2.794847175548088, + "grad_norm": 5.8041157722473145, + "learning_rate": 3.445739958282077e-06, + "loss": 4.2221, + "step": 34930 + }, + { + "epoch": 2.795647303568571, + "grad_norm": 6.8014020919799805, + "learning_rate": 3.4323688292239397e-06, + "loss": 4.118, + "step": 34940 + }, + { + "epoch": 2.7964474315890544, + "grad_norm": 6.862493991851807, + "learning_rate": 3.418997700165802e-06, + "loss": 4.2246, + "step": 34950 + }, + { + "epoch": 2.7972475596095374, + "grad_norm": 3.999680995941162, + "learning_rate": 3.4056265711076646e-06, + "loss": 4.2414, + "step": 34960 + }, + { + "epoch": 2.798047687630021, + "grad_norm": 5.764650821685791, + "learning_rate": 3.3922554420495265e-06, + "loss": 4.1001, + "step": 34970 + }, + { + "epoch": 2.798847815650504, + "grad_norm": 4.690990447998047, + "learning_rate": 3.378884312991389e-06, + "loss": 4.2748, + "step": 34980 + }, + { + "epoch": 2.7996479436709873, + "grad_norm": 5.3460259437561035, + "learning_rate": 3.3655131839332514e-06, + "loss": 4.245, + "step": 34990 + }, + { + "epoch": 2.800448071691471, + "grad_norm": 3.6493239402770996, + "learning_rate": 3.352142054875114e-06, + "loss": 4.2913, + "step": 35000 + }, + { + "epoch": 2.800448071691471, + "eval_loss": 5.665900230407715, + "eval_runtime": 17.279, + "eval_samples_per_second": 2.315, + "eval_steps_per_second": 0.289, + "step": 35000 + }, + { + "epoch": 2.801248199711954, + "grad_norm": 8.602275848388672, + "learning_rate": 3.338770925816976e-06, + "loss": 4.2604, + "step": 35010 + }, + { + "epoch": 2.8020483277324373, + "grad_norm": 5.3672356605529785, + "learning_rate": 3.3253997967588387e-06, + "loss": 4.1385, + "step": 35020 + }, + { + "epoch": 2.8028484557529207, + "grad_norm": 9.254061698913574, + "learning_rate": 3.3120286677007005e-06, + "loss": 4.3666, + "step": 35030 + }, + { + "epoch": 2.8036485837734038, + "grad_norm": 5.195075988769531, + "learning_rate": 3.298657538642563e-06, + "loss": 4.0343, + "step": 35040 + }, + { + "epoch": 2.8044487117938868, + "grad_norm": 5.531332969665527, + "learning_rate": 3.2852864095844255e-06, + "loss": 4.4317, + "step": 35050 + }, + { + "epoch": 2.8052488398143702, + "grad_norm": 5.089931964874268, + "learning_rate": 3.271915280526288e-06, + "loss": 4.2116, + "step": 35060 + }, + { + "epoch": 2.8060489678348537, + "grad_norm": 7.625751495361328, + "learning_rate": 3.25854415146815e-06, + "loss": 4.0876, + "step": 35070 + }, + { + "epoch": 2.8068490958553367, + "grad_norm": 4.376953601837158, + "learning_rate": 3.2451730224100127e-06, + "loss": 4.2479, + "step": 35080 + }, + { + "epoch": 2.80764922387582, + "grad_norm": 4.922122001647949, + "learning_rate": 3.2318018933518745e-06, + "loss": 4.3492, + "step": 35090 + }, + { + "epoch": 2.8084493518963036, + "grad_norm": 5.147998332977295, + "learning_rate": 3.2184307642937372e-06, + "loss": 4.0972, + "step": 35100 + }, + { + "epoch": 2.8092494799167866, + "grad_norm": 5.979716777801514, + "learning_rate": 3.2050596352355995e-06, + "loss": 4.4772, + "step": 35110 + }, + { + "epoch": 2.81004960793727, + "grad_norm": 5.627559185028076, + "learning_rate": 3.191688506177462e-06, + "loss": 4.3933, + "step": 35120 + }, + { + "epoch": 2.810849735957753, + "grad_norm": 4.930499076843262, + "learning_rate": 3.178317377119324e-06, + "loss": 4.2395, + "step": 35130 + }, + { + "epoch": 2.8116498639782366, + "grad_norm": 9.172618865966797, + "learning_rate": 3.1649462480611867e-06, + "loss": 4.1436, + "step": 35140 + }, + { + "epoch": 2.8124499919987196, + "grad_norm": 5.223548412322998, + "learning_rate": 3.1515751190030486e-06, + "loss": 4.3852, + "step": 35150 + }, + { + "epoch": 2.813250120019203, + "grad_norm": 5.304259300231934, + "learning_rate": 3.1382039899449113e-06, + "loss": 4.297, + "step": 35160 + }, + { + "epoch": 2.8140502480396865, + "grad_norm": 5.6694655418396, + "learning_rate": 3.1248328608867735e-06, + "loss": 4.3311, + "step": 35170 + }, + { + "epoch": 2.8148503760601695, + "grad_norm": 4.797468185424805, + "learning_rate": 3.1114617318286358e-06, + "loss": 4.3337, + "step": 35180 + }, + { + "epoch": 2.815650504080653, + "grad_norm": 5.934649467468262, + "learning_rate": 3.098090602770498e-06, + "loss": 4.3562, + "step": 35190 + }, + { + "epoch": 2.8164506321011364, + "grad_norm": 7.17952823638916, + "learning_rate": 3.0847194737123603e-06, + "loss": 4.2319, + "step": 35200 + }, + { + "epoch": 2.8172507601216195, + "grad_norm": 5.0281081199646, + "learning_rate": 3.071348344654223e-06, + "loss": 4.2155, + "step": 35210 + }, + { + "epoch": 2.8180508881421025, + "grad_norm": 4.19122314453125, + "learning_rate": 3.0579772155960853e-06, + "loss": 4.2386, + "step": 35220 + }, + { + "epoch": 2.818851016162586, + "grad_norm": 4.930305480957031, + "learning_rate": 3.0446060865379475e-06, + "loss": 4.3804, + "step": 35230 + }, + { + "epoch": 2.8196511441830694, + "grad_norm": 5.851728916168213, + "learning_rate": 3.03123495747981e-06, + "loss": 4.2732, + "step": 35240 + }, + { + "epoch": 2.8204512722035524, + "grad_norm": 6.313448905944824, + "learning_rate": 3.017863828421672e-06, + "loss": 4.2819, + "step": 35250 + }, + { + "epoch": 2.821251400224036, + "grad_norm": 6.813796520233154, + "learning_rate": 3.0044926993635343e-06, + "loss": 4.078, + "step": 35260 + }, + { + "epoch": 2.8220515282445193, + "grad_norm": 11.20564079284668, + "learning_rate": 2.991121570305397e-06, + "loss": 3.9932, + "step": 35270 + }, + { + "epoch": 2.8228516562650023, + "grad_norm": 5.3791351318359375, + "learning_rate": 2.9777504412472593e-06, + "loss": 4.2421, + "step": 35280 + }, + { + "epoch": 2.823651784285486, + "grad_norm": 5.504123210906982, + "learning_rate": 2.9643793121891216e-06, + "loss": 4.1563, + "step": 35290 + }, + { + "epoch": 2.824451912305969, + "grad_norm": 5.196000099182129, + "learning_rate": 2.951008183130984e-06, + "loss": 4.3874, + "step": 35300 + }, + { + "epoch": 2.8252520403264523, + "grad_norm": 6.91581916809082, + "learning_rate": 2.9376370540728457e-06, + "loss": 4.246, + "step": 35310 + }, + { + "epoch": 2.8260521683469353, + "grad_norm": 5.90737247467041, + "learning_rate": 2.9242659250147084e-06, + "loss": 4.2119, + "step": 35320 + }, + { + "epoch": 2.8268522963674187, + "grad_norm": 6.170504093170166, + "learning_rate": 2.9108947959565706e-06, + "loss": 4.252, + "step": 35330 + }, + { + "epoch": 2.827652424387902, + "grad_norm": 5.4018354415893555, + "learning_rate": 2.897523666898433e-06, + "loss": 3.8861, + "step": 35340 + }, + { + "epoch": 2.8284525524083852, + "grad_norm": 6.109228134155273, + "learning_rate": 2.884152537840295e-06, + "loss": 4.347, + "step": 35350 + }, + { + "epoch": 2.8292526804288687, + "grad_norm": 5.722598075866699, + "learning_rate": 2.8707814087821574e-06, + "loss": 4.1786, + "step": 35360 + }, + { + "epoch": 2.830052808449352, + "grad_norm": 4.858404636383057, + "learning_rate": 2.85741027972402e-06, + "loss": 4.2734, + "step": 35370 + }, + { + "epoch": 2.830852936469835, + "grad_norm": 4.960545063018799, + "learning_rate": 2.8440391506658824e-06, + "loss": 4.3257, + "step": 35380 + }, + { + "epoch": 2.8316530644903186, + "grad_norm": 7.317862033843994, + "learning_rate": 2.8306680216077447e-06, + "loss": 4.1959, + "step": 35390 + }, + { + "epoch": 2.8324531925108016, + "grad_norm": 4.587612152099609, + "learning_rate": 2.817296892549607e-06, + "loss": 4.0904, + "step": 35400 + }, + { + "epoch": 2.833253320531285, + "grad_norm": 5.755588531494141, + "learning_rate": 2.803925763491469e-06, + "loss": 4.3367, + "step": 35410 + }, + { + "epoch": 2.834053448551768, + "grad_norm": 4.245760917663574, + "learning_rate": 2.7905546344333315e-06, + "loss": 4.4096, + "step": 35420 + }, + { + "epoch": 2.8348535765722516, + "grad_norm": 4.4361748695373535, + "learning_rate": 2.777183505375194e-06, + "loss": 4.1928, + "step": 35430 + }, + { + "epoch": 2.835653704592735, + "grad_norm": 4.350141525268555, + "learning_rate": 2.7638123763170564e-06, + "loss": 4.2633, + "step": 35440 + }, + { + "epoch": 2.836453832613218, + "grad_norm": 7.448699951171875, + "learning_rate": 2.7504412472589187e-06, + "loss": 4.2588, + "step": 35450 + }, + { + "epoch": 2.8372539606337015, + "grad_norm": 3.8748276233673096, + "learning_rate": 2.737070118200781e-06, + "loss": 4.1945, + "step": 35460 + }, + { + "epoch": 2.8380540886541845, + "grad_norm": 4.915309906005859, + "learning_rate": 2.723698989142643e-06, + "loss": 4.2373, + "step": 35470 + }, + { + "epoch": 2.838854216674668, + "grad_norm": 7.0673980712890625, + "learning_rate": 2.710327860084506e-06, + "loss": 4.2509, + "step": 35480 + }, + { + "epoch": 2.839654344695151, + "grad_norm": 4.1713547706604, + "learning_rate": 2.696956731026368e-06, + "loss": 4.2477, + "step": 35490 + }, + { + "epoch": 2.8404544727156344, + "grad_norm": 5.206265926361084, + "learning_rate": 2.6835856019682304e-06, + "loss": 4.2303, + "step": 35500 + }, + { + "epoch": 2.841254600736118, + "grad_norm": 8.962867736816406, + "learning_rate": 2.6702144729100927e-06, + "loss": 4.3039, + "step": 35510 + }, + { + "epoch": 2.842054728756601, + "grad_norm": 7.5982985496521, + "learning_rate": 2.656843343851955e-06, + "loss": 4.1855, + "step": 35520 + }, + { + "epoch": 2.8428548567770844, + "grad_norm": 3.994152069091797, + "learning_rate": 2.6434722147938172e-06, + "loss": 4.1927, + "step": 35530 + }, + { + "epoch": 2.843654984797568, + "grad_norm": Infinity, + "learning_rate": 2.6314381986414934e-06, + "loss": 4.3871, + "step": 35540 + }, + { + "epoch": 2.844455112818051, + "grad_norm": 5.288627624511719, + "learning_rate": 2.6180670695833557e-06, + "loss": 4.0285, + "step": 35550 + }, + { + "epoch": 2.8452552408385343, + "grad_norm": 4.253042221069336, + "learning_rate": 2.6046959405252184e-06, + "loss": 4.2831, + "step": 35560 + }, + { + "epoch": 2.8460553688590173, + "grad_norm": 4.621375560760498, + "learning_rate": 2.5913248114670806e-06, + "loss": 4.2246, + "step": 35570 + }, + { + "epoch": 2.846855496879501, + "grad_norm": 5.700193881988525, + "learning_rate": 2.577953682408943e-06, + "loss": 4.0086, + "step": 35580 + }, + { + "epoch": 2.847655624899984, + "grad_norm": 4.042226791381836, + "learning_rate": 2.564582553350805e-06, + "loss": 4.1566, + "step": 35590 + }, + { + "epoch": 2.8484557529204673, + "grad_norm": 5.7738776206970215, + "learning_rate": 2.5512114242926674e-06, + "loss": 4.1277, + "step": 35600 + }, + { + "epoch": 2.8492558809409507, + "grad_norm": 5.566997528076172, + "learning_rate": 2.5378402952345297e-06, + "loss": 4.2208, + "step": 35610 + }, + { + "epoch": 2.8500560089614337, + "grad_norm": 6.773410320281982, + "learning_rate": 2.524469166176392e-06, + "loss": 4.1155, + "step": 35620 + }, + { + "epoch": 2.850856136981917, + "grad_norm": 4.425107002258301, + "learning_rate": 2.5110980371182542e-06, + "loss": 4.2836, + "step": 35630 + }, + { + "epoch": 2.8516562650024007, + "grad_norm": 6.652309417724609, + "learning_rate": 2.4977269080601165e-06, + "loss": 4.2207, + "step": 35640 + }, + { + "epoch": 2.8524563930228837, + "grad_norm": 10.535252571105957, + "learning_rate": 2.4843557790019788e-06, + "loss": 4.2426, + "step": 35650 + }, + { + "epoch": 2.8532565210433667, + "grad_norm": 7.071555137634277, + "learning_rate": 2.4709846499438414e-06, + "loss": 4.1919, + "step": 35660 + }, + { + "epoch": 2.85405664906385, + "grad_norm": 6.044496059417725, + "learning_rate": 2.4576135208857037e-06, + "loss": 4.1179, + "step": 35670 + }, + { + "epoch": 2.8548567770843336, + "grad_norm": 4.740421772003174, + "learning_rate": 2.444242391827566e-06, + "loss": 4.3368, + "step": 35680 + }, + { + "epoch": 2.8556569051048166, + "grad_norm": 4.672318935394287, + "learning_rate": 2.4308712627694282e-06, + "loss": 4.2928, + "step": 35690 + }, + { + "epoch": 2.8564570331253, + "grad_norm": 4.998258590698242, + "learning_rate": 2.4175001337112905e-06, + "loss": 4.1935, + "step": 35700 + }, + { + "epoch": 2.8572571611457835, + "grad_norm": 6.342236042022705, + "learning_rate": 2.4041290046531528e-06, + "loss": 4.2918, + "step": 35710 + }, + { + "epoch": 2.8580572891662666, + "grad_norm": 6.538265705108643, + "learning_rate": 2.3907578755950155e-06, + "loss": 4.2393, + "step": 35720 + }, + { + "epoch": 2.85885741718675, + "grad_norm": 5.26876974105835, + "learning_rate": 2.3773867465368777e-06, + "loss": 4.1355, + "step": 35730 + }, + { + "epoch": 2.859657545207233, + "grad_norm": 6.272813320159912, + "learning_rate": 2.36401561747874e-06, + "loss": 4.277, + "step": 35740 + }, + { + "epoch": 2.8604576732277165, + "grad_norm": 4.440347671508789, + "learning_rate": 2.3506444884206023e-06, + "loss": 4.2423, + "step": 35750 + }, + { + "epoch": 2.8612578012481995, + "grad_norm": 4.575409889221191, + "learning_rate": 2.3372733593624645e-06, + "loss": 4.2464, + "step": 35760 + }, + { + "epoch": 2.862057929268683, + "grad_norm": 6.975805759429932, + "learning_rate": 2.3239022303043272e-06, + "loss": 4.3602, + "step": 35770 + }, + { + "epoch": 2.8628580572891664, + "grad_norm": 4.794578552246094, + "learning_rate": 2.3105311012461895e-06, + "loss": 4.2069, + "step": 35780 + }, + { + "epoch": 2.8636581853096494, + "grad_norm": 4.724869251251221, + "learning_rate": 2.2971599721880518e-06, + "loss": 4.319, + "step": 35790 + }, + { + "epoch": 2.864458313330133, + "grad_norm": 5.100827217102051, + "learning_rate": 2.283788843129914e-06, + "loss": 4.1811, + "step": 35800 + }, + { + "epoch": 2.8652584413506164, + "grad_norm": 5.946887016296387, + "learning_rate": 2.2704177140717763e-06, + "loss": 4.3413, + "step": 35810 + }, + { + "epoch": 2.8660585693710994, + "grad_norm": 4.958827018737793, + "learning_rate": 2.2570465850136386e-06, + "loss": 4.2534, + "step": 35820 + }, + { + "epoch": 2.8668586973915824, + "grad_norm": 4.055940628051758, + "learning_rate": 2.2436754559555012e-06, + "loss": 4.3207, + "step": 35830 + }, + { + "epoch": 2.867658825412066, + "grad_norm": 5.415339946746826, + "learning_rate": 2.2303043268973635e-06, + "loss": 4.0927, + "step": 35840 + }, + { + "epoch": 2.8684589534325493, + "grad_norm": 11.86763858795166, + "learning_rate": 2.2169331978392258e-06, + "loss": 4.2369, + "step": 35850 + }, + { + "epoch": 2.8692590814530323, + "grad_norm": 5.227077960968018, + "learning_rate": 2.203562068781088e-06, + "loss": 4.0475, + "step": 35860 + }, + { + "epoch": 2.870059209473516, + "grad_norm": 4.520332336425781, + "learning_rate": 2.1901909397229503e-06, + "loss": 4.248, + "step": 35870 + }, + { + "epoch": 2.8708593374939992, + "grad_norm": 4.784921169281006, + "learning_rate": 2.1768198106648126e-06, + "loss": 4.061, + "step": 35880 + }, + { + "epoch": 2.8716594655144823, + "grad_norm": 4.605025768280029, + "learning_rate": 2.1634486816066753e-06, + "loss": 4.3272, + "step": 35890 + }, + { + "epoch": 2.8724595935349657, + "grad_norm": 4.685732364654541, + "learning_rate": 2.1500775525485375e-06, + "loss": 4.2693, + "step": 35900 + }, + { + "epoch": 2.8732597215554487, + "grad_norm": 5.242833614349365, + "learning_rate": 2.1367064234903994e-06, + "loss": 4.2223, + "step": 35910 + }, + { + "epoch": 2.874059849575932, + "grad_norm": 7.820953369140625, + "learning_rate": 2.1233352944322616e-06, + "loss": 3.8063, + "step": 35920 + }, + { + "epoch": 2.874859977596415, + "grad_norm": 6.070064067840576, + "learning_rate": 2.1099641653741243e-06, + "loss": 4.1858, + "step": 35930 + }, + { + "epoch": 2.8756601056168987, + "grad_norm": 10.925037384033203, + "learning_rate": 2.0965930363159866e-06, + "loss": 4.2107, + "step": 35940 + }, + { + "epoch": 2.876460233637382, + "grad_norm": 7.051306247711182, + "learning_rate": 2.083221907257849e-06, + "loss": 4.1936, + "step": 35950 + }, + { + "epoch": 2.877260361657865, + "grad_norm": 5.999988079071045, + "learning_rate": 2.069850778199711e-06, + "loss": 4.1867, + "step": 35960 + }, + { + "epoch": 2.8780604896783486, + "grad_norm": 5.072703838348389, + "learning_rate": 2.0564796491415734e-06, + "loss": 4.0283, + "step": 35970 + }, + { + "epoch": 2.878860617698832, + "grad_norm": 6.015980243682861, + "learning_rate": 2.0431085200834357e-06, + "loss": 4.2977, + "step": 35980 + }, + { + "epoch": 2.879660745719315, + "grad_norm": 5.586410999298096, + "learning_rate": 2.0297373910252984e-06, + "loss": 4.0267, + "step": 35990 + }, + { + "epoch": 2.8804608737397985, + "grad_norm": 4.132725238800049, + "learning_rate": 2.0163662619671606e-06, + "loss": 4.3056, + "step": 36000 + }, + { + "epoch": 2.8812610017602815, + "grad_norm": 5.6204023361206055, + "learning_rate": 2.002995132909023e-06, + "loss": 4.1451, + "step": 36010 + }, + { + "epoch": 2.882061129780765, + "grad_norm": 5.820326805114746, + "learning_rate": 1.989624003850885e-06, + "loss": 4.2092, + "step": 36020 + }, + { + "epoch": 2.882861257801248, + "grad_norm": 7.098259449005127, + "learning_rate": 1.9762528747927474e-06, + "loss": 4.2706, + "step": 36030 + }, + { + "epoch": 2.8836613858217315, + "grad_norm": 6.010718822479248, + "learning_rate": 1.96288174573461e-06, + "loss": 4.3539, + "step": 36040 + }, + { + "epoch": 2.884461513842215, + "grad_norm": 6.73037052154541, + "learning_rate": 1.9495106166764724e-06, + "loss": 4.3154, + "step": 36050 + }, + { + "epoch": 2.885261641862698, + "grad_norm": 5.936001777648926, + "learning_rate": 1.9361394876183347e-06, + "loss": 4.1449, + "step": 36060 + }, + { + "epoch": 2.8860617698831814, + "grad_norm": 5.640296936035156, + "learning_rate": 1.922768358560197e-06, + "loss": 4.1601, + "step": 36070 + }, + { + "epoch": 2.8868618979036644, + "grad_norm": 6.558215618133545, + "learning_rate": 1.909397229502059e-06, + "loss": 4.1861, + "step": 36080 + }, + { + "epoch": 2.887662025924148, + "grad_norm": 4.897027015686035, + "learning_rate": 1.8960261004439217e-06, + "loss": 4.2773, + "step": 36090 + }, + { + "epoch": 2.888462153944631, + "grad_norm": 6.5792436599731445, + "learning_rate": 1.882654971385784e-06, + "loss": 4.0926, + "step": 36100 + }, + { + "epoch": 2.8892622819651144, + "grad_norm": 5.062023639678955, + "learning_rate": 1.8692838423276464e-06, + "loss": 4.2352, + "step": 36110 + }, + { + "epoch": 2.890062409985598, + "grad_norm": 4.872011661529541, + "learning_rate": 1.8559127132695087e-06, + "loss": 4.1284, + "step": 36120 + }, + { + "epoch": 2.890862538006081, + "grad_norm": 4.747717380523682, + "learning_rate": 1.842541584211371e-06, + "loss": 4.2554, + "step": 36130 + }, + { + "epoch": 2.8916626660265643, + "grad_norm": 7.434582710266113, + "learning_rate": 1.8291704551532334e-06, + "loss": 4.288, + "step": 36140 + }, + { + "epoch": 2.8924627940470478, + "grad_norm": 4.58209228515625, + "learning_rate": 1.8157993260950957e-06, + "loss": 4.2023, + "step": 36150 + }, + { + "epoch": 2.8932629220675308, + "grad_norm": 5.5834455490112305, + "learning_rate": 1.802428197036958e-06, + "loss": 4.4362, + "step": 36160 + }, + { + "epoch": 2.8940630500880142, + "grad_norm": 7.5915422439575195, + "learning_rate": 1.7890570679788204e-06, + "loss": 4.1086, + "step": 36170 + }, + { + "epoch": 2.8948631781084972, + "grad_norm": 4.346075534820557, + "learning_rate": 1.7756859389206827e-06, + "loss": 4.2259, + "step": 36180 + }, + { + "epoch": 2.8956633061289807, + "grad_norm": 5.605753421783447, + "learning_rate": 1.7623148098625448e-06, + "loss": 4.3252, + "step": 36190 + }, + { + "epoch": 2.8964634341494637, + "grad_norm": 5.893536567687988, + "learning_rate": 1.748943680804407e-06, + "loss": 4.1954, + "step": 36200 + }, + { + "epoch": 2.897263562169947, + "grad_norm": 4.114583492279053, + "learning_rate": 1.7355725517462695e-06, + "loss": 4.1583, + "step": 36210 + }, + { + "epoch": 2.8980636901904306, + "grad_norm": 5.583434581756592, + "learning_rate": 1.7222014226881318e-06, + "loss": 4.2023, + "step": 36220 + }, + { + "epoch": 2.8988638182109137, + "grad_norm": 7.7079973220825195, + "learning_rate": 1.708830293629994e-06, + "loss": 4.335, + "step": 36230 + }, + { + "epoch": 2.899663946231397, + "grad_norm": 6.053271770477295, + "learning_rate": 1.6954591645718565e-06, + "loss": 4.1763, + "step": 36240 + }, + { + "epoch": 2.9004640742518806, + "grad_norm": 6.019364356994629, + "learning_rate": 1.6820880355137188e-06, + "loss": 4.3155, + "step": 36250 + }, + { + "epoch": 2.9012642022723636, + "grad_norm": 4.506904125213623, + "learning_rate": 1.668716906455581e-06, + "loss": 4.191, + "step": 36260 + }, + { + "epoch": 2.9020643302928466, + "grad_norm": 4.384410381317139, + "learning_rate": 1.6553457773974435e-06, + "loss": 4.2903, + "step": 36270 + }, + { + "epoch": 2.90286445831333, + "grad_norm": 5.195708751678467, + "learning_rate": 1.6419746483393058e-06, + "loss": 4.3516, + "step": 36280 + }, + { + "epoch": 2.9036645863338135, + "grad_norm": 5.231687068939209, + "learning_rate": 1.628603519281168e-06, + "loss": 4.1651, + "step": 36290 + }, + { + "epoch": 2.9044647143542965, + "grad_norm": 5.189396381378174, + "learning_rate": 1.6152323902230305e-06, + "loss": 3.9818, + "step": 36300 + }, + { + "epoch": 2.90526484237478, + "grad_norm": 7.152802467346191, + "learning_rate": 1.6018612611648928e-06, + "loss": 4.1943, + "step": 36310 + }, + { + "epoch": 2.9060649703952635, + "grad_norm": 3.657484769821167, + "learning_rate": 1.588490132106755e-06, + "loss": 4.1238, + "step": 36320 + }, + { + "epoch": 2.9068650984157465, + "grad_norm": 6.751424789428711, + "learning_rate": 1.5751190030486175e-06, + "loss": 4.3217, + "step": 36330 + }, + { + "epoch": 2.90766522643623, + "grad_norm": 6.126858711242676, + "learning_rate": 1.5617478739904798e-06, + "loss": 4.2568, + "step": 36340 + }, + { + "epoch": 2.908465354456713, + "grad_norm": 7.0460638999938965, + "learning_rate": 1.5483767449323423e-06, + "loss": 4.5145, + "step": 36350 + }, + { + "epoch": 2.9092654824771964, + "grad_norm": 5.848211765289307, + "learning_rate": 1.5350056158742046e-06, + "loss": 4.3336, + "step": 36360 + }, + { + "epoch": 2.9100656104976794, + "grad_norm": 3.3985276222229004, + "learning_rate": 1.5216344868160668e-06, + "loss": 4.1685, + "step": 36370 + }, + { + "epoch": 2.910865738518163, + "grad_norm": 4.115097522735596, + "learning_rate": 1.5082633577579293e-06, + "loss": 4.1931, + "step": 36380 + }, + { + "epoch": 2.9116658665386463, + "grad_norm": 4.848307132720947, + "learning_rate": 1.4948922286997916e-06, + "loss": 4.239, + "step": 36390 + }, + { + "epoch": 2.9124659945591294, + "grad_norm": 7.926689147949219, + "learning_rate": 1.4815210996416538e-06, + "loss": 4.1758, + "step": 36400 + }, + { + "epoch": 2.913266122579613, + "grad_norm": 4.529385089874268, + "learning_rate": 1.468149970583516e-06, + "loss": 4.3033, + "step": 36410 + }, + { + "epoch": 2.9140662506000963, + "grad_norm": 6.482027530670166, + "learning_rate": 1.4547788415253784e-06, + "loss": 4.228, + "step": 36420 + }, + { + "epoch": 2.9148663786205793, + "grad_norm": 8.809609413146973, + "learning_rate": 1.4414077124672408e-06, + "loss": 4.2323, + "step": 36430 + }, + { + "epoch": 2.9156665066410623, + "grad_norm": 6.071745872497559, + "learning_rate": 1.4280365834091031e-06, + "loss": 4.1434, + "step": 36440 + }, + { + "epoch": 2.9164666346615458, + "grad_norm": 10.558405876159668, + "learning_rate": 1.4146654543509654e-06, + "loss": 4.1691, + "step": 36450 + }, + { + "epoch": 2.9172667626820292, + "grad_norm": 5.350115776062012, + "learning_rate": 1.4012943252928279e-06, + "loss": 4.1962, + "step": 36460 + }, + { + "epoch": 2.9180668907025122, + "grad_norm": 5.859897136688232, + "learning_rate": 1.3879231962346901e-06, + "loss": 4.4039, + "step": 36470 + }, + { + "epoch": 2.9188670187229957, + "grad_norm": 5.368648052215576, + "learning_rate": 1.3745520671765524e-06, + "loss": 4.0889, + "step": 36480 + }, + { + "epoch": 2.919667146743479, + "grad_norm": 6.981392860412598, + "learning_rate": 1.3611809381184149e-06, + "loss": 4.0967, + "step": 36490 + }, + { + "epoch": 2.920467274763962, + "grad_norm": 10.068036079406738, + "learning_rate": 1.3478098090602771e-06, + "loss": 4.2863, + "step": 36500 + }, + { + "epoch": 2.9212674027844456, + "grad_norm": 6.100005149841309, + "learning_rate": 1.3344386800021394e-06, + "loss": 4.3266, + "step": 36510 + }, + { + "epoch": 2.9220675308049286, + "grad_norm": 4.14323616027832, + "learning_rate": 1.3210675509440019e-06, + "loss": 4.319, + "step": 36520 + }, + { + "epoch": 2.922867658825412, + "grad_norm": 5.502248287200928, + "learning_rate": 1.3076964218858641e-06, + "loss": 4.272, + "step": 36530 + }, + { + "epoch": 2.923667786845895, + "grad_norm": 9.553668022155762, + "learning_rate": 1.2943252928277264e-06, + "loss": 4.2033, + "step": 36540 + }, + { + "epoch": 2.9244679148663786, + "grad_norm": 6.439156532287598, + "learning_rate": 1.2809541637695887e-06, + "loss": 4.1459, + "step": 36550 + }, + { + "epoch": 2.925268042886862, + "grad_norm": 6.06801700592041, + "learning_rate": 1.267583034711451e-06, + "loss": 4.1379, + "step": 36560 + }, + { + "epoch": 2.926068170907345, + "grad_norm": 6.407751083374023, + "learning_rate": 1.2542119056533134e-06, + "loss": 4.2743, + "step": 36570 + }, + { + "epoch": 2.9268682989278285, + "grad_norm": 5.329061985015869, + "learning_rate": 1.2408407765951757e-06, + "loss": 4.2889, + "step": 36580 + }, + { + "epoch": 2.927668426948312, + "grad_norm": 5.764720916748047, + "learning_rate": 1.2274696475370382e-06, + "loss": 4.411, + "step": 36590 + }, + { + "epoch": 2.928468554968795, + "grad_norm": 5.452577590942383, + "learning_rate": 1.2140985184789004e-06, + "loss": 4.2659, + "step": 36600 + }, + { + "epoch": 2.9292686829892784, + "grad_norm": 5.00178337097168, + "learning_rate": 1.2007273894207627e-06, + "loss": 4.4003, + "step": 36610 + }, + { + "epoch": 2.9300688110097615, + "grad_norm": 4.074296474456787, + "learning_rate": 1.1873562603626252e-06, + "loss": 4.2294, + "step": 36620 + }, + { + "epoch": 2.930868939030245, + "grad_norm": 4.745507717132568, + "learning_rate": 1.1739851313044874e-06, + "loss": 4.1708, + "step": 36630 + }, + { + "epoch": 2.931669067050728, + "grad_norm": 5.222048282623291, + "learning_rate": 1.1606140022463497e-06, + "loss": 4.0252, + "step": 36640 + }, + { + "epoch": 2.9324691950712114, + "grad_norm": 5.4653639793396, + "learning_rate": 1.1472428731882122e-06, + "loss": 4.0802, + "step": 36650 + }, + { + "epoch": 2.933269323091695, + "grad_norm": 5.444248676300049, + "learning_rate": 1.1338717441300745e-06, + "loss": 4.194, + "step": 36660 + }, + { + "epoch": 2.934069451112178, + "grad_norm": 4.424251079559326, + "learning_rate": 1.1205006150719367e-06, + "loss": 4.121, + "step": 36670 + }, + { + "epoch": 2.9348695791326613, + "grad_norm": 5.136073112487793, + "learning_rate": 1.107129486013799e-06, + "loss": 4.1506, + "step": 36680 + }, + { + "epoch": 2.9356697071531443, + "grad_norm": 6.3223876953125, + "learning_rate": 1.0937583569556613e-06, + "loss": 4.2885, + "step": 36690 + }, + { + "epoch": 2.936469835173628, + "grad_norm": 8.229373931884766, + "learning_rate": 1.0803872278975237e-06, + "loss": 4.2838, + "step": 36700 + }, + { + "epoch": 2.937269963194111, + "grad_norm": 4.667328834533691, + "learning_rate": 1.067016098839386e-06, + "loss": 4.1594, + "step": 36710 + }, + { + "epoch": 2.9380700912145943, + "grad_norm": 11.4064302444458, + "learning_rate": 1.0536449697812483e-06, + "loss": 4.1362, + "step": 36720 + }, + { + "epoch": 2.9388702192350777, + "grad_norm": 11.03482437133789, + "learning_rate": 1.0402738407231107e-06, + "loss": 4.1872, + "step": 36730 + }, + { + "epoch": 2.9396703472555608, + "grad_norm": 5.653985977172852, + "learning_rate": 1.026902711664973e-06, + "loss": 4.221, + "step": 36740 + }, + { + "epoch": 2.940470475276044, + "grad_norm": 7.984028339385986, + "learning_rate": 1.0135315826068353e-06, + "loss": 4.1823, + "step": 36750 + }, + { + "epoch": 2.9412706032965277, + "grad_norm": 7.034099578857422, + "learning_rate": 1.0001604535486978e-06, + "loss": 4.2426, + "step": 36760 + }, + { + "epoch": 2.9420707313170107, + "grad_norm": 6.773263931274414, + "learning_rate": 9.8678932449056e-07, + "loss": 4.2967, + "step": 36770 + }, + { + "epoch": 2.942870859337494, + "grad_norm": 4.996553421020508, + "learning_rate": 9.734181954324225e-07, + "loss": 4.2479, + "step": 36780 + }, + { + "epoch": 2.943670987357977, + "grad_norm": 12.89632797241211, + "learning_rate": 9.600470663742848e-07, + "loss": 4.1419, + "step": 36790 + }, + { + "epoch": 2.9444711153784606, + "grad_norm": 4.559993743896484, + "learning_rate": 9.46675937316147e-07, + "loss": 4.2202, + "step": 36800 + }, + { + "epoch": 2.9452712433989436, + "grad_norm": 5.565128803253174, + "learning_rate": 9.333048082580094e-07, + "loss": 4.3414, + "step": 36810 + }, + { + "epoch": 2.946071371419427, + "grad_norm": 4.633853912353516, + "learning_rate": 9.199336791998716e-07, + "loss": 4.3058, + "step": 36820 + }, + { + "epoch": 2.9468714994399106, + "grad_norm": 6.64238166809082, + "learning_rate": 9.065625501417339e-07, + "loss": 4.1815, + "step": 36830 + }, + { + "epoch": 2.9476716274603936, + "grad_norm": 4.056352615356445, + "learning_rate": 8.931914210835963e-07, + "loss": 4.3363, + "step": 36840 + }, + { + "epoch": 2.948471755480877, + "grad_norm": 5.0011491775512695, + "learning_rate": 8.798202920254586e-07, + "loss": 4.2357, + "step": 36850 + }, + { + "epoch": 2.9492718835013605, + "grad_norm": 6.450883865356445, + "learning_rate": 8.66449162967321e-07, + "loss": 4.2437, + "step": 36860 + }, + { + "epoch": 2.9500720115218435, + "grad_norm": 6.694143295288086, + "learning_rate": 8.530780339091833e-07, + "loss": 4.2564, + "step": 36870 + }, + { + "epoch": 2.9508721395423265, + "grad_norm": 4.477694511413574, + "learning_rate": 8.397069048510457e-07, + "loss": 4.2768, + "step": 36880 + }, + { + "epoch": 2.95167226756281, + "grad_norm": 5.470662593841553, + "learning_rate": 8.26335775792908e-07, + "loss": 4.3017, + "step": 36890 + }, + { + "epoch": 2.9524723955832934, + "grad_norm": 5.221342086791992, + "learning_rate": 8.129646467347703e-07, + "loss": 4.0872, + "step": 36900 + }, + { + "epoch": 2.9532725236037765, + "grad_norm": 6.306137561798096, + "learning_rate": 7.995935176766327e-07, + "loss": 4.2408, + "step": 36910 + }, + { + "epoch": 2.95407265162426, + "grad_norm": 6.915722370147705, + "learning_rate": 7.86222388618495e-07, + "loss": 4.3012, + "step": 36920 + }, + { + "epoch": 2.9548727796447434, + "grad_norm": 4.694388389587402, + "learning_rate": 7.728512595603572e-07, + "loss": 4.2886, + "step": 36930 + }, + { + "epoch": 2.9556729076652264, + "grad_norm": 4.7695136070251465, + "learning_rate": 7.594801305022196e-07, + "loss": 4.1293, + "step": 36940 + }, + { + "epoch": 2.95647303568571, + "grad_norm": 8.320331573486328, + "learning_rate": 7.46109001444082e-07, + "loss": 4.2172, + "step": 36950 + }, + { + "epoch": 2.957273163706193, + "grad_norm": 6.202345371246338, + "learning_rate": 7.327378723859443e-07, + "loss": 4.0253, + "step": 36960 + }, + { + "epoch": 2.9580732917266763, + "grad_norm": 8.366705894470215, + "learning_rate": 7.193667433278066e-07, + "loss": 4.1124, + "step": 36970 + }, + { + "epoch": 2.9588734197471593, + "grad_norm": 4.636302471160889, + "learning_rate": 7.05995614269669e-07, + "loss": 4.3183, + "step": 36980 + }, + { + "epoch": 2.959673547767643, + "grad_norm": 5.838320732116699, + "learning_rate": 6.926244852115314e-07, + "loss": 4.2729, + "step": 36990 + }, + { + "epoch": 2.9604736757881263, + "grad_norm": 5.318964004516602, + "learning_rate": 6.792533561533936e-07, + "loss": 4.2532, + "step": 37000 + }, + { + "epoch": 2.9612738038086093, + "grad_norm": 8.026263236999512, + "learning_rate": 6.658822270952559e-07, + "loss": 4.2332, + "step": 37010 + }, + { + "epoch": 2.9620739318290927, + "grad_norm": 5.9107747077941895, + "learning_rate": 6.525110980371183e-07, + "loss": 4.1986, + "step": 37020 + }, + { + "epoch": 2.962874059849576, + "grad_norm": 5.215660095214844, + "learning_rate": 6.391399689789807e-07, + "loss": 4.0658, + "step": 37030 + }, + { + "epoch": 2.963674187870059, + "grad_norm": 7.460134506225586, + "learning_rate": 6.257688399208429e-07, + "loss": 4.264, + "step": 37040 + }, + { + "epoch": 2.964474315890542, + "grad_norm": 7.57894229888916, + "learning_rate": 6.123977108627053e-07, + "loss": 4.0961, + "step": 37050 + }, + { + "epoch": 2.9652744439110257, + "grad_norm": 6.080920219421387, + "learning_rate": 5.990265818045677e-07, + "loss": 4.2651, + "step": 37060 + }, + { + "epoch": 2.966074571931509, + "grad_norm": 8.295559883117676, + "learning_rate": 5.856554527464299e-07, + "loss": 4.2018, + "step": 37070 + }, + { + "epoch": 2.966874699951992, + "grad_norm": 8.348221778869629, + "learning_rate": 5.722843236882922e-07, + "loss": 4.3621, + "step": 37080 + }, + { + "epoch": 2.9676748279724756, + "grad_norm": 4.557129383087158, + "learning_rate": 5.589131946301546e-07, + "loss": 4.1681, + "step": 37090 + }, + { + "epoch": 2.968474955992959, + "grad_norm": 4.582211017608643, + "learning_rate": 5.455420655720169e-07, + "loss": 4.1408, + "step": 37100 + }, + { + "epoch": 2.969275084013442, + "grad_norm": 4.796433925628662, + "learning_rate": 5.321709365138793e-07, + "loss": 4.2535, + "step": 37110 + }, + { + "epoch": 2.9700752120339255, + "grad_norm": 7.60089111328125, + "learning_rate": 5.187998074557416e-07, + "loss": 4.1283, + "step": 37120 + }, + { + "epoch": 2.9708753400544086, + "grad_norm": 7.079959392547607, + "learning_rate": 5.05428678397604e-07, + "loss": 4.1379, + "step": 37130 + }, + { + "epoch": 2.971675468074892, + "grad_norm": 5.832749843597412, + "learning_rate": 4.920575493394662e-07, + "loss": 4.379, + "step": 37140 + }, + { + "epoch": 2.972475596095375, + "grad_norm": 5.663512229919434, + "learning_rate": 4.786864202813286e-07, + "loss": 4.2176, + "step": 37150 + }, + { + "epoch": 2.9732757241158585, + "grad_norm": 6.955112934112549, + "learning_rate": 4.653152912231909e-07, + "loss": 4.0786, + "step": 37160 + }, + { + "epoch": 2.974075852136342, + "grad_norm": 5.792449951171875, + "learning_rate": 4.5194416216505323e-07, + "loss": 4.1782, + "step": 37170 + }, + { + "epoch": 2.974875980156825, + "grad_norm": 4.567730903625488, + "learning_rate": 4.385730331069156e-07, + "loss": 4.0626, + "step": 37180 + }, + { + "epoch": 2.9756761081773084, + "grad_norm": 5.905571937561035, + "learning_rate": 4.252019040487779e-07, + "loss": 4.3109, + "step": 37190 + }, + { + "epoch": 2.976476236197792, + "grad_norm": 7.822843074798584, + "learning_rate": 4.1183077499064024e-07, + "loss": 4.1584, + "step": 37200 + }, + { + "epoch": 2.977276364218275, + "grad_norm": 4.660794258117676, + "learning_rate": 3.984596459325025e-07, + "loss": 4.3121, + "step": 37210 + }, + { + "epoch": 2.9780764922387584, + "grad_norm": 4.471508979797363, + "learning_rate": 3.8508851687436493e-07, + "loss": 4.1769, + "step": 37220 + }, + { + "epoch": 2.9788766202592414, + "grad_norm": 6.368688106536865, + "learning_rate": 3.717173878162272e-07, + "loss": 4.2282, + "step": 37230 + }, + { + "epoch": 2.979676748279725, + "grad_norm": 31.647247314453125, + "learning_rate": 3.5834625875808957e-07, + "loss": 4.295, + "step": 37240 + }, + { + "epoch": 2.980476876300208, + "grad_norm": 5.0117950439453125, + "learning_rate": 3.449751296999519e-07, + "loss": 4.2265, + "step": 37250 + }, + { + "epoch": 2.9812770043206913, + "grad_norm": 7.659223556518555, + "learning_rate": 3.316040006418142e-07, + "loss": 4.2627, + "step": 37260 + }, + { + "epoch": 2.9820771323411748, + "grad_norm": 5.416043281555176, + "learning_rate": 3.1823287158367653e-07, + "loss": 4.1306, + "step": 37270 + }, + { + "epoch": 2.982877260361658, + "grad_norm": 5.165368556976318, + "learning_rate": 3.048617425255389e-07, + "loss": 4.3564, + "step": 37280 + }, + { + "epoch": 2.9836773883821412, + "grad_norm": 4.954187870025635, + "learning_rate": 2.914906134674012e-07, + "loss": 4.1988, + "step": 37290 + }, + { + "epoch": 2.9844775164026243, + "grad_norm": 4.00586462020874, + "learning_rate": 2.7811948440926354e-07, + "loss": 4.186, + "step": 37300 + }, + { + "epoch": 2.9852776444231077, + "grad_norm": 4.78800630569458, + "learning_rate": 2.6474835535112586e-07, + "loss": 4.0983, + "step": 37310 + }, + { + "epoch": 2.9860777724435907, + "grad_norm": 6.098124980926514, + "learning_rate": 2.513772262929882e-07, + "loss": 4.335, + "step": 37320 + }, + { + "epoch": 2.986877900464074, + "grad_norm": 8.325749397277832, + "learning_rate": 2.380060972348505e-07, + "loss": 4.1324, + "step": 37330 + }, + { + "epoch": 2.9876780284845577, + "grad_norm": 4.86344051361084, + "learning_rate": 2.2463496817671285e-07, + "loss": 4.2644, + "step": 37340 + }, + { + "epoch": 2.9884781565050407, + "grad_norm": 6.780296325683594, + "learning_rate": 2.112638391185752e-07, + "loss": 4.3815, + "step": 37350 + }, + { + "epoch": 2.989278284525524, + "grad_norm": 5.512997150421143, + "learning_rate": 1.9789271006043754e-07, + "loss": 4.1606, + "step": 37360 + }, + { + "epoch": 2.9900784125460076, + "grad_norm": 5.321970462799072, + "learning_rate": 1.8452158100229986e-07, + "loss": 4.0704, + "step": 37370 + }, + { + "epoch": 2.9908785405664906, + "grad_norm": 5.405054569244385, + "learning_rate": 1.7115045194416218e-07, + "loss": 4.215, + "step": 37380 + }, + { + "epoch": 2.991678668586974, + "grad_norm": 7.2463274002075195, + "learning_rate": 1.577793228860245e-07, + "loss": 4.2474, + "step": 37390 + }, + { + "epoch": 2.992478796607457, + "grad_norm": 5.009868621826172, + "learning_rate": 1.4440819382788684e-07, + "loss": 4.4354, + "step": 37400 + }, + { + "epoch": 2.9932789246279405, + "grad_norm": 5.4555206298828125, + "learning_rate": 1.3103706476974916e-07, + "loss": 4.3358, + "step": 37410 + }, + { + "epoch": 2.9940790526484236, + "grad_norm": 4.64158296585083, + "learning_rate": 1.176659357116115e-07, + "loss": 4.1267, + "step": 37420 + }, + { + "epoch": 2.994879180668907, + "grad_norm": 5.860431671142578, + "learning_rate": 1.0429480665347382e-07, + "loss": 4.4098, + "step": 37430 + }, + { + "epoch": 2.9956793086893905, + "grad_norm": 5.745077133178711, + "learning_rate": 9.092367759533615e-08, + "loss": 4.255, + "step": 37440 + }, + { + "epoch": 2.9964794367098735, + "grad_norm": 5.062090873718262, + "learning_rate": 7.755254853719848e-08, + "loss": 4.2798, + "step": 37450 + }, + { + "epoch": 2.997279564730357, + "grad_norm": 4.622069358825684, + "learning_rate": 6.418141947906081e-08, + "loss": 4.1738, + "step": 37460 + }, + { + "epoch": 2.9980796927508404, + "grad_norm": 5.84435510635376, + "learning_rate": 5.0810290420923146e-08, + "loss": 4.2221, + "step": 37470 + }, + { + "epoch": 2.9988798207713234, + "grad_norm": 4.550410747528076, + "learning_rate": 3.743916136278548e-08, + "loss": 4.3505, + "step": 37480 + }, + { + "epoch": 2.9996799487918064, + "grad_norm": 4.685887813568115, + "learning_rate": 2.4068032304647808e-08, + "loss": 4.0673, + "step": 37490 + }, + { + "epoch": 3.0, + "step": 37494, + "total_flos": 0.0, + "train_loss": 4.986996560858022, + "train_runtime": 84970.2993, + "train_samples_per_second": 7.06, + "train_steps_per_second": 0.441 + } + ], + "logging_steps": 10, + "max_steps": 37494, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 7000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 16, + "trial_name": null, + "trial_params": null +}