chtbx2 / checkpoint-14000 /trainer_state.json
Guerte's picture
Add files using upload-large-folder tool
dc6e737 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.1201792286765881,
"eval_steps": 7000,
"global_step": 14000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 9.306169509887695,
"eval_runtime": 10.9126,
"eval_samples_per_second": 3.665,
"eval_steps_per_second": 0.458,
"step": 0
},
{
"epoch": 0.0008001280204832773,
"grad_norm": 8.51533031463623,
"learning_rate": 3.5000000000000004e-06,
"loss": 8.786,
"step": 10
},
{
"epoch": 0.0016002560409665546,
"grad_norm": 10.90935230255127,
"learning_rate": 8.500000000000002e-06,
"loss": 8.3433,
"step": 20
},
{
"epoch": 0.002400384061449832,
"grad_norm": 7.269016265869141,
"learning_rate": 1.3500000000000001e-05,
"loss": 7.549,
"step": 30
},
{
"epoch": 0.003200512081933109,
"grad_norm": 8.790578842163086,
"learning_rate": 1.85e-05,
"loss": 7.2574,
"step": 40
},
{
"epoch": 0.004000640102416387,
"grad_norm": 6.52068567276001,
"learning_rate": 2.35e-05,
"loss": 7.0024,
"step": 50
},
{
"epoch": 0.004800768122899664,
"grad_norm": 6.902959823608398,
"learning_rate": 2.8499999999999998e-05,
"loss": 6.9074,
"step": 60
},
{
"epoch": 0.005600896143382941,
"grad_norm": 5.350945949554443,
"learning_rate": 3.35e-05,
"loss": 6.8765,
"step": 70
},
{
"epoch": 0.006401024163866218,
"grad_norm": 5.928489685058594,
"learning_rate": 3.85e-05,
"loss": 6.5663,
"step": 80
},
{
"epoch": 0.007201152184349496,
"grad_norm": 9.222543716430664,
"learning_rate": 4.35e-05,
"loss": 6.6131,
"step": 90
},
{
"epoch": 0.008001280204832774,
"grad_norm": 6.57027006149292,
"learning_rate": 4.85e-05,
"loss": 6.5829,
"step": 100
},
{
"epoch": 0.00880140822531605,
"grad_norm": 5.280848503112793,
"learning_rate": 4.999064020965931e-05,
"loss": 6.5996,
"step": 110
},
{
"epoch": 0.009601536245799328,
"grad_norm": 5.950971603393555,
"learning_rate": 4.997726908060117e-05,
"loss": 6.6075,
"step": 120
},
{
"epoch": 0.010401664266282605,
"grad_norm": 4.300549507141113,
"learning_rate": 4.996389795154303e-05,
"loss": 6.5074,
"step": 130
},
{
"epoch": 0.011201792286765882,
"grad_norm": 4.824333190917969,
"learning_rate": 4.9950526822484896e-05,
"loss": 6.6072,
"step": 140
},
{
"epoch": 0.01200192030724916,
"grad_norm": 5.4324116706848145,
"learning_rate": 4.993715569342676e-05,
"loss": 6.6183,
"step": 150
},
{
"epoch": 0.012802048327732437,
"grad_norm": 4.087579250335693,
"learning_rate": 4.992378456436862e-05,
"loss": 6.4806,
"step": 160
},
{
"epoch": 0.013602176348215714,
"grad_norm": 7.260207653045654,
"learning_rate": 4.9910413435310484e-05,
"loss": 6.3709,
"step": 170
},
{
"epoch": 0.014402304368698993,
"grad_norm": 4.145061016082764,
"learning_rate": 4.9897042306252346e-05,
"loss": 6.2951,
"step": 180
},
{
"epoch": 0.01520243238918227,
"grad_norm": 3.2026450634002686,
"learning_rate": 4.98836711771942e-05,
"loss": 6.3255,
"step": 190
},
{
"epoch": 0.016002560409665547,
"grad_norm": 3.443145751953125,
"learning_rate": 4.9870300048136065e-05,
"loss": 6.4894,
"step": 200
},
{
"epoch": 0.016802688430148822,
"grad_norm": 5.324231147766113,
"learning_rate": 4.985692891907793e-05,
"loss": 6.4312,
"step": 210
},
{
"epoch": 0.0176028164506321,
"grad_norm": 3.2833452224731445,
"learning_rate": 4.984355779001979e-05,
"loss": 6.513,
"step": 220
},
{
"epoch": 0.018402944471115377,
"grad_norm": 3.8984358310699463,
"learning_rate": 4.983018666096165e-05,
"loss": 6.1683,
"step": 230
},
{
"epoch": 0.019203072491598656,
"grad_norm": 4.183676719665527,
"learning_rate": 4.9816815531903516e-05,
"loss": 6.329,
"step": 240
},
{
"epoch": 0.020003200512081935,
"grad_norm": 3.136693239212036,
"learning_rate": 4.980344440284538e-05,
"loss": 6.466,
"step": 250
},
{
"epoch": 0.02080332853256521,
"grad_norm": 4.185967445373535,
"learning_rate": 4.979007327378724e-05,
"loss": 6.4613,
"step": 260
},
{
"epoch": 0.02160345655304849,
"grad_norm": 3.105653762817383,
"learning_rate": 4.9776702144729104e-05,
"loss": 6.3596,
"step": 270
},
{
"epoch": 0.022403584573531764,
"grad_norm": 3.927561044692993,
"learning_rate": 4.9763331015670967e-05,
"loss": 6.2604,
"step": 280
},
{
"epoch": 0.023203712594015043,
"grad_norm": 3.513439178466797,
"learning_rate": 4.974995988661283e-05,
"loss": 6.2747,
"step": 290
},
{
"epoch": 0.02400384061449832,
"grad_norm": 3.07377290725708,
"learning_rate": 4.973658875755469e-05,
"loss": 6.202,
"step": 300
},
{
"epoch": 0.024803968634981598,
"grad_norm": 3.045619249343872,
"learning_rate": 4.9723217628496555e-05,
"loss": 6.1022,
"step": 310
},
{
"epoch": 0.025604096655464873,
"grad_norm": 3.330648183822632,
"learning_rate": 4.970984649943842e-05,
"loss": 6.1544,
"step": 320
},
{
"epoch": 0.026404224675948152,
"grad_norm": 3.0299668312072754,
"learning_rate": 4.969647537038028e-05,
"loss": 6.3119,
"step": 330
},
{
"epoch": 0.027204352696431428,
"grad_norm": 3.687938928604126,
"learning_rate": 4.9683104241322136e-05,
"loss": 6.333,
"step": 340
},
{
"epoch": 0.028004480716914706,
"grad_norm": 4.0919413566589355,
"learning_rate": 4.9669733112264e-05,
"loss": 6.1711,
"step": 350
},
{
"epoch": 0.028804608737397985,
"grad_norm": 3.1327242851257324,
"learning_rate": 4.965636198320586e-05,
"loss": 6.3365,
"step": 360
},
{
"epoch": 0.02960473675788126,
"grad_norm": 4.531859874725342,
"learning_rate": 4.9642990854147724e-05,
"loss": 6.2121,
"step": 370
},
{
"epoch": 0.03040486477836454,
"grad_norm": 2.522672414779663,
"learning_rate": 4.962961972508959e-05,
"loss": 6.2388,
"step": 380
},
{
"epoch": 0.031204992798847815,
"grad_norm": 5.62153959274292,
"learning_rate": 4.961624859603145e-05,
"loss": 6.168,
"step": 390
},
{
"epoch": 0.032005120819331094,
"grad_norm": 3.522804021835327,
"learning_rate": 4.960287746697331e-05,
"loss": 6.1207,
"step": 400
},
{
"epoch": 0.03280524883981437,
"grad_norm": 7.260324478149414,
"learning_rate": 4.9589506337915175e-05,
"loss": 6.31,
"step": 410
},
{
"epoch": 0.033605376860297645,
"grad_norm": 4.309441566467285,
"learning_rate": 4.957613520885704e-05,
"loss": 6.1107,
"step": 420
},
{
"epoch": 0.034405504880780924,
"grad_norm": 3.2409913539886475,
"learning_rate": 4.95627640797989e-05,
"loss": 6.2082,
"step": 430
},
{
"epoch": 0.0352056329012642,
"grad_norm": 3.9414610862731934,
"learning_rate": 4.954939295074076e-05,
"loss": 6.2102,
"step": 440
},
{
"epoch": 0.03600576092174748,
"grad_norm": 2.441235303878784,
"learning_rate": 4.9536021821682626e-05,
"loss": 6.1023,
"step": 450
},
{
"epoch": 0.036805888942230754,
"grad_norm": 2.997591972351074,
"learning_rate": 4.952265069262449e-05,
"loss": 6.1147,
"step": 460
},
{
"epoch": 0.03760601696271403,
"grad_norm": 3.950436592102051,
"learning_rate": 4.950927956356635e-05,
"loss": 6.0725,
"step": 470
},
{
"epoch": 0.03840614498319731,
"grad_norm": 3.4340896606445312,
"learning_rate": 4.9495908434508214e-05,
"loss": 6.1336,
"step": 480
},
{
"epoch": 0.03920627300368059,
"grad_norm": 3.28839373588562,
"learning_rate": 4.948253730545007e-05,
"loss": 6.1709,
"step": 490
},
{
"epoch": 0.04000640102416387,
"grad_norm": 2.976365566253662,
"learning_rate": 4.946916617639193e-05,
"loss": 6.2074,
"step": 500
},
{
"epoch": 0.04080652904464714,
"grad_norm": 4.156027793884277,
"learning_rate": 4.9455795047333795e-05,
"loss": 6.1694,
"step": 510
},
{
"epoch": 0.04160665706513042,
"grad_norm": 3.4855797290802,
"learning_rate": 4.944242391827566e-05,
"loss": 6.1218,
"step": 520
},
{
"epoch": 0.0424067850856137,
"grad_norm": 4.489185333251953,
"learning_rate": 4.942905278921752e-05,
"loss": 6.1507,
"step": 530
},
{
"epoch": 0.04320691310609698,
"grad_norm": 3.2751166820526123,
"learning_rate": 4.941568166015938e-05,
"loss": 6.1055,
"step": 540
},
{
"epoch": 0.04400704112658025,
"grad_norm": 2.4234585762023926,
"learning_rate": 4.9402310531101246e-05,
"loss": 6.1755,
"step": 550
},
{
"epoch": 0.04480716914706353,
"grad_norm": 3.4436991214752197,
"learning_rate": 4.938893940204311e-05,
"loss": 6.1882,
"step": 560
},
{
"epoch": 0.04560729716754681,
"grad_norm": 3.3731908798217773,
"learning_rate": 4.937556827298497e-05,
"loss": 6.0648,
"step": 570
},
{
"epoch": 0.04640742518803009,
"grad_norm": 3.8733670711517334,
"learning_rate": 4.9362197143926834e-05,
"loss": 6.0621,
"step": 580
},
{
"epoch": 0.04720755320851336,
"grad_norm": 4.126636505126953,
"learning_rate": 4.9348826014868696e-05,
"loss": 6.122,
"step": 590
},
{
"epoch": 0.04800768122899664,
"grad_norm": 3.8605775833129883,
"learning_rate": 4.933545488581056e-05,
"loss": 5.9788,
"step": 600
},
{
"epoch": 0.048807809249479917,
"grad_norm": 2.9509966373443604,
"learning_rate": 4.932208375675242e-05,
"loss": 6.2045,
"step": 610
},
{
"epoch": 0.049607937269963195,
"grad_norm": 4.4266510009765625,
"learning_rate": 4.9308712627694285e-05,
"loss": 5.9981,
"step": 620
},
{
"epoch": 0.050408065290446474,
"grad_norm": 2.79042649269104,
"learning_rate": 4.929534149863615e-05,
"loss": 6.1882,
"step": 630
},
{
"epoch": 0.051208193310929746,
"grad_norm": 2.8986568450927734,
"learning_rate": 4.928197036957801e-05,
"loss": 6.1739,
"step": 640
},
{
"epoch": 0.052008321331413025,
"grad_norm": 4.294217586517334,
"learning_rate": 4.926859924051987e-05,
"loss": 6.0566,
"step": 650
},
{
"epoch": 0.052808449351896304,
"grad_norm": 8.848836898803711,
"learning_rate": 4.9255228111461735e-05,
"loss": 6.2994,
"step": 660
},
{
"epoch": 0.05360857737237958,
"grad_norm": 3.2204337120056152,
"learning_rate": 4.92418569824036e-05,
"loss": 6.0573,
"step": 670
},
{
"epoch": 0.054408705392862855,
"grad_norm": 4.775251865386963,
"learning_rate": 4.922848585334546e-05,
"loss": 5.9764,
"step": 680
},
{
"epoch": 0.055208833413346134,
"grad_norm": 3.5426905155181885,
"learning_rate": 4.921511472428732e-05,
"loss": 6.0402,
"step": 690
},
{
"epoch": 0.05600896143382941,
"grad_norm": 10.72481632232666,
"learning_rate": 4.9201743595229186e-05,
"loss": 6.0024,
"step": 700
},
{
"epoch": 0.05680908945431269,
"grad_norm": 2.441681385040283,
"learning_rate": 4.918837246617105e-05,
"loss": 6.1122,
"step": 710
},
{
"epoch": 0.05760921747479597,
"grad_norm": 3.375319480895996,
"learning_rate": 4.917500133711291e-05,
"loss": 6.058,
"step": 720
},
{
"epoch": 0.05840934549527924,
"grad_norm": 2.821507453918457,
"learning_rate": 4.9161630208054774e-05,
"loss": 6.0586,
"step": 730
},
{
"epoch": 0.05920947351576252,
"grad_norm": 2.8658957481384277,
"learning_rate": 4.914825907899664e-05,
"loss": 6.0115,
"step": 740
},
{
"epoch": 0.0600096015362458,
"grad_norm": 2.239774227142334,
"learning_rate": 4.91348879499385e-05,
"loss": 6.0669,
"step": 750
},
{
"epoch": 0.06080972955672908,
"grad_norm": 3.5249900817871094,
"learning_rate": 4.912151682088036e-05,
"loss": 6.1013,
"step": 760
},
{
"epoch": 0.06160985757721235,
"grad_norm": 2.790356159210205,
"learning_rate": 4.9108145691822225e-05,
"loss": 6.0099,
"step": 770
},
{
"epoch": 0.06240998559769563,
"grad_norm": 3.0729963779449463,
"learning_rate": 4.909477456276409e-05,
"loss": 6.1376,
"step": 780
},
{
"epoch": 0.06321011361817891,
"grad_norm": 2.9490275382995605,
"learning_rate": 4.908140343370595e-05,
"loss": 6.1457,
"step": 790
},
{
"epoch": 0.06401024163866219,
"grad_norm": 2.7475438117980957,
"learning_rate": 4.9068032304647806e-05,
"loss": 6.0041,
"step": 800
},
{
"epoch": 0.06481036965914547,
"grad_norm": 2.755703926086426,
"learning_rate": 4.905466117558967e-05,
"loss": 6.0242,
"step": 810
},
{
"epoch": 0.06561049767962875,
"grad_norm": 2.724515676498413,
"learning_rate": 4.904129004653153e-05,
"loss": 6.1827,
"step": 820
},
{
"epoch": 0.06641062570011202,
"grad_norm": 4.498260974884033,
"learning_rate": 4.9027918917473394e-05,
"loss": 6.0892,
"step": 830
},
{
"epoch": 0.06721075372059529,
"grad_norm": 2.4399070739746094,
"learning_rate": 4.901454778841526e-05,
"loss": 6.0197,
"step": 840
},
{
"epoch": 0.06801088174107857,
"grad_norm": 2.7584304809570312,
"learning_rate": 4.900117665935712e-05,
"loss": 5.9056,
"step": 850
},
{
"epoch": 0.06881100976156185,
"grad_norm": 2.8177144527435303,
"learning_rate": 4.898780553029898e-05,
"loss": 6.1484,
"step": 860
},
{
"epoch": 0.06961113778204513,
"grad_norm": 4.181133270263672,
"learning_rate": 4.8974434401240845e-05,
"loss": 5.9376,
"step": 870
},
{
"epoch": 0.0704112658025284,
"grad_norm": 3.677849769592285,
"learning_rate": 4.896106327218271e-05,
"loss": 6.0403,
"step": 880
},
{
"epoch": 0.07121139382301168,
"grad_norm": 3.1553192138671875,
"learning_rate": 4.894769214312457e-05,
"loss": 6.0488,
"step": 890
},
{
"epoch": 0.07201152184349496,
"grad_norm": 3.2580947875976562,
"learning_rate": 4.893432101406643e-05,
"loss": 6.1002,
"step": 900
},
{
"epoch": 0.07281164986397824,
"grad_norm": 6.328150749206543,
"learning_rate": 4.8920949885008296e-05,
"loss": 6.0225,
"step": 910
},
{
"epoch": 0.07361177788446151,
"grad_norm": 2.7467615604400635,
"learning_rate": 4.890757875595016e-05,
"loss": 5.9622,
"step": 920
},
{
"epoch": 0.07441190590494479,
"grad_norm": 2.86570405960083,
"learning_rate": 4.889420762689202e-05,
"loss": 5.9718,
"step": 930
},
{
"epoch": 0.07521203392542807,
"grad_norm": 2.544917106628418,
"learning_rate": 4.8880836497833884e-05,
"loss": 5.8697,
"step": 940
},
{
"epoch": 0.07601216194591134,
"grad_norm": 2.5245840549468994,
"learning_rate": 4.8867465368775746e-05,
"loss": 5.9973,
"step": 950
},
{
"epoch": 0.07681228996639462,
"grad_norm": 3.6830902099609375,
"learning_rate": 4.88540942397176e-05,
"loss": 5.943,
"step": 960
},
{
"epoch": 0.0776124179868779,
"grad_norm": 2.6643354892730713,
"learning_rate": 4.8840723110659465e-05,
"loss": 5.8958,
"step": 970
},
{
"epoch": 0.07841254600736118,
"grad_norm": 6.4623565673828125,
"learning_rate": 4.882735198160133e-05,
"loss": 6.0236,
"step": 980
},
{
"epoch": 0.07921267402784446,
"grad_norm": 2.186974048614502,
"learning_rate": 4.881398085254319e-05,
"loss": 6.0481,
"step": 990
},
{
"epoch": 0.08001280204832774,
"grad_norm": 2.4983859062194824,
"learning_rate": 4.880060972348505e-05,
"loss": 6.075,
"step": 1000
},
{
"epoch": 0.080812930068811,
"grad_norm": 2.778280258178711,
"learning_rate": 4.8787238594426916e-05,
"loss": 6.0757,
"step": 1010
},
{
"epoch": 0.08161305808929428,
"grad_norm": 2.706965923309326,
"learning_rate": 4.877386746536878e-05,
"loss": 6.1504,
"step": 1020
},
{
"epoch": 0.08241318610977756,
"grad_norm": 3.4069600105285645,
"learning_rate": 4.876049633631064e-05,
"loss": 6.0889,
"step": 1030
},
{
"epoch": 0.08321331413026084,
"grad_norm": 3.179551124572754,
"learning_rate": 4.8747125207252504e-05,
"loss": 6.0057,
"step": 1040
},
{
"epoch": 0.08401344215074412,
"grad_norm": 2.924018383026123,
"learning_rate": 4.873375407819437e-05,
"loss": 5.8406,
"step": 1050
},
{
"epoch": 0.0848135701712274,
"grad_norm": 3.103912115097046,
"learning_rate": 4.872038294913623e-05,
"loss": 6.0351,
"step": 1060
},
{
"epoch": 0.08561369819171068,
"grad_norm": 2.8037219047546387,
"learning_rate": 4.870701182007809e-05,
"loss": 6.0272,
"step": 1070
},
{
"epoch": 0.08641382621219396,
"grad_norm": 2.477062940597534,
"learning_rate": 4.8693640691019955e-05,
"loss": 5.9269,
"step": 1080
},
{
"epoch": 0.08721395423267723,
"grad_norm": 2.748488187789917,
"learning_rate": 4.868026956196182e-05,
"loss": 5.943,
"step": 1090
},
{
"epoch": 0.0880140822531605,
"grad_norm": 3.3991920948028564,
"learning_rate": 4.866689843290368e-05,
"loss": 6.1455,
"step": 1100
},
{
"epoch": 0.08881421027364378,
"grad_norm": 3.208509683609009,
"learning_rate": 4.8653527303845536e-05,
"loss": 5.9746,
"step": 1110
},
{
"epoch": 0.08961433829412706,
"grad_norm": 3.3378469944000244,
"learning_rate": 4.86401561747874e-05,
"loss": 5.9185,
"step": 1120
},
{
"epoch": 0.09041446631461034,
"grad_norm": 2.269606113433838,
"learning_rate": 4.862678504572926e-05,
"loss": 5.9369,
"step": 1130
},
{
"epoch": 0.09121459433509362,
"grad_norm": 2.749335765838623,
"learning_rate": 4.8613413916671124e-05,
"loss": 6.0648,
"step": 1140
},
{
"epoch": 0.0920147223555769,
"grad_norm": 2.821913480758667,
"learning_rate": 4.860004278761299e-05,
"loss": 5.952,
"step": 1150
},
{
"epoch": 0.09281485037606017,
"grad_norm": 2.640990734100342,
"learning_rate": 4.858667165855485e-05,
"loss": 6.0537,
"step": 1160
},
{
"epoch": 0.09361497839654345,
"grad_norm": 3.570896625518799,
"learning_rate": 4.857330052949671e-05,
"loss": 5.7721,
"step": 1170
},
{
"epoch": 0.09441510641702672,
"grad_norm": 3.245318651199341,
"learning_rate": 4.8559929400438575e-05,
"loss": 5.7305,
"step": 1180
},
{
"epoch": 0.09521523443751,
"grad_norm": 4.075076580047607,
"learning_rate": 4.854655827138044e-05,
"loss": 5.974,
"step": 1190
},
{
"epoch": 0.09601536245799328,
"grad_norm": 2.429893732070923,
"learning_rate": 4.85331871423223e-05,
"loss": 5.7828,
"step": 1200
},
{
"epoch": 0.09681549047847655,
"grad_norm": 2.7077040672302246,
"learning_rate": 4.851981601326416e-05,
"loss": 5.9143,
"step": 1210
},
{
"epoch": 0.09761561849895983,
"grad_norm": 2.767918586730957,
"learning_rate": 4.8506444884206026e-05,
"loss": 5.9449,
"step": 1220
},
{
"epoch": 0.09841574651944311,
"grad_norm": 2.4544034004211426,
"learning_rate": 4.849307375514789e-05,
"loss": 6.0034,
"step": 1230
},
{
"epoch": 0.09921587453992639,
"grad_norm": 5.215607643127441,
"learning_rate": 4.847970262608975e-05,
"loss": 5.867,
"step": 1240
},
{
"epoch": 0.10001600256040967,
"grad_norm": 2.7856080532073975,
"learning_rate": 4.8466331497031614e-05,
"loss": 6.0213,
"step": 1250
},
{
"epoch": 0.10081613058089295,
"grad_norm": 2.5528719425201416,
"learning_rate": 4.8452960367973476e-05,
"loss": 5.9634,
"step": 1260
},
{
"epoch": 0.10161625860137621,
"grad_norm": 2.4917409420013428,
"learning_rate": 4.843958923891533e-05,
"loss": 5.887,
"step": 1270
},
{
"epoch": 0.10241638662185949,
"grad_norm": 6.125699520111084,
"learning_rate": 4.8426218109857195e-05,
"loss": 6.1189,
"step": 1280
},
{
"epoch": 0.10321651464234277,
"grad_norm": 2.783156156539917,
"learning_rate": 4.841284698079906e-05,
"loss": 5.9064,
"step": 1290
},
{
"epoch": 0.10401664266282605,
"grad_norm": 3.611070156097412,
"learning_rate": 4.839947585174092e-05,
"loss": 5.9405,
"step": 1300
},
{
"epoch": 0.10481677068330933,
"grad_norm": 4.296909809112549,
"learning_rate": 4.838610472268278e-05,
"loss": 5.9067,
"step": 1310
},
{
"epoch": 0.10561689870379261,
"grad_norm": 2.4273040294647217,
"learning_rate": 4.8372733593624646e-05,
"loss": 5.888,
"step": 1320
},
{
"epoch": 0.10641702672427589,
"grad_norm": 2.6499924659729004,
"learning_rate": 4.835936246456651e-05,
"loss": 5.9683,
"step": 1330
},
{
"epoch": 0.10721715474475917,
"grad_norm": 3.1474297046661377,
"learning_rate": 4.834599133550837e-05,
"loss": 5.8946,
"step": 1340
},
{
"epoch": 0.10801728276524244,
"grad_norm": 3.5050199031829834,
"learning_rate": 4.8332620206450234e-05,
"loss": 5.9179,
"step": 1350
},
{
"epoch": 0.10881741078572571,
"grad_norm": 2.693700075149536,
"learning_rate": 4.8319249077392096e-05,
"loss": 5.7965,
"step": 1360
},
{
"epoch": 0.10961753880620899,
"grad_norm": 2.8202953338623047,
"learning_rate": 4.830587794833396e-05,
"loss": 5.9526,
"step": 1370
},
{
"epoch": 0.11041766682669227,
"grad_norm": 2.514862060546875,
"learning_rate": 4.829250681927582e-05,
"loss": 5.936,
"step": 1380
},
{
"epoch": 0.11121779484717555,
"grad_norm": 3.18804931640625,
"learning_rate": 4.8279135690217685e-05,
"loss": 5.9246,
"step": 1390
},
{
"epoch": 0.11201792286765883,
"grad_norm": 2.77697491645813,
"learning_rate": 4.826576456115955e-05,
"loss": 5.9576,
"step": 1400
},
{
"epoch": 0.1128180508881421,
"grad_norm": 2.762524127960205,
"learning_rate": 4.825239343210141e-05,
"loss": 5.9085,
"step": 1410
},
{
"epoch": 0.11361817890862538,
"grad_norm": 2.4407670497894287,
"learning_rate": 4.8239022303043266e-05,
"loss": 5.9518,
"step": 1420
},
{
"epoch": 0.11441830692910866,
"grad_norm": 3.1036713123321533,
"learning_rate": 4.822565117398513e-05,
"loss": 5.8412,
"step": 1430
},
{
"epoch": 0.11521843494959194,
"grad_norm": 3.319058418273926,
"learning_rate": 4.821228004492699e-05,
"loss": 5.9733,
"step": 1440
},
{
"epoch": 0.1160185629700752,
"grad_norm": 2.13468599319458,
"learning_rate": 4.8198908915868854e-05,
"loss": 5.9193,
"step": 1450
},
{
"epoch": 0.11681869099055849,
"grad_norm": 2.6057028770446777,
"learning_rate": 4.8185537786810717e-05,
"loss": 5.9807,
"step": 1460
},
{
"epoch": 0.11761881901104176,
"grad_norm": 2.7509753704071045,
"learning_rate": 4.817216665775258e-05,
"loss": 5.9534,
"step": 1470
},
{
"epoch": 0.11841894703152504,
"grad_norm": 2.111055850982666,
"learning_rate": 4.815879552869444e-05,
"loss": 5.9207,
"step": 1480
},
{
"epoch": 0.11921907505200832,
"grad_norm": 2.5271990299224854,
"learning_rate": 4.8145424399636305e-05,
"loss": 5.7148,
"step": 1490
},
{
"epoch": 0.1200192030724916,
"grad_norm": 2.814138174057007,
"learning_rate": 4.813205327057817e-05,
"loss": 5.9498,
"step": 1500
},
{
"epoch": 0.12081933109297488,
"grad_norm": 3.449355363845825,
"learning_rate": 4.811868214152003e-05,
"loss": 5.7814,
"step": 1510
},
{
"epoch": 0.12161945911345816,
"grad_norm": 2.813746213912964,
"learning_rate": 4.810531101246189e-05,
"loss": 5.9517,
"step": 1520
},
{
"epoch": 0.12241958713394142,
"grad_norm": 2.529242753982544,
"learning_rate": 4.8091939883403755e-05,
"loss": 5.8227,
"step": 1530
},
{
"epoch": 0.1232197151544247,
"grad_norm": 2.2425034046173096,
"learning_rate": 4.807856875434562e-05,
"loss": 6.1064,
"step": 1540
},
{
"epoch": 0.12401984317490798,
"grad_norm": 2.7732784748077393,
"learning_rate": 4.806519762528748e-05,
"loss": 5.8888,
"step": 1550
},
{
"epoch": 0.12481997119539126,
"grad_norm": 2.5558009147644043,
"learning_rate": 4.8051826496229343e-05,
"loss": 5.8185,
"step": 1560
},
{
"epoch": 0.12562009921587455,
"grad_norm": 2.884411096572876,
"learning_rate": 4.8038455367171206e-05,
"loss": 6.0534,
"step": 1570
},
{
"epoch": 0.12642022723635782,
"grad_norm": 2.5747668743133545,
"learning_rate": 4.802508423811307e-05,
"loss": 5.8186,
"step": 1580
},
{
"epoch": 0.12722035525684108,
"grad_norm": 2.324767827987671,
"learning_rate": 4.801171310905493e-05,
"loss": 5.8642,
"step": 1590
},
{
"epoch": 0.12802048327732438,
"grad_norm": 2.2255160808563232,
"learning_rate": 4.7998341979996794e-05,
"loss": 5.8559,
"step": 1600
},
{
"epoch": 0.12882061129780764,
"grad_norm": 2.97525954246521,
"learning_rate": 4.798497085093866e-05,
"loss": 5.8744,
"step": 1610
},
{
"epoch": 0.12962073931829093,
"grad_norm": 2.23962664604187,
"learning_rate": 4.797159972188052e-05,
"loss": 5.7545,
"step": 1620
},
{
"epoch": 0.1304208673387742,
"grad_norm": 3.6182124614715576,
"learning_rate": 4.795822859282238e-05,
"loss": 5.8872,
"step": 1630
},
{
"epoch": 0.1312209953592575,
"grad_norm": 4.068545341491699,
"learning_rate": 4.7944857463764245e-05,
"loss": 5.9008,
"step": 1640
},
{
"epoch": 0.13202112337974076,
"grad_norm": 3.627082109451294,
"learning_rate": 4.793148633470611e-05,
"loss": 5.8215,
"step": 1650
},
{
"epoch": 0.13282125140022405,
"grad_norm": 3.0080721378326416,
"learning_rate": 4.791811520564797e-05,
"loss": 5.9086,
"step": 1660
},
{
"epoch": 0.13362137942070731,
"grad_norm": 2.5463860034942627,
"learning_rate": 4.790474407658983e-05,
"loss": 5.776,
"step": 1670
},
{
"epoch": 0.13442150744119058,
"grad_norm": 2.212488889694214,
"learning_rate": 4.7891372947531696e-05,
"loss": 6.006,
"step": 1680
},
{
"epoch": 0.13522163546167387,
"grad_norm": 4.147563934326172,
"learning_rate": 4.787800181847356e-05,
"loss": 5.886,
"step": 1690
},
{
"epoch": 0.13602176348215714,
"grad_norm": 2.6021018028259277,
"learning_rate": 4.786463068941542e-05,
"loss": 5.9182,
"step": 1700
},
{
"epoch": 0.13682189150264043,
"grad_norm": 2.3109893798828125,
"learning_rate": 4.7851259560357284e-05,
"loss": 5.8084,
"step": 1710
},
{
"epoch": 0.1376220195231237,
"grad_norm": 2.8678529262542725,
"learning_rate": 4.7837888431299147e-05,
"loss": 6.0363,
"step": 1720
},
{
"epoch": 0.138422147543607,
"grad_norm": 2.1921958923339844,
"learning_rate": 4.7824517302241e-05,
"loss": 5.7667,
"step": 1730
},
{
"epoch": 0.13922227556409025,
"grad_norm": 2.6883316040039062,
"learning_rate": 4.7811146173182865e-05,
"loss": 5.7906,
"step": 1740
},
{
"epoch": 0.14002240358457352,
"grad_norm": 2.4079957008361816,
"learning_rate": 4.779777504412473e-05,
"loss": 5.7698,
"step": 1750
},
{
"epoch": 0.1408225316050568,
"grad_norm": 4.29390287399292,
"learning_rate": 4.778440391506659e-05,
"loss": 5.9639,
"step": 1760
},
{
"epoch": 0.14162265962554008,
"grad_norm": 4.133132457733154,
"learning_rate": 4.777103278600845e-05,
"loss": 6.0901,
"step": 1770
},
{
"epoch": 0.14242278764602337,
"grad_norm": 3.871561288833618,
"learning_rate": 4.7757661656950316e-05,
"loss": 5.7455,
"step": 1780
},
{
"epoch": 0.14322291566650663,
"grad_norm": 4.266111850738525,
"learning_rate": 4.774429052789218e-05,
"loss": 5.9971,
"step": 1790
},
{
"epoch": 0.14402304368698993,
"grad_norm": 2.9000513553619385,
"learning_rate": 4.773091939883404e-05,
"loss": 5.9025,
"step": 1800
},
{
"epoch": 0.1448231717074732,
"grad_norm": 2.549964189529419,
"learning_rate": 4.7717548269775904e-05,
"loss": 5.768,
"step": 1810
},
{
"epoch": 0.14562329972795648,
"grad_norm": 2.2882704734802246,
"learning_rate": 4.770417714071777e-05,
"loss": 6.022,
"step": 1820
},
{
"epoch": 0.14642342774843975,
"grad_norm": 2.6501784324645996,
"learning_rate": 4.769080601165963e-05,
"loss": 5.8539,
"step": 1830
},
{
"epoch": 0.14722355576892301,
"grad_norm": 2.3417108058929443,
"learning_rate": 4.767743488260149e-05,
"loss": 5.7734,
"step": 1840
},
{
"epoch": 0.1480236837894063,
"grad_norm": 2.2151668071746826,
"learning_rate": 4.7664063753543355e-05,
"loss": 5.84,
"step": 1850
},
{
"epoch": 0.14882381180988957,
"grad_norm": 3.114260196685791,
"learning_rate": 4.765069262448522e-05,
"loss": 5.9409,
"step": 1860
},
{
"epoch": 0.14962393983037287,
"grad_norm": 2.4931910037994385,
"learning_rate": 4.763732149542708e-05,
"loss": 5.9396,
"step": 1870
},
{
"epoch": 0.15042406785085613,
"grad_norm": 3.736487865447998,
"learning_rate": 4.7623950366368936e-05,
"loss": 5.7427,
"step": 1880
},
{
"epoch": 0.15122419587133942,
"grad_norm": 4.730785846710205,
"learning_rate": 4.76105792373108e-05,
"loss": 5.9181,
"step": 1890
},
{
"epoch": 0.1520243238918227,
"grad_norm": 2.9264132976531982,
"learning_rate": 4.759720810825266e-05,
"loss": 5.8967,
"step": 1900
},
{
"epoch": 0.15282445191230598,
"grad_norm": 3.2538132667541504,
"learning_rate": 4.7583836979194524e-05,
"loss": 5.8459,
"step": 1910
},
{
"epoch": 0.15362457993278925,
"grad_norm": 2.7208549976348877,
"learning_rate": 4.757046585013639e-05,
"loss": 5.7038,
"step": 1920
},
{
"epoch": 0.1544247079532725,
"grad_norm": 2.7510788440704346,
"learning_rate": 4.755709472107825e-05,
"loss": 5.8524,
"step": 1930
},
{
"epoch": 0.1552248359737558,
"grad_norm": 2.6565892696380615,
"learning_rate": 4.754372359202011e-05,
"loss": 5.6324,
"step": 1940
},
{
"epoch": 0.15602496399423907,
"grad_norm": 2.954798936843872,
"learning_rate": 4.7530352462961975e-05,
"loss": 5.8388,
"step": 1950
},
{
"epoch": 0.15682509201472236,
"grad_norm": 2.291714668273926,
"learning_rate": 4.751698133390384e-05,
"loss": 5.7504,
"step": 1960
},
{
"epoch": 0.15762522003520563,
"grad_norm": 2.1387598514556885,
"learning_rate": 4.75036102048457e-05,
"loss": 5.7556,
"step": 1970
},
{
"epoch": 0.15842534805568892,
"grad_norm": 2.290407180786133,
"learning_rate": 4.749023907578756e-05,
"loss": 5.7089,
"step": 1980
},
{
"epoch": 0.15922547607617218,
"grad_norm": 2.852696657180786,
"learning_rate": 4.7476867946729426e-05,
"loss": 5.8656,
"step": 1990
},
{
"epoch": 0.16002560409665548,
"grad_norm": 2.8190526962280273,
"learning_rate": 4.746349681767129e-05,
"loss": 6.0134,
"step": 2000
},
{
"epoch": 0.16082573211713874,
"grad_norm": 2.705008029937744,
"learning_rate": 4.745012568861315e-05,
"loss": 5.8713,
"step": 2010
},
{
"epoch": 0.161625860137622,
"grad_norm": 3.571394205093384,
"learning_rate": 4.7436754559555014e-05,
"loss": 5.8329,
"step": 2020
},
{
"epoch": 0.1624259881581053,
"grad_norm": 2.687455177307129,
"learning_rate": 4.7423383430496876e-05,
"loss": 5.8355,
"step": 2030
},
{
"epoch": 0.16322611617858857,
"grad_norm": 2.6158690452575684,
"learning_rate": 4.741001230143873e-05,
"loss": 5.6938,
"step": 2040
},
{
"epoch": 0.16402624419907186,
"grad_norm": 2.9657154083251953,
"learning_rate": 4.7396641172380595e-05,
"loss": 5.7514,
"step": 2050
},
{
"epoch": 0.16482637221955512,
"grad_norm": 2.310607433319092,
"learning_rate": 4.738327004332246e-05,
"loss": 5.7397,
"step": 2060
},
{
"epoch": 0.16562650024003842,
"grad_norm": 2.855271339416504,
"learning_rate": 4.736989891426432e-05,
"loss": 5.7645,
"step": 2070
},
{
"epoch": 0.16642662826052168,
"grad_norm": 2.778768301010132,
"learning_rate": 4.735652778520618e-05,
"loss": 5.9582,
"step": 2080
},
{
"epoch": 0.16722675628100497,
"grad_norm": 3.069973945617676,
"learning_rate": 4.7343156656148046e-05,
"loss": 5.8205,
"step": 2090
},
{
"epoch": 0.16802688430148824,
"grad_norm": 3.5799551010131836,
"learning_rate": 4.732978552708991e-05,
"loss": 5.9001,
"step": 2100
},
{
"epoch": 0.1688270123219715,
"grad_norm": 2.556668758392334,
"learning_rate": 4.731641439803177e-05,
"loss": 5.7258,
"step": 2110
},
{
"epoch": 0.1696271403424548,
"grad_norm": 2.7847707271575928,
"learning_rate": 4.7303043268973634e-05,
"loss": 5.9007,
"step": 2120
},
{
"epoch": 0.17042726836293806,
"grad_norm": 4.071508407592773,
"learning_rate": 4.7289672139915496e-05,
"loss": 5.7035,
"step": 2130
},
{
"epoch": 0.17122739638342135,
"grad_norm": 2.6188418865203857,
"learning_rate": 4.727630101085736e-05,
"loss": 5.651,
"step": 2140
},
{
"epoch": 0.17202752440390462,
"grad_norm": 1.952249526977539,
"learning_rate": 4.726292988179922e-05,
"loss": 6.1107,
"step": 2150
},
{
"epoch": 0.1728276524243879,
"grad_norm": 2.299018144607544,
"learning_rate": 4.7249558752741085e-05,
"loss": 5.7609,
"step": 2160
},
{
"epoch": 0.17362778044487118,
"grad_norm": 2.5578439235687256,
"learning_rate": 4.723618762368295e-05,
"loss": 5.792,
"step": 2170
},
{
"epoch": 0.17442790846535447,
"grad_norm": 3.9921529293060303,
"learning_rate": 4.722281649462481e-05,
"loss": 5.7233,
"step": 2180
},
{
"epoch": 0.17522803648583773,
"grad_norm": 2.5521302223205566,
"learning_rate": 4.7209445365566666e-05,
"loss": 5.807,
"step": 2190
},
{
"epoch": 0.176028164506321,
"grad_norm": 2.71401047706604,
"learning_rate": 4.719607423650853e-05,
"loss": 5.6689,
"step": 2200
},
{
"epoch": 0.1768282925268043,
"grad_norm": 3.782607316970825,
"learning_rate": 4.718270310745039e-05,
"loss": 5.734,
"step": 2210
},
{
"epoch": 0.17762842054728756,
"grad_norm": 2.57356333732605,
"learning_rate": 4.7169331978392254e-05,
"loss": 5.8101,
"step": 2220
},
{
"epoch": 0.17842854856777085,
"grad_norm": 2.7005815505981445,
"learning_rate": 4.715596084933412e-05,
"loss": 6.0603,
"step": 2230
},
{
"epoch": 0.17922867658825412,
"grad_norm": 2.081550359725952,
"learning_rate": 4.714258972027598e-05,
"loss": 5.7677,
"step": 2240
},
{
"epoch": 0.1800288046087374,
"grad_norm": 3.6565728187561035,
"learning_rate": 4.712921859121784e-05,
"loss": 5.9672,
"step": 2250
},
{
"epoch": 0.18082893262922067,
"grad_norm": 2.4702320098876953,
"learning_rate": 4.7115847462159705e-05,
"loss": 5.8397,
"step": 2260
},
{
"epoch": 0.18162906064970397,
"grad_norm": 3.335736036300659,
"learning_rate": 4.710247633310157e-05,
"loss": 5.7021,
"step": 2270
},
{
"epoch": 0.18242918867018723,
"grad_norm": 3.3939075469970703,
"learning_rate": 4.708910520404343e-05,
"loss": 5.8464,
"step": 2280
},
{
"epoch": 0.1832293166906705,
"grad_norm": 2.4869279861450195,
"learning_rate": 4.707573407498529e-05,
"loss": 5.6904,
"step": 2290
},
{
"epoch": 0.1840294447111538,
"grad_norm": 2.4240360260009766,
"learning_rate": 4.7062362945927155e-05,
"loss": 5.7227,
"step": 2300
},
{
"epoch": 0.18482957273163705,
"grad_norm": 2.428786039352417,
"learning_rate": 4.704899181686902e-05,
"loss": 5.8295,
"step": 2310
},
{
"epoch": 0.18562970075212035,
"grad_norm": 3.3214187622070312,
"learning_rate": 4.703562068781088e-05,
"loss": 5.8341,
"step": 2320
},
{
"epoch": 0.1864298287726036,
"grad_norm": 3.2146456241607666,
"learning_rate": 4.7022249558752744e-05,
"loss": 5.7217,
"step": 2330
},
{
"epoch": 0.1872299567930869,
"grad_norm": 4.442914009094238,
"learning_rate": 4.7008878429694606e-05,
"loss": 5.9003,
"step": 2340
},
{
"epoch": 0.18803008481357017,
"grad_norm": 1.9268267154693604,
"learning_rate": 4.699550730063646e-05,
"loss": 5.8292,
"step": 2350
},
{
"epoch": 0.18883021283405343,
"grad_norm": 3.130021095275879,
"learning_rate": 4.6982136171578325e-05,
"loss": 5.6864,
"step": 2360
},
{
"epoch": 0.18963034085453673,
"grad_norm": 2.8835690021514893,
"learning_rate": 4.696876504252019e-05,
"loss": 5.829,
"step": 2370
},
{
"epoch": 0.19043046887502,
"grad_norm": 2.4171135425567627,
"learning_rate": 4.695539391346205e-05,
"loss": 5.7972,
"step": 2380
},
{
"epoch": 0.19123059689550329,
"grad_norm": 3.782817840576172,
"learning_rate": 4.694202278440391e-05,
"loss": 5.8497,
"step": 2390
},
{
"epoch": 0.19203072491598655,
"grad_norm": 2.475249767303467,
"learning_rate": 4.6928651655345776e-05,
"loss": 5.9237,
"step": 2400
},
{
"epoch": 0.19283085293646984,
"grad_norm": 2.5809242725372314,
"learning_rate": 4.691528052628764e-05,
"loss": 5.7756,
"step": 2410
},
{
"epoch": 0.1936309809569531,
"grad_norm": 2.6922059059143066,
"learning_rate": 4.69019093972295e-05,
"loss": 5.9326,
"step": 2420
},
{
"epoch": 0.1944311089774364,
"grad_norm": 2.7542431354522705,
"learning_rate": 4.6888538268171364e-05,
"loss": 5.6279,
"step": 2430
},
{
"epoch": 0.19523123699791967,
"grad_norm": 2.4063303470611572,
"learning_rate": 4.6875167139113226e-05,
"loss": 5.91,
"step": 2440
},
{
"epoch": 0.19603136501840293,
"grad_norm": 4.855547904968262,
"learning_rate": 4.686179601005509e-05,
"loss": 5.7286,
"step": 2450
},
{
"epoch": 0.19683149303888622,
"grad_norm": 2.9875595569610596,
"learning_rate": 4.684842488099695e-05,
"loss": 5.8299,
"step": 2460
},
{
"epoch": 0.1976316210593695,
"grad_norm": 4.467639923095703,
"learning_rate": 4.6835053751938814e-05,
"loss": 5.8469,
"step": 2470
},
{
"epoch": 0.19843174907985278,
"grad_norm": 2.2144124507904053,
"learning_rate": 4.682168262288068e-05,
"loss": 5.7871,
"step": 2480
},
{
"epoch": 0.19923187710033605,
"grad_norm": 2.4507012367248535,
"learning_rate": 4.680831149382254e-05,
"loss": 5.7529,
"step": 2490
},
{
"epoch": 0.20003200512081934,
"grad_norm": 2.208648681640625,
"learning_rate": 4.67949403647644e-05,
"loss": 5.7265,
"step": 2500
},
{
"epoch": 0.2008321331413026,
"grad_norm": 2.560302257537842,
"learning_rate": 4.6781569235706265e-05,
"loss": 5.7842,
"step": 2510
},
{
"epoch": 0.2016322611617859,
"grad_norm": 2.354292154312134,
"learning_rate": 4.676819810664813e-05,
"loss": 5.8468,
"step": 2520
},
{
"epoch": 0.20243238918226916,
"grad_norm": 2.9559860229492188,
"learning_rate": 4.675482697758999e-05,
"loss": 5.7003,
"step": 2530
},
{
"epoch": 0.20323251720275243,
"grad_norm": 3.251077651977539,
"learning_rate": 4.674145584853185e-05,
"loss": 5.8129,
"step": 2540
},
{
"epoch": 0.20403264522323572,
"grad_norm": 2.7863471508026123,
"learning_rate": 4.6728084719473716e-05,
"loss": 5.6814,
"step": 2550
},
{
"epoch": 0.20483277324371899,
"grad_norm": 2.9006989002227783,
"learning_rate": 4.671471359041558e-05,
"loss": 5.8292,
"step": 2560
},
{
"epoch": 0.20563290126420228,
"grad_norm": 2.930689573287964,
"learning_rate": 4.670134246135744e-05,
"loss": 5.8825,
"step": 2570
},
{
"epoch": 0.20643302928468554,
"grad_norm": 2.3105032444000244,
"learning_rate": 4.6687971332299304e-05,
"loss": 5.7039,
"step": 2580
},
{
"epoch": 0.20723315730516884,
"grad_norm": 3.1141879558563232,
"learning_rate": 4.667460020324117e-05,
"loss": 5.8692,
"step": 2590
},
{
"epoch": 0.2080332853256521,
"grad_norm": 3.5017199516296387,
"learning_rate": 4.666122907418303e-05,
"loss": 5.7922,
"step": 2600
},
{
"epoch": 0.2088334133461354,
"grad_norm": 2.657975912094116,
"learning_rate": 4.664785794512489e-05,
"loss": 5.7736,
"step": 2610
},
{
"epoch": 0.20963354136661866,
"grad_norm": 3.246952772140503,
"learning_rate": 4.6634486816066755e-05,
"loss": 5.768,
"step": 2620
},
{
"epoch": 0.21043366938710192,
"grad_norm": 6.832335948944092,
"learning_rate": 4.662111568700862e-05,
"loss": 5.6752,
"step": 2630
},
{
"epoch": 0.21123379740758522,
"grad_norm": 3.2479753494262695,
"learning_rate": 4.660774455795048e-05,
"loss": 5.8015,
"step": 2640
},
{
"epoch": 0.21203392542806848,
"grad_norm": 2.809082508087158,
"learning_rate": 4.659437342889234e-05,
"loss": 5.8663,
"step": 2650
},
{
"epoch": 0.21283405344855177,
"grad_norm": 3.7948036193847656,
"learning_rate": 4.65810022998342e-05,
"loss": 5.889,
"step": 2660
},
{
"epoch": 0.21363418146903504,
"grad_norm": 2.836090564727783,
"learning_rate": 4.656763117077606e-05,
"loss": 5.7516,
"step": 2670
},
{
"epoch": 0.21443430948951833,
"grad_norm": 3.0940232276916504,
"learning_rate": 4.6554260041717924e-05,
"loss": 5.7033,
"step": 2680
},
{
"epoch": 0.2152344375100016,
"grad_norm": 2.436757802963257,
"learning_rate": 4.654088891265979e-05,
"loss": 5.746,
"step": 2690
},
{
"epoch": 0.2160345655304849,
"grad_norm": 2.4339609146118164,
"learning_rate": 4.652751778360165e-05,
"loss": 5.828,
"step": 2700
},
{
"epoch": 0.21683469355096816,
"grad_norm": 2.379366874694824,
"learning_rate": 4.651414665454351e-05,
"loss": 5.719,
"step": 2710
},
{
"epoch": 0.21763482157145142,
"grad_norm": 2.1722371578216553,
"learning_rate": 4.6500775525485375e-05,
"loss": 5.7875,
"step": 2720
},
{
"epoch": 0.2184349495919347,
"grad_norm": 3.633279800415039,
"learning_rate": 4.648740439642724e-05,
"loss": 5.802,
"step": 2730
},
{
"epoch": 0.21923507761241798,
"grad_norm": 2.4091219902038574,
"learning_rate": 4.64740332673691e-05,
"loss": 5.8197,
"step": 2740
},
{
"epoch": 0.22003520563290127,
"grad_norm": 2.7289021015167236,
"learning_rate": 4.646066213831096e-05,
"loss": 5.9445,
"step": 2750
},
{
"epoch": 0.22083533365338454,
"grad_norm": 2.376481294631958,
"learning_rate": 4.6447291009252826e-05,
"loss": 5.9943,
"step": 2760
},
{
"epoch": 0.22163546167386783,
"grad_norm": 2.6542563438415527,
"learning_rate": 4.643391988019469e-05,
"loss": 5.6049,
"step": 2770
},
{
"epoch": 0.2224355896943511,
"grad_norm": 2.320472240447998,
"learning_rate": 4.642054875113655e-05,
"loss": 5.7637,
"step": 2780
},
{
"epoch": 0.2232357177148344,
"grad_norm": 2.8923239707946777,
"learning_rate": 4.6407177622078414e-05,
"loss": 5.9666,
"step": 2790
},
{
"epoch": 0.22403584573531765,
"grad_norm": 4.277271270751953,
"learning_rate": 4.6393806493020276e-05,
"loss": 5.8393,
"step": 2800
},
{
"epoch": 0.22483597375580092,
"grad_norm": 2.797428607940674,
"learning_rate": 4.638043536396213e-05,
"loss": 5.759,
"step": 2810
},
{
"epoch": 0.2256361017762842,
"grad_norm": 2.1849517822265625,
"learning_rate": 4.6367064234903995e-05,
"loss": 5.7514,
"step": 2820
},
{
"epoch": 0.22643622979676747,
"grad_norm": 2.8607492446899414,
"learning_rate": 4.635369310584586e-05,
"loss": 5.7545,
"step": 2830
},
{
"epoch": 0.22723635781725077,
"grad_norm": 3.722041130065918,
"learning_rate": 4.634032197678772e-05,
"loss": 5.8011,
"step": 2840
},
{
"epoch": 0.22803648583773403,
"grad_norm": 2.8563833236694336,
"learning_rate": 4.632695084772958e-05,
"loss": 5.8569,
"step": 2850
},
{
"epoch": 0.22883661385821732,
"grad_norm": 3.5724806785583496,
"learning_rate": 4.6313579718671446e-05,
"loss": 5.9649,
"step": 2860
},
{
"epoch": 0.2296367418787006,
"grad_norm": 2.380469560623169,
"learning_rate": 4.630020858961331e-05,
"loss": 5.7467,
"step": 2870
},
{
"epoch": 0.23043686989918388,
"grad_norm": 3.1629838943481445,
"learning_rate": 4.628683746055517e-05,
"loss": 5.642,
"step": 2880
},
{
"epoch": 0.23123699791966715,
"grad_norm": 2.1239373683929443,
"learning_rate": 4.6273466331497034e-05,
"loss": 5.6483,
"step": 2890
},
{
"epoch": 0.2320371259401504,
"grad_norm": 3.049079418182373,
"learning_rate": 4.6260095202438897e-05,
"loss": 5.9736,
"step": 2900
},
{
"epoch": 0.2328372539606337,
"grad_norm": 2.556830406188965,
"learning_rate": 4.624672407338076e-05,
"loss": 5.6037,
"step": 2910
},
{
"epoch": 0.23363738198111697,
"grad_norm": 2.8762035369873047,
"learning_rate": 4.623335294432262e-05,
"loss": 5.6345,
"step": 2920
},
{
"epoch": 0.23443751000160026,
"grad_norm": 2.11167573928833,
"learning_rate": 4.6219981815264485e-05,
"loss": 5.7822,
"step": 2930
},
{
"epoch": 0.23523763802208353,
"grad_norm": 4.623869895935059,
"learning_rate": 4.620661068620635e-05,
"loss": 5.7063,
"step": 2940
},
{
"epoch": 0.23603776604256682,
"grad_norm": 2.4420578479766846,
"learning_rate": 4.619323955714821e-05,
"loss": 5.686,
"step": 2950
},
{
"epoch": 0.2368378940630501,
"grad_norm": 2.6543869972229004,
"learning_rate": 4.617986842809007e-05,
"loss": 5.7802,
"step": 2960
},
{
"epoch": 0.23763802208353338,
"grad_norm": 2.6264312267303467,
"learning_rate": 4.616649729903193e-05,
"loss": 5.6667,
"step": 2970
},
{
"epoch": 0.23843815010401664,
"grad_norm": 2.4579195976257324,
"learning_rate": 4.615312616997379e-05,
"loss": 5.6738,
"step": 2980
},
{
"epoch": 0.2392382781244999,
"grad_norm": 2.299448251724243,
"learning_rate": 4.6139755040915654e-05,
"loss": 5.8622,
"step": 2990
},
{
"epoch": 0.2400384061449832,
"grad_norm": 3.6527328491210938,
"learning_rate": 4.612638391185752e-05,
"loss": 5.6346,
"step": 3000
},
{
"epoch": 0.24083853416546647,
"grad_norm": 2.217876434326172,
"learning_rate": 4.611301278279938e-05,
"loss": 5.7892,
"step": 3010
},
{
"epoch": 0.24163866218594976,
"grad_norm": 3.500544309616089,
"learning_rate": 4.609964165374124e-05,
"loss": 5.8026,
"step": 3020
},
{
"epoch": 0.24243879020643302,
"grad_norm": 3.1694483757019043,
"learning_rate": 4.6086270524683105e-05,
"loss": 5.827,
"step": 3030
},
{
"epoch": 0.24323891822691632,
"grad_norm": 2.899625778198242,
"learning_rate": 4.607289939562497e-05,
"loss": 5.7384,
"step": 3040
},
{
"epoch": 0.24403904624739958,
"grad_norm": 2.8286776542663574,
"learning_rate": 4.605952826656683e-05,
"loss": 5.7629,
"step": 3050
},
{
"epoch": 0.24483917426788285,
"grad_norm": 2.7585489749908447,
"learning_rate": 4.604615713750869e-05,
"loss": 5.7462,
"step": 3060
},
{
"epoch": 0.24563930228836614,
"grad_norm": 2.2017667293548584,
"learning_rate": 4.6032786008450555e-05,
"loss": 5.844,
"step": 3070
},
{
"epoch": 0.2464394303088494,
"grad_norm": 4.679725170135498,
"learning_rate": 4.601941487939242e-05,
"loss": 5.7254,
"step": 3080
},
{
"epoch": 0.2472395583293327,
"grad_norm": 2.923884868621826,
"learning_rate": 4.600604375033428e-05,
"loss": 5.703,
"step": 3090
},
{
"epoch": 0.24803968634981596,
"grad_norm": 2.2205090522766113,
"learning_rate": 4.5992672621276144e-05,
"loss": 5.7185,
"step": 3100
},
{
"epoch": 0.24883981437029926,
"grad_norm": 2.852313280105591,
"learning_rate": 4.5979301492218006e-05,
"loss": 5.5653,
"step": 3110
},
{
"epoch": 0.24963994239078252,
"grad_norm": 2.7683911323547363,
"learning_rate": 4.596593036315986e-05,
"loss": 5.7262,
"step": 3120
},
{
"epoch": 0.2504400704112658,
"grad_norm": 3.1315665245056152,
"learning_rate": 4.5952559234101725e-05,
"loss": 5.7524,
"step": 3130
},
{
"epoch": 0.2512401984317491,
"grad_norm": 2.5233592987060547,
"learning_rate": 4.593918810504359e-05,
"loss": 5.7443,
"step": 3140
},
{
"epoch": 0.25204032645223234,
"grad_norm": 2.3802831172943115,
"learning_rate": 4.592581697598545e-05,
"loss": 5.8091,
"step": 3150
},
{
"epoch": 0.25284045447271564,
"grad_norm": 2.378218412399292,
"learning_rate": 4.591244584692731e-05,
"loss": 5.7741,
"step": 3160
},
{
"epoch": 0.25364058249319893,
"grad_norm": 4.712483882904053,
"learning_rate": 4.5899074717869176e-05,
"loss": 5.8643,
"step": 3170
},
{
"epoch": 0.25444071051368217,
"grad_norm": 2.798752784729004,
"learning_rate": 4.588570358881104e-05,
"loss": 5.7984,
"step": 3180
},
{
"epoch": 0.25524083853416546,
"grad_norm": 2.302037477493286,
"learning_rate": 4.58723324597529e-05,
"loss": 5.6548,
"step": 3190
},
{
"epoch": 0.25604096655464875,
"grad_norm": 2.8621273040771484,
"learning_rate": 4.5858961330694764e-05,
"loss": 5.6875,
"step": 3200
},
{
"epoch": 0.25684109457513205,
"grad_norm": 2.9079480171203613,
"learning_rate": 4.5845590201636626e-05,
"loss": 5.8801,
"step": 3210
},
{
"epoch": 0.2576412225956153,
"grad_norm": 2.9576847553253174,
"learning_rate": 4.583221907257849e-05,
"loss": 5.6646,
"step": 3220
},
{
"epoch": 0.2584413506160986,
"grad_norm": 4.085951805114746,
"learning_rate": 4.581884794352035e-05,
"loss": 5.9078,
"step": 3230
},
{
"epoch": 0.25924147863658187,
"grad_norm": 2.622903347015381,
"learning_rate": 4.5805476814462214e-05,
"loss": 5.6821,
"step": 3240
},
{
"epoch": 0.2600416066570651,
"grad_norm": 1.794255256652832,
"learning_rate": 4.579210568540408e-05,
"loss": 5.751,
"step": 3250
},
{
"epoch": 0.2608417346775484,
"grad_norm": 3.074042558670044,
"learning_rate": 4.577873455634594e-05,
"loss": 5.7864,
"step": 3260
},
{
"epoch": 0.2616418626980317,
"grad_norm": 2.3138844966888428,
"learning_rate": 4.57653634272878e-05,
"loss": 5.693,
"step": 3270
},
{
"epoch": 0.262441990718515,
"grad_norm": 3.8877549171447754,
"learning_rate": 4.5751992298229665e-05,
"loss": 5.7154,
"step": 3280
},
{
"epoch": 0.2632421187389982,
"grad_norm": 2.9623680114746094,
"learning_rate": 4.573862116917153e-05,
"loss": 5.7514,
"step": 3290
},
{
"epoch": 0.2640422467594815,
"grad_norm": 2.840122938156128,
"learning_rate": 4.572525004011339e-05,
"loss": 5.7397,
"step": 3300
},
{
"epoch": 0.2648423747799648,
"grad_norm": 2.9699277877807617,
"learning_rate": 4.571187891105525e-05,
"loss": 5.7626,
"step": 3310
},
{
"epoch": 0.2656425028004481,
"grad_norm": 2.6493773460388184,
"learning_rate": 4.5698507781997116e-05,
"loss": 5.7619,
"step": 3320
},
{
"epoch": 0.26644263082093134,
"grad_norm": 2.283259868621826,
"learning_rate": 4.568513665293898e-05,
"loss": 5.8409,
"step": 3330
},
{
"epoch": 0.26724275884141463,
"grad_norm": 1.9254164695739746,
"learning_rate": 4.567176552388084e-05,
"loss": 5.8218,
"step": 3340
},
{
"epoch": 0.2680428868618979,
"grad_norm": 2.382345676422119,
"learning_rate": 4.5658394394822704e-05,
"loss": 5.6865,
"step": 3350
},
{
"epoch": 0.26884301488238116,
"grad_norm": 2.6039271354675293,
"learning_rate": 4.564502326576457e-05,
"loss": 5.7254,
"step": 3360
},
{
"epoch": 0.26964314290286445,
"grad_norm": 2.0948996543884277,
"learning_rate": 4.563165213670643e-05,
"loss": 5.7589,
"step": 3370
},
{
"epoch": 0.27044327092334774,
"grad_norm": 2.939955711364746,
"learning_rate": 4.561828100764829e-05,
"loss": 5.8298,
"step": 3380
},
{
"epoch": 0.27124339894383104,
"grad_norm": 2.748307466506958,
"learning_rate": 4.5604909878590155e-05,
"loss": 5.8505,
"step": 3390
},
{
"epoch": 0.2720435269643143,
"grad_norm": 2.7122459411621094,
"learning_rate": 4.559153874953202e-05,
"loss": 5.9027,
"step": 3400
},
{
"epoch": 0.27284365498479757,
"grad_norm": 3.6053593158721924,
"learning_rate": 4.557816762047388e-05,
"loss": 5.6746,
"step": 3410
},
{
"epoch": 0.27364378300528086,
"grad_norm": 4.433299541473389,
"learning_rate": 4.556479649141574e-05,
"loss": 5.7713,
"step": 3420
},
{
"epoch": 0.2744439110257641,
"grad_norm": 2.5253539085388184,
"learning_rate": 4.55514253623576e-05,
"loss": 5.8219,
"step": 3430
},
{
"epoch": 0.2752440390462474,
"grad_norm": 4.9358062744140625,
"learning_rate": 4.553805423329946e-05,
"loss": 5.7971,
"step": 3440
},
{
"epoch": 0.2760441670667307,
"grad_norm": 2.6247594356536865,
"learning_rate": 4.5524683104241324e-05,
"loss": 5.1528,
"step": 3450
},
{
"epoch": 0.276844295087214,
"grad_norm": 2.8152048587799072,
"learning_rate": 4.551131197518319e-05,
"loss": 5.7955,
"step": 3460
},
{
"epoch": 0.2776444231076972,
"grad_norm": 2.143275499343872,
"learning_rate": 4.549794084612505e-05,
"loss": 5.6875,
"step": 3470
},
{
"epoch": 0.2784445511281805,
"grad_norm": 2.9896023273468018,
"learning_rate": 4.548456971706691e-05,
"loss": 5.7981,
"step": 3480
},
{
"epoch": 0.2792446791486638,
"grad_norm": 3.5231759548187256,
"learning_rate": 4.5471198588008775e-05,
"loss": 5.7343,
"step": 3490
},
{
"epoch": 0.28004480716914704,
"grad_norm": 2.391721487045288,
"learning_rate": 4.545782745895064e-05,
"loss": 5.6821,
"step": 3500
},
{
"epoch": 0.28084493518963033,
"grad_norm": 2.414992332458496,
"learning_rate": 4.54444563298925e-05,
"loss": 5.7357,
"step": 3510
},
{
"epoch": 0.2816450632101136,
"grad_norm": 2.7502214908599854,
"learning_rate": 4.543108520083436e-05,
"loss": 5.6511,
"step": 3520
},
{
"epoch": 0.2824451912305969,
"grad_norm": 2.1601436138153076,
"learning_rate": 4.5417714071776226e-05,
"loss": 5.6249,
"step": 3530
},
{
"epoch": 0.28324531925108015,
"grad_norm": 2.89013671875,
"learning_rate": 4.540434294271809e-05,
"loss": 5.7583,
"step": 3540
},
{
"epoch": 0.28404544727156344,
"grad_norm": 2.4915778636932373,
"learning_rate": 4.539097181365995e-05,
"loss": 5.6957,
"step": 3550
},
{
"epoch": 0.28484557529204674,
"grad_norm": 5.053386688232422,
"learning_rate": 4.5377600684601814e-05,
"loss": 5.632,
"step": 3560
},
{
"epoch": 0.28564570331253003,
"grad_norm": 2.6207687854766846,
"learning_rate": 4.5364229555543676e-05,
"loss": 5.8514,
"step": 3570
},
{
"epoch": 0.28644583133301327,
"grad_norm": 4.157670497894287,
"learning_rate": 4.535085842648553e-05,
"loss": 5.7608,
"step": 3580
},
{
"epoch": 0.28724595935349656,
"grad_norm": 3.4464797973632812,
"learning_rate": 4.5337487297427395e-05,
"loss": 5.6737,
"step": 3590
},
{
"epoch": 0.28804608737397985,
"grad_norm": 4.255002498626709,
"learning_rate": 4.532411616836926e-05,
"loss": 5.7977,
"step": 3600
},
{
"epoch": 0.2888462153944631,
"grad_norm": 2.7926547527313232,
"learning_rate": 4.531074503931112e-05,
"loss": 5.6891,
"step": 3610
},
{
"epoch": 0.2896463434149464,
"grad_norm": 3.150400400161743,
"learning_rate": 4.529737391025298e-05,
"loss": 5.7931,
"step": 3620
},
{
"epoch": 0.2904464714354297,
"grad_norm": 2.1223199367523193,
"learning_rate": 4.5284002781194846e-05,
"loss": 5.8646,
"step": 3630
},
{
"epoch": 0.29124659945591297,
"grad_norm": 3.950665235519409,
"learning_rate": 4.527063165213671e-05,
"loss": 5.7008,
"step": 3640
},
{
"epoch": 0.2920467274763962,
"grad_norm": 2.995692729949951,
"learning_rate": 4.525726052307857e-05,
"loss": 5.688,
"step": 3650
},
{
"epoch": 0.2928468554968795,
"grad_norm": 2.041736125946045,
"learning_rate": 4.5243889394020434e-05,
"loss": 5.7301,
"step": 3660
},
{
"epoch": 0.2936469835173628,
"grad_norm": 2.541757106781006,
"learning_rate": 4.5230518264962297e-05,
"loss": 5.5606,
"step": 3670
},
{
"epoch": 0.29444711153784603,
"grad_norm": 2.140761613845825,
"learning_rate": 4.521714713590416e-05,
"loss": 5.7671,
"step": 3680
},
{
"epoch": 0.2952472395583293,
"grad_norm": 2.6869146823883057,
"learning_rate": 4.520377600684602e-05,
"loss": 5.6452,
"step": 3690
},
{
"epoch": 0.2960473675788126,
"grad_norm": 3.072376012802124,
"learning_rate": 4.5190404877787885e-05,
"loss": 5.6956,
"step": 3700
},
{
"epoch": 0.2968474955992959,
"grad_norm": 2.5933837890625,
"learning_rate": 4.517703374872975e-05,
"loss": 5.6212,
"step": 3710
},
{
"epoch": 0.29764762361977914,
"grad_norm": 3.0443103313446045,
"learning_rate": 4.516366261967161e-05,
"loss": 5.7849,
"step": 3720
},
{
"epoch": 0.29844775164026244,
"grad_norm": 2.673583745956421,
"learning_rate": 4.515029149061347e-05,
"loss": 5.6186,
"step": 3730
},
{
"epoch": 0.29924787966074573,
"grad_norm": 2.3276283740997314,
"learning_rate": 4.513692036155533e-05,
"loss": 5.9188,
"step": 3740
},
{
"epoch": 0.300048007681229,
"grad_norm": 5.504491329193115,
"learning_rate": 4.512354923249719e-05,
"loss": 5.5676,
"step": 3750
},
{
"epoch": 0.30084813570171226,
"grad_norm": 2.4181482791900635,
"learning_rate": 4.5110178103439054e-05,
"loss": 5.6852,
"step": 3760
},
{
"epoch": 0.30164826372219555,
"grad_norm": 2.2489006519317627,
"learning_rate": 4.509680697438092e-05,
"loss": 5.7003,
"step": 3770
},
{
"epoch": 0.30244839174267885,
"grad_norm": 2.6925253868103027,
"learning_rate": 4.508343584532278e-05,
"loss": 5.8176,
"step": 3780
},
{
"epoch": 0.3032485197631621,
"grad_norm": 2.904318332672119,
"learning_rate": 4.507006471626464e-05,
"loss": 5.6912,
"step": 3790
},
{
"epoch": 0.3040486477836454,
"grad_norm": 3.3189070224761963,
"learning_rate": 4.5056693587206505e-05,
"loss": 5.8706,
"step": 3800
},
{
"epoch": 0.30484877580412867,
"grad_norm": 2.8324170112609863,
"learning_rate": 4.504332245814837e-05,
"loss": 5.8795,
"step": 3810
},
{
"epoch": 0.30564890382461196,
"grad_norm": 3.113417148590088,
"learning_rate": 4.502995132909023e-05,
"loss": 5.8689,
"step": 3820
},
{
"epoch": 0.3064490318450952,
"grad_norm": 2.469269275665283,
"learning_rate": 4.501658020003209e-05,
"loss": 5.7934,
"step": 3830
},
{
"epoch": 0.3072491598655785,
"grad_norm": 2.778571128845215,
"learning_rate": 4.5003209070973956e-05,
"loss": 5.8577,
"step": 3840
},
{
"epoch": 0.3080492878860618,
"grad_norm": 3.4269161224365234,
"learning_rate": 4.498983794191582e-05,
"loss": 5.8378,
"step": 3850
},
{
"epoch": 0.308849415906545,
"grad_norm": 3.417850971221924,
"learning_rate": 4.497646681285768e-05,
"loss": 5.6532,
"step": 3860
},
{
"epoch": 0.3096495439270283,
"grad_norm": 2.389784097671509,
"learning_rate": 4.4963095683799544e-05,
"loss": 5.5454,
"step": 3870
},
{
"epoch": 0.3104496719475116,
"grad_norm": 2.384453296661377,
"learning_rate": 4.4949724554741406e-05,
"loss": 5.8014,
"step": 3880
},
{
"epoch": 0.3112497999679949,
"grad_norm": 1.913668155670166,
"learning_rate": 4.493635342568326e-05,
"loss": 5.6033,
"step": 3890
},
{
"epoch": 0.31204992798847814,
"grad_norm": 3.4930074214935303,
"learning_rate": 4.4922982296625125e-05,
"loss": 5.7649,
"step": 3900
},
{
"epoch": 0.31285005600896143,
"grad_norm": 3.517458200454712,
"learning_rate": 4.490961116756699e-05,
"loss": 5.5635,
"step": 3910
},
{
"epoch": 0.3136501840294447,
"grad_norm": 2.611274480819702,
"learning_rate": 4.489624003850885e-05,
"loss": 5.8121,
"step": 3920
},
{
"epoch": 0.314450312049928,
"grad_norm": 2.373997926712036,
"learning_rate": 4.488286890945071e-05,
"loss": 5.6002,
"step": 3930
},
{
"epoch": 0.31525044007041125,
"grad_norm": 2.554847002029419,
"learning_rate": 4.4869497780392576e-05,
"loss": 5.6432,
"step": 3940
},
{
"epoch": 0.31605056809089455,
"grad_norm": 3.3720595836639404,
"learning_rate": 4.485612665133444e-05,
"loss": 5.5794,
"step": 3950
},
{
"epoch": 0.31685069611137784,
"grad_norm": 2.2308788299560547,
"learning_rate": 4.48427555222763e-05,
"loss": 5.794,
"step": 3960
},
{
"epoch": 0.3176508241318611,
"grad_norm": 2.0659661293029785,
"learning_rate": 4.4829384393218164e-05,
"loss": 5.5383,
"step": 3970
},
{
"epoch": 0.31845095215234437,
"grad_norm": 3.2644894123077393,
"learning_rate": 4.4816013264160026e-05,
"loss": 5.6979,
"step": 3980
},
{
"epoch": 0.31925108017282766,
"grad_norm": 2.3485729694366455,
"learning_rate": 4.480264213510189e-05,
"loss": 5.7214,
"step": 3990
},
{
"epoch": 0.32005120819331095,
"grad_norm": 2.7470600605010986,
"learning_rate": 4.478927100604375e-05,
"loss": 5.6032,
"step": 4000
},
{
"epoch": 0.3208513362137942,
"grad_norm": 2.1622989177703857,
"learning_rate": 4.4775899876985614e-05,
"loss": 5.7976,
"step": 4010
},
{
"epoch": 0.3216514642342775,
"grad_norm": 2.7463905811309814,
"learning_rate": 4.476252874792748e-05,
"loss": 5.7181,
"step": 4020
},
{
"epoch": 0.3224515922547608,
"grad_norm": 3.503662109375,
"learning_rate": 4.474915761886934e-05,
"loss": 5.8092,
"step": 4030
},
{
"epoch": 0.323251720275244,
"grad_norm": 2.6073853969573975,
"learning_rate": 4.47357864898112e-05,
"loss": 5.7876,
"step": 4040
},
{
"epoch": 0.3240518482957273,
"grad_norm": 3.354768991470337,
"learning_rate": 4.472241536075306e-05,
"loss": 5.7741,
"step": 4050
},
{
"epoch": 0.3248519763162106,
"grad_norm": 2.648145914077759,
"learning_rate": 4.470904423169492e-05,
"loss": 5.7522,
"step": 4060
},
{
"epoch": 0.3256521043366939,
"grad_norm": 3.086655378341675,
"learning_rate": 4.4695673102636784e-05,
"loss": 5.81,
"step": 4070
},
{
"epoch": 0.32645223235717713,
"grad_norm": 2.230905771255493,
"learning_rate": 4.4682301973578647e-05,
"loss": 5.8839,
"step": 4080
},
{
"epoch": 0.3272523603776604,
"grad_norm": 2.5391674041748047,
"learning_rate": 4.466893084452051e-05,
"loss": 5.5535,
"step": 4090
},
{
"epoch": 0.3280524883981437,
"grad_norm": 2.7574117183685303,
"learning_rate": 4.465555971546237e-05,
"loss": 5.8275,
"step": 4100
},
{
"epoch": 0.32885261641862695,
"grad_norm": 3.1114678382873535,
"learning_rate": 4.4642188586404235e-05,
"loss": 5.6876,
"step": 4110
},
{
"epoch": 0.32965274443911025,
"grad_norm": 2.404892683029175,
"learning_rate": 4.46288174573461e-05,
"loss": 5.6876,
"step": 4120
},
{
"epoch": 0.33045287245959354,
"grad_norm": 2.590759754180908,
"learning_rate": 4.461544632828796e-05,
"loss": 5.802,
"step": 4130
},
{
"epoch": 0.33125300048007683,
"grad_norm": 2.4358649253845215,
"learning_rate": 4.460207519922982e-05,
"loss": 5.632,
"step": 4140
},
{
"epoch": 0.33205312850056007,
"grad_norm": 3.9567458629608154,
"learning_rate": 4.4588704070171685e-05,
"loss": 5.8761,
"step": 4150
},
{
"epoch": 0.33285325652104336,
"grad_norm": 2.3808743953704834,
"learning_rate": 4.457533294111355e-05,
"loss": 5.6815,
"step": 4160
},
{
"epoch": 0.33365338454152665,
"grad_norm": 2.6527156829833984,
"learning_rate": 4.456196181205541e-05,
"loss": 5.805,
"step": 4170
},
{
"epoch": 0.33445351256200995,
"grad_norm": 2.351062536239624,
"learning_rate": 4.4548590682997273e-05,
"loss": 5.6681,
"step": 4180
},
{
"epoch": 0.3352536405824932,
"grad_norm": 2.3213460445404053,
"learning_rate": 4.4535219553939136e-05,
"loss": 5.6363,
"step": 4190
},
{
"epoch": 0.3360537686029765,
"grad_norm": 1.9470767974853516,
"learning_rate": 4.4521848424881e-05,
"loss": 5.8772,
"step": 4200
},
{
"epoch": 0.33685389662345977,
"grad_norm": 4.303500652313232,
"learning_rate": 4.450847729582286e-05,
"loss": 5.6185,
"step": 4210
},
{
"epoch": 0.337654024643943,
"grad_norm": 2.713275909423828,
"learning_rate": 4.4495106166764724e-05,
"loss": 5.6754,
"step": 4220
},
{
"epoch": 0.3384541526644263,
"grad_norm": 2.34993314743042,
"learning_rate": 4.448173503770659e-05,
"loss": 5.7003,
"step": 4230
},
{
"epoch": 0.3392542806849096,
"grad_norm": 2.276228666305542,
"learning_rate": 4.446836390864845e-05,
"loss": 5.6,
"step": 4240
},
{
"epoch": 0.3400544087053929,
"grad_norm": 2.3635685443878174,
"learning_rate": 4.445499277959031e-05,
"loss": 5.7373,
"step": 4250
},
{
"epoch": 0.3408545367258761,
"grad_norm": 3.100604772567749,
"learning_rate": 4.4441621650532175e-05,
"loss": 5.7354,
"step": 4260
},
{
"epoch": 0.3416546647463594,
"grad_norm": 2.6743876934051514,
"learning_rate": 4.442825052147404e-05,
"loss": 5.7544,
"step": 4270
},
{
"epoch": 0.3424547927668427,
"grad_norm": 2.5783612728118896,
"learning_rate": 4.44148793924159e-05,
"loss": 5.8826,
"step": 4280
},
{
"epoch": 0.34325492078732595,
"grad_norm": 2.8976659774780273,
"learning_rate": 4.440150826335776e-05,
"loss": 5.5418,
"step": 4290
},
{
"epoch": 0.34405504880780924,
"grad_norm": 2.1061089038848877,
"learning_rate": 4.4388137134299626e-05,
"loss": 5.6406,
"step": 4300
},
{
"epoch": 0.34485517682829253,
"grad_norm": 2.1303789615631104,
"learning_rate": 4.437476600524149e-05,
"loss": 5.6491,
"step": 4310
},
{
"epoch": 0.3456553048487758,
"grad_norm": 2.6240499019622803,
"learning_rate": 4.436139487618335e-05,
"loss": 5.7161,
"step": 4320
},
{
"epoch": 0.34645543286925906,
"grad_norm": 2.325155019760132,
"learning_rate": 4.4348023747125214e-05,
"loss": 5.6172,
"step": 4330
},
{
"epoch": 0.34725556088974235,
"grad_norm": 2.8844404220581055,
"learning_rate": 4.4334652618067076e-05,
"loss": 5.7438,
"step": 4340
},
{
"epoch": 0.34805568891022565,
"grad_norm": 2.375324249267578,
"learning_rate": 4.432128148900894e-05,
"loss": 5.8335,
"step": 4350
},
{
"epoch": 0.34885581693070894,
"grad_norm": 2.1572377681732178,
"learning_rate": 4.4307910359950795e-05,
"loss": 5.706,
"step": 4360
},
{
"epoch": 0.3496559449511922,
"grad_norm": 2.5218889713287354,
"learning_rate": 4.429453923089266e-05,
"loss": 5.7487,
"step": 4370
},
{
"epoch": 0.35045607297167547,
"grad_norm": 2.636223554611206,
"learning_rate": 4.428116810183452e-05,
"loss": 5.8327,
"step": 4380
},
{
"epoch": 0.35125620099215876,
"grad_norm": 2.436155080795288,
"learning_rate": 4.426779697277638e-05,
"loss": 5.6895,
"step": 4390
},
{
"epoch": 0.352056329012642,
"grad_norm": 3.4435484409332275,
"learning_rate": 4.4254425843718246e-05,
"loss": 5.6171,
"step": 4400
},
{
"epoch": 0.3528564570331253,
"grad_norm": 2.3990628719329834,
"learning_rate": 4.424105471466011e-05,
"loss": 5.7574,
"step": 4410
},
{
"epoch": 0.3536565850536086,
"grad_norm": 2.544774293899536,
"learning_rate": 4.422768358560197e-05,
"loss": 5.558,
"step": 4420
},
{
"epoch": 0.3544567130740919,
"grad_norm": 2.389491081237793,
"learning_rate": 4.4214312456543834e-05,
"loss": 5.6628,
"step": 4430
},
{
"epoch": 0.3552568410945751,
"grad_norm": 5.203212261199951,
"learning_rate": 4.4200941327485697e-05,
"loss": 5.5403,
"step": 4440
},
{
"epoch": 0.3560569691150584,
"grad_norm": 2.0861873626708984,
"learning_rate": 4.418757019842756e-05,
"loss": 5.625,
"step": 4450
},
{
"epoch": 0.3568570971355417,
"grad_norm": 2.2355470657348633,
"learning_rate": 4.417419906936942e-05,
"loss": 5.614,
"step": 4460
},
{
"epoch": 0.35765722515602494,
"grad_norm": 2.2239274978637695,
"learning_rate": 4.4160827940311285e-05,
"loss": 5.6885,
"step": 4470
},
{
"epoch": 0.35845735317650823,
"grad_norm": 4.571592807769775,
"learning_rate": 4.414745681125315e-05,
"loss": 5.8495,
"step": 4480
},
{
"epoch": 0.3592574811969915,
"grad_norm": 2.6501150131225586,
"learning_rate": 4.413408568219501e-05,
"loss": 5.6158,
"step": 4490
},
{
"epoch": 0.3600576092174748,
"grad_norm": 2.8568902015686035,
"learning_rate": 4.412071455313687e-05,
"loss": 5.6403,
"step": 4500
},
{
"epoch": 0.36085773723795805,
"grad_norm": 2.4179179668426514,
"learning_rate": 4.410734342407873e-05,
"loss": 5.749,
"step": 4510
},
{
"epoch": 0.36165786525844135,
"grad_norm": 2.950491189956665,
"learning_rate": 4.409397229502059e-05,
"loss": 5.7128,
"step": 4520
},
{
"epoch": 0.36245799327892464,
"grad_norm": 3.731049060821533,
"learning_rate": 4.4080601165962454e-05,
"loss": 5.6397,
"step": 4530
},
{
"epoch": 0.36325812129940793,
"grad_norm": 2.255730390548706,
"learning_rate": 4.406723003690432e-05,
"loss": 5.626,
"step": 4540
},
{
"epoch": 0.36405824931989117,
"grad_norm": 2.623455047607422,
"learning_rate": 4.405385890784618e-05,
"loss": 5.6792,
"step": 4550
},
{
"epoch": 0.36485837734037446,
"grad_norm": 2.366481065750122,
"learning_rate": 4.404048777878804e-05,
"loss": 5.5455,
"step": 4560
},
{
"epoch": 0.36565850536085776,
"grad_norm": 2.56351375579834,
"learning_rate": 4.4027116649729905e-05,
"loss": 5.7982,
"step": 4570
},
{
"epoch": 0.366458633381341,
"grad_norm": 2.3203811645507812,
"learning_rate": 4.401374552067177e-05,
"loss": 5.7969,
"step": 4580
},
{
"epoch": 0.3672587614018243,
"grad_norm": 2.3838179111480713,
"learning_rate": 4.400037439161363e-05,
"loss": 5.7484,
"step": 4590
},
{
"epoch": 0.3680588894223076,
"grad_norm": 2.0725440979003906,
"learning_rate": 4.398700326255549e-05,
"loss": 5.8405,
"step": 4600
},
{
"epoch": 0.36885901744279087,
"grad_norm": 3.49495005607605,
"learning_rate": 4.3973632133497356e-05,
"loss": 5.7151,
"step": 4610
},
{
"epoch": 0.3696591454632741,
"grad_norm": 2.643007755279541,
"learning_rate": 4.396026100443922e-05,
"loss": 5.6374,
"step": 4620
},
{
"epoch": 0.3704592734837574,
"grad_norm": 2.282304286956787,
"learning_rate": 4.394688987538108e-05,
"loss": 5.589,
"step": 4630
},
{
"epoch": 0.3712594015042407,
"grad_norm": 2.244058609008789,
"learning_rate": 4.3933518746322944e-05,
"loss": 5.7516,
"step": 4640
},
{
"epoch": 0.37205952952472393,
"grad_norm": 2.44496488571167,
"learning_rate": 4.3920147617264806e-05,
"loss": 5.8393,
"step": 4650
},
{
"epoch": 0.3728596575452072,
"grad_norm": 2.6613078117370605,
"learning_rate": 4.390677648820667e-05,
"loss": 5.6764,
"step": 4660
},
{
"epoch": 0.3736597855656905,
"grad_norm": 3.99092173576355,
"learning_rate": 4.3893405359148525e-05,
"loss": 5.8658,
"step": 4670
},
{
"epoch": 0.3744599135861738,
"grad_norm": 1.6338485479354858,
"learning_rate": 4.388003423009039e-05,
"loss": 5.7527,
"step": 4680
},
{
"epoch": 0.37526004160665705,
"grad_norm": 2.3723371028900146,
"learning_rate": 4.386666310103225e-05,
"loss": 5.7482,
"step": 4690
},
{
"epoch": 0.37606016962714034,
"grad_norm": 2.630424976348877,
"learning_rate": 4.385329197197411e-05,
"loss": 5.7539,
"step": 4700
},
{
"epoch": 0.37686029764762363,
"grad_norm": 2.3873038291931152,
"learning_rate": 4.3839920842915976e-05,
"loss": 5.6729,
"step": 4710
},
{
"epoch": 0.37766042566810687,
"grad_norm": 1.9391748905181885,
"learning_rate": 4.382654971385784e-05,
"loss": 5.6794,
"step": 4720
},
{
"epoch": 0.37846055368859016,
"grad_norm": 2.103975296020508,
"learning_rate": 4.38131785847997e-05,
"loss": 5.5104,
"step": 4730
},
{
"epoch": 0.37926068170907346,
"grad_norm": 3.731184959411621,
"learning_rate": 4.3799807455741564e-05,
"loss": 5.6699,
"step": 4740
},
{
"epoch": 0.38006080972955675,
"grad_norm": 2.881068468093872,
"learning_rate": 4.3786436326683426e-05,
"loss": 5.6394,
"step": 4750
},
{
"epoch": 0.38086093775004,
"grad_norm": 2.5963799953460693,
"learning_rate": 4.377306519762529e-05,
"loss": 5.784,
"step": 4760
},
{
"epoch": 0.3816610657705233,
"grad_norm": 1.9520230293273926,
"learning_rate": 4.375969406856715e-05,
"loss": 5.7608,
"step": 4770
},
{
"epoch": 0.38246119379100657,
"grad_norm": 2.386702537536621,
"learning_rate": 4.374766005241483e-05,
"loss": 5.5725,
"step": 4780
},
{
"epoch": 0.38326132181148986,
"grad_norm": 2.3830511569976807,
"learning_rate": 4.3734288923356694e-05,
"loss": 5.5584,
"step": 4790
},
{
"epoch": 0.3840614498319731,
"grad_norm": 2.1514739990234375,
"learning_rate": 4.3720917794298556e-05,
"loss": 5.6621,
"step": 4800
},
{
"epoch": 0.3848615778524564,
"grad_norm": 2.5376317501068115,
"learning_rate": 4.370754666524042e-05,
"loss": 5.4138,
"step": 4810
},
{
"epoch": 0.3856617058729397,
"grad_norm": 3.425899028778076,
"learning_rate": 4.3694175536182275e-05,
"loss": 5.6478,
"step": 4820
},
{
"epoch": 0.3864618338934229,
"grad_norm": 2.7518632411956787,
"learning_rate": 4.368080440712414e-05,
"loss": 5.6556,
"step": 4830
},
{
"epoch": 0.3872619619139062,
"grad_norm": 3.119227647781372,
"learning_rate": 4.3667433278066e-05,
"loss": 5.7925,
"step": 4840
},
{
"epoch": 0.3880620899343895,
"grad_norm": 3.2664616107940674,
"learning_rate": 4.365406214900786e-05,
"loss": 5.7176,
"step": 4850
},
{
"epoch": 0.3888622179548728,
"grad_norm": 2.5125045776367188,
"learning_rate": 4.3640691019949726e-05,
"loss": 5.6511,
"step": 4860
},
{
"epoch": 0.38966234597535604,
"grad_norm": 2.992112874984741,
"learning_rate": 4.362731989089159e-05,
"loss": 5.6426,
"step": 4870
},
{
"epoch": 0.39046247399583933,
"grad_norm": 4.46783971786499,
"learning_rate": 4.361394876183345e-05,
"loss": 5.736,
"step": 4880
},
{
"epoch": 0.3912626020163226,
"grad_norm": 1.8372838497161865,
"learning_rate": 4.3600577632775314e-05,
"loss": 5.7603,
"step": 4890
},
{
"epoch": 0.39206273003680586,
"grad_norm": 2.1635375022888184,
"learning_rate": 4.3587206503717176e-05,
"loss": 5.6019,
"step": 4900
},
{
"epoch": 0.39286285805728915,
"grad_norm": 2.2425310611724854,
"learning_rate": 4.357383537465904e-05,
"loss": 5.6829,
"step": 4910
},
{
"epoch": 0.39366298607777245,
"grad_norm": 2.408907413482666,
"learning_rate": 4.35604642456009e-05,
"loss": 5.6821,
"step": 4920
},
{
"epoch": 0.39446311409825574,
"grad_norm": 3.012258291244507,
"learning_rate": 4.3547093116542765e-05,
"loss": 5.7503,
"step": 4930
},
{
"epoch": 0.395263242118739,
"grad_norm": 3.187053680419922,
"learning_rate": 4.353372198748463e-05,
"loss": 5.6459,
"step": 4940
},
{
"epoch": 0.39606337013922227,
"grad_norm": 2.7528955936431885,
"learning_rate": 4.352035085842649e-05,
"loss": 5.6386,
"step": 4950
},
{
"epoch": 0.39686349815970556,
"grad_norm": 2.9744699001312256,
"learning_rate": 4.350697972936835e-05,
"loss": 5.5938,
"step": 4960
},
{
"epoch": 0.39766362618018886,
"grad_norm": 2.779604196548462,
"learning_rate": 4.3493608600310215e-05,
"loss": 5.5459,
"step": 4970
},
{
"epoch": 0.3984637542006721,
"grad_norm": 2.9092133045196533,
"learning_rate": 4.348023747125207e-05,
"loss": 5.7695,
"step": 4980
},
{
"epoch": 0.3992638822211554,
"grad_norm": 2.800872802734375,
"learning_rate": 4.3466866342193934e-05,
"loss": 5.6943,
"step": 4990
},
{
"epoch": 0.4000640102416387,
"grad_norm": 3.299595832824707,
"learning_rate": 4.3453495213135797e-05,
"loss": 5.4432,
"step": 5000
},
{
"epoch": 0.4008641382621219,
"grad_norm": 2.2425456047058105,
"learning_rate": 4.344012408407766e-05,
"loss": 5.6688,
"step": 5010
},
{
"epoch": 0.4016642662826052,
"grad_norm": 2.269378423690796,
"learning_rate": 4.342675295501952e-05,
"loss": 5.7713,
"step": 5020
},
{
"epoch": 0.4024643943030885,
"grad_norm": 2.3903868198394775,
"learning_rate": 4.3413381825961385e-05,
"loss": 5.5926,
"step": 5030
},
{
"epoch": 0.4032645223235718,
"grad_norm": 3.267918109893799,
"learning_rate": 4.340001069690325e-05,
"loss": 5.6806,
"step": 5040
},
{
"epoch": 0.40406465034405503,
"grad_norm": 3.2075066566467285,
"learning_rate": 4.338663956784511e-05,
"loss": 5.6582,
"step": 5050
},
{
"epoch": 0.4048647783645383,
"grad_norm": 2.5458226203918457,
"learning_rate": 4.337326843878697e-05,
"loss": 5.6576,
"step": 5060
},
{
"epoch": 0.4056649063850216,
"grad_norm": 2.0331077575683594,
"learning_rate": 4.3359897309728835e-05,
"loss": 5.6725,
"step": 5070
},
{
"epoch": 0.40646503440550485,
"grad_norm": 2.406907796859741,
"learning_rate": 4.33465261806707e-05,
"loss": 5.5168,
"step": 5080
},
{
"epoch": 0.40726516242598815,
"grad_norm": 2.661137580871582,
"learning_rate": 4.333315505161256e-05,
"loss": 5.5953,
"step": 5090
},
{
"epoch": 0.40806529044647144,
"grad_norm": 2.857725143432617,
"learning_rate": 4.3319783922554423e-05,
"loss": 5.6702,
"step": 5100
},
{
"epoch": 0.40886541846695473,
"grad_norm": 2.7894747257232666,
"learning_rate": 4.3306412793496286e-05,
"loss": 5.6228,
"step": 5110
},
{
"epoch": 0.40966554648743797,
"grad_norm": 2.8865861892700195,
"learning_rate": 4.329304166443815e-05,
"loss": 5.6859,
"step": 5120
},
{
"epoch": 0.41046567450792126,
"grad_norm": 2.1493608951568604,
"learning_rate": 4.3279670535380005e-05,
"loss": 5.5516,
"step": 5130
},
{
"epoch": 0.41126580252840456,
"grad_norm": 3.112820863723755,
"learning_rate": 4.326629940632187e-05,
"loss": 5.6409,
"step": 5140
},
{
"epoch": 0.41206593054888785,
"grad_norm": 2.778876543045044,
"learning_rate": 4.325292827726373e-05,
"loss": 5.6948,
"step": 5150
},
{
"epoch": 0.4128660585693711,
"grad_norm": 2.0409047603607178,
"learning_rate": 4.323955714820559e-05,
"loss": 5.5458,
"step": 5160
},
{
"epoch": 0.4136661865898544,
"grad_norm": 3.1058828830718994,
"learning_rate": 4.3226186019147456e-05,
"loss": 5.8437,
"step": 5170
},
{
"epoch": 0.41446631461033767,
"grad_norm": 3.306704044342041,
"learning_rate": 4.321281489008932e-05,
"loss": 5.691,
"step": 5180
},
{
"epoch": 0.4152664426308209,
"grad_norm": 2.9495625495910645,
"learning_rate": 4.319944376103118e-05,
"loss": 5.6364,
"step": 5190
},
{
"epoch": 0.4160665706513042,
"grad_norm": 2.1773974895477295,
"learning_rate": 4.3186072631973044e-05,
"loss": 5.6713,
"step": 5200
},
{
"epoch": 0.4168666986717875,
"grad_norm": 2.0897533893585205,
"learning_rate": 4.3172701502914906e-05,
"loss": 5.6022,
"step": 5210
},
{
"epoch": 0.4176668266922708,
"grad_norm": 2.2131927013397217,
"learning_rate": 4.315933037385677e-05,
"loss": 5.5728,
"step": 5220
},
{
"epoch": 0.418466954712754,
"grad_norm": 2.225728750228882,
"learning_rate": 4.314595924479863e-05,
"loss": 5.5374,
"step": 5230
},
{
"epoch": 0.4192670827332373,
"grad_norm": 2.219791889190674,
"learning_rate": 4.3132588115740494e-05,
"loss": 5.6986,
"step": 5240
},
{
"epoch": 0.4200672107537206,
"grad_norm": 2.720323085784912,
"learning_rate": 4.311921698668236e-05,
"loss": 5.6046,
"step": 5250
},
{
"epoch": 0.42086733877420385,
"grad_norm": 2.4254257678985596,
"learning_rate": 4.310584585762422e-05,
"loss": 5.5566,
"step": 5260
},
{
"epoch": 0.42166746679468714,
"grad_norm": 2.2297472953796387,
"learning_rate": 4.309247472856608e-05,
"loss": 5.7431,
"step": 5270
},
{
"epoch": 0.42246759481517043,
"grad_norm": 2.2767512798309326,
"learning_rate": 4.3079103599507945e-05,
"loss": 5.6661,
"step": 5280
},
{
"epoch": 0.4232677228356537,
"grad_norm": 2.8959579467773438,
"learning_rate": 4.30657324704498e-05,
"loss": 5.6584,
"step": 5290
},
{
"epoch": 0.42406785085613696,
"grad_norm": 2.49867844581604,
"learning_rate": 4.3052361341391664e-05,
"loss": 5.7564,
"step": 5300
},
{
"epoch": 0.42486797887662026,
"grad_norm": 2.1820337772369385,
"learning_rate": 4.3038990212333526e-05,
"loss": 5.6288,
"step": 5310
},
{
"epoch": 0.42566810689710355,
"grad_norm": 2.7174227237701416,
"learning_rate": 4.302561908327539e-05,
"loss": 5.6496,
"step": 5320
},
{
"epoch": 0.42646823491758684,
"grad_norm": 2.7261149883270264,
"learning_rate": 4.301224795421725e-05,
"loss": 5.6557,
"step": 5330
},
{
"epoch": 0.4272683629380701,
"grad_norm": 2.581760883331299,
"learning_rate": 4.2998876825159114e-05,
"loss": 5.604,
"step": 5340
},
{
"epoch": 0.42806849095855337,
"grad_norm": 2.43254017829895,
"learning_rate": 4.298550569610098e-05,
"loss": 5.6041,
"step": 5350
},
{
"epoch": 0.42886861897903666,
"grad_norm": 4.465782165527344,
"learning_rate": 4.297213456704284e-05,
"loss": 5.7158,
"step": 5360
},
{
"epoch": 0.4296687469995199,
"grad_norm": 2.6434614658355713,
"learning_rate": 4.29587634379847e-05,
"loss": 5.6347,
"step": 5370
},
{
"epoch": 0.4304688750200032,
"grad_norm": 2.344190835952759,
"learning_rate": 4.2945392308926565e-05,
"loss": 5.6062,
"step": 5380
},
{
"epoch": 0.4312690030404865,
"grad_norm": 4.311372756958008,
"learning_rate": 4.293202117986843e-05,
"loss": 5.7356,
"step": 5390
},
{
"epoch": 0.4320691310609698,
"grad_norm": 2.8204123973846436,
"learning_rate": 4.291865005081029e-05,
"loss": 5.63,
"step": 5400
},
{
"epoch": 0.432869259081453,
"grad_norm": 3.333059072494507,
"learning_rate": 4.290527892175215e-05,
"loss": 5.5992,
"step": 5410
},
{
"epoch": 0.4336693871019363,
"grad_norm": 2.0647048950195312,
"learning_rate": 4.2891907792694016e-05,
"loss": 5.691,
"step": 5420
},
{
"epoch": 0.4344695151224196,
"grad_norm": 2.5100045204162598,
"learning_rate": 4.287853666363588e-05,
"loss": 5.615,
"step": 5430
},
{
"epoch": 0.43526964314290284,
"grad_norm": 2.6120762825012207,
"learning_rate": 4.286516553457774e-05,
"loss": 5.746,
"step": 5440
},
{
"epoch": 0.43606977116338613,
"grad_norm": 2.2886853218078613,
"learning_rate": 4.2851794405519604e-05,
"loss": 5.6783,
"step": 5450
},
{
"epoch": 0.4368698991838694,
"grad_norm": 2.6724119186401367,
"learning_rate": 4.283842327646147e-05,
"loss": 5.6526,
"step": 5460
},
{
"epoch": 0.4376700272043527,
"grad_norm": 2.2408151626586914,
"learning_rate": 4.282505214740333e-05,
"loss": 5.6314,
"step": 5470
},
{
"epoch": 0.43847015522483596,
"grad_norm": 3.0294084548950195,
"learning_rate": 4.281168101834519e-05,
"loss": 5.6669,
"step": 5480
},
{
"epoch": 0.43927028324531925,
"grad_norm": 2.1664011478424072,
"learning_rate": 4.2798309889287055e-05,
"loss": 5.4856,
"step": 5490
},
{
"epoch": 0.44007041126580254,
"grad_norm": 3.4465417861938477,
"learning_rate": 4.278493876022892e-05,
"loss": 5.5859,
"step": 5500
},
{
"epoch": 0.4408705392862858,
"grad_norm": 2.0116310119628906,
"learning_rate": 4.277156763117078e-05,
"loss": 5.5982,
"step": 5510
},
{
"epoch": 0.44167066730676907,
"grad_norm": 2.578658103942871,
"learning_rate": 4.275819650211264e-05,
"loss": 5.4026,
"step": 5520
},
{
"epoch": 0.44247079532725236,
"grad_norm": 3.1201677322387695,
"learning_rate": 4.2744825373054506e-05,
"loss": 5.7024,
"step": 5530
},
{
"epoch": 0.44327092334773566,
"grad_norm": 2.2246837615966797,
"learning_rate": 4.273145424399637e-05,
"loss": 5.5842,
"step": 5540
},
{
"epoch": 0.4440710513682189,
"grad_norm": 2.1593568325042725,
"learning_rate": 4.271808311493823e-05,
"loss": 5.5099,
"step": 5550
},
{
"epoch": 0.4448711793887022,
"grad_norm": 3.082218885421753,
"learning_rate": 4.2704711985880094e-05,
"loss": 5.5539,
"step": 5560
},
{
"epoch": 0.4456713074091855,
"grad_norm": 3.2272634506225586,
"learning_rate": 4.2691340856821956e-05,
"loss": 5.73,
"step": 5570
},
{
"epoch": 0.4464714354296688,
"grad_norm": 2.301713466644287,
"learning_rate": 4.267796972776382e-05,
"loss": 5.5444,
"step": 5580
},
{
"epoch": 0.447271563450152,
"grad_norm": 3.2985429763793945,
"learning_rate": 4.2664598598705675e-05,
"loss": 5.7499,
"step": 5590
},
{
"epoch": 0.4480716914706353,
"grad_norm": 2.103994607925415,
"learning_rate": 4.265122746964754e-05,
"loss": 5.5627,
"step": 5600
},
{
"epoch": 0.4488718194911186,
"grad_norm": 3.260099172592163,
"learning_rate": 4.26378563405894e-05,
"loss": 5.5692,
"step": 5610
},
{
"epoch": 0.44967194751160183,
"grad_norm": 2.740907907485962,
"learning_rate": 4.262448521153126e-05,
"loss": 5.4984,
"step": 5620
},
{
"epoch": 0.4504720755320851,
"grad_norm": 5.314218997955322,
"learning_rate": 4.2611114082473126e-05,
"loss": 5.5641,
"step": 5630
},
{
"epoch": 0.4512722035525684,
"grad_norm": 3.0524938106536865,
"learning_rate": 4.259774295341499e-05,
"loss": 5.6375,
"step": 5640
},
{
"epoch": 0.4520723315730517,
"grad_norm": 3.57781982421875,
"learning_rate": 4.258437182435685e-05,
"loss": 5.6726,
"step": 5650
},
{
"epoch": 0.45287245959353495,
"grad_norm": 3.094510793685913,
"learning_rate": 4.2571000695298714e-05,
"loss": 5.7328,
"step": 5660
},
{
"epoch": 0.45367258761401824,
"grad_norm": 2.731092929840088,
"learning_rate": 4.2557629566240576e-05,
"loss": 5.6667,
"step": 5670
},
{
"epoch": 0.45447271563450153,
"grad_norm": 3.6701395511627197,
"learning_rate": 4.254425843718244e-05,
"loss": 5.641,
"step": 5680
},
{
"epoch": 0.45527284365498477,
"grad_norm": 1.9017853736877441,
"learning_rate": 4.25308873081243e-05,
"loss": 5.6521,
"step": 5690
},
{
"epoch": 0.45607297167546806,
"grad_norm": 3.2658119201660156,
"learning_rate": 4.2517516179066165e-05,
"loss": 5.6431,
"step": 5700
},
{
"epoch": 0.45687309969595136,
"grad_norm": 2.227353572845459,
"learning_rate": 4.250414505000803e-05,
"loss": 5.6198,
"step": 5710
},
{
"epoch": 0.45767322771643465,
"grad_norm": 1.7804296016693115,
"learning_rate": 4.249077392094989e-05,
"loss": 5.618,
"step": 5720
},
{
"epoch": 0.4584733557369179,
"grad_norm": 2.9357879161834717,
"learning_rate": 4.247740279189175e-05,
"loss": 5.5222,
"step": 5730
},
{
"epoch": 0.4592734837574012,
"grad_norm": 5.074959754943848,
"learning_rate": 4.2464031662833615e-05,
"loss": 5.7604,
"step": 5740
},
{
"epoch": 0.4600736117778845,
"grad_norm": 2.4961061477661133,
"learning_rate": 4.245066053377547e-05,
"loss": 5.5699,
"step": 5750
},
{
"epoch": 0.46087373979836777,
"grad_norm": 2.636403799057007,
"learning_rate": 4.2437289404717334e-05,
"loss": 5.745,
"step": 5760
},
{
"epoch": 0.461673867818851,
"grad_norm": 2.4829630851745605,
"learning_rate": 4.2423918275659197e-05,
"loss": 5.9779,
"step": 5770
},
{
"epoch": 0.4624739958393343,
"grad_norm": 2.389112710952759,
"learning_rate": 4.241054714660106e-05,
"loss": 5.696,
"step": 5780
},
{
"epoch": 0.4632741238598176,
"grad_norm": 2.3053462505340576,
"learning_rate": 4.239717601754292e-05,
"loss": 5.6567,
"step": 5790
},
{
"epoch": 0.4640742518803008,
"grad_norm": 2.9635446071624756,
"learning_rate": 4.2383804888484785e-05,
"loss": 5.7643,
"step": 5800
},
{
"epoch": 0.4648743799007841,
"grad_norm": 3.3227570056915283,
"learning_rate": 4.237043375942665e-05,
"loss": 5.5425,
"step": 5810
},
{
"epoch": 0.4656745079212674,
"grad_norm": 3.2959067821502686,
"learning_rate": 4.235706263036851e-05,
"loss": 5.5886,
"step": 5820
},
{
"epoch": 0.4664746359417507,
"grad_norm": 2.497953176498413,
"learning_rate": 4.234369150131037e-05,
"loss": 5.6248,
"step": 5830
},
{
"epoch": 0.46727476396223394,
"grad_norm": 3.5957205295562744,
"learning_rate": 4.2330320372252235e-05,
"loss": 5.5345,
"step": 5840
},
{
"epoch": 0.46807489198271723,
"grad_norm": 2.9113316535949707,
"learning_rate": 4.23169492431941e-05,
"loss": 5.7358,
"step": 5850
},
{
"epoch": 0.4688750200032005,
"grad_norm": 3.8617255687713623,
"learning_rate": 4.230357811413596e-05,
"loss": 5.7451,
"step": 5860
},
{
"epoch": 0.46967514802368376,
"grad_norm": 2.5546538829803467,
"learning_rate": 4.2290206985077824e-05,
"loss": 5.5874,
"step": 5870
},
{
"epoch": 0.47047527604416706,
"grad_norm": 3.7215869426727295,
"learning_rate": 4.2276835856019686e-05,
"loss": 5.5462,
"step": 5880
},
{
"epoch": 0.47127540406465035,
"grad_norm": 3.3122622966766357,
"learning_rate": 4.226346472696155e-05,
"loss": 5.7368,
"step": 5890
},
{
"epoch": 0.47207553208513364,
"grad_norm": 2.3962459564208984,
"learning_rate": 4.2250093597903405e-05,
"loss": 5.7328,
"step": 5900
},
{
"epoch": 0.4728756601056169,
"grad_norm": 2.497668504714966,
"learning_rate": 4.223672246884527e-05,
"loss": 5.7063,
"step": 5910
},
{
"epoch": 0.4736757881261002,
"grad_norm": 2.301725387573242,
"learning_rate": 4.222335133978713e-05,
"loss": 5.6029,
"step": 5920
},
{
"epoch": 0.47447591614658347,
"grad_norm": 3.840155839920044,
"learning_rate": 4.220998021072899e-05,
"loss": 5.825,
"step": 5930
},
{
"epoch": 0.47527604416706676,
"grad_norm": 3.1776278018951416,
"learning_rate": 4.2196609081670856e-05,
"loss": 5.6421,
"step": 5940
},
{
"epoch": 0.47607617218755,
"grad_norm": 2.1823127269744873,
"learning_rate": 4.218323795261272e-05,
"loss": 5.7154,
"step": 5950
},
{
"epoch": 0.4768763002080333,
"grad_norm": 2.944390058517456,
"learning_rate": 4.216986682355458e-05,
"loss": 5.5429,
"step": 5960
},
{
"epoch": 0.4776764282285166,
"grad_norm": 2.035430431365967,
"learning_rate": 4.2156495694496444e-05,
"loss": 5.8187,
"step": 5970
},
{
"epoch": 0.4784765562489998,
"grad_norm": 3.167098045349121,
"learning_rate": 4.2143124565438306e-05,
"loss": 5.5891,
"step": 5980
},
{
"epoch": 0.4792766842694831,
"grad_norm": 1.9377233982086182,
"learning_rate": 4.212975343638017e-05,
"loss": 5.7428,
"step": 5990
},
{
"epoch": 0.4800768122899664,
"grad_norm": 2.759096622467041,
"learning_rate": 4.211638230732203e-05,
"loss": 5.5572,
"step": 6000
},
{
"epoch": 0.4808769403104497,
"grad_norm": 2.074033498764038,
"learning_rate": 4.2103011178263894e-05,
"loss": 5.517,
"step": 6010
},
{
"epoch": 0.48167706833093293,
"grad_norm": 2.2866854667663574,
"learning_rate": 4.208964004920576e-05,
"loss": 5.6539,
"step": 6020
},
{
"epoch": 0.4824771963514162,
"grad_norm": 1.9909095764160156,
"learning_rate": 4.207626892014762e-05,
"loss": 5.5532,
"step": 6030
},
{
"epoch": 0.4832773243718995,
"grad_norm": 3.245906114578247,
"learning_rate": 4.206289779108948e-05,
"loss": 5.6797,
"step": 6040
},
{
"epoch": 0.48407745239238276,
"grad_norm": 2.013009786605835,
"learning_rate": 4.2049526662031345e-05,
"loss": 5.6378,
"step": 6050
},
{
"epoch": 0.48487758041286605,
"grad_norm": 2.5478925704956055,
"learning_rate": 4.20361555329732e-05,
"loss": 5.555,
"step": 6060
},
{
"epoch": 0.48567770843334934,
"grad_norm": 3.079225778579712,
"learning_rate": 4.2022784403915064e-05,
"loss": 5.7618,
"step": 6070
},
{
"epoch": 0.48647783645383263,
"grad_norm": 2.2639927864074707,
"learning_rate": 4.2009413274856926e-05,
"loss": 5.8063,
"step": 6080
},
{
"epoch": 0.48727796447431587,
"grad_norm": 4.630524158477783,
"learning_rate": 4.199604214579879e-05,
"loss": 5.6403,
"step": 6090
},
{
"epoch": 0.48807809249479917,
"grad_norm": 3.11018967628479,
"learning_rate": 4.198267101674065e-05,
"loss": 5.7517,
"step": 6100
},
{
"epoch": 0.48887822051528246,
"grad_norm": 8.462982177734375,
"learning_rate": 4.1969299887682515e-05,
"loss": 5.7311,
"step": 6110
},
{
"epoch": 0.4896783485357657,
"grad_norm": 2.418065071105957,
"learning_rate": 4.195592875862438e-05,
"loss": 5.6239,
"step": 6120
},
{
"epoch": 0.490478476556249,
"grad_norm": 2.5452466011047363,
"learning_rate": 4.194255762956624e-05,
"loss": 5.7417,
"step": 6130
},
{
"epoch": 0.4912786045767323,
"grad_norm": 2.986041307449341,
"learning_rate": 4.19291865005081e-05,
"loss": 5.663,
"step": 6140
},
{
"epoch": 0.4920787325972156,
"grad_norm": 2.7642807960510254,
"learning_rate": 4.1915815371449965e-05,
"loss": 5.5379,
"step": 6150
},
{
"epoch": 0.4928788606176988,
"grad_norm": 4.326907157897949,
"learning_rate": 4.190244424239183e-05,
"loss": 5.8058,
"step": 6160
},
{
"epoch": 0.4936789886381821,
"grad_norm": 1.9514706134796143,
"learning_rate": 4.188907311333369e-05,
"loss": 5.7004,
"step": 6170
},
{
"epoch": 0.4944791166586654,
"grad_norm": 2.5721428394317627,
"learning_rate": 4.187570198427555e-05,
"loss": 5.6959,
"step": 6180
},
{
"epoch": 0.4952792446791487,
"grad_norm": 2.6619083881378174,
"learning_rate": 4.1862330855217416e-05,
"loss": 5.7196,
"step": 6190
},
{
"epoch": 0.4960793726996319,
"grad_norm": 2.322341203689575,
"learning_rate": 4.184895972615928e-05,
"loss": 5.5998,
"step": 6200
},
{
"epoch": 0.4968795007201152,
"grad_norm": 2.280777931213379,
"learning_rate": 4.183558859710114e-05,
"loss": 5.5171,
"step": 6210
},
{
"epoch": 0.4976796287405985,
"grad_norm": 1.9774320125579834,
"learning_rate": 4.1822217468043004e-05,
"loss": 5.6368,
"step": 6220
},
{
"epoch": 0.49847975676108175,
"grad_norm": 2.199708938598633,
"learning_rate": 4.180884633898487e-05,
"loss": 5.4638,
"step": 6230
},
{
"epoch": 0.49927988478156504,
"grad_norm": 2.0054879188537598,
"learning_rate": 4.179547520992673e-05,
"loss": 5.4624,
"step": 6240
},
{
"epoch": 0.5000800128020483,
"grad_norm": 2.0623903274536133,
"learning_rate": 4.178210408086859e-05,
"loss": 5.6554,
"step": 6250
},
{
"epoch": 0.5008801408225316,
"grad_norm": 2.5907487869262695,
"learning_rate": 4.1768732951810455e-05,
"loss": 5.4989,
"step": 6260
},
{
"epoch": 0.5016802688430149,
"grad_norm": 2.181987762451172,
"learning_rate": 4.175536182275232e-05,
"loss": 5.624,
"step": 6270
},
{
"epoch": 0.5024803968634982,
"grad_norm": 2.9678001403808594,
"learning_rate": 4.174199069369418e-05,
"loss": 5.6545,
"step": 6280
},
{
"epoch": 0.5032805248839815,
"grad_norm": 5.213638782501221,
"learning_rate": 4.172861956463604e-05,
"loss": 5.7048,
"step": 6290
},
{
"epoch": 0.5040806529044647,
"grad_norm": 2.465900182723999,
"learning_rate": 4.1715248435577906e-05,
"loss": 5.646,
"step": 6300
},
{
"epoch": 0.504880780924948,
"grad_norm": 2.94570255279541,
"learning_rate": 4.170187730651977e-05,
"loss": 5.6274,
"step": 6310
},
{
"epoch": 0.5056809089454313,
"grad_norm": 3.5255651473999023,
"learning_rate": 4.168850617746163e-05,
"loss": 5.5336,
"step": 6320
},
{
"epoch": 0.5064810369659145,
"grad_norm": 2.3499608039855957,
"learning_rate": 4.1675135048403494e-05,
"loss": 5.7768,
"step": 6330
},
{
"epoch": 0.5072811649863979,
"grad_norm": 2.0476951599121094,
"learning_rate": 4.1661763919345356e-05,
"loss": 5.5927,
"step": 6340
},
{
"epoch": 0.5080812930068811,
"grad_norm": 2.4708118438720703,
"learning_rate": 4.164839279028722e-05,
"loss": 5.6458,
"step": 6350
},
{
"epoch": 0.5088814210273643,
"grad_norm": 2.465075731277466,
"learning_rate": 4.163502166122908e-05,
"loss": 5.5744,
"step": 6360
},
{
"epoch": 0.5096815490478477,
"grad_norm": 2.9378490447998047,
"learning_rate": 4.162165053217094e-05,
"loss": 5.6963,
"step": 6370
},
{
"epoch": 0.5104816770683309,
"grad_norm": 2.201359987258911,
"learning_rate": 4.16082794031128e-05,
"loss": 5.613,
"step": 6380
},
{
"epoch": 0.5112818050888142,
"grad_norm": 1.8427401781082153,
"learning_rate": 4.159490827405466e-05,
"loss": 5.5494,
"step": 6390
},
{
"epoch": 0.5120819331092975,
"grad_norm": 1.9969813823699951,
"learning_rate": 4.1581537144996526e-05,
"loss": 5.5783,
"step": 6400
},
{
"epoch": 0.5128820611297807,
"grad_norm": 2.9670321941375732,
"learning_rate": 4.156816601593839e-05,
"loss": 5.7176,
"step": 6410
},
{
"epoch": 0.5136821891502641,
"grad_norm": 2.76875901222229,
"learning_rate": 4.155479488688025e-05,
"loss": 5.5584,
"step": 6420
},
{
"epoch": 0.5144823171707473,
"grad_norm": 3.2874600887298584,
"learning_rate": 4.1541423757822114e-05,
"loss": 5.8726,
"step": 6430
},
{
"epoch": 0.5152824451912306,
"grad_norm": 2.4672482013702393,
"learning_rate": 4.1528052628763977e-05,
"loss": 5.764,
"step": 6440
},
{
"epoch": 0.5160825732117139,
"grad_norm": 3.5424506664276123,
"learning_rate": 4.151468149970584e-05,
"loss": 5.6612,
"step": 6450
},
{
"epoch": 0.5168827012321972,
"grad_norm": 2.7947871685028076,
"learning_rate": 4.15013103706477e-05,
"loss": 5.668,
"step": 6460
},
{
"epoch": 0.5176828292526804,
"grad_norm": 2.624370574951172,
"learning_rate": 4.1487939241589565e-05,
"loss": 5.577,
"step": 6470
},
{
"epoch": 0.5184829572731637,
"grad_norm": 2.276289701461792,
"learning_rate": 4.147456811253143e-05,
"loss": 5.7592,
"step": 6480
},
{
"epoch": 0.519283085293647,
"grad_norm": 2.751945972442627,
"learning_rate": 4.146119698347329e-05,
"loss": 5.6251,
"step": 6490
},
{
"epoch": 0.5200832133141302,
"grad_norm": 2.1990444660186768,
"learning_rate": 4.144782585441515e-05,
"loss": 5.5141,
"step": 6500
},
{
"epoch": 0.5208833413346136,
"grad_norm": 2.732024908065796,
"learning_rate": 4.1434454725357015e-05,
"loss": 5.5938,
"step": 6510
},
{
"epoch": 0.5216834693550968,
"grad_norm": 2.6876533031463623,
"learning_rate": 4.142108359629887e-05,
"loss": 5.7126,
"step": 6520
},
{
"epoch": 0.5224835973755801,
"grad_norm": 2.660323143005371,
"learning_rate": 4.1407712467240734e-05,
"loss": 5.6261,
"step": 6530
},
{
"epoch": 0.5232837253960634,
"grad_norm": 2.567084550857544,
"learning_rate": 4.13943413381826e-05,
"loss": 5.5248,
"step": 6540
},
{
"epoch": 0.5240838534165466,
"grad_norm": 4.317018032073975,
"learning_rate": 4.138097020912446e-05,
"loss": 5.4444,
"step": 6550
},
{
"epoch": 0.52488398143703,
"grad_norm": 2.0361647605895996,
"learning_rate": 4.136759908006632e-05,
"loss": 5.7532,
"step": 6560
},
{
"epoch": 0.5256841094575132,
"grad_norm": 2.0946271419525146,
"learning_rate": 4.1354227951008185e-05,
"loss": 5.6343,
"step": 6570
},
{
"epoch": 0.5264842374779964,
"grad_norm": 3.3724842071533203,
"learning_rate": 4.134085682195005e-05,
"loss": 5.6455,
"step": 6580
},
{
"epoch": 0.5272843654984798,
"grad_norm": 4.078947067260742,
"learning_rate": 4.132748569289191e-05,
"loss": 5.6681,
"step": 6590
},
{
"epoch": 0.528084493518963,
"grad_norm": 4.288105010986328,
"learning_rate": 4.131411456383377e-05,
"loss": 5.7152,
"step": 6600
},
{
"epoch": 0.5288846215394463,
"grad_norm": 2.5208754539489746,
"learning_rate": 4.1300743434775635e-05,
"loss": 5.5715,
"step": 6610
},
{
"epoch": 0.5296847495599296,
"grad_norm": 2.6902217864990234,
"learning_rate": 4.12873723057175e-05,
"loss": 5.4997,
"step": 6620
},
{
"epoch": 0.5304848775804129,
"grad_norm": 2.4580068588256836,
"learning_rate": 4.127400117665936e-05,
"loss": 5.7656,
"step": 6630
},
{
"epoch": 0.5312850056008962,
"grad_norm": 2.5117955207824707,
"learning_rate": 4.1260630047601224e-05,
"loss": 5.6373,
"step": 6640
},
{
"epoch": 0.5320851336213794,
"grad_norm": 2.660921096801758,
"learning_rate": 4.1247258918543086e-05,
"loss": 5.6829,
"step": 6650
},
{
"epoch": 0.5328852616418627,
"grad_norm": 2.4601287841796875,
"learning_rate": 4.123388778948495e-05,
"loss": 5.7702,
"step": 6660
},
{
"epoch": 0.533685389662346,
"grad_norm": 2.9025120735168457,
"learning_rate": 4.122051666042681e-05,
"loss": 5.6374,
"step": 6670
},
{
"epoch": 0.5344855176828293,
"grad_norm": 2.8221569061279297,
"learning_rate": 4.120714553136867e-05,
"loss": 5.5568,
"step": 6680
},
{
"epoch": 0.5352856457033125,
"grad_norm": 2.3035178184509277,
"learning_rate": 4.119377440231053e-05,
"loss": 5.5845,
"step": 6690
},
{
"epoch": 0.5360857737237958,
"grad_norm": 2.0955657958984375,
"learning_rate": 4.118040327325239e-05,
"loss": 5.687,
"step": 6700
},
{
"epoch": 0.5368859017442791,
"grad_norm": 2.530156135559082,
"learning_rate": 4.1167032144194256e-05,
"loss": 5.5772,
"step": 6710
},
{
"epoch": 0.5376860297647623,
"grad_norm": 2.2060387134552,
"learning_rate": 4.115366101513612e-05,
"loss": 5.5964,
"step": 6720
},
{
"epoch": 0.5384861577852457,
"grad_norm": 2.720702886581421,
"learning_rate": 4.114028988607798e-05,
"loss": 5.5432,
"step": 6730
},
{
"epoch": 0.5392862858057289,
"grad_norm": 2.2585232257843018,
"learning_rate": 4.1126918757019844e-05,
"loss": 5.77,
"step": 6740
},
{
"epoch": 0.5400864138262121,
"grad_norm": 2.052316904067993,
"learning_rate": 4.1113547627961706e-05,
"loss": 5.5679,
"step": 6750
},
{
"epoch": 0.5408865418466955,
"grad_norm": 2.772500991821289,
"learning_rate": 4.110017649890357e-05,
"loss": 5.5608,
"step": 6760
},
{
"epoch": 0.5416866698671787,
"grad_norm": 2.158129930496216,
"learning_rate": 4.108680536984543e-05,
"loss": 5.6612,
"step": 6770
},
{
"epoch": 0.5424867978876621,
"grad_norm": 2.874685287475586,
"learning_rate": 4.1073434240787294e-05,
"loss": 5.5999,
"step": 6780
},
{
"epoch": 0.5432869259081453,
"grad_norm": 2.2797632217407227,
"learning_rate": 4.106006311172916e-05,
"loss": 5.7243,
"step": 6790
},
{
"epoch": 0.5440870539286286,
"grad_norm": 2.998309850692749,
"learning_rate": 4.1048029095576836e-05,
"loss": 5.5031,
"step": 6800
},
{
"epoch": 0.5448871819491119,
"grad_norm": 2.8155364990234375,
"learning_rate": 4.10346579665187e-05,
"loss": 5.7631,
"step": 6810
},
{
"epoch": 0.5456873099695951,
"grad_norm": 2.327279806137085,
"learning_rate": 4.102128683746056e-05,
"loss": 5.6293,
"step": 6820
},
{
"epoch": 0.5464874379900784,
"grad_norm": 3.3200621604919434,
"learning_rate": 4.100791570840242e-05,
"loss": 5.717,
"step": 6830
},
{
"epoch": 0.5472875660105617,
"grad_norm": 2.521144390106201,
"learning_rate": 4.099454457934428e-05,
"loss": 5.5705,
"step": 6840
},
{
"epoch": 0.548087694031045,
"grad_norm": 2.7198219299316406,
"learning_rate": 4.098117345028614e-05,
"loss": 5.5931,
"step": 6850
},
{
"epoch": 0.5488878220515282,
"grad_norm": 2.701251268386841,
"learning_rate": 4.0967802321228006e-05,
"loss": 5.4706,
"step": 6860
},
{
"epoch": 0.5496879500720115,
"grad_norm": 2.2789149284362793,
"learning_rate": 4.095443119216987e-05,
"loss": 5.5883,
"step": 6870
},
{
"epoch": 0.5504880780924948,
"grad_norm": 2.8821568489074707,
"learning_rate": 4.094106006311173e-05,
"loss": 5.7525,
"step": 6880
},
{
"epoch": 0.5512882061129781,
"grad_norm": 2.3450064659118652,
"learning_rate": 4.0927688934053594e-05,
"loss": 5.5166,
"step": 6890
},
{
"epoch": 0.5520883341334614,
"grad_norm": 2.639960527420044,
"learning_rate": 4.0914317804995456e-05,
"loss": 5.7001,
"step": 6900
},
{
"epoch": 0.5528884621539446,
"grad_norm": 2.6743710041046143,
"learning_rate": 4.090094667593732e-05,
"loss": 5.7049,
"step": 6910
},
{
"epoch": 0.553688590174428,
"grad_norm": 2.7540199756622314,
"learning_rate": 4.088757554687918e-05,
"loss": 5.5705,
"step": 6920
},
{
"epoch": 0.5544887181949112,
"grad_norm": 3.2703442573547363,
"learning_rate": 4.0874204417821044e-05,
"loss": 5.5585,
"step": 6930
},
{
"epoch": 0.5552888462153944,
"grad_norm": 3.684135913848877,
"learning_rate": 4.086083328876291e-05,
"loss": 5.6561,
"step": 6940
},
{
"epoch": 0.5560889742358778,
"grad_norm": 2.918989896774292,
"learning_rate": 4.084746215970477e-05,
"loss": 5.5171,
"step": 6950
},
{
"epoch": 0.556889102256361,
"grad_norm": 2.5902323722839355,
"learning_rate": 4.083409103064663e-05,
"loss": 5.6703,
"step": 6960
},
{
"epoch": 0.5576892302768442,
"grad_norm": 2.23820161819458,
"learning_rate": 4.0820719901588495e-05,
"loss": 5.7048,
"step": 6970
},
{
"epoch": 0.5584893582973276,
"grad_norm": 2.4339401721954346,
"learning_rate": 4.080734877253036e-05,
"loss": 5.4264,
"step": 6980
},
{
"epoch": 0.5592894863178108,
"grad_norm": 3.3097031116485596,
"learning_rate": 4.0793977643472214e-05,
"loss": 5.5931,
"step": 6990
},
{
"epoch": 0.5600896143382941,
"grad_norm": 2.6903202533721924,
"learning_rate": 4.0780606514414077e-05,
"loss": 5.5349,
"step": 7000
},
{
"epoch": 0.5600896143382941,
"eval_loss": 5.870830535888672,
"eval_runtime": 13.3044,
"eval_samples_per_second": 3.007,
"eval_steps_per_second": 0.376,
"step": 7000
},
{
"epoch": 0.5608897423587774,
"grad_norm": 2.144684314727783,
"learning_rate": 4.076723538535594e-05,
"loss": 5.6295,
"step": 7010
},
{
"epoch": 0.5616898703792607,
"grad_norm": 3.227046489715576,
"learning_rate": 4.07538642562978e-05,
"loss": 5.5506,
"step": 7020
},
{
"epoch": 0.562489998399744,
"grad_norm": 2.7323713302612305,
"learning_rate": 4.0740493127239665e-05,
"loss": 5.5441,
"step": 7030
},
{
"epoch": 0.5632901264202272,
"grad_norm": 2.3682384490966797,
"learning_rate": 4.072712199818153e-05,
"loss": 5.6632,
"step": 7040
},
{
"epoch": 0.5640902544407105,
"grad_norm": 3.006518602371216,
"learning_rate": 4.071375086912339e-05,
"loss": 5.5702,
"step": 7050
},
{
"epoch": 0.5648903824611938,
"grad_norm": 2.554481029510498,
"learning_rate": 4.070037974006525e-05,
"loss": 5.4405,
"step": 7060
},
{
"epoch": 0.5656905104816771,
"grad_norm": 2.2349042892456055,
"learning_rate": 4.0687008611007115e-05,
"loss": 5.5774,
"step": 7070
},
{
"epoch": 0.5664906385021603,
"grad_norm": 2.24906325340271,
"learning_rate": 4.067363748194898e-05,
"loss": 5.6362,
"step": 7080
},
{
"epoch": 0.5672907665226437,
"grad_norm": 2.2345407009124756,
"learning_rate": 4.066026635289084e-05,
"loss": 5.642,
"step": 7090
},
{
"epoch": 0.5680908945431269,
"grad_norm": 3.2273216247558594,
"learning_rate": 4.0646895223832703e-05,
"loss": 5.5204,
"step": 7100
},
{
"epoch": 0.5688910225636101,
"grad_norm": 2.689624071121216,
"learning_rate": 4.0633524094774566e-05,
"loss": 5.5565,
"step": 7110
},
{
"epoch": 0.5696911505840935,
"grad_norm": 3.4473490715026855,
"learning_rate": 4.062015296571643e-05,
"loss": 5.4041,
"step": 7120
},
{
"epoch": 0.5704912786045767,
"grad_norm": 2.528700590133667,
"learning_rate": 4.060678183665829e-05,
"loss": 5.4294,
"step": 7130
},
{
"epoch": 0.5712914066250601,
"grad_norm": 2.6679399013519287,
"learning_rate": 4.059341070760015e-05,
"loss": 5.6018,
"step": 7140
},
{
"epoch": 0.5720915346455433,
"grad_norm": 2.0572123527526855,
"learning_rate": 4.058003957854201e-05,
"loss": 5.6527,
"step": 7150
},
{
"epoch": 0.5728916626660265,
"grad_norm": 2.446279287338257,
"learning_rate": 4.056666844948387e-05,
"loss": 5.5862,
"step": 7160
},
{
"epoch": 0.5736917906865099,
"grad_norm": 2.067232131958008,
"learning_rate": 4.0553297320425735e-05,
"loss": 5.5159,
"step": 7170
},
{
"epoch": 0.5744919187069931,
"grad_norm": 2.225755214691162,
"learning_rate": 4.05399261913676e-05,
"loss": 5.6483,
"step": 7180
},
{
"epoch": 0.5752920467274764,
"grad_norm": 2.3613367080688477,
"learning_rate": 4.052655506230946e-05,
"loss": 5.6226,
"step": 7190
},
{
"epoch": 0.5760921747479597,
"grad_norm": 2.4239625930786133,
"learning_rate": 4.0513183933251324e-05,
"loss": 5.6164,
"step": 7200
},
{
"epoch": 0.5768923027684429,
"grad_norm": 3.5525450706481934,
"learning_rate": 4.0499812804193186e-05,
"loss": 5.4503,
"step": 7210
},
{
"epoch": 0.5776924307889262,
"grad_norm": 2.664311170578003,
"learning_rate": 4.048644167513505e-05,
"loss": 5.5188,
"step": 7220
},
{
"epoch": 0.5784925588094095,
"grad_norm": 2.4020540714263916,
"learning_rate": 4.047307054607691e-05,
"loss": 5.5481,
"step": 7230
},
{
"epoch": 0.5792926868298928,
"grad_norm": 2.256044626235962,
"learning_rate": 4.0459699417018774e-05,
"loss": 5.6097,
"step": 7240
},
{
"epoch": 0.5800928148503761,
"grad_norm": 2.1168150901794434,
"learning_rate": 4.044632828796064e-05,
"loss": 5.5249,
"step": 7250
},
{
"epoch": 0.5808929428708594,
"grad_norm": 2.329375743865967,
"learning_rate": 4.04329571589025e-05,
"loss": 5.504,
"step": 7260
},
{
"epoch": 0.5816930708913426,
"grad_norm": 2.1734092235565186,
"learning_rate": 4.041958602984436e-05,
"loss": 5.5017,
"step": 7270
},
{
"epoch": 0.5824931989118259,
"grad_norm": 3.232649564743042,
"learning_rate": 4.0406214900786225e-05,
"loss": 5.6462,
"step": 7280
},
{
"epoch": 0.5832933269323092,
"grad_norm": 3.140702724456787,
"learning_rate": 4.039284377172809e-05,
"loss": 5.4393,
"step": 7290
},
{
"epoch": 0.5840934549527924,
"grad_norm": 2.284515619277954,
"learning_rate": 4.0379472642669944e-05,
"loss": 5.4891,
"step": 7300
},
{
"epoch": 0.5848935829732758,
"grad_norm": 4.518533706665039,
"learning_rate": 4.0366101513611806e-05,
"loss": 5.7371,
"step": 7310
},
{
"epoch": 0.585693710993759,
"grad_norm": 2.2323620319366455,
"learning_rate": 4.035273038455367e-05,
"loss": 5.6324,
"step": 7320
},
{
"epoch": 0.5864938390142422,
"grad_norm": 3.123394012451172,
"learning_rate": 4.033935925549553e-05,
"loss": 5.6266,
"step": 7330
},
{
"epoch": 0.5872939670347256,
"grad_norm": 2.577545642852783,
"learning_rate": 4.0325988126437394e-05,
"loss": 5.6541,
"step": 7340
},
{
"epoch": 0.5880940950552088,
"grad_norm": 2.8590281009674072,
"learning_rate": 4.031261699737926e-05,
"loss": 5.6927,
"step": 7350
},
{
"epoch": 0.5888942230756921,
"grad_norm": 3.0693793296813965,
"learning_rate": 4.029924586832112e-05,
"loss": 5.5101,
"step": 7360
},
{
"epoch": 0.5896943510961754,
"grad_norm": 2.5813119411468506,
"learning_rate": 4.028587473926298e-05,
"loss": 5.625,
"step": 7370
},
{
"epoch": 0.5904944791166586,
"grad_norm": 2.7804691791534424,
"learning_rate": 4.0272503610204845e-05,
"loss": 5.6264,
"step": 7380
},
{
"epoch": 0.591294607137142,
"grad_norm": 2.4291296005249023,
"learning_rate": 4.025913248114671e-05,
"loss": 5.5024,
"step": 7390
},
{
"epoch": 0.5920947351576252,
"grad_norm": 2.6989386081695557,
"learning_rate": 4.024576135208857e-05,
"loss": 5.4484,
"step": 7400
},
{
"epoch": 0.5928948631781085,
"grad_norm": 2.42767596244812,
"learning_rate": 4.023239022303043e-05,
"loss": 5.5537,
"step": 7410
},
{
"epoch": 0.5936949911985918,
"grad_norm": 2.492577075958252,
"learning_rate": 4.0219019093972296e-05,
"loss": 5.616,
"step": 7420
},
{
"epoch": 0.594495119219075,
"grad_norm": 2.4696478843688965,
"learning_rate": 4.020564796491416e-05,
"loss": 5.62,
"step": 7430
},
{
"epoch": 0.5952952472395583,
"grad_norm": 3.2339985370635986,
"learning_rate": 4.019227683585602e-05,
"loss": 5.485,
"step": 7440
},
{
"epoch": 0.5960953752600416,
"grad_norm": 3.9647512435913086,
"learning_rate": 4.0178905706797884e-05,
"loss": 5.5868,
"step": 7450
},
{
"epoch": 0.5968955032805249,
"grad_norm": 2.36417293548584,
"learning_rate": 4.016553457773975e-05,
"loss": 5.5179,
"step": 7460
},
{
"epoch": 0.5976956313010081,
"grad_norm": 2.1484084129333496,
"learning_rate": 4.015216344868161e-05,
"loss": 5.6915,
"step": 7470
},
{
"epoch": 0.5984957593214915,
"grad_norm": 2.5233757495880127,
"learning_rate": 4.013879231962347e-05,
"loss": 5.4879,
"step": 7480
},
{
"epoch": 0.5992958873419747,
"grad_norm": 3.3730146884918213,
"learning_rate": 4.0125421190565335e-05,
"loss": 5.6531,
"step": 7490
},
{
"epoch": 0.600096015362458,
"grad_norm": 3.0788846015930176,
"learning_rate": 4.01120500615072e-05,
"loss": 5.6058,
"step": 7500
},
{
"epoch": 0.6008961433829413,
"grad_norm": 2.93515944480896,
"learning_rate": 4.009867893244906e-05,
"loss": 5.4777,
"step": 7510
},
{
"epoch": 0.6016962714034245,
"grad_norm": 2.6020236015319824,
"learning_rate": 4.008530780339092e-05,
"loss": 5.6444,
"step": 7520
},
{
"epoch": 0.6024963994239079,
"grad_norm": 2.4522392749786377,
"learning_rate": 4.0071936674332786e-05,
"loss": 5.6157,
"step": 7530
},
{
"epoch": 0.6032965274443911,
"grad_norm": 3.1317343711853027,
"learning_rate": 4.005856554527465e-05,
"loss": 5.5527,
"step": 7540
},
{
"epoch": 0.6040966554648743,
"grad_norm": 2.485154390335083,
"learning_rate": 4.004519441621651e-05,
"loss": 5.6467,
"step": 7550
},
{
"epoch": 0.6048967834853577,
"grad_norm": 2.2032833099365234,
"learning_rate": 4.0031823287158374e-05,
"loss": 5.4957,
"step": 7560
},
{
"epoch": 0.6056969115058409,
"grad_norm": 3.1787898540496826,
"learning_rate": 4.0018452158100236e-05,
"loss": 5.6204,
"step": 7570
},
{
"epoch": 0.6064970395263242,
"grad_norm": 2.9925789833068848,
"learning_rate": 4.00050810290421e-05,
"loss": 5.6732,
"step": 7580
},
{
"epoch": 0.6072971675468075,
"grad_norm": 2.7631521224975586,
"learning_rate": 3.999170989998396e-05,
"loss": 5.6743,
"step": 7590
},
{
"epoch": 0.6080972955672908,
"grad_norm": 2.808265447616577,
"learning_rate": 3.997833877092582e-05,
"loss": 5.5951,
"step": 7600
},
{
"epoch": 0.608897423587774,
"grad_norm": 3.6244983673095703,
"learning_rate": 3.996496764186768e-05,
"loss": 5.5216,
"step": 7610
},
{
"epoch": 0.6096975516082573,
"grad_norm": 2.4245145320892334,
"learning_rate": 3.995159651280954e-05,
"loss": 5.5844,
"step": 7620
},
{
"epoch": 0.6104976796287406,
"grad_norm": 2.2855565547943115,
"learning_rate": 3.9938225383751406e-05,
"loss": 5.5674,
"step": 7630
},
{
"epoch": 0.6112978076492239,
"grad_norm": 2.2801260948181152,
"learning_rate": 3.992485425469327e-05,
"loss": 5.4406,
"step": 7640
},
{
"epoch": 0.6120979356697072,
"grad_norm": 2.0117592811584473,
"learning_rate": 3.991148312563513e-05,
"loss": 5.5463,
"step": 7650
},
{
"epoch": 0.6128980636901904,
"grad_norm": 3.110349655151367,
"learning_rate": 3.9898111996576994e-05,
"loss": 5.6124,
"step": 7660
},
{
"epoch": 0.6136981917106737,
"grad_norm": 2.9789066314697266,
"learning_rate": 3.9884740867518856e-05,
"loss": 5.789,
"step": 7670
},
{
"epoch": 0.614498319731157,
"grad_norm": 2.641871213912964,
"learning_rate": 3.987136973846072e-05,
"loss": 5.4838,
"step": 7680
},
{
"epoch": 0.6152984477516402,
"grad_norm": 3.82928466796875,
"learning_rate": 3.985799860940258e-05,
"loss": 5.7108,
"step": 7690
},
{
"epoch": 0.6160985757721236,
"grad_norm": 3.2533349990844727,
"learning_rate": 3.9844627480344444e-05,
"loss": 5.4167,
"step": 7700
},
{
"epoch": 0.6168987037926068,
"grad_norm": 2.4259872436523438,
"learning_rate": 3.983125635128631e-05,
"loss": 5.5539,
"step": 7710
},
{
"epoch": 0.61769883181309,
"grad_norm": 3.5356359481811523,
"learning_rate": 3.981788522222817e-05,
"loss": 5.4643,
"step": 7720
},
{
"epoch": 0.6184989598335734,
"grad_norm": 2.5774996280670166,
"learning_rate": 3.980451409317003e-05,
"loss": 5.5389,
"step": 7730
},
{
"epoch": 0.6192990878540566,
"grad_norm": 2.3197529315948486,
"learning_rate": 3.9791142964111895e-05,
"loss": 5.5724,
"step": 7740
},
{
"epoch": 0.62009921587454,
"grad_norm": 2.2660646438598633,
"learning_rate": 3.977777183505376e-05,
"loss": 5.5675,
"step": 7750
},
{
"epoch": 0.6208993438950232,
"grad_norm": 2.7596511840820312,
"learning_rate": 3.9764400705995614e-05,
"loss": 5.6168,
"step": 7760
},
{
"epoch": 0.6216994719155065,
"grad_norm": 2.4579806327819824,
"learning_rate": 3.9751029576937477e-05,
"loss": 5.4243,
"step": 7770
},
{
"epoch": 0.6224995999359898,
"grad_norm": 2.7039647102355957,
"learning_rate": 3.973765844787934e-05,
"loss": 5.633,
"step": 7780
},
{
"epoch": 0.623299727956473,
"grad_norm": 2.274777412414551,
"learning_rate": 3.97242873188212e-05,
"loss": 5.5945,
"step": 7790
},
{
"epoch": 0.6240998559769563,
"grad_norm": 2.4263217449188232,
"learning_rate": 3.9710916189763065e-05,
"loss": 5.6763,
"step": 7800
},
{
"epoch": 0.6248999839974396,
"grad_norm": 3.420625686645508,
"learning_rate": 3.969754506070493e-05,
"loss": 5.4884,
"step": 7810
},
{
"epoch": 0.6257001120179229,
"grad_norm": 2.1576149463653564,
"learning_rate": 3.968417393164679e-05,
"loss": 5.6325,
"step": 7820
},
{
"epoch": 0.6265002400384061,
"grad_norm": 2.4189348220825195,
"learning_rate": 3.967080280258865e-05,
"loss": 5.5113,
"step": 7830
},
{
"epoch": 0.6273003680588894,
"grad_norm": 2.533433675765991,
"learning_rate": 3.9657431673530515e-05,
"loss": 5.3743,
"step": 7840
},
{
"epoch": 0.6281004960793727,
"grad_norm": 2.2747883796691895,
"learning_rate": 3.964406054447238e-05,
"loss": 5.4912,
"step": 7850
},
{
"epoch": 0.628900624099856,
"grad_norm": 2.546261787414551,
"learning_rate": 3.963068941541424e-05,
"loss": 5.6571,
"step": 7860
},
{
"epoch": 0.6297007521203393,
"grad_norm": 2.5970914363861084,
"learning_rate": 3.9617318286356103e-05,
"loss": 5.6732,
"step": 7870
},
{
"epoch": 0.6305008801408225,
"grad_norm": 2.956646680831909,
"learning_rate": 3.9603947157297966e-05,
"loss": 5.4769,
"step": 7880
},
{
"epoch": 0.6313010081613059,
"grad_norm": 2.9553463459014893,
"learning_rate": 3.959057602823983e-05,
"loss": 5.4675,
"step": 7890
},
{
"epoch": 0.6321011361817891,
"grad_norm": 2.6471643447875977,
"learning_rate": 3.957720489918169e-05,
"loss": 5.4538,
"step": 7900
},
{
"epoch": 0.6329012642022723,
"grad_norm": 2.847944736480713,
"learning_rate": 3.956383377012355e-05,
"loss": 5.384,
"step": 7910
},
{
"epoch": 0.6337013922227557,
"grad_norm": 3.6218080520629883,
"learning_rate": 3.955046264106541e-05,
"loss": 5.56,
"step": 7920
},
{
"epoch": 0.6345015202432389,
"grad_norm": 2.396426200866699,
"learning_rate": 3.953709151200727e-05,
"loss": 5.6353,
"step": 7930
},
{
"epoch": 0.6353016482637222,
"grad_norm": 2.4465904235839844,
"learning_rate": 3.9523720382949135e-05,
"loss": 5.6698,
"step": 7940
},
{
"epoch": 0.6361017762842055,
"grad_norm": 2.6707208156585693,
"learning_rate": 3.9510349253891e-05,
"loss": 5.4316,
"step": 7950
},
{
"epoch": 0.6369019043046887,
"grad_norm": 2.982117176055908,
"learning_rate": 3.949697812483286e-05,
"loss": 5.6359,
"step": 7960
},
{
"epoch": 0.637702032325172,
"grad_norm": 2.6343331336975098,
"learning_rate": 3.9483606995774724e-05,
"loss": 5.6188,
"step": 7970
},
{
"epoch": 0.6385021603456553,
"grad_norm": 2.290728807449341,
"learning_rate": 3.9470235866716586e-05,
"loss": 5.5824,
"step": 7980
},
{
"epoch": 0.6393022883661386,
"grad_norm": 2.3056259155273438,
"learning_rate": 3.945686473765845e-05,
"loss": 5.5314,
"step": 7990
},
{
"epoch": 0.6401024163866219,
"grad_norm": 2.301790714263916,
"learning_rate": 3.944349360860031e-05,
"loss": 5.497,
"step": 8000
},
{
"epoch": 0.6409025444071051,
"grad_norm": 2.2784414291381836,
"learning_rate": 3.9430122479542174e-05,
"loss": 5.6482,
"step": 8010
},
{
"epoch": 0.6417026724275884,
"grad_norm": 2.3686752319335938,
"learning_rate": 3.941675135048404e-05,
"loss": 5.449,
"step": 8020
},
{
"epoch": 0.6425028004480717,
"grad_norm": 3.0353329181671143,
"learning_rate": 3.94033802214259e-05,
"loss": 5.4544,
"step": 8030
},
{
"epoch": 0.643302928468555,
"grad_norm": 3.035477876663208,
"learning_rate": 3.939000909236776e-05,
"loss": 5.4641,
"step": 8040
},
{
"epoch": 0.6441030564890382,
"grad_norm": 2.6078028678894043,
"learning_rate": 3.9376637963309625e-05,
"loss": 5.6181,
"step": 8050
},
{
"epoch": 0.6449031845095216,
"grad_norm": 2.7835607528686523,
"learning_rate": 3.936326683425149e-05,
"loss": 5.459,
"step": 8060
},
{
"epoch": 0.6457033125300048,
"grad_norm": 2.465331792831421,
"learning_rate": 3.9349895705193344e-05,
"loss": 5.5365,
"step": 8070
},
{
"epoch": 0.646503440550488,
"grad_norm": 2.0666961669921875,
"learning_rate": 3.9336524576135206e-05,
"loss": 5.5158,
"step": 8080
},
{
"epoch": 0.6473035685709714,
"grad_norm": 2.2512967586517334,
"learning_rate": 3.932315344707707e-05,
"loss": 5.4235,
"step": 8090
},
{
"epoch": 0.6481036965914546,
"grad_norm": 2.081125497817993,
"learning_rate": 3.930978231801893e-05,
"loss": 5.4172,
"step": 8100
},
{
"epoch": 0.648903824611938,
"grad_norm": 2.0393776893615723,
"learning_rate": 3.9296411188960794e-05,
"loss": 5.5454,
"step": 8110
},
{
"epoch": 0.6497039526324212,
"grad_norm": 2.671065092086792,
"learning_rate": 3.928304005990266e-05,
"loss": 5.4562,
"step": 8120
},
{
"epoch": 0.6505040806529044,
"grad_norm": 2.3266165256500244,
"learning_rate": 3.926966893084452e-05,
"loss": 5.5839,
"step": 8130
},
{
"epoch": 0.6513042086733878,
"grad_norm": 2.400386333465576,
"learning_rate": 3.925629780178638e-05,
"loss": 5.7815,
"step": 8140
},
{
"epoch": 0.652104336693871,
"grad_norm": 2.3798139095306396,
"learning_rate": 3.9242926672728245e-05,
"loss": 5.5736,
"step": 8150
},
{
"epoch": 0.6529044647143543,
"grad_norm": 2.4090096950531006,
"learning_rate": 3.922955554367011e-05,
"loss": 5.4634,
"step": 8160
},
{
"epoch": 0.6537045927348376,
"grad_norm": 3.5072951316833496,
"learning_rate": 3.921618441461197e-05,
"loss": 5.5608,
"step": 8170
},
{
"epoch": 0.6545047207553208,
"grad_norm": 2.364222526550293,
"learning_rate": 3.920281328555383e-05,
"loss": 5.7275,
"step": 8180
},
{
"epoch": 0.6553048487758041,
"grad_norm": 4.594448566436768,
"learning_rate": 3.9189442156495696e-05,
"loss": 5.7235,
"step": 8190
},
{
"epoch": 0.6561049767962874,
"grad_norm": 3.863098621368408,
"learning_rate": 3.917607102743756e-05,
"loss": 5.5359,
"step": 8200
},
{
"epoch": 0.6569051048167707,
"grad_norm": 3.201704978942871,
"learning_rate": 3.916269989837942e-05,
"loss": 5.645,
"step": 8210
},
{
"epoch": 0.6577052328372539,
"grad_norm": 2.697448492050171,
"learning_rate": 3.9149328769321284e-05,
"loss": 5.523,
"step": 8220
},
{
"epoch": 0.6585053608577373,
"grad_norm": 2.4561972618103027,
"learning_rate": 3.913595764026315e-05,
"loss": 5.734,
"step": 8230
},
{
"epoch": 0.6593054888782205,
"grad_norm": 4.527692794799805,
"learning_rate": 3.912258651120501e-05,
"loss": 5.4594,
"step": 8240
},
{
"epoch": 0.6601056168987038,
"grad_norm": 2.8713691234588623,
"learning_rate": 3.910921538214687e-05,
"loss": 5.7247,
"step": 8250
},
{
"epoch": 0.6609057449191871,
"grad_norm": 2.167921304702759,
"learning_rate": 3.9095844253088735e-05,
"loss": 5.6405,
"step": 8260
},
{
"epoch": 0.6617058729396703,
"grad_norm": 2.8967878818511963,
"learning_rate": 3.90824731240306e-05,
"loss": 5.4989,
"step": 8270
},
{
"epoch": 0.6625060009601537,
"grad_norm": 2.002103090286255,
"learning_rate": 3.906910199497246e-05,
"loss": 5.4434,
"step": 8280
},
{
"epoch": 0.6633061289806369,
"grad_norm": 2.187889575958252,
"learning_rate": 3.905573086591432e-05,
"loss": 5.4078,
"step": 8290
},
{
"epoch": 0.6641062570011201,
"grad_norm": 2.4078755378723145,
"learning_rate": 3.9042359736856186e-05,
"loss": 5.5381,
"step": 8300
},
{
"epoch": 0.6649063850216035,
"grad_norm": 3.071484327316284,
"learning_rate": 3.902898860779805e-05,
"loss": 5.4298,
"step": 8310
},
{
"epoch": 0.6657065130420867,
"grad_norm": 3.8413217067718506,
"learning_rate": 3.901561747873991e-05,
"loss": 5.4844,
"step": 8320
},
{
"epoch": 0.66650664106257,
"grad_norm": 3.0394554138183594,
"learning_rate": 3.9002246349681774e-05,
"loss": 5.5524,
"step": 8330
},
{
"epoch": 0.6673067690830533,
"grad_norm": 2.635354518890381,
"learning_rate": 3.8988875220623636e-05,
"loss": 5.5727,
"step": 8340
},
{
"epoch": 0.6681068971035365,
"grad_norm": 2.2557764053344727,
"learning_rate": 3.89755040915655e-05,
"loss": 5.3455,
"step": 8350
},
{
"epoch": 0.6689070251240199,
"grad_norm": 2.837040662765503,
"learning_rate": 3.896213296250736e-05,
"loss": 5.3729,
"step": 8360
},
{
"epoch": 0.6697071531445031,
"grad_norm": 6.783266067504883,
"learning_rate": 3.8948761833449224e-05,
"loss": 5.4372,
"step": 8370
},
{
"epoch": 0.6705072811649864,
"grad_norm": 2.20611310005188,
"learning_rate": 3.893539070439108e-05,
"loss": 5.4983,
"step": 8380
},
{
"epoch": 0.6713074091854697,
"grad_norm": 2.378692626953125,
"learning_rate": 3.892201957533294e-05,
"loss": 5.6309,
"step": 8390
},
{
"epoch": 0.672107537205953,
"grad_norm": 2.7219278812408447,
"learning_rate": 3.8908648446274806e-05,
"loss": 5.67,
"step": 8400
},
{
"epoch": 0.6729076652264362,
"grad_norm": 2.7383148670196533,
"learning_rate": 3.889527731721667e-05,
"loss": 5.5648,
"step": 8410
},
{
"epoch": 0.6737077932469195,
"grad_norm": 1.882124423980713,
"learning_rate": 3.888190618815853e-05,
"loss": 5.5879,
"step": 8420
},
{
"epoch": 0.6745079212674028,
"grad_norm": 2.5975465774536133,
"learning_rate": 3.8868535059100394e-05,
"loss": 5.5644,
"step": 8430
},
{
"epoch": 0.675308049287886,
"grad_norm": 3.4361534118652344,
"learning_rate": 3.8855163930042256e-05,
"loss": 5.6302,
"step": 8440
},
{
"epoch": 0.6761081773083694,
"grad_norm": 2.241267442703247,
"learning_rate": 3.884179280098412e-05,
"loss": 5.5003,
"step": 8450
},
{
"epoch": 0.6769083053288526,
"grad_norm": 1.9234975576400757,
"learning_rate": 3.882842167192598e-05,
"loss": 5.4739,
"step": 8460
},
{
"epoch": 0.677708433349336,
"grad_norm": 2.05928897857666,
"learning_rate": 3.8815050542867845e-05,
"loss": 5.5566,
"step": 8470
},
{
"epoch": 0.6785085613698192,
"grad_norm": 2.5602006912231445,
"learning_rate": 3.880167941380971e-05,
"loss": 5.6363,
"step": 8480
},
{
"epoch": 0.6793086893903024,
"grad_norm": 2.36325740814209,
"learning_rate": 3.878830828475157e-05,
"loss": 5.4635,
"step": 8490
},
{
"epoch": 0.6801088174107858,
"grad_norm": 3.087769031524658,
"learning_rate": 3.877493715569343e-05,
"loss": 5.5537,
"step": 8500
},
{
"epoch": 0.680908945431269,
"grad_norm": 2.759660482406616,
"learning_rate": 3.8761566026635295e-05,
"loss": 5.5427,
"step": 8510
},
{
"epoch": 0.6817090734517522,
"grad_norm": 2.7726991176605225,
"learning_rate": 3.874819489757716e-05,
"loss": 5.4868,
"step": 8520
},
{
"epoch": 0.6825092014722356,
"grad_norm": 3.408202648162842,
"learning_rate": 3.8734823768519014e-05,
"loss": 5.5416,
"step": 8530
},
{
"epoch": 0.6833093294927188,
"grad_norm": 3.801959753036499,
"learning_rate": 3.8721452639460877e-05,
"loss": 5.5577,
"step": 8540
},
{
"epoch": 0.6841094575132021,
"grad_norm": 2.7447824478149414,
"learning_rate": 3.870808151040274e-05,
"loss": 5.5837,
"step": 8550
},
{
"epoch": 0.6849095855336854,
"grad_norm": 3.7551326751708984,
"learning_rate": 3.86947103813446e-05,
"loss": 5.4772,
"step": 8560
},
{
"epoch": 0.6857097135541687,
"grad_norm": 2.036146640777588,
"learning_rate": 3.8681339252286465e-05,
"loss": 5.659,
"step": 8570
},
{
"epoch": 0.6865098415746519,
"grad_norm": 2.392986536026001,
"learning_rate": 3.866796812322833e-05,
"loss": 5.3913,
"step": 8580
},
{
"epoch": 0.6873099695951352,
"grad_norm": 2.7194063663482666,
"learning_rate": 3.865459699417019e-05,
"loss": 5.418,
"step": 8590
},
{
"epoch": 0.6881100976156185,
"grad_norm": 2.2499608993530273,
"learning_rate": 3.864122586511205e-05,
"loss": 5.4924,
"step": 8600
},
{
"epoch": 0.6889102256361018,
"grad_norm": 3.661318302154541,
"learning_rate": 3.8627854736053915e-05,
"loss": 5.5578,
"step": 8610
},
{
"epoch": 0.6897103536565851,
"grad_norm": 3.076019048690796,
"learning_rate": 3.861448360699578e-05,
"loss": 5.6017,
"step": 8620
},
{
"epoch": 0.6905104816770683,
"grad_norm": 2.133923053741455,
"learning_rate": 3.860111247793764e-05,
"loss": 5.5295,
"step": 8630
},
{
"epoch": 0.6913106096975516,
"grad_norm": 3.3584773540496826,
"learning_rate": 3.8587741348879503e-05,
"loss": 5.4534,
"step": 8640
},
{
"epoch": 0.6921107377180349,
"grad_norm": 2.499058723449707,
"learning_rate": 3.8574370219821366e-05,
"loss": 5.3402,
"step": 8650
},
{
"epoch": 0.6929108657385181,
"grad_norm": 2.5099146366119385,
"learning_rate": 3.856099909076323e-05,
"loss": 5.3765,
"step": 8660
},
{
"epoch": 0.6937109937590015,
"grad_norm": 2.9601941108703613,
"learning_rate": 3.854762796170509e-05,
"loss": 5.5139,
"step": 8670
},
{
"epoch": 0.6945111217794847,
"grad_norm": 3.2487246990203857,
"learning_rate": 3.8534256832646954e-05,
"loss": 5.5665,
"step": 8680
},
{
"epoch": 0.695311249799968,
"grad_norm": 2.8433704376220703,
"learning_rate": 3.852088570358881e-05,
"loss": 5.4445,
"step": 8690
},
{
"epoch": 0.6961113778204513,
"grad_norm": 2.204953670501709,
"learning_rate": 3.850751457453067e-05,
"loss": 5.5415,
"step": 8700
},
{
"epoch": 0.6969115058409345,
"grad_norm": 2.7477571964263916,
"learning_rate": 3.8494143445472536e-05,
"loss": 5.5603,
"step": 8710
},
{
"epoch": 0.6977116338614179,
"grad_norm": 3.2059755325317383,
"learning_rate": 3.84807723164144e-05,
"loss": 5.5524,
"step": 8720
},
{
"epoch": 0.6985117618819011,
"grad_norm": 3.2654213905334473,
"learning_rate": 3.846740118735626e-05,
"loss": 5.5482,
"step": 8730
},
{
"epoch": 0.6993118899023844,
"grad_norm": 2.3536834716796875,
"learning_rate": 3.8454030058298124e-05,
"loss": 5.6251,
"step": 8740
},
{
"epoch": 0.7001120179228677,
"grad_norm": 3.132542371749878,
"learning_rate": 3.8440658929239986e-05,
"loss": 5.762,
"step": 8750
},
{
"epoch": 0.7009121459433509,
"grad_norm": 2.3961470127105713,
"learning_rate": 3.842728780018185e-05,
"loss": 5.4919,
"step": 8760
},
{
"epoch": 0.7017122739638342,
"grad_norm": 1.9365229606628418,
"learning_rate": 3.841391667112371e-05,
"loss": 5.4369,
"step": 8770
},
{
"epoch": 0.7025124019843175,
"grad_norm": 2.227877140045166,
"learning_rate": 3.8400545542065574e-05,
"loss": 5.4361,
"step": 8780
},
{
"epoch": 0.7033125300048008,
"grad_norm": 2.521822452545166,
"learning_rate": 3.838717441300744e-05,
"loss": 5.6763,
"step": 8790
},
{
"epoch": 0.704112658025284,
"grad_norm": 2.4155185222625732,
"learning_rate": 3.83738032839493e-05,
"loss": 5.7041,
"step": 8800
},
{
"epoch": 0.7049127860457673,
"grad_norm": 1.9704358577728271,
"learning_rate": 3.836043215489116e-05,
"loss": 5.5136,
"step": 8810
},
{
"epoch": 0.7057129140662506,
"grad_norm": 3.447098731994629,
"learning_rate": 3.8347061025833025e-05,
"loss": 5.5963,
"step": 8820
},
{
"epoch": 0.7065130420867338,
"grad_norm": 2.0857930183410645,
"learning_rate": 3.833368989677489e-05,
"loss": 5.5328,
"step": 8830
},
{
"epoch": 0.7073131701072172,
"grad_norm": 5.354836940765381,
"learning_rate": 3.8320318767716744e-05,
"loss": 5.561,
"step": 8840
},
{
"epoch": 0.7081132981277004,
"grad_norm": 2.1317214965820312,
"learning_rate": 3.8306947638658606e-05,
"loss": 5.7044,
"step": 8850
},
{
"epoch": 0.7089134261481838,
"grad_norm": 2.163472890853882,
"learning_rate": 3.829357650960047e-05,
"loss": 5.4564,
"step": 8860
},
{
"epoch": 0.709713554168667,
"grad_norm": 2.155075788497925,
"learning_rate": 3.828020538054233e-05,
"loss": 5.5767,
"step": 8870
},
{
"epoch": 0.7105136821891502,
"grad_norm": 2.225407361984253,
"learning_rate": 3.8266834251484194e-05,
"loss": 5.574,
"step": 8880
},
{
"epoch": 0.7113138102096336,
"grad_norm": 2.737126350402832,
"learning_rate": 3.825346312242606e-05,
"loss": 5.5425,
"step": 8890
},
{
"epoch": 0.7121139382301168,
"grad_norm": 3.4771502017974854,
"learning_rate": 3.824009199336792e-05,
"loss": 5.6085,
"step": 8900
},
{
"epoch": 0.7129140662506,
"grad_norm": 3.2826528549194336,
"learning_rate": 3.822672086430978e-05,
"loss": 5.5632,
"step": 8910
},
{
"epoch": 0.7137141942710834,
"grad_norm": 2.4936113357543945,
"learning_rate": 3.8213349735251645e-05,
"loss": 5.4818,
"step": 8920
},
{
"epoch": 0.7145143222915666,
"grad_norm": 3.6719648838043213,
"learning_rate": 3.819997860619351e-05,
"loss": 5.5637,
"step": 8930
},
{
"epoch": 0.7153144503120499,
"grad_norm": 2.7252962589263916,
"learning_rate": 3.818660747713537e-05,
"loss": 5.5623,
"step": 8940
},
{
"epoch": 0.7161145783325332,
"grad_norm": 3.8873820304870605,
"learning_rate": 3.817323634807723e-05,
"loss": 5.5009,
"step": 8950
},
{
"epoch": 0.7169147063530165,
"grad_norm": 2.6248092651367188,
"learning_rate": 3.8159865219019096e-05,
"loss": 5.6683,
"step": 8960
},
{
"epoch": 0.7177148343734998,
"grad_norm": 2.1327767372131348,
"learning_rate": 3.814649408996096e-05,
"loss": 5.373,
"step": 8970
},
{
"epoch": 0.718514962393983,
"grad_norm": 3.1641392707824707,
"learning_rate": 3.813312296090282e-05,
"loss": 5.6192,
"step": 8980
},
{
"epoch": 0.7193150904144663,
"grad_norm": 2.533423662185669,
"learning_rate": 3.811975183184468e-05,
"loss": 5.4736,
"step": 8990
},
{
"epoch": 0.7201152184349496,
"grad_norm": 2.892228841781616,
"learning_rate": 3.810638070278654e-05,
"loss": 5.437,
"step": 9000
},
{
"epoch": 0.7209153464554329,
"grad_norm": 2.295328140258789,
"learning_rate": 3.80930095737284e-05,
"loss": 5.4327,
"step": 9010
},
{
"epoch": 0.7217154744759161,
"grad_norm": 2.4300477504730225,
"learning_rate": 3.8079638444670265e-05,
"loss": 5.6341,
"step": 9020
},
{
"epoch": 0.7225156024963995,
"grad_norm": 4.092593669891357,
"learning_rate": 3.806626731561213e-05,
"loss": 5.5062,
"step": 9030
},
{
"epoch": 0.7233157305168827,
"grad_norm": 2.7330925464630127,
"learning_rate": 3.805289618655399e-05,
"loss": 5.4915,
"step": 9040
},
{
"epoch": 0.7241158585373659,
"grad_norm": 2.0372865200042725,
"learning_rate": 3.8039525057495853e-05,
"loss": 5.5056,
"step": 9050
},
{
"epoch": 0.7249159865578493,
"grad_norm": 2.5585618019104004,
"learning_rate": 3.8026153928437716e-05,
"loss": 5.4614,
"step": 9060
},
{
"epoch": 0.7257161145783325,
"grad_norm": 2.653251886367798,
"learning_rate": 3.801278279937958e-05,
"loss": 5.4437,
"step": 9070
},
{
"epoch": 0.7265162425988159,
"grad_norm": 2.7902703285217285,
"learning_rate": 3.799941167032144e-05,
"loss": 5.4927,
"step": 9080
},
{
"epoch": 0.7273163706192991,
"grad_norm": 3.366363525390625,
"learning_rate": 3.7986040541263304e-05,
"loss": 5.382,
"step": 9090
},
{
"epoch": 0.7281164986397823,
"grad_norm": 2.065732479095459,
"learning_rate": 3.797266941220517e-05,
"loss": 5.5663,
"step": 9100
},
{
"epoch": 0.7289166266602657,
"grad_norm": 3.823241710662842,
"learning_rate": 3.795929828314703e-05,
"loss": 5.4697,
"step": 9110
},
{
"epoch": 0.7297167546807489,
"grad_norm": 2.3972017765045166,
"learning_rate": 3.794592715408889e-05,
"loss": 5.5508,
"step": 9120
},
{
"epoch": 0.7305168827012322,
"grad_norm": 2.4955368041992188,
"learning_rate": 3.7932556025030755e-05,
"loss": 5.5437,
"step": 9130
},
{
"epoch": 0.7313170107217155,
"grad_norm": 5.454606533050537,
"learning_rate": 3.791918489597262e-05,
"loss": 5.4974,
"step": 9140
},
{
"epoch": 0.7321171387421987,
"grad_norm": 2.6541287899017334,
"learning_rate": 3.790581376691448e-05,
"loss": 5.5327,
"step": 9150
},
{
"epoch": 0.732917266762682,
"grad_norm": 2.974902391433716,
"learning_rate": 3.789244263785634e-05,
"loss": 5.5352,
"step": 9160
},
{
"epoch": 0.7337173947831653,
"grad_norm": 7.2000274658203125,
"learning_rate": 3.7879071508798206e-05,
"loss": 5.5946,
"step": 9170
},
{
"epoch": 0.7345175228036486,
"grad_norm": 2.418121576309204,
"learning_rate": 3.786570037974007e-05,
"loss": 5.4985,
"step": 9180
},
{
"epoch": 0.7353176508241318,
"grad_norm": 2.3174428939819336,
"learning_rate": 3.785232925068193e-05,
"loss": 5.6393,
"step": 9190
},
{
"epoch": 0.7361177788446152,
"grad_norm": 2.172489643096924,
"learning_rate": 3.7838958121623794e-05,
"loss": 5.6173,
"step": 9200
},
{
"epoch": 0.7369179068650984,
"grad_norm": 3.9107019901275635,
"learning_rate": 3.7825586992565656e-05,
"loss": 5.4436,
"step": 9210
},
{
"epoch": 0.7377180348855817,
"grad_norm": 2.3483355045318604,
"learning_rate": 3.781221586350752e-05,
"loss": 5.4981,
"step": 9220
},
{
"epoch": 0.738518162906065,
"grad_norm": 3.839348077774048,
"learning_rate": 3.779884473444938e-05,
"loss": 5.5541,
"step": 9230
},
{
"epoch": 0.7393182909265482,
"grad_norm": 1.686996579170227,
"learning_rate": 3.7785473605391245e-05,
"loss": 5.6328,
"step": 9240
},
{
"epoch": 0.7401184189470316,
"grad_norm": 2.7277584075927734,
"learning_rate": 3.777210247633311e-05,
"loss": 5.5787,
"step": 9250
},
{
"epoch": 0.7409185469675148,
"grad_norm": 2.60896635055542,
"learning_rate": 3.775873134727497e-05,
"loss": 5.5082,
"step": 9260
},
{
"epoch": 0.741718674987998,
"grad_norm": 2.957674264907837,
"learning_rate": 3.774669733112264e-05,
"loss": 5.516,
"step": 9270
},
{
"epoch": 0.7425188030084814,
"grad_norm": 2.223433017730713,
"learning_rate": 3.7733326202064505e-05,
"loss": 5.502,
"step": 9280
},
{
"epoch": 0.7433189310289646,
"grad_norm": 2.6075685024261475,
"learning_rate": 3.771995507300637e-05,
"loss": 5.5067,
"step": 9290
},
{
"epoch": 0.7441190590494479,
"grad_norm": 2.6572721004486084,
"learning_rate": 3.7706583943948224e-05,
"loss": 5.6304,
"step": 9300
},
{
"epoch": 0.7449191870699312,
"grad_norm": 2.0563318729400635,
"learning_rate": 3.7693212814890086e-05,
"loss": 5.4974,
"step": 9310
},
{
"epoch": 0.7457193150904144,
"grad_norm": 2.032820463180542,
"learning_rate": 3.767984168583195e-05,
"loss": 5.6016,
"step": 9320
},
{
"epoch": 0.7465194431108978,
"grad_norm": 5.646316051483154,
"learning_rate": 3.766647055677381e-05,
"loss": 5.6661,
"step": 9330
},
{
"epoch": 0.747319571131381,
"grad_norm": 2.5043859481811523,
"learning_rate": 3.7653099427715674e-05,
"loss": 5.6445,
"step": 9340
},
{
"epoch": 0.7481196991518643,
"grad_norm": 2.817434787750244,
"learning_rate": 3.763972829865754e-05,
"loss": 5.3901,
"step": 9350
},
{
"epoch": 0.7489198271723476,
"grad_norm": 2.4041759967803955,
"learning_rate": 3.76263571695994e-05,
"loss": 5.7132,
"step": 9360
},
{
"epoch": 0.7497199551928309,
"grad_norm": 1.8806638717651367,
"learning_rate": 3.761298604054126e-05,
"loss": 5.5203,
"step": 9370
},
{
"epoch": 0.7505200832133141,
"grad_norm": 2.088700532913208,
"learning_rate": 3.7599614911483125e-05,
"loss": 5.4414,
"step": 9380
},
{
"epoch": 0.7513202112337974,
"grad_norm": 2.519188165664673,
"learning_rate": 3.758624378242499e-05,
"loss": 5.4094,
"step": 9390
},
{
"epoch": 0.7521203392542807,
"grad_norm": 4.597784042358398,
"learning_rate": 3.757287265336685e-05,
"loss": 5.6246,
"step": 9400
},
{
"epoch": 0.7529204672747639,
"grad_norm": 2.0422868728637695,
"learning_rate": 3.755950152430871e-05,
"loss": 5.3393,
"step": 9410
},
{
"epoch": 0.7537205952952473,
"grad_norm": 3.0451338291168213,
"learning_rate": 3.7546130395250576e-05,
"loss": 5.618,
"step": 9420
},
{
"epoch": 0.7545207233157305,
"grad_norm": 2.3379099369049072,
"learning_rate": 3.753275926619244e-05,
"loss": 5.4859,
"step": 9430
},
{
"epoch": 0.7553208513362137,
"grad_norm": 2.6721060276031494,
"learning_rate": 3.75193881371343e-05,
"loss": 5.5349,
"step": 9440
},
{
"epoch": 0.7561209793566971,
"grad_norm": 2.495716094970703,
"learning_rate": 3.7506017008076164e-05,
"loss": 5.626,
"step": 9450
},
{
"epoch": 0.7569211073771803,
"grad_norm": 2.9002442359924316,
"learning_rate": 3.749264587901803e-05,
"loss": 5.5438,
"step": 9460
},
{
"epoch": 0.7577212353976637,
"grad_norm": 2.3616931438446045,
"learning_rate": 3.747927474995989e-05,
"loss": 5.6381,
"step": 9470
},
{
"epoch": 0.7585213634181469,
"grad_norm": 2.389329433441162,
"learning_rate": 3.746590362090175e-05,
"loss": 5.4326,
"step": 9480
},
{
"epoch": 0.7593214914386301,
"grad_norm": 2.1870810985565186,
"learning_rate": 3.7452532491843615e-05,
"loss": 5.5129,
"step": 9490
},
{
"epoch": 0.7601216194591135,
"grad_norm": 2.2454891204833984,
"learning_rate": 3.743916136278548e-05,
"loss": 5.3963,
"step": 9500
},
{
"epoch": 0.7609217474795967,
"grad_norm": 2.5803539752960205,
"learning_rate": 3.742579023372734e-05,
"loss": 5.5237,
"step": 9510
},
{
"epoch": 0.76172187550008,
"grad_norm": 2.5508155822753906,
"learning_rate": 3.74124191046692e-05,
"loss": 5.4525,
"step": 9520
},
{
"epoch": 0.7625220035205633,
"grad_norm": 3.693437337875366,
"learning_rate": 3.7399047975611065e-05,
"loss": 5.5101,
"step": 9530
},
{
"epoch": 0.7633221315410466,
"grad_norm": 2.4398484230041504,
"learning_rate": 3.738567684655293e-05,
"loss": 5.5372,
"step": 9540
},
{
"epoch": 0.7641222595615298,
"grad_norm": 2.226680278778076,
"learning_rate": 3.737230571749479e-05,
"loss": 5.3711,
"step": 9550
},
{
"epoch": 0.7649223875820131,
"grad_norm": 2.182704210281372,
"learning_rate": 3.7358934588436654e-05,
"loss": 5.4957,
"step": 9560
},
{
"epoch": 0.7657225156024964,
"grad_norm": 3.145799398422241,
"learning_rate": 3.7345563459378516e-05,
"loss": 5.5411,
"step": 9570
},
{
"epoch": 0.7665226436229797,
"grad_norm": 2.656719923019409,
"learning_rate": 3.733219233032038e-05,
"loss": 5.4737,
"step": 9580
},
{
"epoch": 0.767322771643463,
"grad_norm": 2.2230639457702637,
"learning_rate": 3.731882120126224e-05,
"loss": 5.5192,
"step": 9590
},
{
"epoch": 0.7681228996639462,
"grad_norm": 4.286400318145752,
"learning_rate": 3.7305450072204104e-05,
"loss": 5.6413,
"step": 9600
},
{
"epoch": 0.7689230276844295,
"grad_norm": 2.3106577396392822,
"learning_rate": 3.729207894314596e-05,
"loss": 5.5998,
"step": 9610
},
{
"epoch": 0.7697231557049128,
"grad_norm": 2.7155752182006836,
"learning_rate": 3.727870781408782e-05,
"loss": 5.4494,
"step": 9620
},
{
"epoch": 0.770523283725396,
"grad_norm": 2.082399368286133,
"learning_rate": 3.7265336685029686e-05,
"loss": 5.4897,
"step": 9630
},
{
"epoch": 0.7713234117458794,
"grad_norm": 2.0752410888671875,
"learning_rate": 3.725196555597155e-05,
"loss": 5.537,
"step": 9640
},
{
"epoch": 0.7721235397663626,
"grad_norm": 2.258284091949463,
"learning_rate": 3.723859442691341e-05,
"loss": 5.6481,
"step": 9650
},
{
"epoch": 0.7729236677868458,
"grad_norm": 2.8548264503479004,
"learning_rate": 3.7225223297855274e-05,
"loss": 5.5508,
"step": 9660
},
{
"epoch": 0.7737237958073292,
"grad_norm": 3.375497579574585,
"learning_rate": 3.7211852168797136e-05,
"loss": 5.3847,
"step": 9670
},
{
"epoch": 0.7745239238278124,
"grad_norm": 2.6680548191070557,
"learning_rate": 3.7198481039739e-05,
"loss": 5.3742,
"step": 9680
},
{
"epoch": 0.7753240518482958,
"grad_norm": 2.2915420532226562,
"learning_rate": 3.718510991068086e-05,
"loss": 5.5593,
"step": 9690
},
{
"epoch": 0.776124179868779,
"grad_norm": 3.224327325820923,
"learning_rate": 3.7171738781622724e-05,
"loss": 5.5711,
"step": 9700
},
{
"epoch": 0.7769243078892623,
"grad_norm": 3.025899887084961,
"learning_rate": 3.715836765256459e-05,
"loss": 5.3164,
"step": 9710
},
{
"epoch": 0.7777244359097456,
"grad_norm": 1.9424941539764404,
"learning_rate": 3.714499652350645e-05,
"loss": 5.4804,
"step": 9720
},
{
"epoch": 0.7785245639302288,
"grad_norm": 2.863312005996704,
"learning_rate": 3.713162539444831e-05,
"loss": 5.3353,
"step": 9730
},
{
"epoch": 0.7793246919507121,
"grad_norm": 2.0607283115386963,
"learning_rate": 3.7118254265390175e-05,
"loss": 5.5311,
"step": 9740
},
{
"epoch": 0.7801248199711954,
"grad_norm": 2.225666046142578,
"learning_rate": 3.710488313633204e-05,
"loss": 5.5315,
"step": 9750
},
{
"epoch": 0.7809249479916787,
"grad_norm": 2.1531851291656494,
"learning_rate": 3.70915120072739e-05,
"loss": 5.5311,
"step": 9760
},
{
"epoch": 0.7817250760121619,
"grad_norm": 2.6129846572875977,
"learning_rate": 3.7078140878215756e-05,
"loss": 5.5927,
"step": 9770
},
{
"epoch": 0.7825252040326452,
"grad_norm": 3.1822173595428467,
"learning_rate": 3.706476974915762e-05,
"loss": 5.5403,
"step": 9780
},
{
"epoch": 0.7833253320531285,
"grad_norm": 5.453544616699219,
"learning_rate": 3.705139862009948e-05,
"loss": 5.4393,
"step": 9790
},
{
"epoch": 0.7841254600736117,
"grad_norm": 2.573024272918701,
"learning_rate": 3.7038027491041345e-05,
"loss": 5.5677,
"step": 9800
},
{
"epoch": 0.7849255880940951,
"grad_norm": 2.283381700515747,
"learning_rate": 3.702465636198321e-05,
"loss": 5.3814,
"step": 9810
},
{
"epoch": 0.7857257161145783,
"grad_norm": 3.119277238845825,
"learning_rate": 3.701128523292507e-05,
"loss": 5.5022,
"step": 9820
},
{
"epoch": 0.7865258441350617,
"grad_norm": 5.085709571838379,
"learning_rate": 3.699791410386693e-05,
"loss": 5.5322,
"step": 9830
},
{
"epoch": 0.7873259721555449,
"grad_norm": 2.4339115619659424,
"learning_rate": 3.6984542974808795e-05,
"loss": 5.5885,
"step": 9840
},
{
"epoch": 0.7881261001760281,
"grad_norm": 2.2715206146240234,
"learning_rate": 3.697117184575066e-05,
"loss": 5.4657,
"step": 9850
},
{
"epoch": 0.7889262281965115,
"grad_norm": 2.1434290409088135,
"learning_rate": 3.695780071669252e-05,
"loss": 5.5571,
"step": 9860
},
{
"epoch": 0.7897263562169947,
"grad_norm": 2.235814094543457,
"learning_rate": 3.694442958763438e-05,
"loss": 5.5054,
"step": 9870
},
{
"epoch": 0.790526484237478,
"grad_norm": 4.322607517242432,
"learning_rate": 3.6931058458576246e-05,
"loss": 5.3727,
"step": 9880
},
{
"epoch": 0.7913266122579613,
"grad_norm": 2.0876612663269043,
"learning_rate": 3.691768732951811e-05,
"loss": 5.5682,
"step": 9890
},
{
"epoch": 0.7921267402784445,
"grad_norm": 1.9573509693145752,
"learning_rate": 3.690431620045997e-05,
"loss": 5.4981,
"step": 9900
},
{
"epoch": 0.7929268682989278,
"grad_norm": 2.527776002883911,
"learning_rate": 3.6890945071401834e-05,
"loss": 5.3799,
"step": 9910
},
{
"epoch": 0.7937269963194111,
"grad_norm": 3.043266773223877,
"learning_rate": 3.687757394234369e-05,
"loss": 5.5366,
"step": 9920
},
{
"epoch": 0.7945271243398944,
"grad_norm": 2.502704381942749,
"learning_rate": 3.686420281328555e-05,
"loss": 5.576,
"step": 9930
},
{
"epoch": 0.7953272523603777,
"grad_norm": 2.863032817840576,
"learning_rate": 3.6850831684227415e-05,
"loss": 5.4838,
"step": 9940
},
{
"epoch": 0.796127380380861,
"grad_norm": 2.4610373973846436,
"learning_rate": 3.683746055516928e-05,
"loss": 5.6119,
"step": 9950
},
{
"epoch": 0.7969275084013442,
"grad_norm": 2.193134069442749,
"learning_rate": 3.682408942611114e-05,
"loss": 5.3948,
"step": 9960
},
{
"epoch": 0.7977276364218275,
"grad_norm": 3.6384451389312744,
"learning_rate": 3.6810718297053003e-05,
"loss": 5.5381,
"step": 9970
},
{
"epoch": 0.7985277644423108,
"grad_norm": 2.5201289653778076,
"learning_rate": 3.6797347167994866e-05,
"loss": 5.4386,
"step": 9980
},
{
"epoch": 0.799327892462794,
"grad_norm": 2.3459038734436035,
"learning_rate": 3.678397603893673e-05,
"loss": 5.8173,
"step": 9990
},
{
"epoch": 0.8001280204832774,
"grad_norm": 2.575666904449463,
"learning_rate": 3.677060490987859e-05,
"loss": 5.4436,
"step": 10000
},
{
"epoch": 0.8009281485037606,
"grad_norm": 4.0012712478637695,
"learning_rate": 3.6757233780820454e-05,
"loss": 5.5222,
"step": 10010
},
{
"epoch": 0.8017282765242438,
"grad_norm": 2.3244402408599854,
"learning_rate": 3.674386265176232e-05,
"loss": 5.398,
"step": 10020
},
{
"epoch": 0.8025284045447272,
"grad_norm": 2.2298974990844727,
"learning_rate": 3.673049152270418e-05,
"loss": 5.4749,
"step": 10030
},
{
"epoch": 0.8033285325652104,
"grad_norm": 3.589245080947876,
"learning_rate": 3.671712039364604e-05,
"loss": 5.5091,
"step": 10040
},
{
"epoch": 0.8041286605856938,
"grad_norm": 2.2426655292510986,
"learning_rate": 3.6703749264587905e-05,
"loss": 5.5136,
"step": 10050
},
{
"epoch": 0.804928788606177,
"grad_norm": 2.5258290767669678,
"learning_rate": 3.669037813552977e-05,
"loss": 5.522,
"step": 10060
},
{
"epoch": 0.8057289166266602,
"grad_norm": 3.040107250213623,
"learning_rate": 3.667700700647163e-05,
"loss": 5.5748,
"step": 10070
},
{
"epoch": 0.8065290446471436,
"grad_norm": 2.561196804046631,
"learning_rate": 3.6663635877413486e-05,
"loss": 5.5973,
"step": 10080
},
{
"epoch": 0.8073291726676268,
"grad_norm": 2.4179880619049072,
"learning_rate": 3.665026474835535e-05,
"loss": 5.5915,
"step": 10090
},
{
"epoch": 0.8081293006881101,
"grad_norm": 2.393134593963623,
"learning_rate": 3.663689361929721e-05,
"loss": 5.4809,
"step": 10100
},
{
"epoch": 0.8089294287085934,
"grad_norm": 3.107543468475342,
"learning_rate": 3.6623522490239074e-05,
"loss": 5.6127,
"step": 10110
},
{
"epoch": 0.8097295567290766,
"grad_norm": 2.8467986583709717,
"learning_rate": 3.661015136118094e-05,
"loss": 5.5274,
"step": 10120
},
{
"epoch": 0.8105296847495599,
"grad_norm": 2.49955153465271,
"learning_rate": 3.65967802321228e-05,
"loss": 5.4469,
"step": 10130
},
{
"epoch": 0.8113298127700432,
"grad_norm": 2.817401885986328,
"learning_rate": 3.658340910306466e-05,
"loss": 5.5901,
"step": 10140
},
{
"epoch": 0.8121299407905265,
"grad_norm": 2.284855842590332,
"learning_rate": 3.6570037974006525e-05,
"loss": 5.588,
"step": 10150
},
{
"epoch": 0.8129300688110097,
"grad_norm": 3.13712739944458,
"learning_rate": 3.655666684494839e-05,
"loss": 5.5035,
"step": 10160
},
{
"epoch": 0.8137301968314931,
"grad_norm": 2.7964253425598145,
"learning_rate": 3.654329571589025e-05,
"loss": 5.4622,
"step": 10170
},
{
"epoch": 0.8145303248519763,
"grad_norm": 3.7489845752716064,
"learning_rate": 3.652992458683211e-05,
"loss": 5.6106,
"step": 10180
},
{
"epoch": 0.8153304528724596,
"grad_norm": 2.0697953701019287,
"learning_rate": 3.6516553457773976e-05,
"loss": 5.4128,
"step": 10190
},
{
"epoch": 0.8161305808929429,
"grad_norm": 2.495635986328125,
"learning_rate": 3.650318232871584e-05,
"loss": 5.3183,
"step": 10200
},
{
"epoch": 0.8169307089134261,
"grad_norm": 1.9717586040496826,
"learning_rate": 3.64898111996577e-05,
"loss": 5.4251,
"step": 10210
},
{
"epoch": 0.8177308369339095,
"grad_norm": 2.591371774673462,
"learning_rate": 3.6476440070599564e-05,
"loss": 5.3903,
"step": 10220
},
{
"epoch": 0.8185309649543927,
"grad_norm": 2.9142751693725586,
"learning_rate": 3.646306894154142e-05,
"loss": 5.4119,
"step": 10230
},
{
"epoch": 0.8193310929748759,
"grad_norm": 2.1791203022003174,
"learning_rate": 3.644969781248328e-05,
"loss": 5.5931,
"step": 10240
},
{
"epoch": 0.8201312209953593,
"grad_norm": 2.787339925765991,
"learning_rate": 3.6436326683425145e-05,
"loss": 5.5301,
"step": 10250
},
{
"epoch": 0.8209313490158425,
"grad_norm": 2.722717523574829,
"learning_rate": 3.642295555436701e-05,
"loss": 5.5608,
"step": 10260
},
{
"epoch": 0.8217314770363258,
"grad_norm": 2.937549114227295,
"learning_rate": 3.640958442530887e-05,
"loss": 5.5967,
"step": 10270
},
{
"epoch": 0.8225316050568091,
"grad_norm": 3.0384104251861572,
"learning_rate": 3.639621329625073e-05,
"loss": 5.5901,
"step": 10280
},
{
"epoch": 0.8233317330772923,
"grad_norm": 2.6817758083343506,
"learning_rate": 3.6382842167192596e-05,
"loss": 5.4188,
"step": 10290
},
{
"epoch": 0.8241318610977757,
"grad_norm": 2.6184494495391846,
"learning_rate": 3.636947103813446e-05,
"loss": 5.5194,
"step": 10300
},
{
"epoch": 0.8249319891182589,
"grad_norm": 2.613208293914795,
"learning_rate": 3.635609990907632e-05,
"loss": 5.4968,
"step": 10310
},
{
"epoch": 0.8257321171387422,
"grad_norm": 6.223053932189941,
"learning_rate": 3.6342728780018184e-05,
"loss": 5.3478,
"step": 10320
},
{
"epoch": 0.8265322451592255,
"grad_norm": 3.294417381286621,
"learning_rate": 3.632935765096005e-05,
"loss": 5.5736,
"step": 10330
},
{
"epoch": 0.8273323731797088,
"grad_norm": 2.3347206115722656,
"learning_rate": 3.631598652190191e-05,
"loss": 5.6787,
"step": 10340
},
{
"epoch": 0.828132501200192,
"grad_norm": 3.219491958618164,
"learning_rate": 3.630261539284377e-05,
"loss": 5.5125,
"step": 10350
},
{
"epoch": 0.8289326292206753,
"grad_norm": 2.5759575366973877,
"learning_rate": 3.6289244263785635e-05,
"loss": 5.4405,
"step": 10360
},
{
"epoch": 0.8297327572411586,
"grad_norm": 2.4145963191986084,
"learning_rate": 3.62758731347275e-05,
"loss": 5.479,
"step": 10370
},
{
"epoch": 0.8305328852616418,
"grad_norm": 2.7548952102661133,
"learning_rate": 3.626250200566936e-05,
"loss": 5.5466,
"step": 10380
},
{
"epoch": 0.8313330132821252,
"grad_norm": 1.9488781690597534,
"learning_rate": 3.624913087661122e-05,
"loss": 5.5063,
"step": 10390
},
{
"epoch": 0.8321331413026084,
"grad_norm": 2.648233652114868,
"learning_rate": 3.6235759747553086e-05,
"loss": 5.4158,
"step": 10400
},
{
"epoch": 0.8329332693230916,
"grad_norm": 2.8808720111846924,
"learning_rate": 3.622238861849495e-05,
"loss": 5.5431,
"step": 10410
},
{
"epoch": 0.833733397343575,
"grad_norm": 3.4570131301879883,
"learning_rate": 3.620901748943681e-05,
"loss": 5.4842,
"step": 10420
},
{
"epoch": 0.8345335253640582,
"grad_norm": 4.246754169464111,
"learning_rate": 3.6195646360378674e-05,
"loss": 5.5809,
"step": 10430
},
{
"epoch": 0.8353336533845416,
"grad_norm": 1.8645952939987183,
"learning_rate": 3.6182275231320536e-05,
"loss": 5.4272,
"step": 10440
},
{
"epoch": 0.8361337814050248,
"grad_norm": 3.3832550048828125,
"learning_rate": 3.61689041022624e-05,
"loss": 5.4291,
"step": 10450
},
{
"epoch": 0.836933909425508,
"grad_norm": 2.1454830169677734,
"learning_rate": 3.615553297320426e-05,
"loss": 5.4457,
"step": 10460
},
{
"epoch": 0.8377340374459914,
"grad_norm": 2.9275059700012207,
"learning_rate": 3.6142161844146124e-05,
"loss": 5.3577,
"step": 10470
},
{
"epoch": 0.8385341654664746,
"grad_norm": 2.9177403450012207,
"learning_rate": 3.612879071508799e-05,
"loss": 5.5011,
"step": 10480
},
{
"epoch": 0.8393342934869579,
"grad_norm": 2.9115045070648193,
"learning_rate": 3.611541958602985e-05,
"loss": 5.4961,
"step": 10490
},
{
"epoch": 0.8401344215074412,
"grad_norm": 3.270296335220337,
"learning_rate": 3.610204845697171e-05,
"loss": 5.4651,
"step": 10500
},
{
"epoch": 0.8409345495279245,
"grad_norm": 2.2930686473846436,
"learning_rate": 3.6088677327913575e-05,
"loss": 5.4363,
"step": 10510
},
{
"epoch": 0.8417346775484077,
"grad_norm": 3.168717622756958,
"learning_rate": 3.607530619885544e-05,
"loss": 5.361,
"step": 10520
},
{
"epoch": 0.842534805568891,
"grad_norm": 2.009021759033203,
"learning_rate": 3.60619350697973e-05,
"loss": 5.4435,
"step": 10530
},
{
"epoch": 0.8433349335893743,
"grad_norm": 3.454181432723999,
"learning_rate": 3.6048563940739156e-05,
"loss": 5.4134,
"step": 10540
},
{
"epoch": 0.8441350616098576,
"grad_norm": 2.8601911067962646,
"learning_rate": 3.603519281168102e-05,
"loss": 5.3224,
"step": 10550
},
{
"epoch": 0.8449351896303409,
"grad_norm": 2.612689733505249,
"learning_rate": 3.602182168262288e-05,
"loss": 5.3947,
"step": 10560
},
{
"epoch": 0.8457353176508241,
"grad_norm": 2.813868284225464,
"learning_rate": 3.6008450553564745e-05,
"loss": 5.4598,
"step": 10570
},
{
"epoch": 0.8465354456713075,
"grad_norm": 2.226395606994629,
"learning_rate": 3.599507942450661e-05,
"loss": 5.4401,
"step": 10580
},
{
"epoch": 0.8473355736917907,
"grad_norm": 3.4722280502319336,
"learning_rate": 3.598170829544847e-05,
"loss": 5.4831,
"step": 10590
},
{
"epoch": 0.8481357017122739,
"grad_norm": 3.270322799682617,
"learning_rate": 3.596833716639033e-05,
"loss": 5.6256,
"step": 10600
},
{
"epoch": 0.8489358297327573,
"grad_norm": 1.9735034704208374,
"learning_rate": 3.5954966037332195e-05,
"loss": 5.491,
"step": 10610
},
{
"epoch": 0.8497359577532405,
"grad_norm": 2.9609665870666504,
"learning_rate": 3.594159490827406e-05,
"loss": 5.5421,
"step": 10620
},
{
"epoch": 0.8505360857737237,
"grad_norm": 3.1109185218811035,
"learning_rate": 3.592822377921592e-05,
"loss": 5.5718,
"step": 10630
},
{
"epoch": 0.8513362137942071,
"grad_norm": 2.68784761428833,
"learning_rate": 3.5914852650157783e-05,
"loss": 5.4769,
"step": 10640
},
{
"epoch": 0.8521363418146903,
"grad_norm": 2.2947535514831543,
"learning_rate": 3.5901481521099646e-05,
"loss": 5.4901,
"step": 10650
},
{
"epoch": 0.8529364698351737,
"grad_norm": 1.894142746925354,
"learning_rate": 3.588811039204151e-05,
"loss": 5.5021,
"step": 10660
},
{
"epoch": 0.8537365978556569,
"grad_norm": 2.800260543823242,
"learning_rate": 3.587473926298337e-05,
"loss": 5.6767,
"step": 10670
},
{
"epoch": 0.8545367258761402,
"grad_norm": 3.055172920227051,
"learning_rate": 3.5861368133925234e-05,
"loss": 5.5765,
"step": 10680
},
{
"epoch": 0.8553368538966235,
"grad_norm": 2.3778443336486816,
"learning_rate": 3.58479970048671e-05,
"loss": 5.5377,
"step": 10690
},
{
"epoch": 0.8561369819171067,
"grad_norm": 4.772058486938477,
"learning_rate": 3.583462587580895e-05,
"loss": 5.432,
"step": 10700
},
{
"epoch": 0.85693710993759,
"grad_norm": 1.9563825130462646,
"learning_rate": 3.5821254746750815e-05,
"loss": 5.4832,
"step": 10710
},
{
"epoch": 0.8577372379580733,
"grad_norm": 2.149519205093384,
"learning_rate": 3.580788361769268e-05,
"loss": 5.491,
"step": 10720
},
{
"epoch": 0.8585373659785566,
"grad_norm": 3.5061347484588623,
"learning_rate": 3.579451248863454e-05,
"loss": 5.5747,
"step": 10730
},
{
"epoch": 0.8593374939990398,
"grad_norm": 2.74947452545166,
"learning_rate": 3.5781141359576404e-05,
"loss": 5.3591,
"step": 10740
},
{
"epoch": 0.8601376220195232,
"grad_norm": 2.818753719329834,
"learning_rate": 3.5767770230518266e-05,
"loss": 5.4722,
"step": 10750
},
{
"epoch": 0.8609377500400064,
"grad_norm": 2.7501718997955322,
"learning_rate": 3.575439910146013e-05,
"loss": 5.4531,
"step": 10760
},
{
"epoch": 0.8617378780604896,
"grad_norm": 2.314549207687378,
"learning_rate": 3.574102797240199e-05,
"loss": 5.5488,
"step": 10770
},
{
"epoch": 0.862538006080973,
"grad_norm": 2.583895683288574,
"learning_rate": 3.5727656843343854e-05,
"loss": 5.5101,
"step": 10780
},
{
"epoch": 0.8633381341014562,
"grad_norm": 2.778087854385376,
"learning_rate": 3.571428571428572e-05,
"loss": 5.421,
"step": 10790
},
{
"epoch": 0.8641382621219396,
"grad_norm": 3.679514169692993,
"learning_rate": 3.570091458522758e-05,
"loss": 5.5277,
"step": 10800
},
{
"epoch": 0.8649383901424228,
"grad_norm": 3.3869597911834717,
"learning_rate": 3.568754345616944e-05,
"loss": 5.5185,
"step": 10810
},
{
"epoch": 0.865738518162906,
"grad_norm": 3.1094346046447754,
"learning_rate": 3.5674172327111305e-05,
"loss": 5.396,
"step": 10820
},
{
"epoch": 0.8665386461833894,
"grad_norm": 2.3561792373657227,
"learning_rate": 3.566080119805317e-05,
"loss": 5.5995,
"step": 10830
},
{
"epoch": 0.8673387742038726,
"grad_norm": 2.7533133029937744,
"learning_rate": 3.564743006899503e-05,
"loss": 5.4848,
"step": 10840
},
{
"epoch": 0.8681389022243559,
"grad_norm": 2.923741579055786,
"learning_rate": 3.5634058939936886e-05,
"loss": 5.5549,
"step": 10850
},
{
"epoch": 0.8689390302448392,
"grad_norm": 2.002704381942749,
"learning_rate": 3.562068781087875e-05,
"loss": 5.4354,
"step": 10860
},
{
"epoch": 0.8697391582653224,
"grad_norm": 2.277064085006714,
"learning_rate": 3.560731668182061e-05,
"loss": 5.4404,
"step": 10870
},
{
"epoch": 0.8705392862858057,
"grad_norm": 2.23490047454834,
"learning_rate": 3.5593945552762474e-05,
"loss": 5.7253,
"step": 10880
},
{
"epoch": 0.871339414306289,
"grad_norm": 2.42874813079834,
"learning_rate": 3.558057442370434e-05,
"loss": 5.4351,
"step": 10890
},
{
"epoch": 0.8721395423267723,
"grad_norm": 2.097278118133545,
"learning_rate": 3.55672032946462e-05,
"loss": 5.4772,
"step": 10900
},
{
"epoch": 0.8729396703472556,
"grad_norm": 2.045832395553589,
"learning_rate": 3.555383216558806e-05,
"loss": 5.4132,
"step": 10910
},
{
"epoch": 0.8737397983677389,
"grad_norm": 2.695033550262451,
"learning_rate": 3.5540461036529925e-05,
"loss": 5.3975,
"step": 10920
},
{
"epoch": 0.8745399263882221,
"grad_norm": 2.62748384475708,
"learning_rate": 3.552708990747179e-05,
"loss": 5.5843,
"step": 10930
},
{
"epoch": 0.8753400544087054,
"grad_norm": 2.6703569889068604,
"learning_rate": 3.551371877841365e-05,
"loss": 5.548,
"step": 10940
},
{
"epoch": 0.8761401824291887,
"grad_norm": 2.7184908390045166,
"learning_rate": 3.550034764935551e-05,
"loss": 5.4833,
"step": 10950
},
{
"epoch": 0.8769403104496719,
"grad_norm": 2.6194417476654053,
"learning_rate": 3.5486976520297376e-05,
"loss": 5.3647,
"step": 10960
},
{
"epoch": 0.8777404384701553,
"grad_norm": 2.5021440982818604,
"learning_rate": 3.547360539123924e-05,
"loss": 5.4775,
"step": 10970
},
{
"epoch": 0.8785405664906385,
"grad_norm": 3.3758370876312256,
"learning_rate": 3.54602342621811e-05,
"loss": 5.4144,
"step": 10980
},
{
"epoch": 0.8793406945111217,
"grad_norm": 2.7361087799072266,
"learning_rate": 3.5446863133122964e-05,
"loss": 5.3614,
"step": 10990
},
{
"epoch": 0.8801408225316051,
"grad_norm": 3.831631660461426,
"learning_rate": 3.543349200406482e-05,
"loss": 5.4672,
"step": 11000
},
{
"epoch": 0.8809409505520883,
"grad_norm": 2.9705264568328857,
"learning_rate": 3.542012087500668e-05,
"loss": 5.5334,
"step": 11010
},
{
"epoch": 0.8817410785725716,
"grad_norm": 3.578693389892578,
"learning_rate": 3.5406749745948545e-05,
"loss": 5.4943,
"step": 11020
},
{
"epoch": 0.8825412065930549,
"grad_norm": 2.0674843788146973,
"learning_rate": 3.539337861689041e-05,
"loss": 5.4054,
"step": 11030
},
{
"epoch": 0.8833413346135381,
"grad_norm": 2.1904194355010986,
"learning_rate": 3.538000748783227e-05,
"loss": 5.37,
"step": 11040
},
{
"epoch": 0.8841414626340215,
"grad_norm": 3.7718141078948975,
"learning_rate": 3.536663635877413e-05,
"loss": 5.6004,
"step": 11050
},
{
"epoch": 0.8849415906545047,
"grad_norm": 2.7325282096862793,
"learning_rate": 3.5353265229715996e-05,
"loss": 5.4552,
"step": 11060
},
{
"epoch": 0.885741718674988,
"grad_norm": 3.3750839233398438,
"learning_rate": 3.533989410065786e-05,
"loss": 5.5041,
"step": 11070
},
{
"epoch": 0.8865418466954713,
"grad_norm": 2.5617001056671143,
"learning_rate": 3.532652297159972e-05,
"loss": 5.4912,
"step": 11080
},
{
"epoch": 0.8873419747159546,
"grad_norm": 1.9870737791061401,
"learning_rate": 3.5313151842541584e-05,
"loss": 5.4576,
"step": 11090
},
{
"epoch": 0.8881421027364378,
"grad_norm": 2.458249568939209,
"learning_rate": 3.529978071348345e-05,
"loss": 5.7306,
"step": 11100
},
{
"epoch": 0.8889422307569211,
"grad_norm": 3.1406562328338623,
"learning_rate": 3.528640958442531e-05,
"loss": 5.5833,
"step": 11110
},
{
"epoch": 0.8897423587774044,
"grad_norm": 2.4337878227233887,
"learning_rate": 3.527303845536717e-05,
"loss": 5.4938,
"step": 11120
},
{
"epoch": 0.8905424867978876,
"grad_norm": 2.925147294998169,
"learning_rate": 3.5259667326309035e-05,
"loss": 5.5591,
"step": 11130
},
{
"epoch": 0.891342614818371,
"grad_norm": 2.5177969932556152,
"learning_rate": 3.52462961972509e-05,
"loss": 5.5199,
"step": 11140
},
{
"epoch": 0.8921427428388542,
"grad_norm": 2.3133068084716797,
"learning_rate": 3.523292506819276e-05,
"loss": 5.3506,
"step": 11150
},
{
"epoch": 0.8929428708593375,
"grad_norm": 2.1670310497283936,
"learning_rate": 3.521955393913462e-05,
"loss": 5.3459,
"step": 11160
},
{
"epoch": 0.8937429988798208,
"grad_norm": 2.875126838684082,
"learning_rate": 3.5206182810076486e-05,
"loss": 5.3948,
"step": 11170
},
{
"epoch": 0.894543126900304,
"grad_norm": 2.3784403800964355,
"learning_rate": 3.519281168101835e-05,
"loss": 5.431,
"step": 11180
},
{
"epoch": 0.8953432549207874,
"grad_norm": 2.400426149368286,
"learning_rate": 3.517944055196021e-05,
"loss": 5.4228,
"step": 11190
},
{
"epoch": 0.8961433829412706,
"grad_norm": 2.2166919708251953,
"learning_rate": 3.5166069422902074e-05,
"loss": 5.6408,
"step": 11200
},
{
"epoch": 0.8969435109617538,
"grad_norm": 1.7938240766525269,
"learning_rate": 3.5152698293843936e-05,
"loss": 5.3972,
"step": 11210
},
{
"epoch": 0.8977436389822372,
"grad_norm": 2.4942996501922607,
"learning_rate": 3.51393271647858e-05,
"loss": 5.5523,
"step": 11220
},
{
"epoch": 0.8985437670027204,
"grad_norm": 2.706131935119629,
"learning_rate": 3.512595603572766e-05,
"loss": 5.6029,
"step": 11230
},
{
"epoch": 0.8993438950232037,
"grad_norm": 3.6749794483184814,
"learning_rate": 3.5112584906669524e-05,
"loss": 5.5903,
"step": 11240
},
{
"epoch": 0.900144023043687,
"grad_norm": 2.8764829635620117,
"learning_rate": 3.509921377761139e-05,
"loss": 5.392,
"step": 11250
},
{
"epoch": 0.9009441510641703,
"grad_norm": 1.9971251487731934,
"learning_rate": 3.508584264855325e-05,
"loss": 5.5115,
"step": 11260
},
{
"epoch": 0.9017442790846536,
"grad_norm": 1.9127808809280396,
"learning_rate": 3.507247151949511e-05,
"loss": 5.6273,
"step": 11270
},
{
"epoch": 0.9025444071051368,
"grad_norm": 2.679152727127075,
"learning_rate": 3.5059100390436975e-05,
"loss": 5.5216,
"step": 11280
},
{
"epoch": 0.9033445351256201,
"grad_norm": 3.1412837505340576,
"learning_rate": 3.504572926137884e-05,
"loss": 5.5665,
"step": 11290
},
{
"epoch": 0.9041446631461034,
"grad_norm": 3.2604153156280518,
"learning_rate": 3.50323581323207e-05,
"loss": 5.6283,
"step": 11300
},
{
"epoch": 0.9049447911665867,
"grad_norm": 2.2050578594207764,
"learning_rate": 3.5018987003262557e-05,
"loss": 5.429,
"step": 11310
},
{
"epoch": 0.9057449191870699,
"grad_norm": 3.6569366455078125,
"learning_rate": 3.500561587420442e-05,
"loss": 5.5833,
"step": 11320
},
{
"epoch": 0.9065450472075532,
"grad_norm": 2.38771653175354,
"learning_rate": 3.499224474514628e-05,
"loss": 5.4127,
"step": 11330
},
{
"epoch": 0.9073451752280365,
"grad_norm": 2.1471800804138184,
"learning_rate": 3.4978873616088145e-05,
"loss": 5.4064,
"step": 11340
},
{
"epoch": 0.9081453032485197,
"grad_norm": 2.340174674987793,
"learning_rate": 3.496550248703001e-05,
"loss": 5.5581,
"step": 11350
},
{
"epoch": 0.9089454312690031,
"grad_norm": 2.771235466003418,
"learning_rate": 3.495213135797187e-05,
"loss": 5.4221,
"step": 11360
},
{
"epoch": 0.9097455592894863,
"grad_norm": 2.7797491550445557,
"learning_rate": 3.493876022891373e-05,
"loss": 5.5604,
"step": 11370
},
{
"epoch": 0.9105456873099695,
"grad_norm": 2.0206966400146484,
"learning_rate": 3.4925389099855595e-05,
"loss": 5.3382,
"step": 11380
},
{
"epoch": 0.9113458153304529,
"grad_norm": 3.5101125240325928,
"learning_rate": 3.491201797079746e-05,
"loss": 5.5358,
"step": 11390
},
{
"epoch": 0.9121459433509361,
"grad_norm": 2.3375003337860107,
"learning_rate": 3.489864684173932e-05,
"loss": 5.5492,
"step": 11400
},
{
"epoch": 0.9129460713714195,
"grad_norm": 2.4977264404296875,
"learning_rate": 3.4885275712681183e-05,
"loss": 5.507,
"step": 11410
},
{
"epoch": 0.9137461993919027,
"grad_norm": 2.0408174991607666,
"learning_rate": 3.4871904583623046e-05,
"loss": 5.5587,
"step": 11420
},
{
"epoch": 0.914546327412386,
"grad_norm": 2.525320053100586,
"learning_rate": 3.485853345456491e-05,
"loss": 5.5013,
"step": 11430
},
{
"epoch": 0.9153464554328693,
"grad_norm": 2.946377992630005,
"learning_rate": 3.484516232550677e-05,
"loss": 5.5959,
"step": 11440
},
{
"epoch": 0.9161465834533525,
"grad_norm": 2.138331174850464,
"learning_rate": 3.4831791196448634e-05,
"loss": 5.4817,
"step": 11450
},
{
"epoch": 0.9169467114738358,
"grad_norm": 1.7159631252288818,
"learning_rate": 3.48184200673905e-05,
"loss": 5.5036,
"step": 11460
},
{
"epoch": 0.9177468394943191,
"grad_norm": 2.5576088428497314,
"learning_rate": 3.480504893833235e-05,
"loss": 5.4721,
"step": 11470
},
{
"epoch": 0.9185469675148024,
"grad_norm": 2.057349443435669,
"learning_rate": 3.4791677809274215e-05,
"loss": 5.5468,
"step": 11480
},
{
"epoch": 0.9193470955352856,
"grad_norm": 2.4942944049835205,
"learning_rate": 3.477830668021608e-05,
"loss": 5.5999,
"step": 11490
},
{
"epoch": 0.920147223555769,
"grad_norm": 3.3070192337036133,
"learning_rate": 3.476493555115794e-05,
"loss": 5.5418,
"step": 11500
},
{
"epoch": 0.9209473515762522,
"grad_norm": 2.2323672771453857,
"learning_rate": 3.4751564422099804e-05,
"loss": 5.398,
"step": 11510
},
{
"epoch": 0.9217474795967355,
"grad_norm": 1.9982457160949707,
"learning_rate": 3.4738193293041666e-05,
"loss": 5.4668,
"step": 11520
},
{
"epoch": 0.9225476076172188,
"grad_norm": 3.4668660163879395,
"learning_rate": 3.472482216398353e-05,
"loss": 5.5433,
"step": 11530
},
{
"epoch": 0.923347735637702,
"grad_norm": 2.7247307300567627,
"learning_rate": 3.471145103492539e-05,
"loss": 5.4156,
"step": 11540
},
{
"epoch": 0.9241478636581854,
"grad_norm": 2.42948317527771,
"learning_rate": 3.4698079905867254e-05,
"loss": 5.4336,
"step": 11550
},
{
"epoch": 0.9249479916786686,
"grad_norm": 4.134993076324463,
"learning_rate": 3.468470877680912e-05,
"loss": 5.3362,
"step": 11560
},
{
"epoch": 0.9257481196991518,
"grad_norm": 2.0852134227752686,
"learning_rate": 3.467133764775098e-05,
"loss": 5.4117,
"step": 11570
},
{
"epoch": 0.9265482477196352,
"grad_norm": 2.224235773086548,
"learning_rate": 3.465796651869284e-05,
"loss": 5.4132,
"step": 11580
},
{
"epoch": 0.9273483757401184,
"grad_norm": 2.0093464851379395,
"learning_rate": 3.4644595389634705e-05,
"loss": 5.3876,
"step": 11590
},
{
"epoch": 0.9281485037606017,
"grad_norm": 1.9892866611480713,
"learning_rate": 3.463122426057657e-05,
"loss": 5.4069,
"step": 11600
},
{
"epoch": 0.928948631781085,
"grad_norm": 3.9974398612976074,
"learning_rate": 3.461785313151843e-05,
"loss": 5.4892,
"step": 11610
},
{
"epoch": 0.9297487598015682,
"grad_norm": 1.9878896474838257,
"learning_rate": 3.4604482002460286e-05,
"loss": 5.5017,
"step": 11620
},
{
"epoch": 0.9305488878220515,
"grad_norm": 3.1477320194244385,
"learning_rate": 3.459111087340215e-05,
"loss": 5.3199,
"step": 11630
},
{
"epoch": 0.9313490158425348,
"grad_norm": 2.434946298599243,
"learning_rate": 3.457773974434401e-05,
"loss": 5.4885,
"step": 11640
},
{
"epoch": 0.9321491438630181,
"grad_norm": 3.2463152408599854,
"learning_rate": 3.4564368615285874e-05,
"loss": 5.5232,
"step": 11650
},
{
"epoch": 0.9329492718835014,
"grad_norm": 3.733612537384033,
"learning_rate": 3.455099748622774e-05,
"loss": 5.4918,
"step": 11660
},
{
"epoch": 0.9337493999039846,
"grad_norm": 3.3726518154144287,
"learning_rate": 3.45376263571696e-05,
"loss": 5.3887,
"step": 11670
},
{
"epoch": 0.9345495279244679,
"grad_norm": 2.527639627456665,
"learning_rate": 3.452425522811146e-05,
"loss": 5.4,
"step": 11680
},
{
"epoch": 0.9353496559449512,
"grad_norm": 3.3945000171661377,
"learning_rate": 3.4510884099053325e-05,
"loss": 5.4835,
"step": 11690
},
{
"epoch": 0.9361497839654345,
"grad_norm": 2.492178201675415,
"learning_rate": 3.449751296999519e-05,
"loss": 5.5472,
"step": 11700
},
{
"epoch": 0.9369499119859177,
"grad_norm": 2.2719671726226807,
"learning_rate": 3.448414184093705e-05,
"loss": 5.3069,
"step": 11710
},
{
"epoch": 0.937750040006401,
"grad_norm": 4.121431350708008,
"learning_rate": 3.447077071187891e-05,
"loss": 5.3377,
"step": 11720
},
{
"epoch": 0.9385501680268843,
"grad_norm": 2.2480831146240234,
"learning_rate": 3.4457399582820776e-05,
"loss": 5.3888,
"step": 11730
},
{
"epoch": 0.9393502960473675,
"grad_norm": 3.118621349334717,
"learning_rate": 3.444402845376264e-05,
"loss": 5.3225,
"step": 11740
},
{
"epoch": 0.9401504240678509,
"grad_norm": 2.513777494430542,
"learning_rate": 3.44306573247045e-05,
"loss": 5.4971,
"step": 11750
},
{
"epoch": 0.9409505520883341,
"grad_norm": 2.491767406463623,
"learning_rate": 3.4417286195646364e-05,
"loss": 5.5061,
"step": 11760
},
{
"epoch": 0.9417506801088175,
"grad_norm": 2.8964290618896484,
"learning_rate": 3.440391506658823e-05,
"loss": 5.3395,
"step": 11770
},
{
"epoch": 0.9425508081293007,
"grad_norm": 2.1613073348999023,
"learning_rate": 3.439054393753008e-05,
"loss": 5.512,
"step": 11780
},
{
"epoch": 0.9433509361497839,
"grad_norm": 3.5444371700286865,
"learning_rate": 3.4377172808471945e-05,
"loss": 5.4804,
"step": 11790
},
{
"epoch": 0.9441510641702673,
"grad_norm": 3.0833287239074707,
"learning_rate": 3.436380167941381e-05,
"loss": 5.5711,
"step": 11800
},
{
"epoch": 0.9449511921907505,
"grad_norm": 2.2267260551452637,
"learning_rate": 3.435043055035567e-05,
"loss": 5.3964,
"step": 11810
},
{
"epoch": 0.9457513202112338,
"grad_norm": 3.114546537399292,
"learning_rate": 3.4337059421297533e-05,
"loss": 5.4296,
"step": 11820
},
{
"epoch": 0.9465514482317171,
"grad_norm": 3.316612958908081,
"learning_rate": 3.4323688292239396e-05,
"loss": 5.451,
"step": 11830
},
{
"epoch": 0.9473515762522003,
"grad_norm": 2.97145414352417,
"learning_rate": 3.431031716318126e-05,
"loss": 5.6184,
"step": 11840
},
{
"epoch": 0.9481517042726836,
"grad_norm": 2.2837045192718506,
"learning_rate": 3.429694603412312e-05,
"loss": 5.3398,
"step": 11850
},
{
"epoch": 0.9489518322931669,
"grad_norm": 2.2095916271209717,
"learning_rate": 3.4283574905064984e-05,
"loss": 5.3933,
"step": 11860
},
{
"epoch": 0.9497519603136502,
"grad_norm": 1.9592795372009277,
"learning_rate": 3.427020377600685e-05,
"loss": 5.4423,
"step": 11870
},
{
"epoch": 0.9505520883341335,
"grad_norm": 2.9245188236236572,
"learning_rate": 3.425683264694871e-05,
"loss": 5.4927,
"step": 11880
},
{
"epoch": 0.9513522163546168,
"grad_norm": 2.5000531673431396,
"learning_rate": 3.424346151789057e-05,
"loss": 5.3523,
"step": 11890
},
{
"epoch": 0.9521523443751,
"grad_norm": 2.4692375659942627,
"learning_rate": 3.4230090388832435e-05,
"loss": 5.5949,
"step": 11900
},
{
"epoch": 0.9529524723955833,
"grad_norm": 2.387812852859497,
"learning_rate": 3.42167192597743e-05,
"loss": 5.4971,
"step": 11910
},
{
"epoch": 0.9537526004160666,
"grad_norm": 2.938291072845459,
"learning_rate": 3.420334813071616e-05,
"loss": 5.3849,
"step": 11920
},
{
"epoch": 0.9545527284365498,
"grad_norm": 2.608431339263916,
"learning_rate": 3.4189977001658016e-05,
"loss": 5.3414,
"step": 11930
},
{
"epoch": 0.9553528564570332,
"grad_norm": 2.695615530014038,
"learning_rate": 3.417660587259988e-05,
"loss": 5.2343,
"step": 11940
},
{
"epoch": 0.9561529844775164,
"grad_norm": 3.0142087936401367,
"learning_rate": 3.416323474354174e-05,
"loss": 5.3293,
"step": 11950
},
{
"epoch": 0.9569531124979996,
"grad_norm": 2.5953242778778076,
"learning_rate": 3.4149863614483604e-05,
"loss": 5.459,
"step": 11960
},
{
"epoch": 0.957753240518483,
"grad_norm": 2.2795822620391846,
"learning_rate": 3.413649248542547e-05,
"loss": 5.5305,
"step": 11970
},
{
"epoch": 0.9585533685389662,
"grad_norm": 2.5979270935058594,
"learning_rate": 3.412312135636733e-05,
"loss": 5.4866,
"step": 11980
},
{
"epoch": 0.9593534965594495,
"grad_norm": 2.66823673248291,
"learning_rate": 3.410975022730919e-05,
"loss": 5.5734,
"step": 11990
},
{
"epoch": 0.9601536245799328,
"grad_norm": 2.3899004459381104,
"learning_rate": 3.4096379098251055e-05,
"loss": 5.5367,
"step": 12000
},
{
"epoch": 0.960953752600416,
"grad_norm": 2.233553171157837,
"learning_rate": 3.408300796919292e-05,
"loss": 5.3773,
"step": 12010
},
{
"epoch": 0.9617538806208994,
"grad_norm": 2.2967305183410645,
"learning_rate": 3.406963684013478e-05,
"loss": 5.4409,
"step": 12020
},
{
"epoch": 0.9625540086413826,
"grad_norm": 2.4291601181030273,
"learning_rate": 3.405626571107664e-05,
"loss": 5.4198,
"step": 12030
},
{
"epoch": 0.9633541366618659,
"grad_norm": 2.6325435638427734,
"learning_rate": 3.4042894582018506e-05,
"loss": 5.6044,
"step": 12040
},
{
"epoch": 0.9641542646823492,
"grad_norm": 2.4688518047332764,
"learning_rate": 3.402952345296037e-05,
"loss": 5.3633,
"step": 12050
},
{
"epoch": 0.9649543927028325,
"grad_norm": 2.3974521160125732,
"learning_rate": 3.401615232390223e-05,
"loss": 5.3022,
"step": 12060
},
{
"epoch": 0.9657545207233157,
"grad_norm": 2.146742105484009,
"learning_rate": 3.4002781194844094e-05,
"loss": 5.2753,
"step": 12070
},
{
"epoch": 0.966554648743799,
"grad_norm": 2.1239147186279297,
"learning_rate": 3.3989410065785957e-05,
"loss": 5.466,
"step": 12080
},
{
"epoch": 0.9673547767642823,
"grad_norm": 2.939096450805664,
"learning_rate": 3.397603893672782e-05,
"loss": 5.5288,
"step": 12090
},
{
"epoch": 0.9681549047847655,
"grad_norm": 2.6875243186950684,
"learning_rate": 3.396266780766968e-05,
"loss": 5.4279,
"step": 12100
},
{
"epoch": 0.9689550328052489,
"grad_norm": 3.1991941928863525,
"learning_rate": 3.3949296678611545e-05,
"loss": 5.5397,
"step": 12110
},
{
"epoch": 0.9697551608257321,
"grad_norm": 2.4558470249176025,
"learning_rate": 3.393592554955341e-05,
"loss": 5.3246,
"step": 12120
},
{
"epoch": 0.9705552888462154,
"grad_norm": 2.2693309783935547,
"learning_rate": 3.392255442049527e-05,
"loss": 5.5941,
"step": 12130
},
{
"epoch": 0.9713554168666987,
"grad_norm": 2.8864657878875732,
"learning_rate": 3.390918329143713e-05,
"loss": 5.4632,
"step": 12140
},
{
"epoch": 0.9721555448871819,
"grad_norm": 2.3996002674102783,
"learning_rate": 3.3895812162378995e-05,
"loss": 5.4724,
"step": 12150
},
{
"epoch": 0.9729556729076653,
"grad_norm": 1.979028582572937,
"learning_rate": 3.388244103332086e-05,
"loss": 5.4229,
"step": 12160
},
{
"epoch": 0.9737558009281485,
"grad_norm": 2.0203795433044434,
"learning_rate": 3.386906990426272e-05,
"loss": 5.5592,
"step": 12170
},
{
"epoch": 0.9745559289486317,
"grad_norm": 2.0890145301818848,
"learning_rate": 3.3855698775204583e-05,
"loss": 5.4313,
"step": 12180
},
{
"epoch": 0.9753560569691151,
"grad_norm": 2.4817287921905518,
"learning_rate": 3.3842327646146446e-05,
"loss": 5.5,
"step": 12190
},
{
"epoch": 0.9761561849895983,
"grad_norm": 2.2497968673706055,
"learning_rate": 3.382895651708831e-05,
"loss": 5.3126,
"step": 12200
},
{
"epoch": 0.9769563130100816,
"grad_norm": 3.2818548679351807,
"learning_rate": 3.381558538803017e-05,
"loss": 5.3421,
"step": 12210
},
{
"epoch": 0.9777564410305649,
"grad_norm": 7.580129623413086,
"learning_rate": 3.3802214258972034e-05,
"loss": 5.6585,
"step": 12220
},
{
"epoch": 0.9785565690510482,
"grad_norm": 3.0450634956359863,
"learning_rate": 3.37888431299139e-05,
"loss": 5.4403,
"step": 12230
},
{
"epoch": 0.9793566970715314,
"grad_norm": 2.5230050086975098,
"learning_rate": 3.377547200085575e-05,
"loss": 5.5331,
"step": 12240
},
{
"epoch": 0.9801568250920147,
"grad_norm": 3.398266315460205,
"learning_rate": 3.3762100871797616e-05,
"loss": 5.3996,
"step": 12250
},
{
"epoch": 0.980956953112498,
"grad_norm": 2.2126028537750244,
"learning_rate": 3.374872974273948e-05,
"loss": 5.4175,
"step": 12260
},
{
"epoch": 0.9817570811329813,
"grad_norm": 3.0015792846679688,
"learning_rate": 3.373535861368134e-05,
"loss": 5.3961,
"step": 12270
},
{
"epoch": 0.9825572091534646,
"grad_norm": 2.5461559295654297,
"learning_rate": 3.3721987484623204e-05,
"loss": 5.6026,
"step": 12280
},
{
"epoch": 0.9833573371739478,
"grad_norm": 2.498425245285034,
"learning_rate": 3.3708616355565066e-05,
"loss": 5.3524,
"step": 12290
},
{
"epoch": 0.9841574651944311,
"grad_norm": 2.9614803791046143,
"learning_rate": 3.369524522650693e-05,
"loss": 5.5101,
"step": 12300
},
{
"epoch": 0.9849575932149144,
"grad_norm": 2.7508606910705566,
"learning_rate": 3.368187409744879e-05,
"loss": 5.3776,
"step": 12310
},
{
"epoch": 0.9857577212353976,
"grad_norm": 2.0286755561828613,
"learning_rate": 3.3668502968390654e-05,
"loss": 5.4913,
"step": 12320
},
{
"epoch": 0.986557849255881,
"grad_norm": 3.728842258453369,
"learning_rate": 3.365513183933252e-05,
"loss": 5.4477,
"step": 12330
},
{
"epoch": 0.9873579772763642,
"grad_norm": 3.3132193088531494,
"learning_rate": 3.364176071027438e-05,
"loss": 5.2361,
"step": 12340
},
{
"epoch": 0.9881581052968474,
"grad_norm": 2.515298843383789,
"learning_rate": 3.362838958121624e-05,
"loss": 5.4632,
"step": 12350
},
{
"epoch": 0.9889582333173308,
"grad_norm": 2.0937442779541016,
"learning_rate": 3.3615018452158105e-05,
"loss": 5.5075,
"step": 12360
},
{
"epoch": 0.989758361337814,
"grad_norm": 3.3019323348999023,
"learning_rate": 3.360164732309997e-05,
"loss": 5.4566,
"step": 12370
},
{
"epoch": 0.9905584893582974,
"grad_norm": 3.502408266067505,
"learning_rate": 3.358827619404183e-05,
"loss": 5.464,
"step": 12380
},
{
"epoch": 0.9913586173787806,
"grad_norm": 2.3667659759521484,
"learning_rate": 3.357490506498369e-05,
"loss": 5.5423,
"step": 12390
},
{
"epoch": 0.9921587453992639,
"grad_norm": 2.15498423576355,
"learning_rate": 3.356153393592555e-05,
"loss": 5.4031,
"step": 12400
},
{
"epoch": 0.9929588734197472,
"grad_norm": 2.733090877532959,
"learning_rate": 3.354816280686741e-05,
"loss": 5.4771,
"step": 12410
},
{
"epoch": 0.9937590014402304,
"grad_norm": 2.595238208770752,
"learning_rate": 3.3534791677809274e-05,
"loss": 5.4538,
"step": 12420
},
{
"epoch": 0.9945591294607137,
"grad_norm": 2.3755598068237305,
"learning_rate": 3.352142054875114e-05,
"loss": 5.432,
"step": 12430
},
{
"epoch": 0.995359257481197,
"grad_norm": 2.2179529666900635,
"learning_rate": 3.3508049419693e-05,
"loss": 5.4359,
"step": 12440
},
{
"epoch": 0.9961593855016803,
"grad_norm": 2.264469623565674,
"learning_rate": 3.349467829063486e-05,
"loss": 5.4514,
"step": 12450
},
{
"epoch": 0.9969595135221635,
"grad_norm": 2.9361791610717773,
"learning_rate": 3.3481307161576725e-05,
"loss": 5.4411,
"step": 12460
},
{
"epoch": 0.9977596415426468,
"grad_norm": 2.6548573970794678,
"learning_rate": 3.346793603251859e-05,
"loss": 5.4368,
"step": 12470
},
{
"epoch": 0.9985597695631301,
"grad_norm": 3.5749149322509766,
"learning_rate": 3.345456490346045e-05,
"loss": 5.6314,
"step": 12480
},
{
"epoch": 0.9993598975836134,
"grad_norm": 2.848527193069458,
"learning_rate": 3.344119377440231e-05,
"loss": 5.3849,
"step": 12490
},
{
"epoch": 1.0001600256040966,
"grad_norm": 2.036498546600342,
"learning_rate": 3.3427822645344176e-05,
"loss": 5.5973,
"step": 12500
},
{
"epoch": 1.00096015362458,
"grad_norm": 3.499455451965332,
"learning_rate": 3.341445151628604e-05,
"loss": 5.1882,
"step": 12510
},
{
"epoch": 1.0017602816450633,
"grad_norm": 2.4391655921936035,
"learning_rate": 3.34010803872279e-05,
"loss": 5.0281,
"step": 12520
},
{
"epoch": 1.0025604096655465,
"grad_norm": 2.522850513458252,
"learning_rate": 3.3387709258169764e-05,
"loss": 5.1038,
"step": 12530
},
{
"epoch": 1.0033605376860297,
"grad_norm": 2.631127119064331,
"learning_rate": 3.337433812911163e-05,
"loss": 4.9671,
"step": 12540
},
{
"epoch": 1.004160665706513,
"grad_norm": 2.9861068725585938,
"learning_rate": 3.336096700005348e-05,
"loss": 5.2225,
"step": 12550
},
{
"epoch": 1.0049607937269964,
"grad_norm": 2.59002423286438,
"learning_rate": 3.3347595870995345e-05,
"loss": 5.142,
"step": 12560
},
{
"epoch": 1.0057609217474797,
"grad_norm": 2.830385208129883,
"learning_rate": 3.333422474193721e-05,
"loss": 5.0919,
"step": 12570
},
{
"epoch": 1.006561049767963,
"grad_norm": 2.6355655193328857,
"learning_rate": 3.332085361287907e-05,
"loss": 5.0604,
"step": 12580
},
{
"epoch": 1.0073611777884461,
"grad_norm": 2.8990426063537598,
"learning_rate": 3.3307482483820933e-05,
"loss": 5.0488,
"step": 12590
},
{
"epoch": 1.0081613058089294,
"grad_norm": 2.657283067703247,
"learning_rate": 3.3294111354762796e-05,
"loss": 5.157,
"step": 12600
},
{
"epoch": 1.0089614338294126,
"grad_norm": 3.652735710144043,
"learning_rate": 3.328074022570466e-05,
"loss": 5.1629,
"step": 12610
},
{
"epoch": 1.009761561849896,
"grad_norm": 2.9064295291900635,
"learning_rate": 3.326736909664652e-05,
"loss": 5.1757,
"step": 12620
},
{
"epoch": 1.0105616898703793,
"grad_norm": 3.015488386154175,
"learning_rate": 3.3253997967588384e-05,
"loss": 5.2311,
"step": 12630
},
{
"epoch": 1.0113618178908625,
"grad_norm": 9.49726390838623,
"learning_rate": 3.324062683853025e-05,
"loss": 5.1402,
"step": 12640
},
{
"epoch": 1.0121619459113458,
"grad_norm": 6.71565055847168,
"learning_rate": 3.322725570947211e-05,
"loss": 4.7297,
"step": 12650
},
{
"epoch": 1.012962073931829,
"grad_norm": 4.39326286315918,
"learning_rate": 3.321388458041397e-05,
"loss": 5.1663,
"step": 12660
},
{
"epoch": 1.0137622019523125,
"grad_norm": 2.8973264694213867,
"learning_rate": 3.3200513451355835e-05,
"loss": 5.0674,
"step": 12670
},
{
"epoch": 1.0145623299727957,
"grad_norm": 3.1058743000030518,
"learning_rate": 3.31871423222977e-05,
"loss": 4.9689,
"step": 12680
},
{
"epoch": 1.015362457993279,
"grad_norm": 2.688951253890991,
"learning_rate": 3.317377119323956e-05,
"loss": 5.0916,
"step": 12690
},
{
"epoch": 1.0161625860137622,
"grad_norm": 2.9495773315429688,
"learning_rate": 3.3160400064181416e-05,
"loss": 5.0939,
"step": 12700
},
{
"epoch": 1.0169627140342454,
"grad_norm": 2.5915777683258057,
"learning_rate": 3.314702893512328e-05,
"loss": 5.134,
"step": 12710
},
{
"epoch": 1.0177628420547287,
"grad_norm": 2.703012228012085,
"learning_rate": 3.313365780606514e-05,
"loss": 5.1285,
"step": 12720
},
{
"epoch": 1.0185629700752121,
"grad_norm": 3.0492970943450928,
"learning_rate": 3.3120286677007004e-05,
"loss": 5.1477,
"step": 12730
},
{
"epoch": 1.0193630980956954,
"grad_norm": 2.756546974182129,
"learning_rate": 3.310691554794887e-05,
"loss": 5.0668,
"step": 12740
},
{
"epoch": 1.0201632261161786,
"grad_norm": 4.764959335327148,
"learning_rate": 3.309354441889073e-05,
"loss": 5.1243,
"step": 12750
},
{
"epoch": 1.0209633541366618,
"grad_norm": 5.539842128753662,
"learning_rate": 3.308017328983259e-05,
"loss": 5.0519,
"step": 12760
},
{
"epoch": 1.021763482157145,
"grad_norm": 3.8945937156677246,
"learning_rate": 3.3066802160774455e-05,
"loss": 5.1758,
"step": 12770
},
{
"epoch": 1.0225636101776283,
"grad_norm": 2.5580265522003174,
"learning_rate": 3.305343103171632e-05,
"loss": 5.0893,
"step": 12780
},
{
"epoch": 1.0233637381981118,
"grad_norm": 2.8203110694885254,
"learning_rate": 3.304005990265818e-05,
"loss": 5.2472,
"step": 12790
},
{
"epoch": 1.024163866218595,
"grad_norm": 3.5090975761413574,
"learning_rate": 3.302668877360004e-05,
"loss": 5.0594,
"step": 12800
},
{
"epoch": 1.0249639942390782,
"grad_norm": 2.915062189102173,
"learning_rate": 3.3013317644541906e-05,
"loss": 5.0673,
"step": 12810
},
{
"epoch": 1.0257641222595615,
"grad_norm": 2.648737668991089,
"learning_rate": 3.299994651548377e-05,
"loss": 4.8937,
"step": 12820
},
{
"epoch": 1.0265642502800447,
"grad_norm": 3.2576730251312256,
"learning_rate": 3.298657538642563e-05,
"loss": 5.1564,
"step": 12830
},
{
"epoch": 1.0273643783005282,
"grad_norm": 5.624968528747559,
"learning_rate": 3.2973204257367494e-05,
"loss": 5.3011,
"step": 12840
},
{
"epoch": 1.0281645063210114,
"grad_norm": 2.492978811264038,
"learning_rate": 3.2959833128309357e-05,
"loss": 5.0935,
"step": 12850
},
{
"epoch": 1.0289646343414947,
"grad_norm": 2.4655046463012695,
"learning_rate": 3.294646199925121e-05,
"loss": 5.1768,
"step": 12860
},
{
"epoch": 1.029764762361978,
"grad_norm": 3.4421567916870117,
"learning_rate": 3.2933090870193075e-05,
"loss": 5.0756,
"step": 12870
},
{
"epoch": 1.0305648903824611,
"grad_norm": 2.6774377822875977,
"learning_rate": 3.291971974113494e-05,
"loss": 5.036,
"step": 12880
},
{
"epoch": 1.0313650184029444,
"grad_norm": 2.665099859237671,
"learning_rate": 3.29063486120768e-05,
"loss": 5.1284,
"step": 12890
},
{
"epoch": 1.0321651464234278,
"grad_norm": 3.7092061042785645,
"learning_rate": 3.289297748301866e-05,
"loss": 4.8892,
"step": 12900
},
{
"epoch": 1.032965274443911,
"grad_norm": 2.875427484512329,
"learning_rate": 3.2879606353960526e-05,
"loss": 4.928,
"step": 12910
},
{
"epoch": 1.0337654024643943,
"grad_norm": 2.409395694732666,
"learning_rate": 3.286623522490239e-05,
"loss": 5.0545,
"step": 12920
},
{
"epoch": 1.0345655304848775,
"grad_norm": 3.936565637588501,
"learning_rate": 3.285286409584425e-05,
"loss": 5.0556,
"step": 12930
},
{
"epoch": 1.0353656585053608,
"grad_norm": 3.52986216545105,
"learning_rate": 3.2839492966786114e-05,
"loss": 5.0738,
"step": 12940
},
{
"epoch": 1.0361657865258442,
"grad_norm": 3.0732507705688477,
"learning_rate": 3.282612183772798e-05,
"loss": 5.0852,
"step": 12950
},
{
"epoch": 1.0369659145463275,
"grad_norm": 2.800020217895508,
"learning_rate": 3.281275070866984e-05,
"loss": 4.9983,
"step": 12960
},
{
"epoch": 1.0377660425668107,
"grad_norm": 2.682191848754883,
"learning_rate": 3.27993795796117e-05,
"loss": 4.8372,
"step": 12970
},
{
"epoch": 1.038566170587294,
"grad_norm": 5.331565856933594,
"learning_rate": 3.2786008450553565e-05,
"loss": 5.1444,
"step": 12980
},
{
"epoch": 1.0393662986077772,
"grad_norm": 3.530069589614868,
"learning_rate": 3.277263732149543e-05,
"loss": 5.1467,
"step": 12990
},
{
"epoch": 1.0401664266282604,
"grad_norm": 2.296837568283081,
"learning_rate": 3.275926619243729e-05,
"loss": 5.0782,
"step": 13000
},
{
"epoch": 1.0409665546487439,
"grad_norm": 4.3493146896362305,
"learning_rate": 3.274589506337915e-05,
"loss": 5.0574,
"step": 13010
},
{
"epoch": 1.0417666826692271,
"grad_norm": 3.2167856693267822,
"learning_rate": 3.2732523934321016e-05,
"loss": 5.1219,
"step": 13020
},
{
"epoch": 1.0425668106897104,
"grad_norm": 3.200861692428589,
"learning_rate": 3.271915280526288e-05,
"loss": 5.0674,
"step": 13030
},
{
"epoch": 1.0433669387101936,
"grad_norm": 2.286841869354248,
"learning_rate": 3.270578167620474e-05,
"loss": 5.0125,
"step": 13040
},
{
"epoch": 1.0441670667306768,
"grad_norm": 3.6788413524627686,
"learning_rate": 3.2692410547146604e-05,
"loss": 5.2975,
"step": 13050
},
{
"epoch": 1.0449671947511603,
"grad_norm": 2.77284574508667,
"learning_rate": 3.2679039418088466e-05,
"loss": 5.0099,
"step": 13060
},
{
"epoch": 1.0457673227716435,
"grad_norm": 4.33493185043335,
"learning_rate": 3.266566828903033e-05,
"loss": 5.0362,
"step": 13070
},
{
"epoch": 1.0465674507921268,
"grad_norm": 3.2839553356170654,
"learning_rate": 3.265229715997219e-05,
"loss": 4.9569,
"step": 13080
},
{
"epoch": 1.04736757881261,
"grad_norm": 2.9086809158325195,
"learning_rate": 3.2638926030914054e-05,
"loss": 5.0341,
"step": 13090
},
{
"epoch": 1.0481677068330932,
"grad_norm": 2.565225124359131,
"learning_rate": 3.262555490185592e-05,
"loss": 5.0601,
"step": 13100
},
{
"epoch": 1.0489678348535765,
"grad_norm": 2.8457388877868652,
"learning_rate": 3.261218377279778e-05,
"loss": 4.9952,
"step": 13110
},
{
"epoch": 1.04976796287406,
"grad_norm": 2.5370593070983887,
"learning_rate": 3.259881264373964e-05,
"loss": 5.1425,
"step": 13120
},
{
"epoch": 1.0505680908945432,
"grad_norm": 2.504817008972168,
"learning_rate": 3.2585441514681505e-05,
"loss": 4.9605,
"step": 13130
},
{
"epoch": 1.0513682189150264,
"grad_norm": 2.9582226276397705,
"learning_rate": 3.257207038562337e-05,
"loss": 5.1436,
"step": 13140
},
{
"epoch": 1.0521683469355096,
"grad_norm": 3.7598915100097656,
"learning_rate": 3.255869925656523e-05,
"loss": 5.0743,
"step": 13150
},
{
"epoch": 1.0529684749559929,
"grad_norm": 3.2642862796783447,
"learning_rate": 3.254532812750709e-05,
"loss": 5.139,
"step": 13160
},
{
"epoch": 1.0537686029764763,
"grad_norm": 3.4917502403259277,
"learning_rate": 3.253195699844895e-05,
"loss": 5.0566,
"step": 13170
},
{
"epoch": 1.0545687309969596,
"grad_norm": 2.9878995418548584,
"learning_rate": 3.251858586939081e-05,
"loss": 5.2385,
"step": 13180
},
{
"epoch": 1.0553688590174428,
"grad_norm": 2.9996213912963867,
"learning_rate": 3.2505214740332674e-05,
"loss": 5.1138,
"step": 13190
},
{
"epoch": 1.056168987037926,
"grad_norm": 5.470676422119141,
"learning_rate": 3.249184361127454e-05,
"loss": 5.0921,
"step": 13200
},
{
"epoch": 1.0569691150584093,
"grad_norm": 2.9724602699279785,
"learning_rate": 3.24784724822164e-05,
"loss": 4.9315,
"step": 13210
},
{
"epoch": 1.0577692430788925,
"grad_norm": 3.191342353820801,
"learning_rate": 3.246510135315826e-05,
"loss": 5.1095,
"step": 13220
},
{
"epoch": 1.058569371099376,
"grad_norm": 4.010619163513184,
"learning_rate": 3.2451730224100125e-05,
"loss": 5.1697,
"step": 13230
},
{
"epoch": 1.0593694991198592,
"grad_norm": 2.828768253326416,
"learning_rate": 3.243835909504199e-05,
"loss": 5.1114,
"step": 13240
},
{
"epoch": 1.0601696271403425,
"grad_norm": 4.081239223480225,
"learning_rate": 3.242498796598385e-05,
"loss": 5.0629,
"step": 13250
},
{
"epoch": 1.0609697551608257,
"grad_norm": 3.347407817840576,
"learning_rate": 3.241161683692571e-05,
"loss": 5.0355,
"step": 13260
},
{
"epoch": 1.061769883181309,
"grad_norm": 2.902289390563965,
"learning_rate": 3.2398245707867576e-05,
"loss": 5.1561,
"step": 13270
},
{
"epoch": 1.0625700112017924,
"grad_norm": 15.202888488769531,
"learning_rate": 3.238487457880944e-05,
"loss": 5.2149,
"step": 13280
},
{
"epoch": 1.0633701392222756,
"grad_norm": 3.353285551071167,
"learning_rate": 3.23715034497513e-05,
"loss": 4.8566,
"step": 13290
},
{
"epoch": 1.0641702672427589,
"grad_norm": 4.258049011230469,
"learning_rate": 3.2358132320693164e-05,
"loss": 5.0358,
"step": 13300
},
{
"epoch": 1.064970395263242,
"grad_norm": 2.727367639541626,
"learning_rate": 3.234476119163503e-05,
"loss": 4.9733,
"step": 13310
},
{
"epoch": 1.0657705232837253,
"grad_norm": 4.626856803894043,
"learning_rate": 3.233139006257688e-05,
"loss": 5.162,
"step": 13320
},
{
"epoch": 1.0665706513042086,
"grad_norm": 3.074949264526367,
"learning_rate": 3.2318018933518745e-05,
"loss": 5.1322,
"step": 13330
},
{
"epoch": 1.067370779324692,
"grad_norm": 4.150319576263428,
"learning_rate": 3.230464780446061e-05,
"loss": 5.0567,
"step": 13340
},
{
"epoch": 1.0681709073451753,
"grad_norm": 5.132182598114014,
"learning_rate": 3.229127667540247e-05,
"loss": 5.1743,
"step": 13350
},
{
"epoch": 1.0689710353656585,
"grad_norm": 4.4582839012146,
"learning_rate": 3.2277905546344333e-05,
"loss": 5.2236,
"step": 13360
},
{
"epoch": 1.0697711633861418,
"grad_norm": 2.9640562534332275,
"learning_rate": 3.2264534417286196e-05,
"loss": 5.0974,
"step": 13370
},
{
"epoch": 1.070571291406625,
"grad_norm": 2.8978335857391357,
"learning_rate": 3.225116328822806e-05,
"loss": 5.1591,
"step": 13380
},
{
"epoch": 1.0713714194271082,
"grad_norm": 2.773488759994507,
"learning_rate": 3.223779215916992e-05,
"loss": 5.0814,
"step": 13390
},
{
"epoch": 1.0721715474475917,
"grad_norm": 2.719374656677246,
"learning_rate": 3.2224421030111784e-05,
"loss": 5.0352,
"step": 13400
},
{
"epoch": 1.072971675468075,
"grad_norm": 2.918991804122925,
"learning_rate": 3.221104990105365e-05,
"loss": 5.0955,
"step": 13410
},
{
"epoch": 1.0737718034885582,
"grad_norm": 3.3438122272491455,
"learning_rate": 3.219767877199551e-05,
"loss": 5.0205,
"step": 13420
},
{
"epoch": 1.0745719315090414,
"grad_norm": 2.915687322616577,
"learning_rate": 3.218430764293737e-05,
"loss": 5.0708,
"step": 13430
},
{
"epoch": 1.0753720595295246,
"grad_norm": 2.3897652626037598,
"learning_rate": 3.2170936513879235e-05,
"loss": 5.0898,
"step": 13440
},
{
"epoch": 1.076172187550008,
"grad_norm": 2.5261075496673584,
"learning_rate": 3.21575653848211e-05,
"loss": 5.0002,
"step": 13450
},
{
"epoch": 1.0769723155704913,
"grad_norm": 4.839473247528076,
"learning_rate": 3.214419425576296e-05,
"loss": 5.1853,
"step": 13460
},
{
"epoch": 1.0777724435909746,
"grad_norm": 2.396831512451172,
"learning_rate": 3.213082312670482e-05,
"loss": 5.0397,
"step": 13470
},
{
"epoch": 1.0785725716114578,
"grad_norm": 4.165911674499512,
"learning_rate": 3.211745199764668e-05,
"loss": 5.2065,
"step": 13480
},
{
"epoch": 1.079372699631941,
"grad_norm": 2.74873423576355,
"learning_rate": 3.210408086858854e-05,
"loss": 5.2217,
"step": 13490
},
{
"epoch": 1.0801728276524245,
"grad_norm": 3.480703353881836,
"learning_rate": 3.2090709739530404e-05,
"loss": 4.9929,
"step": 13500
},
{
"epoch": 1.0809729556729077,
"grad_norm": 3.747199773788452,
"learning_rate": 3.207733861047227e-05,
"loss": 5.1235,
"step": 13510
},
{
"epoch": 1.081773083693391,
"grad_norm": 3.634990692138672,
"learning_rate": 3.206396748141413e-05,
"loss": 4.9466,
"step": 13520
},
{
"epoch": 1.0825732117138742,
"grad_norm": 3.6419565677642822,
"learning_rate": 3.205059635235599e-05,
"loss": 5.1791,
"step": 13530
},
{
"epoch": 1.0833733397343575,
"grad_norm": 3.413770914077759,
"learning_rate": 3.2037225223297855e-05,
"loss": 5.1777,
"step": 13540
},
{
"epoch": 1.0841734677548407,
"grad_norm": 5.771011829376221,
"learning_rate": 3.202385409423972e-05,
"loss": 5.0543,
"step": 13550
},
{
"epoch": 1.0849735957753242,
"grad_norm": 2.9491965770721436,
"learning_rate": 3.201048296518158e-05,
"loss": 4.9719,
"step": 13560
},
{
"epoch": 1.0857737237958074,
"grad_norm": 3.3095767498016357,
"learning_rate": 3.199711183612344e-05,
"loss": 5.2155,
"step": 13570
},
{
"epoch": 1.0865738518162906,
"grad_norm": 4.941197395324707,
"learning_rate": 3.1983740707065306e-05,
"loss": 5.073,
"step": 13580
},
{
"epoch": 1.0873739798367739,
"grad_norm": 2.3605270385742188,
"learning_rate": 3.197036957800717e-05,
"loss": 5.1746,
"step": 13590
},
{
"epoch": 1.088174107857257,
"grad_norm": 2.9810526371002197,
"learning_rate": 3.195699844894903e-05,
"loss": 5.157,
"step": 13600
},
{
"epoch": 1.0889742358777403,
"grad_norm": 2.767223358154297,
"learning_rate": 3.1943627319890894e-05,
"loss": 5.0831,
"step": 13610
},
{
"epoch": 1.0897743638982238,
"grad_norm": 6.959831714630127,
"learning_rate": 3.193025619083276e-05,
"loss": 4.883,
"step": 13620
},
{
"epoch": 1.090574491918707,
"grad_norm": 6.120983123779297,
"learning_rate": 3.191688506177461e-05,
"loss": 5.0368,
"step": 13630
},
{
"epoch": 1.0913746199391903,
"grad_norm": 2.680748462677002,
"learning_rate": 3.1903513932716475e-05,
"loss": 5.1996,
"step": 13640
},
{
"epoch": 1.0921747479596735,
"grad_norm": 4.287043571472168,
"learning_rate": 3.189014280365834e-05,
"loss": 4.9824,
"step": 13650
},
{
"epoch": 1.0929748759801567,
"grad_norm": 2.647005319595337,
"learning_rate": 3.18767716746002e-05,
"loss": 4.9845,
"step": 13660
},
{
"epoch": 1.0937750040006402,
"grad_norm": 2.9568288326263428,
"learning_rate": 3.186340054554206e-05,
"loss": 5.0804,
"step": 13670
},
{
"epoch": 1.0945751320211234,
"grad_norm": 4.118317127227783,
"learning_rate": 3.1850029416483926e-05,
"loss": 5.0375,
"step": 13680
},
{
"epoch": 1.0953752600416067,
"grad_norm": 3.7457168102264404,
"learning_rate": 3.183665828742579e-05,
"loss": 5.0193,
"step": 13690
},
{
"epoch": 1.09617538806209,
"grad_norm": 2.829274892807007,
"learning_rate": 3.182328715836765e-05,
"loss": 5.1896,
"step": 13700
},
{
"epoch": 1.0969755160825732,
"grad_norm": 3.568166971206665,
"learning_rate": 3.1809916029309514e-05,
"loss": 5.0527,
"step": 13710
},
{
"epoch": 1.0977756441030564,
"grad_norm": 2.8555142879486084,
"learning_rate": 3.179654490025138e-05,
"loss": 5.0873,
"step": 13720
},
{
"epoch": 1.0985757721235399,
"grad_norm": 2.9258460998535156,
"learning_rate": 3.178317377119324e-05,
"loss": 4.9293,
"step": 13730
},
{
"epoch": 1.099375900144023,
"grad_norm": 3.3614535331726074,
"learning_rate": 3.17698026421351e-05,
"loss": 4.992,
"step": 13740
},
{
"epoch": 1.1001760281645063,
"grad_norm": 3.859238624572754,
"learning_rate": 3.1756431513076965e-05,
"loss": 4.9695,
"step": 13750
},
{
"epoch": 1.1009761561849896,
"grad_norm": 2.9869918823242188,
"learning_rate": 3.174306038401883e-05,
"loss": 5.0833,
"step": 13760
},
{
"epoch": 1.1017762842054728,
"grad_norm": 2.874736785888672,
"learning_rate": 3.172968925496069e-05,
"loss": 5.0329,
"step": 13770
},
{
"epoch": 1.102576412225956,
"grad_norm": 3.2926857471466064,
"learning_rate": 3.171631812590255e-05,
"loss": 5.0431,
"step": 13780
},
{
"epoch": 1.1033765402464395,
"grad_norm": 3.0349912643432617,
"learning_rate": 3.1702946996844416e-05,
"loss": 5.0485,
"step": 13790
},
{
"epoch": 1.1041766682669227,
"grad_norm": 3.0139970779418945,
"learning_rate": 3.168957586778628e-05,
"loss": 5.0519,
"step": 13800
},
{
"epoch": 1.104976796287406,
"grad_norm": 3.5662894248962402,
"learning_rate": 3.167620473872814e-05,
"loss": 5.2053,
"step": 13810
},
{
"epoch": 1.1057769243078892,
"grad_norm": 3.348515033721924,
"learning_rate": 3.1662833609670004e-05,
"loss": 5.0588,
"step": 13820
},
{
"epoch": 1.1065770523283724,
"grad_norm": 2.439892292022705,
"learning_rate": 3.1649462480611866e-05,
"loss": 5.0894,
"step": 13830
},
{
"epoch": 1.107377180348856,
"grad_norm": 3.85776948928833,
"learning_rate": 3.163609135155373e-05,
"loss": 5.0345,
"step": 13840
},
{
"epoch": 1.1081773083693391,
"grad_norm": 2.6576287746429443,
"learning_rate": 3.162272022249559e-05,
"loss": 5.0607,
"step": 13850
},
{
"epoch": 1.1089774363898224,
"grad_norm": 2.6049861907958984,
"learning_rate": 3.1609349093437454e-05,
"loss": 5.0033,
"step": 13860
},
{
"epoch": 1.1097775644103056,
"grad_norm": 2.5496983528137207,
"learning_rate": 3.159597796437932e-05,
"loss": 5.2102,
"step": 13870
},
{
"epoch": 1.1105776924307889,
"grad_norm": 4.300173282623291,
"learning_rate": 3.158260683532118e-05,
"loss": 5.1137,
"step": 13880
},
{
"epoch": 1.1113778204512723,
"grad_norm": 2.4413559436798096,
"learning_rate": 3.156923570626304e-05,
"loss": 4.9222,
"step": 13890
},
{
"epoch": 1.1121779484717556,
"grad_norm": 2.4938573837280273,
"learning_rate": 3.1555864577204905e-05,
"loss": 5.1414,
"step": 13900
},
{
"epoch": 1.1129780764922388,
"grad_norm": 3.333294153213501,
"learning_rate": 3.154249344814677e-05,
"loss": 5.1243,
"step": 13910
},
{
"epoch": 1.113778204512722,
"grad_norm": 3.8718490600585938,
"learning_rate": 3.152912231908863e-05,
"loss": 5.2178,
"step": 13920
},
{
"epoch": 1.1145783325332053,
"grad_norm": 4.667349338531494,
"learning_rate": 3.151575119003049e-05,
"loss": 5.185,
"step": 13930
},
{
"epoch": 1.1153784605536885,
"grad_norm": 3.7269580364227295,
"learning_rate": 3.150238006097235e-05,
"loss": 4.9231,
"step": 13940
},
{
"epoch": 1.116178588574172,
"grad_norm": 3.8037633895874023,
"learning_rate": 3.148900893191421e-05,
"loss": 5.0166,
"step": 13950
},
{
"epoch": 1.1169787165946552,
"grad_norm": 3.2636613845825195,
"learning_rate": 3.1475637802856075e-05,
"loss": 5.0339,
"step": 13960
},
{
"epoch": 1.1177788446151384,
"grad_norm": 4.069303035736084,
"learning_rate": 3.146226667379794e-05,
"loss": 5.1558,
"step": 13970
},
{
"epoch": 1.1185789726356217,
"grad_norm": 3.160214424133301,
"learning_rate": 3.14488955447398e-05,
"loss": 5.0048,
"step": 13980
},
{
"epoch": 1.119379100656105,
"grad_norm": 2.7678611278533936,
"learning_rate": 3.143552441568166e-05,
"loss": 5.0992,
"step": 13990
},
{
"epoch": 1.1201792286765881,
"grad_norm": 3.162316083908081,
"learning_rate": 3.1422153286623525e-05,
"loss": 5.0398,
"step": 14000
},
{
"epoch": 1.1201792286765881,
"eval_loss": 5.684463977813721,
"eval_runtime": 11.9219,
"eval_samples_per_second": 3.355,
"eval_steps_per_second": 0.419,
"step": 14000
}
],
"logging_steps": 10,
"max_steps": 37494,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 7000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}