chtbx2 / checkpoint-7000 /trainer_state.json
Guerte's picture
Add files using upload-large-folder tool
dc6e737 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.5600896143382941,
"eval_steps": 7000,
"global_step": 7000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0,
"eval_loss": 9.306169509887695,
"eval_runtime": 10.9126,
"eval_samples_per_second": 3.665,
"eval_steps_per_second": 0.458,
"step": 0
},
{
"epoch": 0.0008001280204832773,
"grad_norm": 8.51533031463623,
"learning_rate": 3.5000000000000004e-06,
"loss": 8.786,
"step": 10
},
{
"epoch": 0.0016002560409665546,
"grad_norm": 10.90935230255127,
"learning_rate": 8.500000000000002e-06,
"loss": 8.3433,
"step": 20
},
{
"epoch": 0.002400384061449832,
"grad_norm": 7.269016265869141,
"learning_rate": 1.3500000000000001e-05,
"loss": 7.549,
"step": 30
},
{
"epoch": 0.003200512081933109,
"grad_norm": 8.790578842163086,
"learning_rate": 1.85e-05,
"loss": 7.2574,
"step": 40
},
{
"epoch": 0.004000640102416387,
"grad_norm": 6.52068567276001,
"learning_rate": 2.35e-05,
"loss": 7.0024,
"step": 50
},
{
"epoch": 0.004800768122899664,
"grad_norm": 6.902959823608398,
"learning_rate": 2.8499999999999998e-05,
"loss": 6.9074,
"step": 60
},
{
"epoch": 0.005600896143382941,
"grad_norm": 5.350945949554443,
"learning_rate": 3.35e-05,
"loss": 6.8765,
"step": 70
},
{
"epoch": 0.006401024163866218,
"grad_norm": 5.928489685058594,
"learning_rate": 3.85e-05,
"loss": 6.5663,
"step": 80
},
{
"epoch": 0.007201152184349496,
"grad_norm": 9.222543716430664,
"learning_rate": 4.35e-05,
"loss": 6.6131,
"step": 90
},
{
"epoch": 0.008001280204832774,
"grad_norm": 6.57027006149292,
"learning_rate": 4.85e-05,
"loss": 6.5829,
"step": 100
},
{
"epoch": 0.00880140822531605,
"grad_norm": 5.280848503112793,
"learning_rate": 4.999064020965931e-05,
"loss": 6.5996,
"step": 110
},
{
"epoch": 0.009601536245799328,
"grad_norm": 5.950971603393555,
"learning_rate": 4.997726908060117e-05,
"loss": 6.6075,
"step": 120
},
{
"epoch": 0.010401664266282605,
"grad_norm": 4.300549507141113,
"learning_rate": 4.996389795154303e-05,
"loss": 6.5074,
"step": 130
},
{
"epoch": 0.011201792286765882,
"grad_norm": 4.824333190917969,
"learning_rate": 4.9950526822484896e-05,
"loss": 6.6072,
"step": 140
},
{
"epoch": 0.01200192030724916,
"grad_norm": 5.4324116706848145,
"learning_rate": 4.993715569342676e-05,
"loss": 6.6183,
"step": 150
},
{
"epoch": 0.012802048327732437,
"grad_norm": 4.087579250335693,
"learning_rate": 4.992378456436862e-05,
"loss": 6.4806,
"step": 160
},
{
"epoch": 0.013602176348215714,
"grad_norm": 7.260207653045654,
"learning_rate": 4.9910413435310484e-05,
"loss": 6.3709,
"step": 170
},
{
"epoch": 0.014402304368698993,
"grad_norm": 4.145061016082764,
"learning_rate": 4.9897042306252346e-05,
"loss": 6.2951,
"step": 180
},
{
"epoch": 0.01520243238918227,
"grad_norm": 3.2026450634002686,
"learning_rate": 4.98836711771942e-05,
"loss": 6.3255,
"step": 190
},
{
"epoch": 0.016002560409665547,
"grad_norm": 3.443145751953125,
"learning_rate": 4.9870300048136065e-05,
"loss": 6.4894,
"step": 200
},
{
"epoch": 0.016802688430148822,
"grad_norm": 5.324231147766113,
"learning_rate": 4.985692891907793e-05,
"loss": 6.4312,
"step": 210
},
{
"epoch": 0.0176028164506321,
"grad_norm": 3.2833452224731445,
"learning_rate": 4.984355779001979e-05,
"loss": 6.513,
"step": 220
},
{
"epoch": 0.018402944471115377,
"grad_norm": 3.8984358310699463,
"learning_rate": 4.983018666096165e-05,
"loss": 6.1683,
"step": 230
},
{
"epoch": 0.019203072491598656,
"grad_norm": 4.183676719665527,
"learning_rate": 4.9816815531903516e-05,
"loss": 6.329,
"step": 240
},
{
"epoch": 0.020003200512081935,
"grad_norm": 3.136693239212036,
"learning_rate": 4.980344440284538e-05,
"loss": 6.466,
"step": 250
},
{
"epoch": 0.02080332853256521,
"grad_norm": 4.185967445373535,
"learning_rate": 4.979007327378724e-05,
"loss": 6.4613,
"step": 260
},
{
"epoch": 0.02160345655304849,
"grad_norm": 3.105653762817383,
"learning_rate": 4.9776702144729104e-05,
"loss": 6.3596,
"step": 270
},
{
"epoch": 0.022403584573531764,
"grad_norm": 3.927561044692993,
"learning_rate": 4.9763331015670967e-05,
"loss": 6.2604,
"step": 280
},
{
"epoch": 0.023203712594015043,
"grad_norm": 3.513439178466797,
"learning_rate": 4.974995988661283e-05,
"loss": 6.2747,
"step": 290
},
{
"epoch": 0.02400384061449832,
"grad_norm": 3.07377290725708,
"learning_rate": 4.973658875755469e-05,
"loss": 6.202,
"step": 300
},
{
"epoch": 0.024803968634981598,
"grad_norm": 3.045619249343872,
"learning_rate": 4.9723217628496555e-05,
"loss": 6.1022,
"step": 310
},
{
"epoch": 0.025604096655464873,
"grad_norm": 3.330648183822632,
"learning_rate": 4.970984649943842e-05,
"loss": 6.1544,
"step": 320
},
{
"epoch": 0.026404224675948152,
"grad_norm": 3.0299668312072754,
"learning_rate": 4.969647537038028e-05,
"loss": 6.3119,
"step": 330
},
{
"epoch": 0.027204352696431428,
"grad_norm": 3.687938928604126,
"learning_rate": 4.9683104241322136e-05,
"loss": 6.333,
"step": 340
},
{
"epoch": 0.028004480716914706,
"grad_norm": 4.0919413566589355,
"learning_rate": 4.9669733112264e-05,
"loss": 6.1711,
"step": 350
},
{
"epoch": 0.028804608737397985,
"grad_norm": 3.1327242851257324,
"learning_rate": 4.965636198320586e-05,
"loss": 6.3365,
"step": 360
},
{
"epoch": 0.02960473675788126,
"grad_norm": 4.531859874725342,
"learning_rate": 4.9642990854147724e-05,
"loss": 6.2121,
"step": 370
},
{
"epoch": 0.03040486477836454,
"grad_norm": 2.522672414779663,
"learning_rate": 4.962961972508959e-05,
"loss": 6.2388,
"step": 380
},
{
"epoch": 0.031204992798847815,
"grad_norm": 5.62153959274292,
"learning_rate": 4.961624859603145e-05,
"loss": 6.168,
"step": 390
},
{
"epoch": 0.032005120819331094,
"grad_norm": 3.522804021835327,
"learning_rate": 4.960287746697331e-05,
"loss": 6.1207,
"step": 400
},
{
"epoch": 0.03280524883981437,
"grad_norm": 7.260324478149414,
"learning_rate": 4.9589506337915175e-05,
"loss": 6.31,
"step": 410
},
{
"epoch": 0.033605376860297645,
"grad_norm": 4.309441566467285,
"learning_rate": 4.957613520885704e-05,
"loss": 6.1107,
"step": 420
},
{
"epoch": 0.034405504880780924,
"grad_norm": 3.2409913539886475,
"learning_rate": 4.95627640797989e-05,
"loss": 6.2082,
"step": 430
},
{
"epoch": 0.0352056329012642,
"grad_norm": 3.9414610862731934,
"learning_rate": 4.954939295074076e-05,
"loss": 6.2102,
"step": 440
},
{
"epoch": 0.03600576092174748,
"grad_norm": 2.441235303878784,
"learning_rate": 4.9536021821682626e-05,
"loss": 6.1023,
"step": 450
},
{
"epoch": 0.036805888942230754,
"grad_norm": 2.997591972351074,
"learning_rate": 4.952265069262449e-05,
"loss": 6.1147,
"step": 460
},
{
"epoch": 0.03760601696271403,
"grad_norm": 3.950436592102051,
"learning_rate": 4.950927956356635e-05,
"loss": 6.0725,
"step": 470
},
{
"epoch": 0.03840614498319731,
"grad_norm": 3.4340896606445312,
"learning_rate": 4.9495908434508214e-05,
"loss": 6.1336,
"step": 480
},
{
"epoch": 0.03920627300368059,
"grad_norm": 3.28839373588562,
"learning_rate": 4.948253730545007e-05,
"loss": 6.1709,
"step": 490
},
{
"epoch": 0.04000640102416387,
"grad_norm": 2.976365566253662,
"learning_rate": 4.946916617639193e-05,
"loss": 6.2074,
"step": 500
},
{
"epoch": 0.04080652904464714,
"grad_norm": 4.156027793884277,
"learning_rate": 4.9455795047333795e-05,
"loss": 6.1694,
"step": 510
},
{
"epoch": 0.04160665706513042,
"grad_norm": 3.4855797290802,
"learning_rate": 4.944242391827566e-05,
"loss": 6.1218,
"step": 520
},
{
"epoch": 0.0424067850856137,
"grad_norm": 4.489185333251953,
"learning_rate": 4.942905278921752e-05,
"loss": 6.1507,
"step": 530
},
{
"epoch": 0.04320691310609698,
"grad_norm": 3.2751166820526123,
"learning_rate": 4.941568166015938e-05,
"loss": 6.1055,
"step": 540
},
{
"epoch": 0.04400704112658025,
"grad_norm": 2.4234585762023926,
"learning_rate": 4.9402310531101246e-05,
"loss": 6.1755,
"step": 550
},
{
"epoch": 0.04480716914706353,
"grad_norm": 3.4436991214752197,
"learning_rate": 4.938893940204311e-05,
"loss": 6.1882,
"step": 560
},
{
"epoch": 0.04560729716754681,
"grad_norm": 3.3731908798217773,
"learning_rate": 4.937556827298497e-05,
"loss": 6.0648,
"step": 570
},
{
"epoch": 0.04640742518803009,
"grad_norm": 3.8733670711517334,
"learning_rate": 4.9362197143926834e-05,
"loss": 6.0621,
"step": 580
},
{
"epoch": 0.04720755320851336,
"grad_norm": 4.126636505126953,
"learning_rate": 4.9348826014868696e-05,
"loss": 6.122,
"step": 590
},
{
"epoch": 0.04800768122899664,
"grad_norm": 3.8605775833129883,
"learning_rate": 4.933545488581056e-05,
"loss": 5.9788,
"step": 600
},
{
"epoch": 0.048807809249479917,
"grad_norm": 2.9509966373443604,
"learning_rate": 4.932208375675242e-05,
"loss": 6.2045,
"step": 610
},
{
"epoch": 0.049607937269963195,
"grad_norm": 4.4266510009765625,
"learning_rate": 4.9308712627694285e-05,
"loss": 5.9981,
"step": 620
},
{
"epoch": 0.050408065290446474,
"grad_norm": 2.79042649269104,
"learning_rate": 4.929534149863615e-05,
"loss": 6.1882,
"step": 630
},
{
"epoch": 0.051208193310929746,
"grad_norm": 2.8986568450927734,
"learning_rate": 4.928197036957801e-05,
"loss": 6.1739,
"step": 640
},
{
"epoch": 0.052008321331413025,
"grad_norm": 4.294217586517334,
"learning_rate": 4.926859924051987e-05,
"loss": 6.0566,
"step": 650
},
{
"epoch": 0.052808449351896304,
"grad_norm": 8.848836898803711,
"learning_rate": 4.9255228111461735e-05,
"loss": 6.2994,
"step": 660
},
{
"epoch": 0.05360857737237958,
"grad_norm": 3.2204337120056152,
"learning_rate": 4.92418569824036e-05,
"loss": 6.0573,
"step": 670
},
{
"epoch": 0.054408705392862855,
"grad_norm": 4.775251865386963,
"learning_rate": 4.922848585334546e-05,
"loss": 5.9764,
"step": 680
},
{
"epoch": 0.055208833413346134,
"grad_norm": 3.5426905155181885,
"learning_rate": 4.921511472428732e-05,
"loss": 6.0402,
"step": 690
},
{
"epoch": 0.05600896143382941,
"grad_norm": 10.72481632232666,
"learning_rate": 4.9201743595229186e-05,
"loss": 6.0024,
"step": 700
},
{
"epoch": 0.05680908945431269,
"grad_norm": 2.441681385040283,
"learning_rate": 4.918837246617105e-05,
"loss": 6.1122,
"step": 710
},
{
"epoch": 0.05760921747479597,
"grad_norm": 3.375319480895996,
"learning_rate": 4.917500133711291e-05,
"loss": 6.058,
"step": 720
},
{
"epoch": 0.05840934549527924,
"grad_norm": 2.821507453918457,
"learning_rate": 4.9161630208054774e-05,
"loss": 6.0586,
"step": 730
},
{
"epoch": 0.05920947351576252,
"grad_norm": 2.8658957481384277,
"learning_rate": 4.914825907899664e-05,
"loss": 6.0115,
"step": 740
},
{
"epoch": 0.0600096015362458,
"grad_norm": 2.239774227142334,
"learning_rate": 4.91348879499385e-05,
"loss": 6.0669,
"step": 750
},
{
"epoch": 0.06080972955672908,
"grad_norm": 3.5249900817871094,
"learning_rate": 4.912151682088036e-05,
"loss": 6.1013,
"step": 760
},
{
"epoch": 0.06160985757721235,
"grad_norm": 2.790356159210205,
"learning_rate": 4.9108145691822225e-05,
"loss": 6.0099,
"step": 770
},
{
"epoch": 0.06240998559769563,
"grad_norm": 3.0729963779449463,
"learning_rate": 4.909477456276409e-05,
"loss": 6.1376,
"step": 780
},
{
"epoch": 0.06321011361817891,
"grad_norm": 2.9490275382995605,
"learning_rate": 4.908140343370595e-05,
"loss": 6.1457,
"step": 790
},
{
"epoch": 0.06401024163866219,
"grad_norm": 2.7475438117980957,
"learning_rate": 4.9068032304647806e-05,
"loss": 6.0041,
"step": 800
},
{
"epoch": 0.06481036965914547,
"grad_norm": 2.755703926086426,
"learning_rate": 4.905466117558967e-05,
"loss": 6.0242,
"step": 810
},
{
"epoch": 0.06561049767962875,
"grad_norm": 2.724515676498413,
"learning_rate": 4.904129004653153e-05,
"loss": 6.1827,
"step": 820
},
{
"epoch": 0.06641062570011202,
"grad_norm": 4.498260974884033,
"learning_rate": 4.9027918917473394e-05,
"loss": 6.0892,
"step": 830
},
{
"epoch": 0.06721075372059529,
"grad_norm": 2.4399070739746094,
"learning_rate": 4.901454778841526e-05,
"loss": 6.0197,
"step": 840
},
{
"epoch": 0.06801088174107857,
"grad_norm": 2.7584304809570312,
"learning_rate": 4.900117665935712e-05,
"loss": 5.9056,
"step": 850
},
{
"epoch": 0.06881100976156185,
"grad_norm": 2.8177144527435303,
"learning_rate": 4.898780553029898e-05,
"loss": 6.1484,
"step": 860
},
{
"epoch": 0.06961113778204513,
"grad_norm": 4.181133270263672,
"learning_rate": 4.8974434401240845e-05,
"loss": 5.9376,
"step": 870
},
{
"epoch": 0.0704112658025284,
"grad_norm": 3.677849769592285,
"learning_rate": 4.896106327218271e-05,
"loss": 6.0403,
"step": 880
},
{
"epoch": 0.07121139382301168,
"grad_norm": 3.1553192138671875,
"learning_rate": 4.894769214312457e-05,
"loss": 6.0488,
"step": 890
},
{
"epoch": 0.07201152184349496,
"grad_norm": 3.2580947875976562,
"learning_rate": 4.893432101406643e-05,
"loss": 6.1002,
"step": 900
},
{
"epoch": 0.07281164986397824,
"grad_norm": 6.328150749206543,
"learning_rate": 4.8920949885008296e-05,
"loss": 6.0225,
"step": 910
},
{
"epoch": 0.07361177788446151,
"grad_norm": 2.7467615604400635,
"learning_rate": 4.890757875595016e-05,
"loss": 5.9622,
"step": 920
},
{
"epoch": 0.07441190590494479,
"grad_norm": 2.86570405960083,
"learning_rate": 4.889420762689202e-05,
"loss": 5.9718,
"step": 930
},
{
"epoch": 0.07521203392542807,
"grad_norm": 2.544917106628418,
"learning_rate": 4.8880836497833884e-05,
"loss": 5.8697,
"step": 940
},
{
"epoch": 0.07601216194591134,
"grad_norm": 2.5245840549468994,
"learning_rate": 4.8867465368775746e-05,
"loss": 5.9973,
"step": 950
},
{
"epoch": 0.07681228996639462,
"grad_norm": 3.6830902099609375,
"learning_rate": 4.88540942397176e-05,
"loss": 5.943,
"step": 960
},
{
"epoch": 0.0776124179868779,
"grad_norm": 2.6643354892730713,
"learning_rate": 4.8840723110659465e-05,
"loss": 5.8958,
"step": 970
},
{
"epoch": 0.07841254600736118,
"grad_norm": 6.4623565673828125,
"learning_rate": 4.882735198160133e-05,
"loss": 6.0236,
"step": 980
},
{
"epoch": 0.07921267402784446,
"grad_norm": 2.186974048614502,
"learning_rate": 4.881398085254319e-05,
"loss": 6.0481,
"step": 990
},
{
"epoch": 0.08001280204832774,
"grad_norm": 2.4983859062194824,
"learning_rate": 4.880060972348505e-05,
"loss": 6.075,
"step": 1000
},
{
"epoch": 0.080812930068811,
"grad_norm": 2.778280258178711,
"learning_rate": 4.8787238594426916e-05,
"loss": 6.0757,
"step": 1010
},
{
"epoch": 0.08161305808929428,
"grad_norm": 2.706965923309326,
"learning_rate": 4.877386746536878e-05,
"loss": 6.1504,
"step": 1020
},
{
"epoch": 0.08241318610977756,
"grad_norm": 3.4069600105285645,
"learning_rate": 4.876049633631064e-05,
"loss": 6.0889,
"step": 1030
},
{
"epoch": 0.08321331413026084,
"grad_norm": 3.179551124572754,
"learning_rate": 4.8747125207252504e-05,
"loss": 6.0057,
"step": 1040
},
{
"epoch": 0.08401344215074412,
"grad_norm": 2.924018383026123,
"learning_rate": 4.873375407819437e-05,
"loss": 5.8406,
"step": 1050
},
{
"epoch": 0.0848135701712274,
"grad_norm": 3.103912115097046,
"learning_rate": 4.872038294913623e-05,
"loss": 6.0351,
"step": 1060
},
{
"epoch": 0.08561369819171068,
"grad_norm": 2.8037219047546387,
"learning_rate": 4.870701182007809e-05,
"loss": 6.0272,
"step": 1070
},
{
"epoch": 0.08641382621219396,
"grad_norm": 2.477062940597534,
"learning_rate": 4.8693640691019955e-05,
"loss": 5.9269,
"step": 1080
},
{
"epoch": 0.08721395423267723,
"grad_norm": 2.748488187789917,
"learning_rate": 4.868026956196182e-05,
"loss": 5.943,
"step": 1090
},
{
"epoch": 0.0880140822531605,
"grad_norm": 3.3991920948028564,
"learning_rate": 4.866689843290368e-05,
"loss": 6.1455,
"step": 1100
},
{
"epoch": 0.08881421027364378,
"grad_norm": 3.208509683609009,
"learning_rate": 4.8653527303845536e-05,
"loss": 5.9746,
"step": 1110
},
{
"epoch": 0.08961433829412706,
"grad_norm": 3.3378469944000244,
"learning_rate": 4.86401561747874e-05,
"loss": 5.9185,
"step": 1120
},
{
"epoch": 0.09041446631461034,
"grad_norm": 2.269606113433838,
"learning_rate": 4.862678504572926e-05,
"loss": 5.9369,
"step": 1130
},
{
"epoch": 0.09121459433509362,
"grad_norm": 2.749335765838623,
"learning_rate": 4.8613413916671124e-05,
"loss": 6.0648,
"step": 1140
},
{
"epoch": 0.0920147223555769,
"grad_norm": 2.821913480758667,
"learning_rate": 4.860004278761299e-05,
"loss": 5.952,
"step": 1150
},
{
"epoch": 0.09281485037606017,
"grad_norm": 2.640990734100342,
"learning_rate": 4.858667165855485e-05,
"loss": 6.0537,
"step": 1160
},
{
"epoch": 0.09361497839654345,
"grad_norm": 3.570896625518799,
"learning_rate": 4.857330052949671e-05,
"loss": 5.7721,
"step": 1170
},
{
"epoch": 0.09441510641702672,
"grad_norm": 3.245318651199341,
"learning_rate": 4.8559929400438575e-05,
"loss": 5.7305,
"step": 1180
},
{
"epoch": 0.09521523443751,
"grad_norm": 4.075076580047607,
"learning_rate": 4.854655827138044e-05,
"loss": 5.974,
"step": 1190
},
{
"epoch": 0.09601536245799328,
"grad_norm": 2.429893732070923,
"learning_rate": 4.85331871423223e-05,
"loss": 5.7828,
"step": 1200
},
{
"epoch": 0.09681549047847655,
"grad_norm": 2.7077040672302246,
"learning_rate": 4.851981601326416e-05,
"loss": 5.9143,
"step": 1210
},
{
"epoch": 0.09761561849895983,
"grad_norm": 2.767918586730957,
"learning_rate": 4.8506444884206026e-05,
"loss": 5.9449,
"step": 1220
},
{
"epoch": 0.09841574651944311,
"grad_norm": 2.4544034004211426,
"learning_rate": 4.849307375514789e-05,
"loss": 6.0034,
"step": 1230
},
{
"epoch": 0.09921587453992639,
"grad_norm": 5.215607643127441,
"learning_rate": 4.847970262608975e-05,
"loss": 5.867,
"step": 1240
},
{
"epoch": 0.10001600256040967,
"grad_norm": 2.7856080532073975,
"learning_rate": 4.8466331497031614e-05,
"loss": 6.0213,
"step": 1250
},
{
"epoch": 0.10081613058089295,
"grad_norm": 2.5528719425201416,
"learning_rate": 4.8452960367973476e-05,
"loss": 5.9634,
"step": 1260
},
{
"epoch": 0.10161625860137621,
"grad_norm": 2.4917409420013428,
"learning_rate": 4.843958923891533e-05,
"loss": 5.887,
"step": 1270
},
{
"epoch": 0.10241638662185949,
"grad_norm": 6.125699520111084,
"learning_rate": 4.8426218109857195e-05,
"loss": 6.1189,
"step": 1280
},
{
"epoch": 0.10321651464234277,
"grad_norm": 2.783156156539917,
"learning_rate": 4.841284698079906e-05,
"loss": 5.9064,
"step": 1290
},
{
"epoch": 0.10401664266282605,
"grad_norm": 3.611070156097412,
"learning_rate": 4.839947585174092e-05,
"loss": 5.9405,
"step": 1300
},
{
"epoch": 0.10481677068330933,
"grad_norm": 4.296909809112549,
"learning_rate": 4.838610472268278e-05,
"loss": 5.9067,
"step": 1310
},
{
"epoch": 0.10561689870379261,
"grad_norm": 2.4273040294647217,
"learning_rate": 4.8372733593624646e-05,
"loss": 5.888,
"step": 1320
},
{
"epoch": 0.10641702672427589,
"grad_norm": 2.6499924659729004,
"learning_rate": 4.835936246456651e-05,
"loss": 5.9683,
"step": 1330
},
{
"epoch": 0.10721715474475917,
"grad_norm": 3.1474297046661377,
"learning_rate": 4.834599133550837e-05,
"loss": 5.8946,
"step": 1340
},
{
"epoch": 0.10801728276524244,
"grad_norm": 3.5050199031829834,
"learning_rate": 4.8332620206450234e-05,
"loss": 5.9179,
"step": 1350
},
{
"epoch": 0.10881741078572571,
"grad_norm": 2.693700075149536,
"learning_rate": 4.8319249077392096e-05,
"loss": 5.7965,
"step": 1360
},
{
"epoch": 0.10961753880620899,
"grad_norm": 2.8202953338623047,
"learning_rate": 4.830587794833396e-05,
"loss": 5.9526,
"step": 1370
},
{
"epoch": 0.11041766682669227,
"grad_norm": 2.514862060546875,
"learning_rate": 4.829250681927582e-05,
"loss": 5.936,
"step": 1380
},
{
"epoch": 0.11121779484717555,
"grad_norm": 3.18804931640625,
"learning_rate": 4.8279135690217685e-05,
"loss": 5.9246,
"step": 1390
},
{
"epoch": 0.11201792286765883,
"grad_norm": 2.77697491645813,
"learning_rate": 4.826576456115955e-05,
"loss": 5.9576,
"step": 1400
},
{
"epoch": 0.1128180508881421,
"grad_norm": 2.762524127960205,
"learning_rate": 4.825239343210141e-05,
"loss": 5.9085,
"step": 1410
},
{
"epoch": 0.11361817890862538,
"grad_norm": 2.4407670497894287,
"learning_rate": 4.8239022303043266e-05,
"loss": 5.9518,
"step": 1420
},
{
"epoch": 0.11441830692910866,
"grad_norm": 3.1036713123321533,
"learning_rate": 4.822565117398513e-05,
"loss": 5.8412,
"step": 1430
},
{
"epoch": 0.11521843494959194,
"grad_norm": 3.319058418273926,
"learning_rate": 4.821228004492699e-05,
"loss": 5.9733,
"step": 1440
},
{
"epoch": 0.1160185629700752,
"grad_norm": 2.13468599319458,
"learning_rate": 4.8198908915868854e-05,
"loss": 5.9193,
"step": 1450
},
{
"epoch": 0.11681869099055849,
"grad_norm": 2.6057028770446777,
"learning_rate": 4.8185537786810717e-05,
"loss": 5.9807,
"step": 1460
},
{
"epoch": 0.11761881901104176,
"grad_norm": 2.7509753704071045,
"learning_rate": 4.817216665775258e-05,
"loss": 5.9534,
"step": 1470
},
{
"epoch": 0.11841894703152504,
"grad_norm": 2.111055850982666,
"learning_rate": 4.815879552869444e-05,
"loss": 5.9207,
"step": 1480
},
{
"epoch": 0.11921907505200832,
"grad_norm": 2.5271990299224854,
"learning_rate": 4.8145424399636305e-05,
"loss": 5.7148,
"step": 1490
},
{
"epoch": 0.1200192030724916,
"grad_norm": 2.814138174057007,
"learning_rate": 4.813205327057817e-05,
"loss": 5.9498,
"step": 1500
},
{
"epoch": 0.12081933109297488,
"grad_norm": 3.449355363845825,
"learning_rate": 4.811868214152003e-05,
"loss": 5.7814,
"step": 1510
},
{
"epoch": 0.12161945911345816,
"grad_norm": 2.813746213912964,
"learning_rate": 4.810531101246189e-05,
"loss": 5.9517,
"step": 1520
},
{
"epoch": 0.12241958713394142,
"grad_norm": 2.529242753982544,
"learning_rate": 4.8091939883403755e-05,
"loss": 5.8227,
"step": 1530
},
{
"epoch": 0.1232197151544247,
"grad_norm": 2.2425034046173096,
"learning_rate": 4.807856875434562e-05,
"loss": 6.1064,
"step": 1540
},
{
"epoch": 0.12401984317490798,
"grad_norm": 2.7732784748077393,
"learning_rate": 4.806519762528748e-05,
"loss": 5.8888,
"step": 1550
},
{
"epoch": 0.12481997119539126,
"grad_norm": 2.5558009147644043,
"learning_rate": 4.8051826496229343e-05,
"loss": 5.8185,
"step": 1560
},
{
"epoch": 0.12562009921587455,
"grad_norm": 2.884411096572876,
"learning_rate": 4.8038455367171206e-05,
"loss": 6.0534,
"step": 1570
},
{
"epoch": 0.12642022723635782,
"grad_norm": 2.5747668743133545,
"learning_rate": 4.802508423811307e-05,
"loss": 5.8186,
"step": 1580
},
{
"epoch": 0.12722035525684108,
"grad_norm": 2.324767827987671,
"learning_rate": 4.801171310905493e-05,
"loss": 5.8642,
"step": 1590
},
{
"epoch": 0.12802048327732438,
"grad_norm": 2.2255160808563232,
"learning_rate": 4.7998341979996794e-05,
"loss": 5.8559,
"step": 1600
},
{
"epoch": 0.12882061129780764,
"grad_norm": 2.97525954246521,
"learning_rate": 4.798497085093866e-05,
"loss": 5.8744,
"step": 1610
},
{
"epoch": 0.12962073931829093,
"grad_norm": 2.23962664604187,
"learning_rate": 4.797159972188052e-05,
"loss": 5.7545,
"step": 1620
},
{
"epoch": 0.1304208673387742,
"grad_norm": 3.6182124614715576,
"learning_rate": 4.795822859282238e-05,
"loss": 5.8872,
"step": 1630
},
{
"epoch": 0.1312209953592575,
"grad_norm": 4.068545341491699,
"learning_rate": 4.7944857463764245e-05,
"loss": 5.9008,
"step": 1640
},
{
"epoch": 0.13202112337974076,
"grad_norm": 3.627082109451294,
"learning_rate": 4.793148633470611e-05,
"loss": 5.8215,
"step": 1650
},
{
"epoch": 0.13282125140022405,
"grad_norm": 3.0080721378326416,
"learning_rate": 4.791811520564797e-05,
"loss": 5.9086,
"step": 1660
},
{
"epoch": 0.13362137942070731,
"grad_norm": 2.5463860034942627,
"learning_rate": 4.790474407658983e-05,
"loss": 5.776,
"step": 1670
},
{
"epoch": 0.13442150744119058,
"grad_norm": 2.212488889694214,
"learning_rate": 4.7891372947531696e-05,
"loss": 6.006,
"step": 1680
},
{
"epoch": 0.13522163546167387,
"grad_norm": 4.147563934326172,
"learning_rate": 4.787800181847356e-05,
"loss": 5.886,
"step": 1690
},
{
"epoch": 0.13602176348215714,
"grad_norm": 2.6021018028259277,
"learning_rate": 4.786463068941542e-05,
"loss": 5.9182,
"step": 1700
},
{
"epoch": 0.13682189150264043,
"grad_norm": 2.3109893798828125,
"learning_rate": 4.7851259560357284e-05,
"loss": 5.8084,
"step": 1710
},
{
"epoch": 0.1376220195231237,
"grad_norm": 2.8678529262542725,
"learning_rate": 4.7837888431299147e-05,
"loss": 6.0363,
"step": 1720
},
{
"epoch": 0.138422147543607,
"grad_norm": 2.1921958923339844,
"learning_rate": 4.7824517302241e-05,
"loss": 5.7667,
"step": 1730
},
{
"epoch": 0.13922227556409025,
"grad_norm": 2.6883316040039062,
"learning_rate": 4.7811146173182865e-05,
"loss": 5.7906,
"step": 1740
},
{
"epoch": 0.14002240358457352,
"grad_norm": 2.4079957008361816,
"learning_rate": 4.779777504412473e-05,
"loss": 5.7698,
"step": 1750
},
{
"epoch": 0.1408225316050568,
"grad_norm": 4.29390287399292,
"learning_rate": 4.778440391506659e-05,
"loss": 5.9639,
"step": 1760
},
{
"epoch": 0.14162265962554008,
"grad_norm": 4.133132457733154,
"learning_rate": 4.777103278600845e-05,
"loss": 6.0901,
"step": 1770
},
{
"epoch": 0.14242278764602337,
"grad_norm": 3.871561288833618,
"learning_rate": 4.7757661656950316e-05,
"loss": 5.7455,
"step": 1780
},
{
"epoch": 0.14322291566650663,
"grad_norm": 4.266111850738525,
"learning_rate": 4.774429052789218e-05,
"loss": 5.9971,
"step": 1790
},
{
"epoch": 0.14402304368698993,
"grad_norm": 2.9000513553619385,
"learning_rate": 4.773091939883404e-05,
"loss": 5.9025,
"step": 1800
},
{
"epoch": 0.1448231717074732,
"grad_norm": 2.549964189529419,
"learning_rate": 4.7717548269775904e-05,
"loss": 5.768,
"step": 1810
},
{
"epoch": 0.14562329972795648,
"grad_norm": 2.2882704734802246,
"learning_rate": 4.770417714071777e-05,
"loss": 6.022,
"step": 1820
},
{
"epoch": 0.14642342774843975,
"grad_norm": 2.6501784324645996,
"learning_rate": 4.769080601165963e-05,
"loss": 5.8539,
"step": 1830
},
{
"epoch": 0.14722355576892301,
"grad_norm": 2.3417108058929443,
"learning_rate": 4.767743488260149e-05,
"loss": 5.7734,
"step": 1840
},
{
"epoch": 0.1480236837894063,
"grad_norm": 2.2151668071746826,
"learning_rate": 4.7664063753543355e-05,
"loss": 5.84,
"step": 1850
},
{
"epoch": 0.14882381180988957,
"grad_norm": 3.114260196685791,
"learning_rate": 4.765069262448522e-05,
"loss": 5.9409,
"step": 1860
},
{
"epoch": 0.14962393983037287,
"grad_norm": 2.4931910037994385,
"learning_rate": 4.763732149542708e-05,
"loss": 5.9396,
"step": 1870
},
{
"epoch": 0.15042406785085613,
"grad_norm": 3.736487865447998,
"learning_rate": 4.7623950366368936e-05,
"loss": 5.7427,
"step": 1880
},
{
"epoch": 0.15122419587133942,
"grad_norm": 4.730785846710205,
"learning_rate": 4.76105792373108e-05,
"loss": 5.9181,
"step": 1890
},
{
"epoch": 0.1520243238918227,
"grad_norm": 2.9264132976531982,
"learning_rate": 4.759720810825266e-05,
"loss": 5.8967,
"step": 1900
},
{
"epoch": 0.15282445191230598,
"grad_norm": 3.2538132667541504,
"learning_rate": 4.7583836979194524e-05,
"loss": 5.8459,
"step": 1910
},
{
"epoch": 0.15362457993278925,
"grad_norm": 2.7208549976348877,
"learning_rate": 4.757046585013639e-05,
"loss": 5.7038,
"step": 1920
},
{
"epoch": 0.1544247079532725,
"grad_norm": 2.7510788440704346,
"learning_rate": 4.755709472107825e-05,
"loss": 5.8524,
"step": 1930
},
{
"epoch": 0.1552248359737558,
"grad_norm": 2.6565892696380615,
"learning_rate": 4.754372359202011e-05,
"loss": 5.6324,
"step": 1940
},
{
"epoch": 0.15602496399423907,
"grad_norm": 2.954798936843872,
"learning_rate": 4.7530352462961975e-05,
"loss": 5.8388,
"step": 1950
},
{
"epoch": 0.15682509201472236,
"grad_norm": 2.291714668273926,
"learning_rate": 4.751698133390384e-05,
"loss": 5.7504,
"step": 1960
},
{
"epoch": 0.15762522003520563,
"grad_norm": 2.1387598514556885,
"learning_rate": 4.75036102048457e-05,
"loss": 5.7556,
"step": 1970
},
{
"epoch": 0.15842534805568892,
"grad_norm": 2.290407180786133,
"learning_rate": 4.749023907578756e-05,
"loss": 5.7089,
"step": 1980
},
{
"epoch": 0.15922547607617218,
"grad_norm": 2.852696657180786,
"learning_rate": 4.7476867946729426e-05,
"loss": 5.8656,
"step": 1990
},
{
"epoch": 0.16002560409665548,
"grad_norm": 2.8190526962280273,
"learning_rate": 4.746349681767129e-05,
"loss": 6.0134,
"step": 2000
},
{
"epoch": 0.16082573211713874,
"grad_norm": 2.705008029937744,
"learning_rate": 4.745012568861315e-05,
"loss": 5.8713,
"step": 2010
},
{
"epoch": 0.161625860137622,
"grad_norm": 3.571394205093384,
"learning_rate": 4.7436754559555014e-05,
"loss": 5.8329,
"step": 2020
},
{
"epoch": 0.1624259881581053,
"grad_norm": 2.687455177307129,
"learning_rate": 4.7423383430496876e-05,
"loss": 5.8355,
"step": 2030
},
{
"epoch": 0.16322611617858857,
"grad_norm": 2.6158690452575684,
"learning_rate": 4.741001230143873e-05,
"loss": 5.6938,
"step": 2040
},
{
"epoch": 0.16402624419907186,
"grad_norm": 2.9657154083251953,
"learning_rate": 4.7396641172380595e-05,
"loss": 5.7514,
"step": 2050
},
{
"epoch": 0.16482637221955512,
"grad_norm": 2.310607433319092,
"learning_rate": 4.738327004332246e-05,
"loss": 5.7397,
"step": 2060
},
{
"epoch": 0.16562650024003842,
"grad_norm": 2.855271339416504,
"learning_rate": 4.736989891426432e-05,
"loss": 5.7645,
"step": 2070
},
{
"epoch": 0.16642662826052168,
"grad_norm": 2.778768301010132,
"learning_rate": 4.735652778520618e-05,
"loss": 5.9582,
"step": 2080
},
{
"epoch": 0.16722675628100497,
"grad_norm": 3.069973945617676,
"learning_rate": 4.7343156656148046e-05,
"loss": 5.8205,
"step": 2090
},
{
"epoch": 0.16802688430148824,
"grad_norm": 3.5799551010131836,
"learning_rate": 4.732978552708991e-05,
"loss": 5.9001,
"step": 2100
},
{
"epoch": 0.1688270123219715,
"grad_norm": 2.556668758392334,
"learning_rate": 4.731641439803177e-05,
"loss": 5.7258,
"step": 2110
},
{
"epoch": 0.1696271403424548,
"grad_norm": 2.7847707271575928,
"learning_rate": 4.7303043268973634e-05,
"loss": 5.9007,
"step": 2120
},
{
"epoch": 0.17042726836293806,
"grad_norm": 4.071508407592773,
"learning_rate": 4.7289672139915496e-05,
"loss": 5.7035,
"step": 2130
},
{
"epoch": 0.17122739638342135,
"grad_norm": 2.6188418865203857,
"learning_rate": 4.727630101085736e-05,
"loss": 5.651,
"step": 2140
},
{
"epoch": 0.17202752440390462,
"grad_norm": 1.952249526977539,
"learning_rate": 4.726292988179922e-05,
"loss": 6.1107,
"step": 2150
},
{
"epoch": 0.1728276524243879,
"grad_norm": 2.299018144607544,
"learning_rate": 4.7249558752741085e-05,
"loss": 5.7609,
"step": 2160
},
{
"epoch": 0.17362778044487118,
"grad_norm": 2.5578439235687256,
"learning_rate": 4.723618762368295e-05,
"loss": 5.792,
"step": 2170
},
{
"epoch": 0.17442790846535447,
"grad_norm": 3.9921529293060303,
"learning_rate": 4.722281649462481e-05,
"loss": 5.7233,
"step": 2180
},
{
"epoch": 0.17522803648583773,
"grad_norm": 2.5521302223205566,
"learning_rate": 4.7209445365566666e-05,
"loss": 5.807,
"step": 2190
},
{
"epoch": 0.176028164506321,
"grad_norm": 2.71401047706604,
"learning_rate": 4.719607423650853e-05,
"loss": 5.6689,
"step": 2200
},
{
"epoch": 0.1768282925268043,
"grad_norm": 3.782607316970825,
"learning_rate": 4.718270310745039e-05,
"loss": 5.734,
"step": 2210
},
{
"epoch": 0.17762842054728756,
"grad_norm": 2.57356333732605,
"learning_rate": 4.7169331978392254e-05,
"loss": 5.8101,
"step": 2220
},
{
"epoch": 0.17842854856777085,
"grad_norm": 2.7005815505981445,
"learning_rate": 4.715596084933412e-05,
"loss": 6.0603,
"step": 2230
},
{
"epoch": 0.17922867658825412,
"grad_norm": 2.081550359725952,
"learning_rate": 4.714258972027598e-05,
"loss": 5.7677,
"step": 2240
},
{
"epoch": 0.1800288046087374,
"grad_norm": 3.6565728187561035,
"learning_rate": 4.712921859121784e-05,
"loss": 5.9672,
"step": 2250
},
{
"epoch": 0.18082893262922067,
"grad_norm": 2.4702320098876953,
"learning_rate": 4.7115847462159705e-05,
"loss": 5.8397,
"step": 2260
},
{
"epoch": 0.18162906064970397,
"grad_norm": 3.335736036300659,
"learning_rate": 4.710247633310157e-05,
"loss": 5.7021,
"step": 2270
},
{
"epoch": 0.18242918867018723,
"grad_norm": 3.3939075469970703,
"learning_rate": 4.708910520404343e-05,
"loss": 5.8464,
"step": 2280
},
{
"epoch": 0.1832293166906705,
"grad_norm": 2.4869279861450195,
"learning_rate": 4.707573407498529e-05,
"loss": 5.6904,
"step": 2290
},
{
"epoch": 0.1840294447111538,
"grad_norm": 2.4240360260009766,
"learning_rate": 4.7062362945927155e-05,
"loss": 5.7227,
"step": 2300
},
{
"epoch": 0.18482957273163705,
"grad_norm": 2.428786039352417,
"learning_rate": 4.704899181686902e-05,
"loss": 5.8295,
"step": 2310
},
{
"epoch": 0.18562970075212035,
"grad_norm": 3.3214187622070312,
"learning_rate": 4.703562068781088e-05,
"loss": 5.8341,
"step": 2320
},
{
"epoch": 0.1864298287726036,
"grad_norm": 3.2146456241607666,
"learning_rate": 4.7022249558752744e-05,
"loss": 5.7217,
"step": 2330
},
{
"epoch": 0.1872299567930869,
"grad_norm": 4.442914009094238,
"learning_rate": 4.7008878429694606e-05,
"loss": 5.9003,
"step": 2340
},
{
"epoch": 0.18803008481357017,
"grad_norm": 1.9268267154693604,
"learning_rate": 4.699550730063646e-05,
"loss": 5.8292,
"step": 2350
},
{
"epoch": 0.18883021283405343,
"grad_norm": 3.130021095275879,
"learning_rate": 4.6982136171578325e-05,
"loss": 5.6864,
"step": 2360
},
{
"epoch": 0.18963034085453673,
"grad_norm": 2.8835690021514893,
"learning_rate": 4.696876504252019e-05,
"loss": 5.829,
"step": 2370
},
{
"epoch": 0.19043046887502,
"grad_norm": 2.4171135425567627,
"learning_rate": 4.695539391346205e-05,
"loss": 5.7972,
"step": 2380
},
{
"epoch": 0.19123059689550329,
"grad_norm": 3.782817840576172,
"learning_rate": 4.694202278440391e-05,
"loss": 5.8497,
"step": 2390
},
{
"epoch": 0.19203072491598655,
"grad_norm": 2.475249767303467,
"learning_rate": 4.6928651655345776e-05,
"loss": 5.9237,
"step": 2400
},
{
"epoch": 0.19283085293646984,
"grad_norm": 2.5809242725372314,
"learning_rate": 4.691528052628764e-05,
"loss": 5.7756,
"step": 2410
},
{
"epoch": 0.1936309809569531,
"grad_norm": 2.6922059059143066,
"learning_rate": 4.69019093972295e-05,
"loss": 5.9326,
"step": 2420
},
{
"epoch": 0.1944311089774364,
"grad_norm": 2.7542431354522705,
"learning_rate": 4.6888538268171364e-05,
"loss": 5.6279,
"step": 2430
},
{
"epoch": 0.19523123699791967,
"grad_norm": 2.4063303470611572,
"learning_rate": 4.6875167139113226e-05,
"loss": 5.91,
"step": 2440
},
{
"epoch": 0.19603136501840293,
"grad_norm": 4.855547904968262,
"learning_rate": 4.686179601005509e-05,
"loss": 5.7286,
"step": 2450
},
{
"epoch": 0.19683149303888622,
"grad_norm": 2.9875595569610596,
"learning_rate": 4.684842488099695e-05,
"loss": 5.8299,
"step": 2460
},
{
"epoch": 0.1976316210593695,
"grad_norm": 4.467639923095703,
"learning_rate": 4.6835053751938814e-05,
"loss": 5.8469,
"step": 2470
},
{
"epoch": 0.19843174907985278,
"grad_norm": 2.2144124507904053,
"learning_rate": 4.682168262288068e-05,
"loss": 5.7871,
"step": 2480
},
{
"epoch": 0.19923187710033605,
"grad_norm": 2.4507012367248535,
"learning_rate": 4.680831149382254e-05,
"loss": 5.7529,
"step": 2490
},
{
"epoch": 0.20003200512081934,
"grad_norm": 2.208648681640625,
"learning_rate": 4.67949403647644e-05,
"loss": 5.7265,
"step": 2500
},
{
"epoch": 0.2008321331413026,
"grad_norm": 2.560302257537842,
"learning_rate": 4.6781569235706265e-05,
"loss": 5.7842,
"step": 2510
},
{
"epoch": 0.2016322611617859,
"grad_norm": 2.354292154312134,
"learning_rate": 4.676819810664813e-05,
"loss": 5.8468,
"step": 2520
},
{
"epoch": 0.20243238918226916,
"grad_norm": 2.9559860229492188,
"learning_rate": 4.675482697758999e-05,
"loss": 5.7003,
"step": 2530
},
{
"epoch": 0.20323251720275243,
"grad_norm": 3.251077651977539,
"learning_rate": 4.674145584853185e-05,
"loss": 5.8129,
"step": 2540
},
{
"epoch": 0.20403264522323572,
"grad_norm": 2.7863471508026123,
"learning_rate": 4.6728084719473716e-05,
"loss": 5.6814,
"step": 2550
},
{
"epoch": 0.20483277324371899,
"grad_norm": 2.9006989002227783,
"learning_rate": 4.671471359041558e-05,
"loss": 5.8292,
"step": 2560
},
{
"epoch": 0.20563290126420228,
"grad_norm": 2.930689573287964,
"learning_rate": 4.670134246135744e-05,
"loss": 5.8825,
"step": 2570
},
{
"epoch": 0.20643302928468554,
"grad_norm": 2.3105032444000244,
"learning_rate": 4.6687971332299304e-05,
"loss": 5.7039,
"step": 2580
},
{
"epoch": 0.20723315730516884,
"grad_norm": 3.1141879558563232,
"learning_rate": 4.667460020324117e-05,
"loss": 5.8692,
"step": 2590
},
{
"epoch": 0.2080332853256521,
"grad_norm": 3.5017199516296387,
"learning_rate": 4.666122907418303e-05,
"loss": 5.7922,
"step": 2600
},
{
"epoch": 0.2088334133461354,
"grad_norm": 2.657975912094116,
"learning_rate": 4.664785794512489e-05,
"loss": 5.7736,
"step": 2610
},
{
"epoch": 0.20963354136661866,
"grad_norm": 3.246952772140503,
"learning_rate": 4.6634486816066755e-05,
"loss": 5.768,
"step": 2620
},
{
"epoch": 0.21043366938710192,
"grad_norm": 6.832335948944092,
"learning_rate": 4.662111568700862e-05,
"loss": 5.6752,
"step": 2630
},
{
"epoch": 0.21123379740758522,
"grad_norm": 3.2479753494262695,
"learning_rate": 4.660774455795048e-05,
"loss": 5.8015,
"step": 2640
},
{
"epoch": 0.21203392542806848,
"grad_norm": 2.809082508087158,
"learning_rate": 4.659437342889234e-05,
"loss": 5.8663,
"step": 2650
},
{
"epoch": 0.21283405344855177,
"grad_norm": 3.7948036193847656,
"learning_rate": 4.65810022998342e-05,
"loss": 5.889,
"step": 2660
},
{
"epoch": 0.21363418146903504,
"grad_norm": 2.836090564727783,
"learning_rate": 4.656763117077606e-05,
"loss": 5.7516,
"step": 2670
},
{
"epoch": 0.21443430948951833,
"grad_norm": 3.0940232276916504,
"learning_rate": 4.6554260041717924e-05,
"loss": 5.7033,
"step": 2680
},
{
"epoch": 0.2152344375100016,
"grad_norm": 2.436757802963257,
"learning_rate": 4.654088891265979e-05,
"loss": 5.746,
"step": 2690
},
{
"epoch": 0.2160345655304849,
"grad_norm": 2.4339609146118164,
"learning_rate": 4.652751778360165e-05,
"loss": 5.828,
"step": 2700
},
{
"epoch": 0.21683469355096816,
"grad_norm": 2.379366874694824,
"learning_rate": 4.651414665454351e-05,
"loss": 5.719,
"step": 2710
},
{
"epoch": 0.21763482157145142,
"grad_norm": 2.1722371578216553,
"learning_rate": 4.6500775525485375e-05,
"loss": 5.7875,
"step": 2720
},
{
"epoch": 0.2184349495919347,
"grad_norm": 3.633279800415039,
"learning_rate": 4.648740439642724e-05,
"loss": 5.802,
"step": 2730
},
{
"epoch": 0.21923507761241798,
"grad_norm": 2.4091219902038574,
"learning_rate": 4.64740332673691e-05,
"loss": 5.8197,
"step": 2740
},
{
"epoch": 0.22003520563290127,
"grad_norm": 2.7289021015167236,
"learning_rate": 4.646066213831096e-05,
"loss": 5.9445,
"step": 2750
},
{
"epoch": 0.22083533365338454,
"grad_norm": 2.376481294631958,
"learning_rate": 4.6447291009252826e-05,
"loss": 5.9943,
"step": 2760
},
{
"epoch": 0.22163546167386783,
"grad_norm": 2.6542563438415527,
"learning_rate": 4.643391988019469e-05,
"loss": 5.6049,
"step": 2770
},
{
"epoch": 0.2224355896943511,
"grad_norm": 2.320472240447998,
"learning_rate": 4.642054875113655e-05,
"loss": 5.7637,
"step": 2780
},
{
"epoch": 0.2232357177148344,
"grad_norm": 2.8923239707946777,
"learning_rate": 4.6407177622078414e-05,
"loss": 5.9666,
"step": 2790
},
{
"epoch": 0.22403584573531765,
"grad_norm": 4.277271270751953,
"learning_rate": 4.6393806493020276e-05,
"loss": 5.8393,
"step": 2800
},
{
"epoch": 0.22483597375580092,
"grad_norm": 2.797428607940674,
"learning_rate": 4.638043536396213e-05,
"loss": 5.759,
"step": 2810
},
{
"epoch": 0.2256361017762842,
"grad_norm": 2.1849517822265625,
"learning_rate": 4.6367064234903995e-05,
"loss": 5.7514,
"step": 2820
},
{
"epoch": 0.22643622979676747,
"grad_norm": 2.8607492446899414,
"learning_rate": 4.635369310584586e-05,
"loss": 5.7545,
"step": 2830
},
{
"epoch": 0.22723635781725077,
"grad_norm": 3.722041130065918,
"learning_rate": 4.634032197678772e-05,
"loss": 5.8011,
"step": 2840
},
{
"epoch": 0.22803648583773403,
"grad_norm": 2.8563833236694336,
"learning_rate": 4.632695084772958e-05,
"loss": 5.8569,
"step": 2850
},
{
"epoch": 0.22883661385821732,
"grad_norm": 3.5724806785583496,
"learning_rate": 4.6313579718671446e-05,
"loss": 5.9649,
"step": 2860
},
{
"epoch": 0.2296367418787006,
"grad_norm": 2.380469560623169,
"learning_rate": 4.630020858961331e-05,
"loss": 5.7467,
"step": 2870
},
{
"epoch": 0.23043686989918388,
"grad_norm": 3.1629838943481445,
"learning_rate": 4.628683746055517e-05,
"loss": 5.642,
"step": 2880
},
{
"epoch": 0.23123699791966715,
"grad_norm": 2.1239373683929443,
"learning_rate": 4.6273466331497034e-05,
"loss": 5.6483,
"step": 2890
},
{
"epoch": 0.2320371259401504,
"grad_norm": 3.049079418182373,
"learning_rate": 4.6260095202438897e-05,
"loss": 5.9736,
"step": 2900
},
{
"epoch": 0.2328372539606337,
"grad_norm": 2.556830406188965,
"learning_rate": 4.624672407338076e-05,
"loss": 5.6037,
"step": 2910
},
{
"epoch": 0.23363738198111697,
"grad_norm": 2.8762035369873047,
"learning_rate": 4.623335294432262e-05,
"loss": 5.6345,
"step": 2920
},
{
"epoch": 0.23443751000160026,
"grad_norm": 2.11167573928833,
"learning_rate": 4.6219981815264485e-05,
"loss": 5.7822,
"step": 2930
},
{
"epoch": 0.23523763802208353,
"grad_norm": 4.623869895935059,
"learning_rate": 4.620661068620635e-05,
"loss": 5.7063,
"step": 2940
},
{
"epoch": 0.23603776604256682,
"grad_norm": 2.4420578479766846,
"learning_rate": 4.619323955714821e-05,
"loss": 5.686,
"step": 2950
},
{
"epoch": 0.2368378940630501,
"grad_norm": 2.6543869972229004,
"learning_rate": 4.617986842809007e-05,
"loss": 5.7802,
"step": 2960
},
{
"epoch": 0.23763802208353338,
"grad_norm": 2.6264312267303467,
"learning_rate": 4.616649729903193e-05,
"loss": 5.6667,
"step": 2970
},
{
"epoch": 0.23843815010401664,
"grad_norm": 2.4579195976257324,
"learning_rate": 4.615312616997379e-05,
"loss": 5.6738,
"step": 2980
},
{
"epoch": 0.2392382781244999,
"grad_norm": 2.299448251724243,
"learning_rate": 4.6139755040915654e-05,
"loss": 5.8622,
"step": 2990
},
{
"epoch": 0.2400384061449832,
"grad_norm": 3.6527328491210938,
"learning_rate": 4.612638391185752e-05,
"loss": 5.6346,
"step": 3000
},
{
"epoch": 0.24083853416546647,
"grad_norm": 2.217876434326172,
"learning_rate": 4.611301278279938e-05,
"loss": 5.7892,
"step": 3010
},
{
"epoch": 0.24163866218594976,
"grad_norm": 3.500544309616089,
"learning_rate": 4.609964165374124e-05,
"loss": 5.8026,
"step": 3020
},
{
"epoch": 0.24243879020643302,
"grad_norm": 3.1694483757019043,
"learning_rate": 4.6086270524683105e-05,
"loss": 5.827,
"step": 3030
},
{
"epoch": 0.24323891822691632,
"grad_norm": 2.899625778198242,
"learning_rate": 4.607289939562497e-05,
"loss": 5.7384,
"step": 3040
},
{
"epoch": 0.24403904624739958,
"grad_norm": 2.8286776542663574,
"learning_rate": 4.605952826656683e-05,
"loss": 5.7629,
"step": 3050
},
{
"epoch": 0.24483917426788285,
"grad_norm": 2.7585489749908447,
"learning_rate": 4.604615713750869e-05,
"loss": 5.7462,
"step": 3060
},
{
"epoch": 0.24563930228836614,
"grad_norm": 2.2017667293548584,
"learning_rate": 4.6032786008450555e-05,
"loss": 5.844,
"step": 3070
},
{
"epoch": 0.2464394303088494,
"grad_norm": 4.679725170135498,
"learning_rate": 4.601941487939242e-05,
"loss": 5.7254,
"step": 3080
},
{
"epoch": 0.2472395583293327,
"grad_norm": 2.923884868621826,
"learning_rate": 4.600604375033428e-05,
"loss": 5.703,
"step": 3090
},
{
"epoch": 0.24803968634981596,
"grad_norm": 2.2205090522766113,
"learning_rate": 4.5992672621276144e-05,
"loss": 5.7185,
"step": 3100
},
{
"epoch": 0.24883981437029926,
"grad_norm": 2.852313280105591,
"learning_rate": 4.5979301492218006e-05,
"loss": 5.5653,
"step": 3110
},
{
"epoch": 0.24963994239078252,
"grad_norm": 2.7683911323547363,
"learning_rate": 4.596593036315986e-05,
"loss": 5.7262,
"step": 3120
},
{
"epoch": 0.2504400704112658,
"grad_norm": 3.1315665245056152,
"learning_rate": 4.5952559234101725e-05,
"loss": 5.7524,
"step": 3130
},
{
"epoch": 0.2512401984317491,
"grad_norm": 2.5233592987060547,
"learning_rate": 4.593918810504359e-05,
"loss": 5.7443,
"step": 3140
},
{
"epoch": 0.25204032645223234,
"grad_norm": 2.3802831172943115,
"learning_rate": 4.592581697598545e-05,
"loss": 5.8091,
"step": 3150
},
{
"epoch": 0.25284045447271564,
"grad_norm": 2.378218412399292,
"learning_rate": 4.591244584692731e-05,
"loss": 5.7741,
"step": 3160
},
{
"epoch": 0.25364058249319893,
"grad_norm": 4.712483882904053,
"learning_rate": 4.5899074717869176e-05,
"loss": 5.8643,
"step": 3170
},
{
"epoch": 0.25444071051368217,
"grad_norm": 2.798752784729004,
"learning_rate": 4.588570358881104e-05,
"loss": 5.7984,
"step": 3180
},
{
"epoch": 0.25524083853416546,
"grad_norm": 2.302037477493286,
"learning_rate": 4.58723324597529e-05,
"loss": 5.6548,
"step": 3190
},
{
"epoch": 0.25604096655464875,
"grad_norm": 2.8621273040771484,
"learning_rate": 4.5858961330694764e-05,
"loss": 5.6875,
"step": 3200
},
{
"epoch": 0.25684109457513205,
"grad_norm": 2.9079480171203613,
"learning_rate": 4.5845590201636626e-05,
"loss": 5.8801,
"step": 3210
},
{
"epoch": 0.2576412225956153,
"grad_norm": 2.9576847553253174,
"learning_rate": 4.583221907257849e-05,
"loss": 5.6646,
"step": 3220
},
{
"epoch": 0.2584413506160986,
"grad_norm": 4.085951805114746,
"learning_rate": 4.581884794352035e-05,
"loss": 5.9078,
"step": 3230
},
{
"epoch": 0.25924147863658187,
"grad_norm": 2.622903347015381,
"learning_rate": 4.5805476814462214e-05,
"loss": 5.6821,
"step": 3240
},
{
"epoch": 0.2600416066570651,
"grad_norm": 1.794255256652832,
"learning_rate": 4.579210568540408e-05,
"loss": 5.751,
"step": 3250
},
{
"epoch": 0.2608417346775484,
"grad_norm": 3.074042558670044,
"learning_rate": 4.577873455634594e-05,
"loss": 5.7864,
"step": 3260
},
{
"epoch": 0.2616418626980317,
"grad_norm": 2.3138844966888428,
"learning_rate": 4.57653634272878e-05,
"loss": 5.693,
"step": 3270
},
{
"epoch": 0.262441990718515,
"grad_norm": 3.8877549171447754,
"learning_rate": 4.5751992298229665e-05,
"loss": 5.7154,
"step": 3280
},
{
"epoch": 0.2632421187389982,
"grad_norm": 2.9623680114746094,
"learning_rate": 4.573862116917153e-05,
"loss": 5.7514,
"step": 3290
},
{
"epoch": 0.2640422467594815,
"grad_norm": 2.840122938156128,
"learning_rate": 4.572525004011339e-05,
"loss": 5.7397,
"step": 3300
},
{
"epoch": 0.2648423747799648,
"grad_norm": 2.9699277877807617,
"learning_rate": 4.571187891105525e-05,
"loss": 5.7626,
"step": 3310
},
{
"epoch": 0.2656425028004481,
"grad_norm": 2.6493773460388184,
"learning_rate": 4.5698507781997116e-05,
"loss": 5.7619,
"step": 3320
},
{
"epoch": 0.26644263082093134,
"grad_norm": 2.283259868621826,
"learning_rate": 4.568513665293898e-05,
"loss": 5.8409,
"step": 3330
},
{
"epoch": 0.26724275884141463,
"grad_norm": 1.9254164695739746,
"learning_rate": 4.567176552388084e-05,
"loss": 5.8218,
"step": 3340
},
{
"epoch": 0.2680428868618979,
"grad_norm": 2.382345676422119,
"learning_rate": 4.5658394394822704e-05,
"loss": 5.6865,
"step": 3350
},
{
"epoch": 0.26884301488238116,
"grad_norm": 2.6039271354675293,
"learning_rate": 4.564502326576457e-05,
"loss": 5.7254,
"step": 3360
},
{
"epoch": 0.26964314290286445,
"grad_norm": 2.0948996543884277,
"learning_rate": 4.563165213670643e-05,
"loss": 5.7589,
"step": 3370
},
{
"epoch": 0.27044327092334774,
"grad_norm": 2.939955711364746,
"learning_rate": 4.561828100764829e-05,
"loss": 5.8298,
"step": 3380
},
{
"epoch": 0.27124339894383104,
"grad_norm": 2.748307466506958,
"learning_rate": 4.5604909878590155e-05,
"loss": 5.8505,
"step": 3390
},
{
"epoch": 0.2720435269643143,
"grad_norm": 2.7122459411621094,
"learning_rate": 4.559153874953202e-05,
"loss": 5.9027,
"step": 3400
},
{
"epoch": 0.27284365498479757,
"grad_norm": 3.6053593158721924,
"learning_rate": 4.557816762047388e-05,
"loss": 5.6746,
"step": 3410
},
{
"epoch": 0.27364378300528086,
"grad_norm": 4.433299541473389,
"learning_rate": 4.556479649141574e-05,
"loss": 5.7713,
"step": 3420
},
{
"epoch": 0.2744439110257641,
"grad_norm": 2.5253539085388184,
"learning_rate": 4.55514253623576e-05,
"loss": 5.8219,
"step": 3430
},
{
"epoch": 0.2752440390462474,
"grad_norm": 4.9358062744140625,
"learning_rate": 4.553805423329946e-05,
"loss": 5.7971,
"step": 3440
},
{
"epoch": 0.2760441670667307,
"grad_norm": 2.6247594356536865,
"learning_rate": 4.5524683104241324e-05,
"loss": 5.1528,
"step": 3450
},
{
"epoch": 0.276844295087214,
"grad_norm": 2.8152048587799072,
"learning_rate": 4.551131197518319e-05,
"loss": 5.7955,
"step": 3460
},
{
"epoch": 0.2776444231076972,
"grad_norm": 2.143275499343872,
"learning_rate": 4.549794084612505e-05,
"loss": 5.6875,
"step": 3470
},
{
"epoch": 0.2784445511281805,
"grad_norm": 2.9896023273468018,
"learning_rate": 4.548456971706691e-05,
"loss": 5.7981,
"step": 3480
},
{
"epoch": 0.2792446791486638,
"grad_norm": 3.5231759548187256,
"learning_rate": 4.5471198588008775e-05,
"loss": 5.7343,
"step": 3490
},
{
"epoch": 0.28004480716914704,
"grad_norm": 2.391721487045288,
"learning_rate": 4.545782745895064e-05,
"loss": 5.6821,
"step": 3500
},
{
"epoch": 0.28084493518963033,
"grad_norm": 2.414992332458496,
"learning_rate": 4.54444563298925e-05,
"loss": 5.7357,
"step": 3510
},
{
"epoch": 0.2816450632101136,
"grad_norm": 2.7502214908599854,
"learning_rate": 4.543108520083436e-05,
"loss": 5.6511,
"step": 3520
},
{
"epoch": 0.2824451912305969,
"grad_norm": 2.1601436138153076,
"learning_rate": 4.5417714071776226e-05,
"loss": 5.6249,
"step": 3530
},
{
"epoch": 0.28324531925108015,
"grad_norm": 2.89013671875,
"learning_rate": 4.540434294271809e-05,
"loss": 5.7583,
"step": 3540
},
{
"epoch": 0.28404544727156344,
"grad_norm": 2.4915778636932373,
"learning_rate": 4.539097181365995e-05,
"loss": 5.6957,
"step": 3550
},
{
"epoch": 0.28484557529204674,
"grad_norm": 5.053386688232422,
"learning_rate": 4.5377600684601814e-05,
"loss": 5.632,
"step": 3560
},
{
"epoch": 0.28564570331253003,
"grad_norm": 2.6207687854766846,
"learning_rate": 4.5364229555543676e-05,
"loss": 5.8514,
"step": 3570
},
{
"epoch": 0.28644583133301327,
"grad_norm": 4.157670497894287,
"learning_rate": 4.535085842648553e-05,
"loss": 5.7608,
"step": 3580
},
{
"epoch": 0.28724595935349656,
"grad_norm": 3.4464797973632812,
"learning_rate": 4.5337487297427395e-05,
"loss": 5.6737,
"step": 3590
},
{
"epoch": 0.28804608737397985,
"grad_norm": 4.255002498626709,
"learning_rate": 4.532411616836926e-05,
"loss": 5.7977,
"step": 3600
},
{
"epoch": 0.2888462153944631,
"grad_norm": 2.7926547527313232,
"learning_rate": 4.531074503931112e-05,
"loss": 5.6891,
"step": 3610
},
{
"epoch": 0.2896463434149464,
"grad_norm": 3.150400400161743,
"learning_rate": 4.529737391025298e-05,
"loss": 5.7931,
"step": 3620
},
{
"epoch": 0.2904464714354297,
"grad_norm": 2.1223199367523193,
"learning_rate": 4.5284002781194846e-05,
"loss": 5.8646,
"step": 3630
},
{
"epoch": 0.29124659945591297,
"grad_norm": 3.950665235519409,
"learning_rate": 4.527063165213671e-05,
"loss": 5.7008,
"step": 3640
},
{
"epoch": 0.2920467274763962,
"grad_norm": 2.995692729949951,
"learning_rate": 4.525726052307857e-05,
"loss": 5.688,
"step": 3650
},
{
"epoch": 0.2928468554968795,
"grad_norm": 2.041736125946045,
"learning_rate": 4.5243889394020434e-05,
"loss": 5.7301,
"step": 3660
},
{
"epoch": 0.2936469835173628,
"grad_norm": 2.541757106781006,
"learning_rate": 4.5230518264962297e-05,
"loss": 5.5606,
"step": 3670
},
{
"epoch": 0.29444711153784603,
"grad_norm": 2.140761613845825,
"learning_rate": 4.521714713590416e-05,
"loss": 5.7671,
"step": 3680
},
{
"epoch": 0.2952472395583293,
"grad_norm": 2.6869146823883057,
"learning_rate": 4.520377600684602e-05,
"loss": 5.6452,
"step": 3690
},
{
"epoch": 0.2960473675788126,
"grad_norm": 3.072376012802124,
"learning_rate": 4.5190404877787885e-05,
"loss": 5.6956,
"step": 3700
},
{
"epoch": 0.2968474955992959,
"grad_norm": 2.5933837890625,
"learning_rate": 4.517703374872975e-05,
"loss": 5.6212,
"step": 3710
},
{
"epoch": 0.29764762361977914,
"grad_norm": 3.0443103313446045,
"learning_rate": 4.516366261967161e-05,
"loss": 5.7849,
"step": 3720
},
{
"epoch": 0.29844775164026244,
"grad_norm": 2.673583745956421,
"learning_rate": 4.515029149061347e-05,
"loss": 5.6186,
"step": 3730
},
{
"epoch": 0.29924787966074573,
"grad_norm": 2.3276283740997314,
"learning_rate": 4.513692036155533e-05,
"loss": 5.9188,
"step": 3740
},
{
"epoch": 0.300048007681229,
"grad_norm": 5.504491329193115,
"learning_rate": 4.512354923249719e-05,
"loss": 5.5676,
"step": 3750
},
{
"epoch": 0.30084813570171226,
"grad_norm": 2.4181482791900635,
"learning_rate": 4.5110178103439054e-05,
"loss": 5.6852,
"step": 3760
},
{
"epoch": 0.30164826372219555,
"grad_norm": 2.2489006519317627,
"learning_rate": 4.509680697438092e-05,
"loss": 5.7003,
"step": 3770
},
{
"epoch": 0.30244839174267885,
"grad_norm": 2.6925253868103027,
"learning_rate": 4.508343584532278e-05,
"loss": 5.8176,
"step": 3780
},
{
"epoch": 0.3032485197631621,
"grad_norm": 2.904318332672119,
"learning_rate": 4.507006471626464e-05,
"loss": 5.6912,
"step": 3790
},
{
"epoch": 0.3040486477836454,
"grad_norm": 3.3189070224761963,
"learning_rate": 4.5056693587206505e-05,
"loss": 5.8706,
"step": 3800
},
{
"epoch": 0.30484877580412867,
"grad_norm": 2.8324170112609863,
"learning_rate": 4.504332245814837e-05,
"loss": 5.8795,
"step": 3810
},
{
"epoch": 0.30564890382461196,
"grad_norm": 3.113417148590088,
"learning_rate": 4.502995132909023e-05,
"loss": 5.8689,
"step": 3820
},
{
"epoch": 0.3064490318450952,
"grad_norm": 2.469269275665283,
"learning_rate": 4.501658020003209e-05,
"loss": 5.7934,
"step": 3830
},
{
"epoch": 0.3072491598655785,
"grad_norm": 2.778571128845215,
"learning_rate": 4.5003209070973956e-05,
"loss": 5.8577,
"step": 3840
},
{
"epoch": 0.3080492878860618,
"grad_norm": 3.4269161224365234,
"learning_rate": 4.498983794191582e-05,
"loss": 5.8378,
"step": 3850
},
{
"epoch": 0.308849415906545,
"grad_norm": 3.417850971221924,
"learning_rate": 4.497646681285768e-05,
"loss": 5.6532,
"step": 3860
},
{
"epoch": 0.3096495439270283,
"grad_norm": 2.389784097671509,
"learning_rate": 4.4963095683799544e-05,
"loss": 5.5454,
"step": 3870
},
{
"epoch": 0.3104496719475116,
"grad_norm": 2.384453296661377,
"learning_rate": 4.4949724554741406e-05,
"loss": 5.8014,
"step": 3880
},
{
"epoch": 0.3112497999679949,
"grad_norm": 1.913668155670166,
"learning_rate": 4.493635342568326e-05,
"loss": 5.6033,
"step": 3890
},
{
"epoch": 0.31204992798847814,
"grad_norm": 3.4930074214935303,
"learning_rate": 4.4922982296625125e-05,
"loss": 5.7649,
"step": 3900
},
{
"epoch": 0.31285005600896143,
"grad_norm": 3.517458200454712,
"learning_rate": 4.490961116756699e-05,
"loss": 5.5635,
"step": 3910
},
{
"epoch": 0.3136501840294447,
"grad_norm": 2.611274480819702,
"learning_rate": 4.489624003850885e-05,
"loss": 5.8121,
"step": 3920
},
{
"epoch": 0.314450312049928,
"grad_norm": 2.373997926712036,
"learning_rate": 4.488286890945071e-05,
"loss": 5.6002,
"step": 3930
},
{
"epoch": 0.31525044007041125,
"grad_norm": 2.554847002029419,
"learning_rate": 4.4869497780392576e-05,
"loss": 5.6432,
"step": 3940
},
{
"epoch": 0.31605056809089455,
"grad_norm": 3.3720595836639404,
"learning_rate": 4.485612665133444e-05,
"loss": 5.5794,
"step": 3950
},
{
"epoch": 0.31685069611137784,
"grad_norm": 2.2308788299560547,
"learning_rate": 4.48427555222763e-05,
"loss": 5.794,
"step": 3960
},
{
"epoch": 0.3176508241318611,
"grad_norm": 2.0659661293029785,
"learning_rate": 4.4829384393218164e-05,
"loss": 5.5383,
"step": 3970
},
{
"epoch": 0.31845095215234437,
"grad_norm": 3.2644894123077393,
"learning_rate": 4.4816013264160026e-05,
"loss": 5.6979,
"step": 3980
},
{
"epoch": 0.31925108017282766,
"grad_norm": 2.3485729694366455,
"learning_rate": 4.480264213510189e-05,
"loss": 5.7214,
"step": 3990
},
{
"epoch": 0.32005120819331095,
"grad_norm": 2.7470600605010986,
"learning_rate": 4.478927100604375e-05,
"loss": 5.6032,
"step": 4000
},
{
"epoch": 0.3208513362137942,
"grad_norm": 2.1622989177703857,
"learning_rate": 4.4775899876985614e-05,
"loss": 5.7976,
"step": 4010
},
{
"epoch": 0.3216514642342775,
"grad_norm": 2.7463905811309814,
"learning_rate": 4.476252874792748e-05,
"loss": 5.7181,
"step": 4020
},
{
"epoch": 0.3224515922547608,
"grad_norm": 3.503662109375,
"learning_rate": 4.474915761886934e-05,
"loss": 5.8092,
"step": 4030
},
{
"epoch": 0.323251720275244,
"grad_norm": 2.6073853969573975,
"learning_rate": 4.47357864898112e-05,
"loss": 5.7876,
"step": 4040
},
{
"epoch": 0.3240518482957273,
"grad_norm": 3.354768991470337,
"learning_rate": 4.472241536075306e-05,
"loss": 5.7741,
"step": 4050
},
{
"epoch": 0.3248519763162106,
"grad_norm": 2.648145914077759,
"learning_rate": 4.470904423169492e-05,
"loss": 5.7522,
"step": 4060
},
{
"epoch": 0.3256521043366939,
"grad_norm": 3.086655378341675,
"learning_rate": 4.4695673102636784e-05,
"loss": 5.81,
"step": 4070
},
{
"epoch": 0.32645223235717713,
"grad_norm": 2.230905771255493,
"learning_rate": 4.4682301973578647e-05,
"loss": 5.8839,
"step": 4080
},
{
"epoch": 0.3272523603776604,
"grad_norm": 2.5391674041748047,
"learning_rate": 4.466893084452051e-05,
"loss": 5.5535,
"step": 4090
},
{
"epoch": 0.3280524883981437,
"grad_norm": 2.7574117183685303,
"learning_rate": 4.465555971546237e-05,
"loss": 5.8275,
"step": 4100
},
{
"epoch": 0.32885261641862695,
"grad_norm": 3.1114678382873535,
"learning_rate": 4.4642188586404235e-05,
"loss": 5.6876,
"step": 4110
},
{
"epoch": 0.32965274443911025,
"grad_norm": 2.404892683029175,
"learning_rate": 4.46288174573461e-05,
"loss": 5.6876,
"step": 4120
},
{
"epoch": 0.33045287245959354,
"grad_norm": 2.590759754180908,
"learning_rate": 4.461544632828796e-05,
"loss": 5.802,
"step": 4130
},
{
"epoch": 0.33125300048007683,
"grad_norm": 2.4358649253845215,
"learning_rate": 4.460207519922982e-05,
"loss": 5.632,
"step": 4140
},
{
"epoch": 0.33205312850056007,
"grad_norm": 3.9567458629608154,
"learning_rate": 4.4588704070171685e-05,
"loss": 5.8761,
"step": 4150
},
{
"epoch": 0.33285325652104336,
"grad_norm": 2.3808743953704834,
"learning_rate": 4.457533294111355e-05,
"loss": 5.6815,
"step": 4160
},
{
"epoch": 0.33365338454152665,
"grad_norm": 2.6527156829833984,
"learning_rate": 4.456196181205541e-05,
"loss": 5.805,
"step": 4170
},
{
"epoch": 0.33445351256200995,
"grad_norm": 2.351062536239624,
"learning_rate": 4.4548590682997273e-05,
"loss": 5.6681,
"step": 4180
},
{
"epoch": 0.3352536405824932,
"grad_norm": 2.3213460445404053,
"learning_rate": 4.4535219553939136e-05,
"loss": 5.6363,
"step": 4190
},
{
"epoch": 0.3360537686029765,
"grad_norm": 1.9470767974853516,
"learning_rate": 4.4521848424881e-05,
"loss": 5.8772,
"step": 4200
},
{
"epoch": 0.33685389662345977,
"grad_norm": 4.303500652313232,
"learning_rate": 4.450847729582286e-05,
"loss": 5.6185,
"step": 4210
},
{
"epoch": 0.337654024643943,
"grad_norm": 2.713275909423828,
"learning_rate": 4.4495106166764724e-05,
"loss": 5.6754,
"step": 4220
},
{
"epoch": 0.3384541526644263,
"grad_norm": 2.34993314743042,
"learning_rate": 4.448173503770659e-05,
"loss": 5.7003,
"step": 4230
},
{
"epoch": 0.3392542806849096,
"grad_norm": 2.276228666305542,
"learning_rate": 4.446836390864845e-05,
"loss": 5.6,
"step": 4240
},
{
"epoch": 0.3400544087053929,
"grad_norm": 2.3635685443878174,
"learning_rate": 4.445499277959031e-05,
"loss": 5.7373,
"step": 4250
},
{
"epoch": 0.3408545367258761,
"grad_norm": 3.100604772567749,
"learning_rate": 4.4441621650532175e-05,
"loss": 5.7354,
"step": 4260
},
{
"epoch": 0.3416546647463594,
"grad_norm": 2.6743876934051514,
"learning_rate": 4.442825052147404e-05,
"loss": 5.7544,
"step": 4270
},
{
"epoch": 0.3424547927668427,
"grad_norm": 2.5783612728118896,
"learning_rate": 4.44148793924159e-05,
"loss": 5.8826,
"step": 4280
},
{
"epoch": 0.34325492078732595,
"grad_norm": 2.8976659774780273,
"learning_rate": 4.440150826335776e-05,
"loss": 5.5418,
"step": 4290
},
{
"epoch": 0.34405504880780924,
"grad_norm": 2.1061089038848877,
"learning_rate": 4.4388137134299626e-05,
"loss": 5.6406,
"step": 4300
},
{
"epoch": 0.34485517682829253,
"grad_norm": 2.1303789615631104,
"learning_rate": 4.437476600524149e-05,
"loss": 5.6491,
"step": 4310
},
{
"epoch": 0.3456553048487758,
"grad_norm": 2.6240499019622803,
"learning_rate": 4.436139487618335e-05,
"loss": 5.7161,
"step": 4320
},
{
"epoch": 0.34645543286925906,
"grad_norm": 2.325155019760132,
"learning_rate": 4.4348023747125214e-05,
"loss": 5.6172,
"step": 4330
},
{
"epoch": 0.34725556088974235,
"grad_norm": 2.8844404220581055,
"learning_rate": 4.4334652618067076e-05,
"loss": 5.7438,
"step": 4340
},
{
"epoch": 0.34805568891022565,
"grad_norm": 2.375324249267578,
"learning_rate": 4.432128148900894e-05,
"loss": 5.8335,
"step": 4350
},
{
"epoch": 0.34885581693070894,
"grad_norm": 2.1572377681732178,
"learning_rate": 4.4307910359950795e-05,
"loss": 5.706,
"step": 4360
},
{
"epoch": 0.3496559449511922,
"grad_norm": 2.5218889713287354,
"learning_rate": 4.429453923089266e-05,
"loss": 5.7487,
"step": 4370
},
{
"epoch": 0.35045607297167547,
"grad_norm": 2.636223554611206,
"learning_rate": 4.428116810183452e-05,
"loss": 5.8327,
"step": 4380
},
{
"epoch": 0.35125620099215876,
"grad_norm": 2.436155080795288,
"learning_rate": 4.426779697277638e-05,
"loss": 5.6895,
"step": 4390
},
{
"epoch": 0.352056329012642,
"grad_norm": 3.4435484409332275,
"learning_rate": 4.4254425843718246e-05,
"loss": 5.6171,
"step": 4400
},
{
"epoch": 0.3528564570331253,
"grad_norm": 2.3990628719329834,
"learning_rate": 4.424105471466011e-05,
"loss": 5.7574,
"step": 4410
},
{
"epoch": 0.3536565850536086,
"grad_norm": 2.544774293899536,
"learning_rate": 4.422768358560197e-05,
"loss": 5.558,
"step": 4420
},
{
"epoch": 0.3544567130740919,
"grad_norm": 2.389491081237793,
"learning_rate": 4.4214312456543834e-05,
"loss": 5.6628,
"step": 4430
},
{
"epoch": 0.3552568410945751,
"grad_norm": 5.203212261199951,
"learning_rate": 4.4200941327485697e-05,
"loss": 5.5403,
"step": 4440
},
{
"epoch": 0.3560569691150584,
"grad_norm": 2.0861873626708984,
"learning_rate": 4.418757019842756e-05,
"loss": 5.625,
"step": 4450
},
{
"epoch": 0.3568570971355417,
"grad_norm": 2.2355470657348633,
"learning_rate": 4.417419906936942e-05,
"loss": 5.614,
"step": 4460
},
{
"epoch": 0.35765722515602494,
"grad_norm": 2.2239274978637695,
"learning_rate": 4.4160827940311285e-05,
"loss": 5.6885,
"step": 4470
},
{
"epoch": 0.35845735317650823,
"grad_norm": 4.571592807769775,
"learning_rate": 4.414745681125315e-05,
"loss": 5.8495,
"step": 4480
},
{
"epoch": 0.3592574811969915,
"grad_norm": 2.6501150131225586,
"learning_rate": 4.413408568219501e-05,
"loss": 5.6158,
"step": 4490
},
{
"epoch": 0.3600576092174748,
"grad_norm": 2.8568902015686035,
"learning_rate": 4.412071455313687e-05,
"loss": 5.6403,
"step": 4500
},
{
"epoch": 0.36085773723795805,
"grad_norm": 2.4179179668426514,
"learning_rate": 4.410734342407873e-05,
"loss": 5.749,
"step": 4510
},
{
"epoch": 0.36165786525844135,
"grad_norm": 2.950491189956665,
"learning_rate": 4.409397229502059e-05,
"loss": 5.7128,
"step": 4520
},
{
"epoch": 0.36245799327892464,
"grad_norm": 3.731049060821533,
"learning_rate": 4.4080601165962454e-05,
"loss": 5.6397,
"step": 4530
},
{
"epoch": 0.36325812129940793,
"grad_norm": 2.255730390548706,
"learning_rate": 4.406723003690432e-05,
"loss": 5.626,
"step": 4540
},
{
"epoch": 0.36405824931989117,
"grad_norm": 2.623455047607422,
"learning_rate": 4.405385890784618e-05,
"loss": 5.6792,
"step": 4550
},
{
"epoch": 0.36485837734037446,
"grad_norm": 2.366481065750122,
"learning_rate": 4.404048777878804e-05,
"loss": 5.5455,
"step": 4560
},
{
"epoch": 0.36565850536085776,
"grad_norm": 2.56351375579834,
"learning_rate": 4.4027116649729905e-05,
"loss": 5.7982,
"step": 4570
},
{
"epoch": 0.366458633381341,
"grad_norm": 2.3203811645507812,
"learning_rate": 4.401374552067177e-05,
"loss": 5.7969,
"step": 4580
},
{
"epoch": 0.3672587614018243,
"grad_norm": 2.3838179111480713,
"learning_rate": 4.400037439161363e-05,
"loss": 5.7484,
"step": 4590
},
{
"epoch": 0.3680588894223076,
"grad_norm": 2.0725440979003906,
"learning_rate": 4.398700326255549e-05,
"loss": 5.8405,
"step": 4600
},
{
"epoch": 0.36885901744279087,
"grad_norm": 3.49495005607605,
"learning_rate": 4.3973632133497356e-05,
"loss": 5.7151,
"step": 4610
},
{
"epoch": 0.3696591454632741,
"grad_norm": 2.643007755279541,
"learning_rate": 4.396026100443922e-05,
"loss": 5.6374,
"step": 4620
},
{
"epoch": 0.3704592734837574,
"grad_norm": 2.282304286956787,
"learning_rate": 4.394688987538108e-05,
"loss": 5.589,
"step": 4630
},
{
"epoch": 0.3712594015042407,
"grad_norm": 2.244058609008789,
"learning_rate": 4.3933518746322944e-05,
"loss": 5.7516,
"step": 4640
},
{
"epoch": 0.37205952952472393,
"grad_norm": 2.44496488571167,
"learning_rate": 4.3920147617264806e-05,
"loss": 5.8393,
"step": 4650
},
{
"epoch": 0.3728596575452072,
"grad_norm": 2.6613078117370605,
"learning_rate": 4.390677648820667e-05,
"loss": 5.6764,
"step": 4660
},
{
"epoch": 0.3736597855656905,
"grad_norm": 3.99092173576355,
"learning_rate": 4.3893405359148525e-05,
"loss": 5.8658,
"step": 4670
},
{
"epoch": 0.3744599135861738,
"grad_norm": 1.6338485479354858,
"learning_rate": 4.388003423009039e-05,
"loss": 5.7527,
"step": 4680
},
{
"epoch": 0.37526004160665705,
"grad_norm": 2.3723371028900146,
"learning_rate": 4.386666310103225e-05,
"loss": 5.7482,
"step": 4690
},
{
"epoch": 0.37606016962714034,
"grad_norm": 2.630424976348877,
"learning_rate": 4.385329197197411e-05,
"loss": 5.7539,
"step": 4700
},
{
"epoch": 0.37686029764762363,
"grad_norm": 2.3873038291931152,
"learning_rate": 4.3839920842915976e-05,
"loss": 5.6729,
"step": 4710
},
{
"epoch": 0.37766042566810687,
"grad_norm": 1.9391748905181885,
"learning_rate": 4.382654971385784e-05,
"loss": 5.6794,
"step": 4720
},
{
"epoch": 0.37846055368859016,
"grad_norm": 2.103975296020508,
"learning_rate": 4.38131785847997e-05,
"loss": 5.5104,
"step": 4730
},
{
"epoch": 0.37926068170907346,
"grad_norm": 3.731184959411621,
"learning_rate": 4.3799807455741564e-05,
"loss": 5.6699,
"step": 4740
},
{
"epoch": 0.38006080972955675,
"grad_norm": 2.881068468093872,
"learning_rate": 4.3786436326683426e-05,
"loss": 5.6394,
"step": 4750
},
{
"epoch": 0.38086093775004,
"grad_norm": 2.5963799953460693,
"learning_rate": 4.377306519762529e-05,
"loss": 5.784,
"step": 4760
},
{
"epoch": 0.3816610657705233,
"grad_norm": 1.9520230293273926,
"learning_rate": 4.375969406856715e-05,
"loss": 5.7608,
"step": 4770
},
{
"epoch": 0.38246119379100657,
"grad_norm": 2.386702537536621,
"learning_rate": 4.374766005241483e-05,
"loss": 5.5725,
"step": 4780
},
{
"epoch": 0.38326132181148986,
"grad_norm": 2.3830511569976807,
"learning_rate": 4.3734288923356694e-05,
"loss": 5.5584,
"step": 4790
},
{
"epoch": 0.3840614498319731,
"grad_norm": 2.1514739990234375,
"learning_rate": 4.3720917794298556e-05,
"loss": 5.6621,
"step": 4800
},
{
"epoch": 0.3848615778524564,
"grad_norm": 2.5376317501068115,
"learning_rate": 4.370754666524042e-05,
"loss": 5.4138,
"step": 4810
},
{
"epoch": 0.3856617058729397,
"grad_norm": 3.425899028778076,
"learning_rate": 4.3694175536182275e-05,
"loss": 5.6478,
"step": 4820
},
{
"epoch": 0.3864618338934229,
"grad_norm": 2.7518632411956787,
"learning_rate": 4.368080440712414e-05,
"loss": 5.6556,
"step": 4830
},
{
"epoch": 0.3872619619139062,
"grad_norm": 3.119227647781372,
"learning_rate": 4.3667433278066e-05,
"loss": 5.7925,
"step": 4840
},
{
"epoch": 0.3880620899343895,
"grad_norm": 3.2664616107940674,
"learning_rate": 4.365406214900786e-05,
"loss": 5.7176,
"step": 4850
},
{
"epoch": 0.3888622179548728,
"grad_norm": 2.5125045776367188,
"learning_rate": 4.3640691019949726e-05,
"loss": 5.6511,
"step": 4860
},
{
"epoch": 0.38966234597535604,
"grad_norm": 2.992112874984741,
"learning_rate": 4.362731989089159e-05,
"loss": 5.6426,
"step": 4870
},
{
"epoch": 0.39046247399583933,
"grad_norm": 4.46783971786499,
"learning_rate": 4.361394876183345e-05,
"loss": 5.736,
"step": 4880
},
{
"epoch": 0.3912626020163226,
"grad_norm": 1.8372838497161865,
"learning_rate": 4.3600577632775314e-05,
"loss": 5.7603,
"step": 4890
},
{
"epoch": 0.39206273003680586,
"grad_norm": 2.1635375022888184,
"learning_rate": 4.3587206503717176e-05,
"loss": 5.6019,
"step": 4900
},
{
"epoch": 0.39286285805728915,
"grad_norm": 2.2425310611724854,
"learning_rate": 4.357383537465904e-05,
"loss": 5.6829,
"step": 4910
},
{
"epoch": 0.39366298607777245,
"grad_norm": 2.408907413482666,
"learning_rate": 4.35604642456009e-05,
"loss": 5.6821,
"step": 4920
},
{
"epoch": 0.39446311409825574,
"grad_norm": 3.012258291244507,
"learning_rate": 4.3547093116542765e-05,
"loss": 5.7503,
"step": 4930
},
{
"epoch": 0.395263242118739,
"grad_norm": 3.187053680419922,
"learning_rate": 4.353372198748463e-05,
"loss": 5.6459,
"step": 4940
},
{
"epoch": 0.39606337013922227,
"grad_norm": 2.7528955936431885,
"learning_rate": 4.352035085842649e-05,
"loss": 5.6386,
"step": 4950
},
{
"epoch": 0.39686349815970556,
"grad_norm": 2.9744699001312256,
"learning_rate": 4.350697972936835e-05,
"loss": 5.5938,
"step": 4960
},
{
"epoch": 0.39766362618018886,
"grad_norm": 2.779604196548462,
"learning_rate": 4.3493608600310215e-05,
"loss": 5.5459,
"step": 4970
},
{
"epoch": 0.3984637542006721,
"grad_norm": 2.9092133045196533,
"learning_rate": 4.348023747125207e-05,
"loss": 5.7695,
"step": 4980
},
{
"epoch": 0.3992638822211554,
"grad_norm": 2.800872802734375,
"learning_rate": 4.3466866342193934e-05,
"loss": 5.6943,
"step": 4990
},
{
"epoch": 0.4000640102416387,
"grad_norm": 3.299595832824707,
"learning_rate": 4.3453495213135797e-05,
"loss": 5.4432,
"step": 5000
},
{
"epoch": 0.4008641382621219,
"grad_norm": 2.2425456047058105,
"learning_rate": 4.344012408407766e-05,
"loss": 5.6688,
"step": 5010
},
{
"epoch": 0.4016642662826052,
"grad_norm": 2.269378423690796,
"learning_rate": 4.342675295501952e-05,
"loss": 5.7713,
"step": 5020
},
{
"epoch": 0.4024643943030885,
"grad_norm": 2.3903868198394775,
"learning_rate": 4.3413381825961385e-05,
"loss": 5.5926,
"step": 5030
},
{
"epoch": 0.4032645223235718,
"grad_norm": 3.267918109893799,
"learning_rate": 4.340001069690325e-05,
"loss": 5.6806,
"step": 5040
},
{
"epoch": 0.40406465034405503,
"grad_norm": 3.2075066566467285,
"learning_rate": 4.338663956784511e-05,
"loss": 5.6582,
"step": 5050
},
{
"epoch": 0.4048647783645383,
"grad_norm": 2.5458226203918457,
"learning_rate": 4.337326843878697e-05,
"loss": 5.6576,
"step": 5060
},
{
"epoch": 0.4056649063850216,
"grad_norm": 2.0331077575683594,
"learning_rate": 4.3359897309728835e-05,
"loss": 5.6725,
"step": 5070
},
{
"epoch": 0.40646503440550485,
"grad_norm": 2.406907796859741,
"learning_rate": 4.33465261806707e-05,
"loss": 5.5168,
"step": 5080
},
{
"epoch": 0.40726516242598815,
"grad_norm": 2.661137580871582,
"learning_rate": 4.333315505161256e-05,
"loss": 5.5953,
"step": 5090
},
{
"epoch": 0.40806529044647144,
"grad_norm": 2.857725143432617,
"learning_rate": 4.3319783922554423e-05,
"loss": 5.6702,
"step": 5100
},
{
"epoch": 0.40886541846695473,
"grad_norm": 2.7894747257232666,
"learning_rate": 4.3306412793496286e-05,
"loss": 5.6228,
"step": 5110
},
{
"epoch": 0.40966554648743797,
"grad_norm": 2.8865861892700195,
"learning_rate": 4.329304166443815e-05,
"loss": 5.6859,
"step": 5120
},
{
"epoch": 0.41046567450792126,
"grad_norm": 2.1493608951568604,
"learning_rate": 4.3279670535380005e-05,
"loss": 5.5516,
"step": 5130
},
{
"epoch": 0.41126580252840456,
"grad_norm": 3.112820863723755,
"learning_rate": 4.326629940632187e-05,
"loss": 5.6409,
"step": 5140
},
{
"epoch": 0.41206593054888785,
"grad_norm": 2.778876543045044,
"learning_rate": 4.325292827726373e-05,
"loss": 5.6948,
"step": 5150
},
{
"epoch": 0.4128660585693711,
"grad_norm": 2.0409047603607178,
"learning_rate": 4.323955714820559e-05,
"loss": 5.5458,
"step": 5160
},
{
"epoch": 0.4136661865898544,
"grad_norm": 3.1058828830718994,
"learning_rate": 4.3226186019147456e-05,
"loss": 5.8437,
"step": 5170
},
{
"epoch": 0.41446631461033767,
"grad_norm": 3.306704044342041,
"learning_rate": 4.321281489008932e-05,
"loss": 5.691,
"step": 5180
},
{
"epoch": 0.4152664426308209,
"grad_norm": 2.9495625495910645,
"learning_rate": 4.319944376103118e-05,
"loss": 5.6364,
"step": 5190
},
{
"epoch": 0.4160665706513042,
"grad_norm": 2.1773974895477295,
"learning_rate": 4.3186072631973044e-05,
"loss": 5.6713,
"step": 5200
},
{
"epoch": 0.4168666986717875,
"grad_norm": 2.0897533893585205,
"learning_rate": 4.3172701502914906e-05,
"loss": 5.6022,
"step": 5210
},
{
"epoch": 0.4176668266922708,
"grad_norm": 2.2131927013397217,
"learning_rate": 4.315933037385677e-05,
"loss": 5.5728,
"step": 5220
},
{
"epoch": 0.418466954712754,
"grad_norm": 2.225728750228882,
"learning_rate": 4.314595924479863e-05,
"loss": 5.5374,
"step": 5230
},
{
"epoch": 0.4192670827332373,
"grad_norm": 2.219791889190674,
"learning_rate": 4.3132588115740494e-05,
"loss": 5.6986,
"step": 5240
},
{
"epoch": 0.4200672107537206,
"grad_norm": 2.720323085784912,
"learning_rate": 4.311921698668236e-05,
"loss": 5.6046,
"step": 5250
},
{
"epoch": 0.42086733877420385,
"grad_norm": 2.4254257678985596,
"learning_rate": 4.310584585762422e-05,
"loss": 5.5566,
"step": 5260
},
{
"epoch": 0.42166746679468714,
"grad_norm": 2.2297472953796387,
"learning_rate": 4.309247472856608e-05,
"loss": 5.7431,
"step": 5270
},
{
"epoch": 0.42246759481517043,
"grad_norm": 2.2767512798309326,
"learning_rate": 4.3079103599507945e-05,
"loss": 5.6661,
"step": 5280
},
{
"epoch": 0.4232677228356537,
"grad_norm": 2.8959579467773438,
"learning_rate": 4.30657324704498e-05,
"loss": 5.6584,
"step": 5290
},
{
"epoch": 0.42406785085613696,
"grad_norm": 2.49867844581604,
"learning_rate": 4.3052361341391664e-05,
"loss": 5.7564,
"step": 5300
},
{
"epoch": 0.42486797887662026,
"grad_norm": 2.1820337772369385,
"learning_rate": 4.3038990212333526e-05,
"loss": 5.6288,
"step": 5310
},
{
"epoch": 0.42566810689710355,
"grad_norm": 2.7174227237701416,
"learning_rate": 4.302561908327539e-05,
"loss": 5.6496,
"step": 5320
},
{
"epoch": 0.42646823491758684,
"grad_norm": 2.7261149883270264,
"learning_rate": 4.301224795421725e-05,
"loss": 5.6557,
"step": 5330
},
{
"epoch": 0.4272683629380701,
"grad_norm": 2.581760883331299,
"learning_rate": 4.2998876825159114e-05,
"loss": 5.604,
"step": 5340
},
{
"epoch": 0.42806849095855337,
"grad_norm": 2.43254017829895,
"learning_rate": 4.298550569610098e-05,
"loss": 5.6041,
"step": 5350
},
{
"epoch": 0.42886861897903666,
"grad_norm": 4.465782165527344,
"learning_rate": 4.297213456704284e-05,
"loss": 5.7158,
"step": 5360
},
{
"epoch": 0.4296687469995199,
"grad_norm": 2.6434614658355713,
"learning_rate": 4.29587634379847e-05,
"loss": 5.6347,
"step": 5370
},
{
"epoch": 0.4304688750200032,
"grad_norm": 2.344190835952759,
"learning_rate": 4.2945392308926565e-05,
"loss": 5.6062,
"step": 5380
},
{
"epoch": 0.4312690030404865,
"grad_norm": 4.311372756958008,
"learning_rate": 4.293202117986843e-05,
"loss": 5.7356,
"step": 5390
},
{
"epoch": 0.4320691310609698,
"grad_norm": 2.8204123973846436,
"learning_rate": 4.291865005081029e-05,
"loss": 5.63,
"step": 5400
},
{
"epoch": 0.432869259081453,
"grad_norm": 3.333059072494507,
"learning_rate": 4.290527892175215e-05,
"loss": 5.5992,
"step": 5410
},
{
"epoch": 0.4336693871019363,
"grad_norm": 2.0647048950195312,
"learning_rate": 4.2891907792694016e-05,
"loss": 5.691,
"step": 5420
},
{
"epoch": 0.4344695151224196,
"grad_norm": 2.5100045204162598,
"learning_rate": 4.287853666363588e-05,
"loss": 5.615,
"step": 5430
},
{
"epoch": 0.43526964314290284,
"grad_norm": 2.6120762825012207,
"learning_rate": 4.286516553457774e-05,
"loss": 5.746,
"step": 5440
},
{
"epoch": 0.43606977116338613,
"grad_norm": 2.2886853218078613,
"learning_rate": 4.2851794405519604e-05,
"loss": 5.6783,
"step": 5450
},
{
"epoch": 0.4368698991838694,
"grad_norm": 2.6724119186401367,
"learning_rate": 4.283842327646147e-05,
"loss": 5.6526,
"step": 5460
},
{
"epoch": 0.4376700272043527,
"grad_norm": 2.2408151626586914,
"learning_rate": 4.282505214740333e-05,
"loss": 5.6314,
"step": 5470
},
{
"epoch": 0.43847015522483596,
"grad_norm": 3.0294084548950195,
"learning_rate": 4.281168101834519e-05,
"loss": 5.6669,
"step": 5480
},
{
"epoch": 0.43927028324531925,
"grad_norm": 2.1664011478424072,
"learning_rate": 4.2798309889287055e-05,
"loss": 5.4856,
"step": 5490
},
{
"epoch": 0.44007041126580254,
"grad_norm": 3.4465417861938477,
"learning_rate": 4.278493876022892e-05,
"loss": 5.5859,
"step": 5500
},
{
"epoch": 0.4408705392862858,
"grad_norm": 2.0116310119628906,
"learning_rate": 4.277156763117078e-05,
"loss": 5.5982,
"step": 5510
},
{
"epoch": 0.44167066730676907,
"grad_norm": 2.578658103942871,
"learning_rate": 4.275819650211264e-05,
"loss": 5.4026,
"step": 5520
},
{
"epoch": 0.44247079532725236,
"grad_norm": 3.1201677322387695,
"learning_rate": 4.2744825373054506e-05,
"loss": 5.7024,
"step": 5530
},
{
"epoch": 0.44327092334773566,
"grad_norm": 2.2246837615966797,
"learning_rate": 4.273145424399637e-05,
"loss": 5.5842,
"step": 5540
},
{
"epoch": 0.4440710513682189,
"grad_norm": 2.1593568325042725,
"learning_rate": 4.271808311493823e-05,
"loss": 5.5099,
"step": 5550
},
{
"epoch": 0.4448711793887022,
"grad_norm": 3.082218885421753,
"learning_rate": 4.2704711985880094e-05,
"loss": 5.5539,
"step": 5560
},
{
"epoch": 0.4456713074091855,
"grad_norm": 3.2272634506225586,
"learning_rate": 4.2691340856821956e-05,
"loss": 5.73,
"step": 5570
},
{
"epoch": 0.4464714354296688,
"grad_norm": 2.301713466644287,
"learning_rate": 4.267796972776382e-05,
"loss": 5.5444,
"step": 5580
},
{
"epoch": 0.447271563450152,
"grad_norm": 3.2985429763793945,
"learning_rate": 4.2664598598705675e-05,
"loss": 5.7499,
"step": 5590
},
{
"epoch": 0.4480716914706353,
"grad_norm": 2.103994607925415,
"learning_rate": 4.265122746964754e-05,
"loss": 5.5627,
"step": 5600
},
{
"epoch": 0.4488718194911186,
"grad_norm": 3.260099172592163,
"learning_rate": 4.26378563405894e-05,
"loss": 5.5692,
"step": 5610
},
{
"epoch": 0.44967194751160183,
"grad_norm": 2.740907907485962,
"learning_rate": 4.262448521153126e-05,
"loss": 5.4984,
"step": 5620
},
{
"epoch": 0.4504720755320851,
"grad_norm": 5.314218997955322,
"learning_rate": 4.2611114082473126e-05,
"loss": 5.5641,
"step": 5630
},
{
"epoch": 0.4512722035525684,
"grad_norm": 3.0524938106536865,
"learning_rate": 4.259774295341499e-05,
"loss": 5.6375,
"step": 5640
},
{
"epoch": 0.4520723315730517,
"grad_norm": 3.57781982421875,
"learning_rate": 4.258437182435685e-05,
"loss": 5.6726,
"step": 5650
},
{
"epoch": 0.45287245959353495,
"grad_norm": 3.094510793685913,
"learning_rate": 4.2571000695298714e-05,
"loss": 5.7328,
"step": 5660
},
{
"epoch": 0.45367258761401824,
"grad_norm": 2.731092929840088,
"learning_rate": 4.2557629566240576e-05,
"loss": 5.6667,
"step": 5670
},
{
"epoch": 0.45447271563450153,
"grad_norm": 3.6701395511627197,
"learning_rate": 4.254425843718244e-05,
"loss": 5.641,
"step": 5680
},
{
"epoch": 0.45527284365498477,
"grad_norm": 1.9017853736877441,
"learning_rate": 4.25308873081243e-05,
"loss": 5.6521,
"step": 5690
},
{
"epoch": 0.45607297167546806,
"grad_norm": 3.2658119201660156,
"learning_rate": 4.2517516179066165e-05,
"loss": 5.6431,
"step": 5700
},
{
"epoch": 0.45687309969595136,
"grad_norm": 2.227353572845459,
"learning_rate": 4.250414505000803e-05,
"loss": 5.6198,
"step": 5710
},
{
"epoch": 0.45767322771643465,
"grad_norm": 1.7804296016693115,
"learning_rate": 4.249077392094989e-05,
"loss": 5.618,
"step": 5720
},
{
"epoch": 0.4584733557369179,
"grad_norm": 2.9357879161834717,
"learning_rate": 4.247740279189175e-05,
"loss": 5.5222,
"step": 5730
},
{
"epoch": 0.4592734837574012,
"grad_norm": 5.074959754943848,
"learning_rate": 4.2464031662833615e-05,
"loss": 5.7604,
"step": 5740
},
{
"epoch": 0.4600736117778845,
"grad_norm": 2.4961061477661133,
"learning_rate": 4.245066053377547e-05,
"loss": 5.5699,
"step": 5750
},
{
"epoch": 0.46087373979836777,
"grad_norm": 2.636403799057007,
"learning_rate": 4.2437289404717334e-05,
"loss": 5.745,
"step": 5760
},
{
"epoch": 0.461673867818851,
"grad_norm": 2.4829630851745605,
"learning_rate": 4.2423918275659197e-05,
"loss": 5.9779,
"step": 5770
},
{
"epoch": 0.4624739958393343,
"grad_norm": 2.389112710952759,
"learning_rate": 4.241054714660106e-05,
"loss": 5.696,
"step": 5780
},
{
"epoch": 0.4632741238598176,
"grad_norm": 2.3053462505340576,
"learning_rate": 4.239717601754292e-05,
"loss": 5.6567,
"step": 5790
},
{
"epoch": 0.4640742518803008,
"grad_norm": 2.9635446071624756,
"learning_rate": 4.2383804888484785e-05,
"loss": 5.7643,
"step": 5800
},
{
"epoch": 0.4648743799007841,
"grad_norm": 3.3227570056915283,
"learning_rate": 4.237043375942665e-05,
"loss": 5.5425,
"step": 5810
},
{
"epoch": 0.4656745079212674,
"grad_norm": 3.2959067821502686,
"learning_rate": 4.235706263036851e-05,
"loss": 5.5886,
"step": 5820
},
{
"epoch": 0.4664746359417507,
"grad_norm": 2.497953176498413,
"learning_rate": 4.234369150131037e-05,
"loss": 5.6248,
"step": 5830
},
{
"epoch": 0.46727476396223394,
"grad_norm": 3.5957205295562744,
"learning_rate": 4.2330320372252235e-05,
"loss": 5.5345,
"step": 5840
},
{
"epoch": 0.46807489198271723,
"grad_norm": 2.9113316535949707,
"learning_rate": 4.23169492431941e-05,
"loss": 5.7358,
"step": 5850
},
{
"epoch": 0.4688750200032005,
"grad_norm": 3.8617255687713623,
"learning_rate": 4.230357811413596e-05,
"loss": 5.7451,
"step": 5860
},
{
"epoch": 0.46967514802368376,
"grad_norm": 2.5546538829803467,
"learning_rate": 4.2290206985077824e-05,
"loss": 5.5874,
"step": 5870
},
{
"epoch": 0.47047527604416706,
"grad_norm": 3.7215869426727295,
"learning_rate": 4.2276835856019686e-05,
"loss": 5.5462,
"step": 5880
},
{
"epoch": 0.47127540406465035,
"grad_norm": 3.3122622966766357,
"learning_rate": 4.226346472696155e-05,
"loss": 5.7368,
"step": 5890
},
{
"epoch": 0.47207553208513364,
"grad_norm": 2.3962459564208984,
"learning_rate": 4.2250093597903405e-05,
"loss": 5.7328,
"step": 5900
},
{
"epoch": 0.4728756601056169,
"grad_norm": 2.497668504714966,
"learning_rate": 4.223672246884527e-05,
"loss": 5.7063,
"step": 5910
},
{
"epoch": 0.4736757881261002,
"grad_norm": 2.301725387573242,
"learning_rate": 4.222335133978713e-05,
"loss": 5.6029,
"step": 5920
},
{
"epoch": 0.47447591614658347,
"grad_norm": 3.840155839920044,
"learning_rate": 4.220998021072899e-05,
"loss": 5.825,
"step": 5930
},
{
"epoch": 0.47527604416706676,
"grad_norm": 3.1776278018951416,
"learning_rate": 4.2196609081670856e-05,
"loss": 5.6421,
"step": 5940
},
{
"epoch": 0.47607617218755,
"grad_norm": 2.1823127269744873,
"learning_rate": 4.218323795261272e-05,
"loss": 5.7154,
"step": 5950
},
{
"epoch": 0.4768763002080333,
"grad_norm": 2.944390058517456,
"learning_rate": 4.216986682355458e-05,
"loss": 5.5429,
"step": 5960
},
{
"epoch": 0.4776764282285166,
"grad_norm": 2.035430431365967,
"learning_rate": 4.2156495694496444e-05,
"loss": 5.8187,
"step": 5970
},
{
"epoch": 0.4784765562489998,
"grad_norm": 3.167098045349121,
"learning_rate": 4.2143124565438306e-05,
"loss": 5.5891,
"step": 5980
},
{
"epoch": 0.4792766842694831,
"grad_norm": 1.9377233982086182,
"learning_rate": 4.212975343638017e-05,
"loss": 5.7428,
"step": 5990
},
{
"epoch": 0.4800768122899664,
"grad_norm": 2.759096622467041,
"learning_rate": 4.211638230732203e-05,
"loss": 5.5572,
"step": 6000
},
{
"epoch": 0.4808769403104497,
"grad_norm": 2.074033498764038,
"learning_rate": 4.2103011178263894e-05,
"loss": 5.517,
"step": 6010
},
{
"epoch": 0.48167706833093293,
"grad_norm": 2.2866854667663574,
"learning_rate": 4.208964004920576e-05,
"loss": 5.6539,
"step": 6020
},
{
"epoch": 0.4824771963514162,
"grad_norm": 1.9909095764160156,
"learning_rate": 4.207626892014762e-05,
"loss": 5.5532,
"step": 6030
},
{
"epoch": 0.4832773243718995,
"grad_norm": 3.245906114578247,
"learning_rate": 4.206289779108948e-05,
"loss": 5.6797,
"step": 6040
},
{
"epoch": 0.48407745239238276,
"grad_norm": 2.013009786605835,
"learning_rate": 4.2049526662031345e-05,
"loss": 5.6378,
"step": 6050
},
{
"epoch": 0.48487758041286605,
"grad_norm": 2.5478925704956055,
"learning_rate": 4.20361555329732e-05,
"loss": 5.555,
"step": 6060
},
{
"epoch": 0.48567770843334934,
"grad_norm": 3.079225778579712,
"learning_rate": 4.2022784403915064e-05,
"loss": 5.7618,
"step": 6070
},
{
"epoch": 0.48647783645383263,
"grad_norm": 2.2639927864074707,
"learning_rate": 4.2009413274856926e-05,
"loss": 5.8063,
"step": 6080
},
{
"epoch": 0.48727796447431587,
"grad_norm": 4.630524158477783,
"learning_rate": 4.199604214579879e-05,
"loss": 5.6403,
"step": 6090
},
{
"epoch": 0.48807809249479917,
"grad_norm": 3.11018967628479,
"learning_rate": 4.198267101674065e-05,
"loss": 5.7517,
"step": 6100
},
{
"epoch": 0.48887822051528246,
"grad_norm": 8.462982177734375,
"learning_rate": 4.1969299887682515e-05,
"loss": 5.7311,
"step": 6110
},
{
"epoch": 0.4896783485357657,
"grad_norm": 2.418065071105957,
"learning_rate": 4.195592875862438e-05,
"loss": 5.6239,
"step": 6120
},
{
"epoch": 0.490478476556249,
"grad_norm": 2.5452466011047363,
"learning_rate": 4.194255762956624e-05,
"loss": 5.7417,
"step": 6130
},
{
"epoch": 0.4912786045767323,
"grad_norm": 2.986041307449341,
"learning_rate": 4.19291865005081e-05,
"loss": 5.663,
"step": 6140
},
{
"epoch": 0.4920787325972156,
"grad_norm": 2.7642807960510254,
"learning_rate": 4.1915815371449965e-05,
"loss": 5.5379,
"step": 6150
},
{
"epoch": 0.4928788606176988,
"grad_norm": 4.326907157897949,
"learning_rate": 4.190244424239183e-05,
"loss": 5.8058,
"step": 6160
},
{
"epoch": 0.4936789886381821,
"grad_norm": 1.9514706134796143,
"learning_rate": 4.188907311333369e-05,
"loss": 5.7004,
"step": 6170
},
{
"epoch": 0.4944791166586654,
"grad_norm": 2.5721428394317627,
"learning_rate": 4.187570198427555e-05,
"loss": 5.6959,
"step": 6180
},
{
"epoch": 0.4952792446791487,
"grad_norm": 2.6619083881378174,
"learning_rate": 4.1862330855217416e-05,
"loss": 5.7196,
"step": 6190
},
{
"epoch": 0.4960793726996319,
"grad_norm": 2.322341203689575,
"learning_rate": 4.184895972615928e-05,
"loss": 5.5998,
"step": 6200
},
{
"epoch": 0.4968795007201152,
"grad_norm": 2.280777931213379,
"learning_rate": 4.183558859710114e-05,
"loss": 5.5171,
"step": 6210
},
{
"epoch": 0.4976796287405985,
"grad_norm": 1.9774320125579834,
"learning_rate": 4.1822217468043004e-05,
"loss": 5.6368,
"step": 6220
},
{
"epoch": 0.49847975676108175,
"grad_norm": 2.199708938598633,
"learning_rate": 4.180884633898487e-05,
"loss": 5.4638,
"step": 6230
},
{
"epoch": 0.49927988478156504,
"grad_norm": 2.0054879188537598,
"learning_rate": 4.179547520992673e-05,
"loss": 5.4624,
"step": 6240
},
{
"epoch": 0.5000800128020483,
"grad_norm": 2.0623903274536133,
"learning_rate": 4.178210408086859e-05,
"loss": 5.6554,
"step": 6250
},
{
"epoch": 0.5008801408225316,
"grad_norm": 2.5907487869262695,
"learning_rate": 4.1768732951810455e-05,
"loss": 5.4989,
"step": 6260
},
{
"epoch": 0.5016802688430149,
"grad_norm": 2.181987762451172,
"learning_rate": 4.175536182275232e-05,
"loss": 5.624,
"step": 6270
},
{
"epoch": 0.5024803968634982,
"grad_norm": 2.9678001403808594,
"learning_rate": 4.174199069369418e-05,
"loss": 5.6545,
"step": 6280
},
{
"epoch": 0.5032805248839815,
"grad_norm": 5.213638782501221,
"learning_rate": 4.172861956463604e-05,
"loss": 5.7048,
"step": 6290
},
{
"epoch": 0.5040806529044647,
"grad_norm": 2.465900182723999,
"learning_rate": 4.1715248435577906e-05,
"loss": 5.646,
"step": 6300
},
{
"epoch": 0.504880780924948,
"grad_norm": 2.94570255279541,
"learning_rate": 4.170187730651977e-05,
"loss": 5.6274,
"step": 6310
},
{
"epoch": 0.5056809089454313,
"grad_norm": 3.5255651473999023,
"learning_rate": 4.168850617746163e-05,
"loss": 5.5336,
"step": 6320
},
{
"epoch": 0.5064810369659145,
"grad_norm": 2.3499608039855957,
"learning_rate": 4.1675135048403494e-05,
"loss": 5.7768,
"step": 6330
},
{
"epoch": 0.5072811649863979,
"grad_norm": 2.0476951599121094,
"learning_rate": 4.1661763919345356e-05,
"loss": 5.5927,
"step": 6340
},
{
"epoch": 0.5080812930068811,
"grad_norm": 2.4708118438720703,
"learning_rate": 4.164839279028722e-05,
"loss": 5.6458,
"step": 6350
},
{
"epoch": 0.5088814210273643,
"grad_norm": 2.465075731277466,
"learning_rate": 4.163502166122908e-05,
"loss": 5.5744,
"step": 6360
},
{
"epoch": 0.5096815490478477,
"grad_norm": 2.9378490447998047,
"learning_rate": 4.162165053217094e-05,
"loss": 5.6963,
"step": 6370
},
{
"epoch": 0.5104816770683309,
"grad_norm": 2.201359987258911,
"learning_rate": 4.16082794031128e-05,
"loss": 5.613,
"step": 6380
},
{
"epoch": 0.5112818050888142,
"grad_norm": 1.8427401781082153,
"learning_rate": 4.159490827405466e-05,
"loss": 5.5494,
"step": 6390
},
{
"epoch": 0.5120819331092975,
"grad_norm": 1.9969813823699951,
"learning_rate": 4.1581537144996526e-05,
"loss": 5.5783,
"step": 6400
},
{
"epoch": 0.5128820611297807,
"grad_norm": 2.9670321941375732,
"learning_rate": 4.156816601593839e-05,
"loss": 5.7176,
"step": 6410
},
{
"epoch": 0.5136821891502641,
"grad_norm": 2.76875901222229,
"learning_rate": 4.155479488688025e-05,
"loss": 5.5584,
"step": 6420
},
{
"epoch": 0.5144823171707473,
"grad_norm": 3.2874600887298584,
"learning_rate": 4.1541423757822114e-05,
"loss": 5.8726,
"step": 6430
},
{
"epoch": 0.5152824451912306,
"grad_norm": 2.4672482013702393,
"learning_rate": 4.1528052628763977e-05,
"loss": 5.764,
"step": 6440
},
{
"epoch": 0.5160825732117139,
"grad_norm": 3.5424506664276123,
"learning_rate": 4.151468149970584e-05,
"loss": 5.6612,
"step": 6450
},
{
"epoch": 0.5168827012321972,
"grad_norm": 2.7947871685028076,
"learning_rate": 4.15013103706477e-05,
"loss": 5.668,
"step": 6460
},
{
"epoch": 0.5176828292526804,
"grad_norm": 2.624370574951172,
"learning_rate": 4.1487939241589565e-05,
"loss": 5.577,
"step": 6470
},
{
"epoch": 0.5184829572731637,
"grad_norm": 2.276289701461792,
"learning_rate": 4.147456811253143e-05,
"loss": 5.7592,
"step": 6480
},
{
"epoch": 0.519283085293647,
"grad_norm": 2.751945972442627,
"learning_rate": 4.146119698347329e-05,
"loss": 5.6251,
"step": 6490
},
{
"epoch": 0.5200832133141302,
"grad_norm": 2.1990444660186768,
"learning_rate": 4.144782585441515e-05,
"loss": 5.5141,
"step": 6500
},
{
"epoch": 0.5208833413346136,
"grad_norm": 2.732024908065796,
"learning_rate": 4.1434454725357015e-05,
"loss": 5.5938,
"step": 6510
},
{
"epoch": 0.5216834693550968,
"grad_norm": 2.6876533031463623,
"learning_rate": 4.142108359629887e-05,
"loss": 5.7126,
"step": 6520
},
{
"epoch": 0.5224835973755801,
"grad_norm": 2.660323143005371,
"learning_rate": 4.1407712467240734e-05,
"loss": 5.6261,
"step": 6530
},
{
"epoch": 0.5232837253960634,
"grad_norm": 2.567084550857544,
"learning_rate": 4.13943413381826e-05,
"loss": 5.5248,
"step": 6540
},
{
"epoch": 0.5240838534165466,
"grad_norm": 4.317018032073975,
"learning_rate": 4.138097020912446e-05,
"loss": 5.4444,
"step": 6550
},
{
"epoch": 0.52488398143703,
"grad_norm": 2.0361647605895996,
"learning_rate": 4.136759908006632e-05,
"loss": 5.7532,
"step": 6560
},
{
"epoch": 0.5256841094575132,
"grad_norm": 2.0946271419525146,
"learning_rate": 4.1354227951008185e-05,
"loss": 5.6343,
"step": 6570
},
{
"epoch": 0.5264842374779964,
"grad_norm": 3.3724842071533203,
"learning_rate": 4.134085682195005e-05,
"loss": 5.6455,
"step": 6580
},
{
"epoch": 0.5272843654984798,
"grad_norm": 4.078947067260742,
"learning_rate": 4.132748569289191e-05,
"loss": 5.6681,
"step": 6590
},
{
"epoch": 0.528084493518963,
"grad_norm": 4.288105010986328,
"learning_rate": 4.131411456383377e-05,
"loss": 5.7152,
"step": 6600
},
{
"epoch": 0.5288846215394463,
"grad_norm": 2.5208754539489746,
"learning_rate": 4.1300743434775635e-05,
"loss": 5.5715,
"step": 6610
},
{
"epoch": 0.5296847495599296,
"grad_norm": 2.6902217864990234,
"learning_rate": 4.12873723057175e-05,
"loss": 5.4997,
"step": 6620
},
{
"epoch": 0.5304848775804129,
"grad_norm": 2.4580068588256836,
"learning_rate": 4.127400117665936e-05,
"loss": 5.7656,
"step": 6630
},
{
"epoch": 0.5312850056008962,
"grad_norm": 2.5117955207824707,
"learning_rate": 4.1260630047601224e-05,
"loss": 5.6373,
"step": 6640
},
{
"epoch": 0.5320851336213794,
"grad_norm": 2.660921096801758,
"learning_rate": 4.1247258918543086e-05,
"loss": 5.6829,
"step": 6650
},
{
"epoch": 0.5328852616418627,
"grad_norm": 2.4601287841796875,
"learning_rate": 4.123388778948495e-05,
"loss": 5.7702,
"step": 6660
},
{
"epoch": 0.533685389662346,
"grad_norm": 2.9025120735168457,
"learning_rate": 4.122051666042681e-05,
"loss": 5.6374,
"step": 6670
},
{
"epoch": 0.5344855176828293,
"grad_norm": 2.8221569061279297,
"learning_rate": 4.120714553136867e-05,
"loss": 5.5568,
"step": 6680
},
{
"epoch": 0.5352856457033125,
"grad_norm": 2.3035178184509277,
"learning_rate": 4.119377440231053e-05,
"loss": 5.5845,
"step": 6690
},
{
"epoch": 0.5360857737237958,
"grad_norm": 2.0955657958984375,
"learning_rate": 4.118040327325239e-05,
"loss": 5.687,
"step": 6700
},
{
"epoch": 0.5368859017442791,
"grad_norm": 2.530156135559082,
"learning_rate": 4.1167032144194256e-05,
"loss": 5.5772,
"step": 6710
},
{
"epoch": 0.5376860297647623,
"grad_norm": 2.2060387134552,
"learning_rate": 4.115366101513612e-05,
"loss": 5.5964,
"step": 6720
},
{
"epoch": 0.5384861577852457,
"grad_norm": 2.720702886581421,
"learning_rate": 4.114028988607798e-05,
"loss": 5.5432,
"step": 6730
},
{
"epoch": 0.5392862858057289,
"grad_norm": 2.2585232257843018,
"learning_rate": 4.1126918757019844e-05,
"loss": 5.77,
"step": 6740
},
{
"epoch": 0.5400864138262121,
"grad_norm": 2.052316904067993,
"learning_rate": 4.1113547627961706e-05,
"loss": 5.5679,
"step": 6750
},
{
"epoch": 0.5408865418466955,
"grad_norm": 2.772500991821289,
"learning_rate": 4.110017649890357e-05,
"loss": 5.5608,
"step": 6760
},
{
"epoch": 0.5416866698671787,
"grad_norm": 2.158129930496216,
"learning_rate": 4.108680536984543e-05,
"loss": 5.6612,
"step": 6770
},
{
"epoch": 0.5424867978876621,
"grad_norm": 2.874685287475586,
"learning_rate": 4.1073434240787294e-05,
"loss": 5.5999,
"step": 6780
},
{
"epoch": 0.5432869259081453,
"grad_norm": 2.2797632217407227,
"learning_rate": 4.106006311172916e-05,
"loss": 5.7243,
"step": 6790
},
{
"epoch": 0.5440870539286286,
"grad_norm": 2.998309850692749,
"learning_rate": 4.1048029095576836e-05,
"loss": 5.5031,
"step": 6800
},
{
"epoch": 0.5448871819491119,
"grad_norm": 2.8155364990234375,
"learning_rate": 4.10346579665187e-05,
"loss": 5.7631,
"step": 6810
},
{
"epoch": 0.5456873099695951,
"grad_norm": 2.327279806137085,
"learning_rate": 4.102128683746056e-05,
"loss": 5.6293,
"step": 6820
},
{
"epoch": 0.5464874379900784,
"grad_norm": 3.3200621604919434,
"learning_rate": 4.100791570840242e-05,
"loss": 5.717,
"step": 6830
},
{
"epoch": 0.5472875660105617,
"grad_norm": 2.521144390106201,
"learning_rate": 4.099454457934428e-05,
"loss": 5.5705,
"step": 6840
},
{
"epoch": 0.548087694031045,
"grad_norm": 2.7198219299316406,
"learning_rate": 4.098117345028614e-05,
"loss": 5.5931,
"step": 6850
},
{
"epoch": 0.5488878220515282,
"grad_norm": 2.701251268386841,
"learning_rate": 4.0967802321228006e-05,
"loss": 5.4706,
"step": 6860
},
{
"epoch": 0.5496879500720115,
"grad_norm": 2.2789149284362793,
"learning_rate": 4.095443119216987e-05,
"loss": 5.5883,
"step": 6870
},
{
"epoch": 0.5504880780924948,
"grad_norm": 2.8821568489074707,
"learning_rate": 4.094106006311173e-05,
"loss": 5.7525,
"step": 6880
},
{
"epoch": 0.5512882061129781,
"grad_norm": 2.3450064659118652,
"learning_rate": 4.0927688934053594e-05,
"loss": 5.5166,
"step": 6890
},
{
"epoch": 0.5520883341334614,
"grad_norm": 2.639960527420044,
"learning_rate": 4.0914317804995456e-05,
"loss": 5.7001,
"step": 6900
},
{
"epoch": 0.5528884621539446,
"grad_norm": 2.6743710041046143,
"learning_rate": 4.090094667593732e-05,
"loss": 5.7049,
"step": 6910
},
{
"epoch": 0.553688590174428,
"grad_norm": 2.7540199756622314,
"learning_rate": 4.088757554687918e-05,
"loss": 5.5705,
"step": 6920
},
{
"epoch": 0.5544887181949112,
"grad_norm": 3.2703442573547363,
"learning_rate": 4.0874204417821044e-05,
"loss": 5.5585,
"step": 6930
},
{
"epoch": 0.5552888462153944,
"grad_norm": 3.684135913848877,
"learning_rate": 4.086083328876291e-05,
"loss": 5.6561,
"step": 6940
},
{
"epoch": 0.5560889742358778,
"grad_norm": 2.918989896774292,
"learning_rate": 4.084746215970477e-05,
"loss": 5.5171,
"step": 6950
},
{
"epoch": 0.556889102256361,
"grad_norm": 2.5902323722839355,
"learning_rate": 4.083409103064663e-05,
"loss": 5.6703,
"step": 6960
},
{
"epoch": 0.5576892302768442,
"grad_norm": 2.23820161819458,
"learning_rate": 4.0820719901588495e-05,
"loss": 5.7048,
"step": 6970
},
{
"epoch": 0.5584893582973276,
"grad_norm": 2.4339401721954346,
"learning_rate": 4.080734877253036e-05,
"loss": 5.4264,
"step": 6980
},
{
"epoch": 0.5592894863178108,
"grad_norm": 3.3097031116485596,
"learning_rate": 4.0793977643472214e-05,
"loss": 5.5931,
"step": 6990
},
{
"epoch": 0.5600896143382941,
"grad_norm": 2.6903202533721924,
"learning_rate": 4.0780606514414077e-05,
"loss": 5.5349,
"step": 7000
},
{
"epoch": 0.5600896143382941,
"eval_loss": 5.870830535888672,
"eval_runtime": 13.3044,
"eval_samples_per_second": 3.007,
"eval_steps_per_second": 0.376,
"step": 7000
}
],
"logging_steps": 10,
"max_steps": 37494,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 7000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}