model-hotpotqa-epo10 / trainer_state.json
ssktora's picture
Upload folder using huggingface_hub
9f85fe9 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 13290,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007524454477050414,
"grad_norm": 205.97328186035156,
"learning_rate": 3.762227238525207e-08,
"loss": 6.3478,
"step": 10
},
{
"epoch": 0.015048908954100828,
"grad_norm": 240.38754272460938,
"learning_rate": 7.524454477050414e-08,
"loss": 7.7512,
"step": 20
},
{
"epoch": 0.022573363431151242,
"grad_norm": 176.9767608642578,
"learning_rate": 1.1286681715575622e-07,
"loss": 7.3317,
"step": 30
},
{
"epoch": 0.030097817908201655,
"grad_norm": 303.99163818359375,
"learning_rate": 1.5048908954100828e-07,
"loss": 6.3909,
"step": 40
},
{
"epoch": 0.03762227238525207,
"grad_norm": 243.18605041503906,
"learning_rate": 1.8811136192626038e-07,
"loss": 5.6329,
"step": 50
},
{
"epoch": 0.045146726862302484,
"grad_norm": 218.3995819091797,
"learning_rate": 2.2573363431151243e-07,
"loss": 5.9957,
"step": 60
},
{
"epoch": 0.0526711813393529,
"grad_norm": 175.67726135253906,
"learning_rate": 2.633559066967645e-07,
"loss": 5.7306,
"step": 70
},
{
"epoch": 0.06019563581640331,
"grad_norm": 139.78561401367188,
"learning_rate": 3.0097817908201656e-07,
"loss": 4.8739,
"step": 80
},
{
"epoch": 0.06772009029345373,
"grad_norm": 208.4263916015625,
"learning_rate": 3.3860045146726866e-07,
"loss": 4.8178,
"step": 90
},
{
"epoch": 0.07524454477050414,
"grad_norm": 122.90266418457031,
"learning_rate": 3.7622272385252076e-07,
"loss": 4.3441,
"step": 100
},
{
"epoch": 0.08276899924755456,
"grad_norm": 115.08711242675781,
"learning_rate": 4.138449962377728e-07,
"loss": 3.882,
"step": 110
},
{
"epoch": 0.09029345372460497,
"grad_norm": 85.97943878173828,
"learning_rate": 4.5146726862302486e-07,
"loss": 3.7734,
"step": 120
},
{
"epoch": 0.09781790820165538,
"grad_norm": 136.2003173828125,
"learning_rate": 4.89089541008277e-07,
"loss": 3.7928,
"step": 130
},
{
"epoch": 0.1053423626787058,
"grad_norm": 109.00381469726562,
"learning_rate": 5.26711813393529e-07,
"loss": 3.1142,
"step": 140
},
{
"epoch": 0.11286681715575621,
"grad_norm": 109.21756744384766,
"learning_rate": 5.643340857787811e-07,
"loss": 2.9789,
"step": 150
},
{
"epoch": 0.12039127163280662,
"grad_norm": 89.74577331542969,
"learning_rate": 6.019563581640331e-07,
"loss": 2.8887,
"step": 160
},
{
"epoch": 0.12791572610985705,
"grad_norm": 81.73179626464844,
"learning_rate": 6.395786305492853e-07,
"loss": 2.7142,
"step": 170
},
{
"epoch": 0.13544018058690746,
"grad_norm": 77.00216674804688,
"learning_rate": 6.772009029345373e-07,
"loss": 2.5444,
"step": 180
},
{
"epoch": 0.14296463506395787,
"grad_norm": 106.53274536132812,
"learning_rate": 7.148231753197895e-07,
"loss": 2.7361,
"step": 190
},
{
"epoch": 0.1504890895410083,
"grad_norm": 71.52346801757812,
"learning_rate": 7.524454477050415e-07,
"loss": 1.9428,
"step": 200
},
{
"epoch": 0.1580135440180587,
"grad_norm": 68.6385498046875,
"learning_rate": 7.900677200902936e-07,
"loss": 2.1022,
"step": 210
},
{
"epoch": 0.1655379984951091,
"grad_norm": 67.29202270507812,
"learning_rate": 8.276899924755456e-07,
"loss": 1.8348,
"step": 220
},
{
"epoch": 0.17306245297215953,
"grad_norm": 78.09269714355469,
"learning_rate": 8.653122648607977e-07,
"loss": 2.2978,
"step": 230
},
{
"epoch": 0.18058690744920994,
"grad_norm": 52.45564651489258,
"learning_rate": 9.029345372460497e-07,
"loss": 1.7102,
"step": 240
},
{
"epoch": 0.18811136192626035,
"grad_norm": 46.46923828125,
"learning_rate": 9.405568096313019e-07,
"loss": 1.6509,
"step": 250
},
{
"epoch": 0.19563581640331076,
"grad_norm": 47.17067337036133,
"learning_rate": 9.78179082016554e-07,
"loss": 1.5885,
"step": 260
},
{
"epoch": 0.20316027088036118,
"grad_norm": 53.28254318237305,
"learning_rate": 1.0158013544018059e-06,
"loss": 1.7244,
"step": 270
},
{
"epoch": 0.2106847253574116,
"grad_norm": 40.88100814819336,
"learning_rate": 1.053423626787058e-06,
"loss": 1.4149,
"step": 280
},
{
"epoch": 0.218209179834462,
"grad_norm": 51.29297637939453,
"learning_rate": 1.0910458991723102e-06,
"loss": 1.3588,
"step": 290
},
{
"epoch": 0.22573363431151242,
"grad_norm": 46.602970123291016,
"learning_rate": 1.1286681715575621e-06,
"loss": 1.2849,
"step": 300
},
{
"epoch": 0.23325808878856283,
"grad_norm": 48.245201110839844,
"learning_rate": 1.1662904439428143e-06,
"loss": 1.1217,
"step": 310
},
{
"epoch": 0.24078254326561324,
"grad_norm": 55.32264709472656,
"learning_rate": 1.2039127163280662e-06,
"loss": 1.4073,
"step": 320
},
{
"epoch": 0.24830699774266365,
"grad_norm": 55.40262985229492,
"learning_rate": 1.2415349887133184e-06,
"loss": 1.3804,
"step": 330
},
{
"epoch": 0.2558314522197141,
"grad_norm": 55.93576431274414,
"learning_rate": 1.2791572610985705e-06,
"loss": 1.2969,
"step": 340
},
{
"epoch": 0.2633559066967645,
"grad_norm": 40.739097595214844,
"learning_rate": 1.3167795334838227e-06,
"loss": 0.8772,
"step": 350
},
{
"epoch": 0.2708803611738149,
"grad_norm": 56.57487487792969,
"learning_rate": 1.3544018058690746e-06,
"loss": 1.0485,
"step": 360
},
{
"epoch": 0.27840481565086533,
"grad_norm": 37.80082702636719,
"learning_rate": 1.3920240782543268e-06,
"loss": 0.8924,
"step": 370
},
{
"epoch": 0.28592927012791575,
"grad_norm": 40.33064651489258,
"learning_rate": 1.429646350639579e-06,
"loss": 1.0153,
"step": 380
},
{
"epoch": 0.29345372460496616,
"grad_norm": 38.92075729370117,
"learning_rate": 1.4672686230248309e-06,
"loss": 0.919,
"step": 390
},
{
"epoch": 0.3009781790820166,
"grad_norm": 36.200130462646484,
"learning_rate": 1.504890895410083e-06,
"loss": 0.8283,
"step": 400
},
{
"epoch": 0.308502633559067,
"grad_norm": 34.788963317871094,
"learning_rate": 1.542513167795335e-06,
"loss": 1.1902,
"step": 410
},
{
"epoch": 0.3160270880361174,
"grad_norm": 36.59656524658203,
"learning_rate": 1.5801354401805871e-06,
"loss": 0.9343,
"step": 420
},
{
"epoch": 0.3235515425131678,
"grad_norm": 39.81496047973633,
"learning_rate": 1.617757712565839e-06,
"loss": 0.9014,
"step": 430
},
{
"epoch": 0.3310759969902182,
"grad_norm": 24.904926300048828,
"learning_rate": 1.6553799849510912e-06,
"loss": 0.7461,
"step": 440
},
{
"epoch": 0.33860045146726864,
"grad_norm": 38.34230422973633,
"learning_rate": 1.6930022573363434e-06,
"loss": 0.8552,
"step": 450
},
{
"epoch": 0.34612490594431905,
"grad_norm": 26.474260330200195,
"learning_rate": 1.7306245297215953e-06,
"loss": 0.8746,
"step": 460
},
{
"epoch": 0.35364936042136946,
"grad_norm": 36.48841094970703,
"learning_rate": 1.7682468021068475e-06,
"loss": 0.816,
"step": 470
},
{
"epoch": 0.3611738148984199,
"grad_norm": 35.95697021484375,
"learning_rate": 1.8058690744920994e-06,
"loss": 0.7342,
"step": 480
},
{
"epoch": 0.3686982693754703,
"grad_norm": 37.21470260620117,
"learning_rate": 1.8434913468773516e-06,
"loss": 0.7724,
"step": 490
},
{
"epoch": 0.3762227238525207,
"grad_norm": 38.8176155090332,
"learning_rate": 1.8811136192626038e-06,
"loss": 0.9453,
"step": 500
},
{
"epoch": 0.3837471783295711,
"grad_norm": 37.029815673828125,
"learning_rate": 1.918735891647856e-06,
"loss": 0.7449,
"step": 510
},
{
"epoch": 0.3912716328066215,
"grad_norm": 36.2590217590332,
"learning_rate": 1.956358164033108e-06,
"loss": 0.8882,
"step": 520
},
{
"epoch": 0.39879608728367194,
"grad_norm": 31.115354537963867,
"learning_rate": 1.99398043641836e-06,
"loss": 0.8969,
"step": 530
},
{
"epoch": 0.40632054176072235,
"grad_norm": 27.557512283325195,
"learning_rate": 2.0316027088036117e-06,
"loss": 0.8884,
"step": 540
},
{
"epoch": 0.41384499623777277,
"grad_norm": 32.3388671875,
"learning_rate": 2.069224981188864e-06,
"loss": 0.7083,
"step": 550
},
{
"epoch": 0.4213694507148232,
"grad_norm": 37.25837707519531,
"learning_rate": 2.106847253574116e-06,
"loss": 0.7259,
"step": 560
},
{
"epoch": 0.4288939051918736,
"grad_norm": 20.491844177246094,
"learning_rate": 2.144469525959368e-06,
"loss": 0.7148,
"step": 570
},
{
"epoch": 0.436418359668924,
"grad_norm": 26.329750061035156,
"learning_rate": 2.1820917983446204e-06,
"loss": 0.8233,
"step": 580
},
{
"epoch": 0.4439428141459744,
"grad_norm": 36.909969329833984,
"learning_rate": 2.2197140707298723e-06,
"loss": 0.8745,
"step": 590
},
{
"epoch": 0.45146726862302483,
"grad_norm": 23.27708625793457,
"learning_rate": 2.2573363431151243e-06,
"loss": 0.7076,
"step": 600
},
{
"epoch": 0.45899172310007524,
"grad_norm": 39.00196838378906,
"learning_rate": 2.294958615500376e-06,
"loss": 0.7312,
"step": 610
},
{
"epoch": 0.46651617757712566,
"grad_norm": 25.750431060791016,
"learning_rate": 2.3325808878856286e-06,
"loss": 0.6018,
"step": 620
},
{
"epoch": 0.47404063205417607,
"grad_norm": 26.11092185974121,
"learning_rate": 2.3702031602708805e-06,
"loss": 0.6264,
"step": 630
},
{
"epoch": 0.4815650865312265,
"grad_norm": 25.451011657714844,
"learning_rate": 2.4078254326561325e-06,
"loss": 0.5659,
"step": 640
},
{
"epoch": 0.4890895410082769,
"grad_norm": 20.302892684936523,
"learning_rate": 2.445447705041385e-06,
"loss": 0.6611,
"step": 650
},
{
"epoch": 0.4966139954853273,
"grad_norm": 30.64501953125,
"learning_rate": 2.4830699774266368e-06,
"loss": 0.7641,
"step": 660
},
{
"epoch": 0.5041384499623778,
"grad_norm": 24.991497039794922,
"learning_rate": 2.520692249811889e-06,
"loss": 0.7671,
"step": 670
},
{
"epoch": 0.5116629044394282,
"grad_norm": 25.86818504333496,
"learning_rate": 2.558314522197141e-06,
"loss": 0.764,
"step": 680
},
{
"epoch": 0.5191873589164786,
"grad_norm": 17.105379104614258,
"learning_rate": 2.595936794582393e-06,
"loss": 0.6704,
"step": 690
},
{
"epoch": 0.526711813393529,
"grad_norm": 32.04035949707031,
"learning_rate": 2.6335590669676454e-06,
"loss": 0.6067,
"step": 700
},
{
"epoch": 0.5342362678705794,
"grad_norm": 28.617063522338867,
"learning_rate": 2.6711813393528973e-06,
"loss": 0.5894,
"step": 710
},
{
"epoch": 0.5417607223476298,
"grad_norm": 21.270259857177734,
"learning_rate": 2.7088036117381493e-06,
"loss": 0.636,
"step": 720
},
{
"epoch": 0.5492851768246803,
"grad_norm": 19.81507110595703,
"learning_rate": 2.7464258841234016e-06,
"loss": 0.631,
"step": 730
},
{
"epoch": 0.5568096313017307,
"grad_norm": 18.52845001220703,
"learning_rate": 2.7840481565086536e-06,
"loss": 0.6765,
"step": 740
},
{
"epoch": 0.5643340857787811,
"grad_norm": 26.92551612854004,
"learning_rate": 2.8216704288939055e-06,
"loss": 0.6776,
"step": 750
},
{
"epoch": 0.5718585402558315,
"grad_norm": 18.482553482055664,
"learning_rate": 2.859292701279158e-06,
"loss": 0.5727,
"step": 760
},
{
"epoch": 0.5793829947328819,
"grad_norm": 22.597740173339844,
"learning_rate": 2.89691497366441e-06,
"loss": 0.5505,
"step": 770
},
{
"epoch": 0.5869074492099323,
"grad_norm": 24.563085556030273,
"learning_rate": 2.9345372460496618e-06,
"loss": 0.6187,
"step": 780
},
{
"epoch": 0.5944319036869827,
"grad_norm": 15.353199005126953,
"learning_rate": 2.9721595184349137e-06,
"loss": 0.6372,
"step": 790
},
{
"epoch": 0.6019563581640331,
"grad_norm": 21.005054473876953,
"learning_rate": 3.009781790820166e-06,
"loss": 0.5677,
"step": 800
},
{
"epoch": 0.6094808126410836,
"grad_norm": 12.04046630859375,
"learning_rate": 3.047404063205418e-06,
"loss": 0.5383,
"step": 810
},
{
"epoch": 0.617005267118134,
"grad_norm": 19.093727111816406,
"learning_rate": 3.08502633559067e-06,
"loss": 0.5017,
"step": 820
},
{
"epoch": 0.6245297215951844,
"grad_norm": 20.096912384033203,
"learning_rate": 3.1226486079759224e-06,
"loss": 0.5334,
"step": 830
},
{
"epoch": 0.6320541760722348,
"grad_norm": 15.413702011108398,
"learning_rate": 3.1602708803611743e-06,
"loss": 0.5755,
"step": 840
},
{
"epoch": 0.6395786305492852,
"grad_norm": 19.66105842590332,
"learning_rate": 3.1978931527464262e-06,
"loss": 0.4816,
"step": 850
},
{
"epoch": 0.6471030850263356,
"grad_norm": 14.463790893554688,
"learning_rate": 3.235515425131678e-06,
"loss": 0.5436,
"step": 860
},
{
"epoch": 0.654627539503386,
"grad_norm": 19.423931121826172,
"learning_rate": 3.2731376975169306e-06,
"loss": 0.4423,
"step": 870
},
{
"epoch": 0.6621519939804364,
"grad_norm": 17.905784606933594,
"learning_rate": 3.3107599699021825e-06,
"loss": 0.5378,
"step": 880
},
{
"epoch": 0.6696764484574869,
"grad_norm": 19.07693862915039,
"learning_rate": 3.3483822422874344e-06,
"loss": 0.5569,
"step": 890
},
{
"epoch": 0.6772009029345373,
"grad_norm": 19.285032272338867,
"learning_rate": 3.386004514672687e-06,
"loss": 0.4729,
"step": 900
},
{
"epoch": 0.6847253574115877,
"grad_norm": 18.800764083862305,
"learning_rate": 3.4236267870579388e-06,
"loss": 0.5417,
"step": 910
},
{
"epoch": 0.6922498118886381,
"grad_norm": 20.45533561706543,
"learning_rate": 3.4612490594431907e-06,
"loss": 0.5456,
"step": 920
},
{
"epoch": 0.6997742663656885,
"grad_norm": 13.277688980102539,
"learning_rate": 3.498871331828443e-06,
"loss": 0.4199,
"step": 930
},
{
"epoch": 0.7072987208427389,
"grad_norm": 12.00981330871582,
"learning_rate": 3.536493604213695e-06,
"loss": 0.4254,
"step": 940
},
{
"epoch": 0.7148231753197893,
"grad_norm": 22.597990036010742,
"learning_rate": 3.574115876598947e-06,
"loss": 0.5816,
"step": 950
},
{
"epoch": 0.7223476297968398,
"grad_norm": 18.304636001586914,
"learning_rate": 3.611738148984199e-06,
"loss": 0.5138,
"step": 960
},
{
"epoch": 0.7298720842738902,
"grad_norm": 19.998075485229492,
"learning_rate": 3.6493604213694513e-06,
"loss": 0.4879,
"step": 970
},
{
"epoch": 0.7373965387509406,
"grad_norm": 13.545928955078125,
"learning_rate": 3.686982693754703e-06,
"loss": 0.4214,
"step": 980
},
{
"epoch": 0.744920993227991,
"grad_norm": 14.774223327636719,
"learning_rate": 3.724604966139955e-06,
"loss": 0.4998,
"step": 990
},
{
"epoch": 0.7524454477050414,
"grad_norm": 12.196511268615723,
"learning_rate": 3.7622272385252075e-06,
"loss": 0.4384,
"step": 1000
},
{
"epoch": 0.7599699021820918,
"grad_norm": 15.515963554382324,
"learning_rate": 3.7998495109104595e-06,
"loss": 0.4783,
"step": 1010
},
{
"epoch": 0.7674943566591422,
"grad_norm": 19.6131534576416,
"learning_rate": 3.837471783295712e-06,
"loss": 0.4698,
"step": 1020
},
{
"epoch": 0.7750188111361926,
"grad_norm": 15.661735534667969,
"learning_rate": 3.875094055680963e-06,
"loss": 0.4194,
"step": 1030
},
{
"epoch": 0.782543265613243,
"grad_norm": 15.907915115356445,
"learning_rate": 3.912716328066216e-06,
"loss": 0.5192,
"step": 1040
},
{
"epoch": 0.7900677200902935,
"grad_norm": 8.650323867797852,
"learning_rate": 3.950338600451468e-06,
"loss": 0.3647,
"step": 1050
},
{
"epoch": 0.7975921745673439,
"grad_norm": 14.708108901977539,
"learning_rate": 3.98796087283672e-06,
"loss": 0.5469,
"step": 1060
},
{
"epoch": 0.8051166290443943,
"grad_norm": 16.337339401245117,
"learning_rate": 4.025583145221972e-06,
"loss": 0.4287,
"step": 1070
},
{
"epoch": 0.8126410835214447,
"grad_norm": 11.901853561401367,
"learning_rate": 4.0632054176072235e-06,
"loss": 0.4745,
"step": 1080
},
{
"epoch": 0.8201655379984951,
"grad_norm": 13.786298751831055,
"learning_rate": 4.100827689992476e-06,
"loss": 0.3915,
"step": 1090
},
{
"epoch": 0.8276899924755455,
"grad_norm": 18.17781639099121,
"learning_rate": 4.138449962377728e-06,
"loss": 0.4134,
"step": 1100
},
{
"epoch": 0.835214446952596,
"grad_norm": 15.534759521484375,
"learning_rate": 4.17607223476298e-06,
"loss": 0.3891,
"step": 1110
},
{
"epoch": 0.8427389014296464,
"grad_norm": 19.764795303344727,
"learning_rate": 4.213694507148232e-06,
"loss": 0.338,
"step": 1120
},
{
"epoch": 0.8502633559066968,
"grad_norm": 19.5684814453125,
"learning_rate": 4.2513167795334845e-06,
"loss": 0.4936,
"step": 1130
},
{
"epoch": 0.8577878103837472,
"grad_norm": 16.872209548950195,
"learning_rate": 4.288939051918736e-06,
"loss": 0.354,
"step": 1140
},
{
"epoch": 0.8653122648607976,
"grad_norm": 16.32406997680664,
"learning_rate": 4.326561324303988e-06,
"loss": 0.4274,
"step": 1150
},
{
"epoch": 0.872836719337848,
"grad_norm": 16.827388763427734,
"learning_rate": 4.364183596689241e-06,
"loss": 0.4203,
"step": 1160
},
{
"epoch": 0.8803611738148984,
"grad_norm": 11.862984657287598,
"learning_rate": 4.401805869074492e-06,
"loss": 0.3622,
"step": 1170
},
{
"epoch": 0.8878856282919488,
"grad_norm": 11.026522636413574,
"learning_rate": 4.439428141459745e-06,
"loss": 0.3571,
"step": 1180
},
{
"epoch": 0.8954100827689992,
"grad_norm": 12.197684288024902,
"learning_rate": 4.477050413844997e-06,
"loss": 0.3711,
"step": 1190
},
{
"epoch": 0.9029345372460497,
"grad_norm": 14.74181079864502,
"learning_rate": 4.5146726862302485e-06,
"loss": 0.443,
"step": 1200
},
{
"epoch": 0.9104589917231001,
"grad_norm": 15.604998588562012,
"learning_rate": 4.552294958615501e-06,
"loss": 0.3191,
"step": 1210
},
{
"epoch": 0.9179834462001505,
"grad_norm": 16.301240921020508,
"learning_rate": 4.589917231000752e-06,
"loss": 0.4541,
"step": 1220
},
{
"epoch": 0.9255079006772009,
"grad_norm": 14.248018264770508,
"learning_rate": 4.627539503386005e-06,
"loss": 0.3961,
"step": 1230
},
{
"epoch": 0.9330323551542513,
"grad_norm": 12.575178146362305,
"learning_rate": 4.665161775771257e-06,
"loss": 0.4851,
"step": 1240
},
{
"epoch": 0.9405568096313017,
"grad_norm": 11.841970443725586,
"learning_rate": 4.702784048156509e-06,
"loss": 0.3934,
"step": 1250
},
{
"epoch": 0.9480812641083521,
"grad_norm": 10.266742706298828,
"learning_rate": 4.740406320541761e-06,
"loss": 0.3943,
"step": 1260
},
{
"epoch": 0.9556057185854026,
"grad_norm": 11.296609878540039,
"learning_rate": 4.778028592927013e-06,
"loss": 0.5257,
"step": 1270
},
{
"epoch": 0.963130173062453,
"grad_norm": 9.851863861083984,
"learning_rate": 4.815650865312265e-06,
"loss": 0.3885,
"step": 1280
},
{
"epoch": 0.9706546275395034,
"grad_norm": 11.191083908081055,
"learning_rate": 4.853273137697517e-06,
"loss": 0.3472,
"step": 1290
},
{
"epoch": 0.9781790820165538,
"grad_norm": 13.400888442993164,
"learning_rate": 4.89089541008277e-06,
"loss": 0.4212,
"step": 1300
},
{
"epoch": 0.9857035364936042,
"grad_norm": 13.300646781921387,
"learning_rate": 4.928517682468021e-06,
"loss": 0.4037,
"step": 1310
},
{
"epoch": 0.9932279909706546,
"grad_norm": 12.612085342407227,
"learning_rate": 4.9661399548532735e-06,
"loss": 0.4435,
"step": 1320
},
{
"epoch": 1.000752445447705,
"grad_norm": 12.677772521972656,
"learning_rate": 5.003762227238526e-06,
"loss": 0.405,
"step": 1330
},
{
"epoch": 1.0082768999247556,
"grad_norm": 10.081019401550293,
"learning_rate": 5.041384499623778e-06,
"loss": 0.3084,
"step": 1340
},
{
"epoch": 1.0158013544018059,
"grad_norm": 6.720037460327148,
"learning_rate": 5.07900677200903e-06,
"loss": 0.3676,
"step": 1350
},
{
"epoch": 1.0233258088788564,
"grad_norm": 12.8779296875,
"learning_rate": 5.116629044394282e-06,
"loss": 0.4041,
"step": 1360
},
{
"epoch": 1.0308502633559067,
"grad_norm": 12.37596321105957,
"learning_rate": 5.154251316779534e-06,
"loss": 0.4613,
"step": 1370
},
{
"epoch": 1.0383747178329572,
"grad_norm": 15.438199043273926,
"learning_rate": 5.191873589164786e-06,
"loss": 0.4007,
"step": 1380
},
{
"epoch": 1.0458991723100075,
"grad_norm": 11.500144004821777,
"learning_rate": 5.2294958615500376e-06,
"loss": 0.3785,
"step": 1390
},
{
"epoch": 1.053423626787058,
"grad_norm": 6.981774806976318,
"learning_rate": 5.267118133935291e-06,
"loss": 0.3816,
"step": 1400
},
{
"epoch": 1.0609480812641083,
"grad_norm": 11.432241439819336,
"learning_rate": 5.304740406320542e-06,
"loss": 0.4292,
"step": 1410
},
{
"epoch": 1.0684725357411589,
"grad_norm": 16.537931442260742,
"learning_rate": 5.342362678705795e-06,
"loss": 0.3867,
"step": 1420
},
{
"epoch": 1.0759969902182092,
"grad_norm": 14.49853801727295,
"learning_rate": 5.379984951091046e-06,
"loss": 0.3697,
"step": 1430
},
{
"epoch": 1.0835214446952597,
"grad_norm": 11.265471458435059,
"learning_rate": 5.4176072234762986e-06,
"loss": 0.3432,
"step": 1440
},
{
"epoch": 1.09104589917231,
"grad_norm": 10.750925064086914,
"learning_rate": 5.45522949586155e-06,
"loss": 0.363,
"step": 1450
},
{
"epoch": 1.0985703536493605,
"grad_norm": 9.414807319641113,
"learning_rate": 5.492851768246803e-06,
"loss": 0.366,
"step": 1460
},
{
"epoch": 1.1060948081264108,
"grad_norm": 12.20535945892334,
"learning_rate": 5.530474040632055e-06,
"loss": 0.3451,
"step": 1470
},
{
"epoch": 1.1136192626034613,
"grad_norm": 10.815361976623535,
"learning_rate": 5.568096313017307e-06,
"loss": 0.333,
"step": 1480
},
{
"epoch": 1.1211437170805116,
"grad_norm": 12.699690818786621,
"learning_rate": 5.605718585402559e-06,
"loss": 0.4013,
"step": 1490
},
{
"epoch": 1.1286681715575622,
"grad_norm": 13.174708366394043,
"learning_rate": 5.643340857787811e-06,
"loss": 0.3381,
"step": 1500
},
{
"epoch": 1.1361926260346125,
"grad_norm": 10.062464714050293,
"learning_rate": 5.680963130173063e-06,
"loss": 0.4081,
"step": 1510
},
{
"epoch": 1.143717080511663,
"grad_norm": 15.652090072631836,
"learning_rate": 5.718585402558316e-06,
"loss": 0.5176,
"step": 1520
},
{
"epoch": 1.1512415349887133,
"grad_norm": 12.49589729309082,
"learning_rate": 5.756207674943567e-06,
"loss": 0.3599,
"step": 1530
},
{
"epoch": 1.1587659894657638,
"grad_norm": 16.024024963378906,
"learning_rate": 5.79382994732882e-06,
"loss": 0.3548,
"step": 1540
},
{
"epoch": 1.1662904439428141,
"grad_norm": 11.08897876739502,
"learning_rate": 5.831452219714071e-06,
"loss": 0.3916,
"step": 1550
},
{
"epoch": 1.1738148984198646,
"grad_norm": 11.4227294921875,
"learning_rate": 5.8690744920993236e-06,
"loss": 0.3289,
"step": 1560
},
{
"epoch": 1.181339352896915,
"grad_norm": 8.900651931762695,
"learning_rate": 5.906696764484575e-06,
"loss": 0.3406,
"step": 1570
},
{
"epoch": 1.1888638073739655,
"grad_norm": 14.201620101928711,
"learning_rate": 5.9443190368698275e-06,
"loss": 0.341,
"step": 1580
},
{
"epoch": 1.1963882618510158,
"grad_norm": 12.406390190124512,
"learning_rate": 5.981941309255079e-06,
"loss": 0.3861,
"step": 1590
},
{
"epoch": 1.2039127163280663,
"grad_norm": 16.3964786529541,
"learning_rate": 6.019563581640332e-06,
"loss": 0.3044,
"step": 1600
},
{
"epoch": 1.2114371708051166,
"grad_norm": 7.37337064743042,
"learning_rate": 6.057185854025584e-06,
"loss": 0.3174,
"step": 1610
},
{
"epoch": 1.2189616252821671,
"grad_norm": 16.885831832885742,
"learning_rate": 6.094808126410836e-06,
"loss": 0.4005,
"step": 1620
},
{
"epoch": 1.2264860797592174,
"grad_norm": 10.093635559082031,
"learning_rate": 6.132430398796088e-06,
"loss": 0.3842,
"step": 1630
},
{
"epoch": 1.234010534236268,
"grad_norm": 10.68936824798584,
"learning_rate": 6.17005267118134e-06,
"loss": 0.2717,
"step": 1640
},
{
"epoch": 1.2415349887133182,
"grad_norm": 9.531023025512695,
"learning_rate": 6.2076749435665915e-06,
"loss": 0.32,
"step": 1650
},
{
"epoch": 1.2490594431903688,
"grad_norm": 11.77883529663086,
"learning_rate": 6.245297215951845e-06,
"loss": 0.2916,
"step": 1660
},
{
"epoch": 1.256583897667419,
"grad_norm": 13.740798950195312,
"learning_rate": 6.282919488337096e-06,
"loss": 0.3566,
"step": 1670
},
{
"epoch": 1.2641083521444696,
"grad_norm": 9.122705459594727,
"learning_rate": 6.320541760722349e-06,
"loss": 0.399,
"step": 1680
},
{
"epoch": 1.27163280662152,
"grad_norm": 8.534326553344727,
"learning_rate": 6.3581640331076e-06,
"loss": 0.2988,
"step": 1690
},
{
"epoch": 1.2791572610985704,
"grad_norm": 9.343318939208984,
"learning_rate": 6.3957863054928525e-06,
"loss": 0.2774,
"step": 1700
},
{
"epoch": 1.2866817155756207,
"grad_norm": 13.007920265197754,
"learning_rate": 6.433408577878104e-06,
"loss": 0.3807,
"step": 1710
},
{
"epoch": 1.2942061700526712,
"grad_norm": 12.973888397216797,
"learning_rate": 6.471030850263356e-06,
"loss": 0.3965,
"step": 1720
},
{
"epoch": 1.3017306245297215,
"grad_norm": 5.929915428161621,
"learning_rate": 6.508653122648608e-06,
"loss": 0.3176,
"step": 1730
},
{
"epoch": 1.309255079006772,
"grad_norm": 12.077717781066895,
"learning_rate": 6.546275395033861e-06,
"loss": 0.3405,
"step": 1740
},
{
"epoch": 1.3167795334838224,
"grad_norm": 13.16915225982666,
"learning_rate": 6.583897667419113e-06,
"loss": 0.3968,
"step": 1750
},
{
"epoch": 1.324303987960873,
"grad_norm": 7.098601341247559,
"learning_rate": 6.621519939804365e-06,
"loss": 0.3402,
"step": 1760
},
{
"epoch": 1.3318284424379232,
"grad_norm": 7.718910217285156,
"learning_rate": 6.6591422121896165e-06,
"loss": 0.3193,
"step": 1770
},
{
"epoch": 1.3393528969149737,
"grad_norm": 12.224291801452637,
"learning_rate": 6.696764484574869e-06,
"loss": 0.4085,
"step": 1780
},
{
"epoch": 1.346877351392024,
"grad_norm": 10.205738067626953,
"learning_rate": 6.73438675696012e-06,
"loss": 0.3806,
"step": 1790
},
{
"epoch": 1.3544018058690745,
"grad_norm": 13.36560344696045,
"learning_rate": 6.772009029345374e-06,
"loss": 0.4265,
"step": 1800
},
{
"epoch": 1.3619262603461249,
"grad_norm": 10.773680686950684,
"learning_rate": 6.809631301730625e-06,
"loss": 0.3876,
"step": 1810
},
{
"epoch": 1.3694507148231754,
"grad_norm": 12.094820976257324,
"learning_rate": 6.8472535741158775e-06,
"loss": 0.3982,
"step": 1820
},
{
"epoch": 1.3769751693002257,
"grad_norm": 10.780938148498535,
"learning_rate": 6.884875846501129e-06,
"loss": 0.3569,
"step": 1830
},
{
"epoch": 1.3844996237772762,
"grad_norm": 14.833603858947754,
"learning_rate": 6.922498118886381e-06,
"loss": 0.4042,
"step": 1840
},
{
"epoch": 1.3920240782543265,
"grad_norm": 10.232943534851074,
"learning_rate": 6.960120391271633e-06,
"loss": 0.3469,
"step": 1850
},
{
"epoch": 1.399548532731377,
"grad_norm": 8.74258041381836,
"learning_rate": 6.997742663656886e-06,
"loss": 0.2916,
"step": 1860
},
{
"epoch": 1.4070729872084273,
"grad_norm": 11.700883865356445,
"learning_rate": 7.035364936042137e-06,
"loss": 0.3765,
"step": 1870
},
{
"epoch": 1.4145974416854779,
"grad_norm": 7.85889196395874,
"learning_rate": 7.07298720842739e-06,
"loss": 0.3984,
"step": 1880
},
{
"epoch": 1.4221218961625282,
"grad_norm": 10.270554542541504,
"learning_rate": 7.1106094808126415e-06,
"loss": 0.2839,
"step": 1890
},
{
"epoch": 1.4296463506395787,
"grad_norm": 10.699418067932129,
"learning_rate": 7.148231753197894e-06,
"loss": 0.3338,
"step": 1900
},
{
"epoch": 1.437170805116629,
"grad_norm": 9.80615234375,
"learning_rate": 7.185854025583145e-06,
"loss": 0.299,
"step": 1910
},
{
"epoch": 1.4446952595936795,
"grad_norm": 11.072823524475098,
"learning_rate": 7.223476297968398e-06,
"loss": 0.3196,
"step": 1920
},
{
"epoch": 1.4522197140707298,
"grad_norm": 10.377110481262207,
"learning_rate": 7.261098570353649e-06,
"loss": 0.3172,
"step": 1930
},
{
"epoch": 1.4597441685477803,
"grad_norm": 13.72491455078125,
"learning_rate": 7.2987208427389025e-06,
"loss": 0.3497,
"step": 1940
},
{
"epoch": 1.4672686230248306,
"grad_norm": 7.965284824371338,
"learning_rate": 7.336343115124154e-06,
"loss": 0.3693,
"step": 1950
},
{
"epoch": 1.4747930775018812,
"grad_norm": 11.198545455932617,
"learning_rate": 7.373965387509406e-06,
"loss": 0.3379,
"step": 1960
},
{
"epoch": 1.4823175319789315,
"grad_norm": 8.948925971984863,
"learning_rate": 7.411587659894658e-06,
"loss": 0.3532,
"step": 1970
},
{
"epoch": 1.489841986455982,
"grad_norm": 12.37880802154541,
"learning_rate": 7.44920993227991e-06,
"loss": 0.3462,
"step": 1980
},
{
"epoch": 1.4973664409330323,
"grad_norm": 12.647643089294434,
"learning_rate": 7.486832204665162e-06,
"loss": 0.3667,
"step": 1990
},
{
"epoch": 1.5048908954100828,
"grad_norm": 10.249937057495117,
"learning_rate": 7.524454477050415e-06,
"loss": 0.3679,
"step": 2000
},
{
"epoch": 1.5124153498871333,
"grad_norm": 6.153459548950195,
"learning_rate": 7.5620767494356666e-06,
"loss": 0.3438,
"step": 2010
},
{
"epoch": 1.5199398043641836,
"grad_norm": 10.269697189331055,
"learning_rate": 7.599699021820919e-06,
"loss": 0.317,
"step": 2020
},
{
"epoch": 1.527464258841234,
"grad_norm": 6.430722236633301,
"learning_rate": 7.63732129420617e-06,
"loss": 0.3408,
"step": 2030
},
{
"epoch": 1.5349887133182845,
"grad_norm": 11.511789321899414,
"learning_rate": 7.674943566591424e-06,
"loss": 0.3218,
"step": 2040
},
{
"epoch": 1.542513167795335,
"grad_norm": 7.904773712158203,
"learning_rate": 7.712565838976675e-06,
"loss": 0.3227,
"step": 2050
},
{
"epoch": 1.5500376222723853,
"grad_norm": 8.252975463867188,
"learning_rate": 7.750188111361927e-06,
"loss": 0.3499,
"step": 2060
},
{
"epoch": 1.5575620767494356,
"grad_norm": 13.122828483581543,
"learning_rate": 7.787810383747178e-06,
"loss": 0.3733,
"step": 2070
},
{
"epoch": 1.565086531226486,
"grad_norm": 8.126852035522461,
"learning_rate": 7.825432656132431e-06,
"loss": 0.3419,
"step": 2080
},
{
"epoch": 1.5726109857035366,
"grad_norm": 12.114632606506348,
"learning_rate": 7.863054928517683e-06,
"loss": 0.2854,
"step": 2090
},
{
"epoch": 1.580135440180587,
"grad_norm": 11.291882514953613,
"learning_rate": 7.900677200902936e-06,
"loss": 0.2942,
"step": 2100
},
{
"epoch": 1.5876598946576372,
"grad_norm": 12.11705493927002,
"learning_rate": 7.938299473288188e-06,
"loss": 0.2712,
"step": 2110
},
{
"epoch": 1.5951843491346878,
"grad_norm": 9.345537185668945,
"learning_rate": 7.97592174567344e-06,
"loss": 0.2612,
"step": 2120
},
{
"epoch": 1.6027088036117383,
"grad_norm": 9.715005874633789,
"learning_rate": 8.01354401805869e-06,
"loss": 0.3601,
"step": 2130
},
{
"epoch": 1.6102332580887886,
"grad_norm": 13.192370414733887,
"learning_rate": 8.051166290443944e-06,
"loss": 0.3084,
"step": 2140
},
{
"epoch": 1.617757712565839,
"grad_norm": 7.89211368560791,
"learning_rate": 8.088788562829195e-06,
"loss": 0.3257,
"step": 2150
},
{
"epoch": 1.6252821670428894,
"grad_norm": 12.189757347106934,
"learning_rate": 8.126410835214447e-06,
"loss": 0.3334,
"step": 2160
},
{
"epoch": 1.63280662151994,
"grad_norm": 8.994878768920898,
"learning_rate": 8.164033107599699e-06,
"loss": 0.3087,
"step": 2170
},
{
"epoch": 1.6403310759969902,
"grad_norm": 7.929622173309326,
"learning_rate": 8.201655379984952e-06,
"loss": 0.3207,
"step": 2180
},
{
"epoch": 1.6478555304740405,
"grad_norm": 8.899545669555664,
"learning_rate": 8.239277652370203e-06,
"loss": 0.3533,
"step": 2190
},
{
"epoch": 1.655379984951091,
"grad_norm": 10.590201377868652,
"learning_rate": 8.276899924755456e-06,
"loss": 0.2689,
"step": 2200
},
{
"epoch": 1.6629044394281416,
"grad_norm": 8.942581176757812,
"learning_rate": 8.314522197140708e-06,
"loss": 0.375,
"step": 2210
},
{
"epoch": 1.670428893905192,
"grad_norm": 9.189504623413086,
"learning_rate": 8.35214446952596e-06,
"loss": 0.2767,
"step": 2220
},
{
"epoch": 1.6779533483822422,
"grad_norm": 9.808874130249023,
"learning_rate": 8.389766741911211e-06,
"loss": 0.3574,
"step": 2230
},
{
"epoch": 1.6854778028592927,
"grad_norm": 11.146899223327637,
"learning_rate": 8.427389014296464e-06,
"loss": 0.3606,
"step": 2240
},
{
"epoch": 1.6930022573363432,
"grad_norm": 7.3132147789001465,
"learning_rate": 8.465011286681716e-06,
"loss": 0.3625,
"step": 2250
},
{
"epoch": 1.7005267118133935,
"grad_norm": 8.39673137664795,
"learning_rate": 8.502633559066969e-06,
"loss": 0.303,
"step": 2260
},
{
"epoch": 1.7080511662904438,
"grad_norm": 7.691243648529053,
"learning_rate": 8.54025583145222e-06,
"loss": 0.2824,
"step": 2270
},
{
"epoch": 1.7155756207674944,
"grad_norm": 9.144414901733398,
"learning_rate": 8.577878103837472e-06,
"loss": 0.3247,
"step": 2280
},
{
"epoch": 1.723100075244545,
"grad_norm": 6.159042835235596,
"learning_rate": 8.615500376222724e-06,
"loss": 0.3105,
"step": 2290
},
{
"epoch": 1.7306245297215952,
"grad_norm": 11.039318084716797,
"learning_rate": 8.653122648607977e-06,
"loss": 0.3481,
"step": 2300
},
{
"epoch": 1.7381489841986455,
"grad_norm": 8.701876640319824,
"learning_rate": 8.690744920993228e-06,
"loss": 0.3395,
"step": 2310
},
{
"epoch": 1.745673438675696,
"grad_norm": 9.022965431213379,
"learning_rate": 8.728367193378481e-06,
"loss": 0.2569,
"step": 2320
},
{
"epoch": 1.7531978931527465,
"grad_norm": 13.029145240783691,
"learning_rate": 8.765989465763733e-06,
"loss": 0.3193,
"step": 2330
},
{
"epoch": 1.7607223476297968,
"grad_norm": 10.46054744720459,
"learning_rate": 8.803611738148985e-06,
"loss": 0.3152,
"step": 2340
},
{
"epoch": 1.7682468021068471,
"grad_norm": 8.319682121276855,
"learning_rate": 8.841234010534236e-06,
"loss": 0.3078,
"step": 2350
},
{
"epoch": 1.7757712565838977,
"grad_norm": 10.117603302001953,
"learning_rate": 8.87885628291949e-06,
"loss": 0.2769,
"step": 2360
},
{
"epoch": 1.7832957110609482,
"grad_norm": 10.86999797821045,
"learning_rate": 8.91647855530474e-06,
"loss": 0.3529,
"step": 2370
},
{
"epoch": 1.7908201655379985,
"grad_norm": 9.869336128234863,
"learning_rate": 8.954100827689994e-06,
"loss": 0.2522,
"step": 2380
},
{
"epoch": 1.7983446200150488,
"grad_norm": 11.369247436523438,
"learning_rate": 8.991723100075246e-06,
"loss": 0.2605,
"step": 2390
},
{
"epoch": 1.8058690744920993,
"grad_norm": 7.237309455871582,
"learning_rate": 9.029345372460497e-06,
"loss": 0.2847,
"step": 2400
},
{
"epoch": 1.8133935289691498,
"grad_norm": 8.000275611877441,
"learning_rate": 9.066967644845749e-06,
"loss": 0.3287,
"step": 2410
},
{
"epoch": 1.8209179834462002,
"grad_norm": 9.019206047058105,
"learning_rate": 9.104589917231002e-06,
"loss": 0.3403,
"step": 2420
},
{
"epoch": 1.8284424379232505,
"grad_norm": 8.670087814331055,
"learning_rate": 9.142212189616253e-06,
"loss": 0.3597,
"step": 2430
},
{
"epoch": 1.835966892400301,
"grad_norm": 8.644824028015137,
"learning_rate": 9.179834462001505e-06,
"loss": 0.3055,
"step": 2440
},
{
"epoch": 1.8434913468773515,
"grad_norm": 6.5850324630737305,
"learning_rate": 9.217456734386758e-06,
"loss": 0.3047,
"step": 2450
},
{
"epoch": 1.8510158013544018,
"grad_norm": 7.342716693878174,
"learning_rate": 9.25507900677201e-06,
"loss": 0.3001,
"step": 2460
},
{
"epoch": 1.858540255831452,
"grad_norm": 10.784932136535645,
"learning_rate": 9.292701279157263e-06,
"loss": 0.327,
"step": 2470
},
{
"epoch": 1.8660647103085026,
"grad_norm": 11.683695793151855,
"learning_rate": 9.330323551542514e-06,
"loss": 0.3108,
"step": 2480
},
{
"epoch": 1.8735891647855532,
"grad_norm": 7.701004505157471,
"learning_rate": 9.367945823927766e-06,
"loss": 0.3059,
"step": 2490
},
{
"epoch": 1.8811136192626035,
"grad_norm": 13.68051528930664,
"learning_rate": 9.405568096313017e-06,
"loss": 0.3088,
"step": 2500
},
{
"epoch": 1.8886380737396538,
"grad_norm": 8.20069408416748,
"learning_rate": 9.44319036869827e-06,
"loss": 0.292,
"step": 2510
},
{
"epoch": 1.8961625282167043,
"grad_norm": 9.032533645629883,
"learning_rate": 9.480812641083522e-06,
"loss": 0.3392,
"step": 2520
},
{
"epoch": 1.9036869826937548,
"grad_norm": 8.278692245483398,
"learning_rate": 9.518434913468775e-06,
"loss": 0.2788,
"step": 2530
},
{
"epoch": 1.911211437170805,
"grad_norm": 8.68079948425293,
"learning_rate": 9.556057185854027e-06,
"loss": 0.3204,
"step": 2540
},
{
"epoch": 1.9187358916478554,
"grad_norm": 6.456122875213623,
"learning_rate": 9.593679458239278e-06,
"loss": 0.2492,
"step": 2550
},
{
"epoch": 1.926260346124906,
"grad_norm": 9.514248847961426,
"learning_rate": 9.63130173062453e-06,
"loss": 0.3392,
"step": 2560
},
{
"epoch": 1.9337848006019565,
"grad_norm": 11.629880905151367,
"learning_rate": 9.668924003009783e-06,
"loss": 0.3183,
"step": 2570
},
{
"epoch": 1.9413092550790068,
"grad_norm": 9.674823760986328,
"learning_rate": 9.706546275395035e-06,
"loss": 0.3305,
"step": 2580
},
{
"epoch": 1.948833709556057,
"grad_norm": 12.23519515991211,
"learning_rate": 9.744168547780288e-06,
"loss": 0.3487,
"step": 2590
},
{
"epoch": 1.9563581640331076,
"grad_norm": 10.810482025146484,
"learning_rate": 9.78179082016554e-06,
"loss": 0.3323,
"step": 2600
},
{
"epoch": 1.963882618510158,
"grad_norm": 11.973986625671387,
"learning_rate": 9.81941309255079e-06,
"loss": 0.2864,
"step": 2610
},
{
"epoch": 1.9714070729872084,
"grad_norm": 13.691604614257812,
"learning_rate": 9.857035364936042e-06,
"loss": 0.3814,
"step": 2620
},
{
"epoch": 1.9789315274642587,
"grad_norm": 8.417920112609863,
"learning_rate": 9.894657637321296e-06,
"loss": 0.249,
"step": 2630
},
{
"epoch": 1.9864559819413092,
"grad_norm": 7.412376880645752,
"learning_rate": 9.932279909706547e-06,
"loss": 0.3331,
"step": 2640
},
{
"epoch": 1.9939804364183598,
"grad_norm": 10.459517478942871,
"learning_rate": 9.9699021820918e-06,
"loss": 0.3042,
"step": 2650
},
{
"epoch": 2.00150489089541,
"grad_norm": 13.24636459350586,
"learning_rate": 9.999163949502551e-06,
"loss": 0.3484,
"step": 2660
},
{
"epoch": 2.0090293453724604,
"grad_norm": 9.26960277557373,
"learning_rate": 9.9949836970153e-06,
"loss": 0.3132,
"step": 2670
},
{
"epoch": 2.016553799849511,
"grad_norm": 7.454202175140381,
"learning_rate": 9.99080344452805e-06,
"loss": 0.3403,
"step": 2680
},
{
"epoch": 2.0240782543265614,
"grad_norm": 6.4221391677856445,
"learning_rate": 9.9866231920408e-06,
"loss": 0.2614,
"step": 2690
},
{
"epoch": 2.0316027088036117,
"grad_norm": 9.066787719726562,
"learning_rate": 9.982442939553549e-06,
"loss": 0.3235,
"step": 2700
},
{
"epoch": 2.039127163280662,
"grad_norm": 12.979838371276855,
"learning_rate": 9.9782626870663e-06,
"loss": 0.2818,
"step": 2710
},
{
"epoch": 2.0466516177577128,
"grad_norm": 8.948103904724121,
"learning_rate": 9.97408243457905e-06,
"loss": 0.2811,
"step": 2720
},
{
"epoch": 2.054176072234763,
"grad_norm": 4.697547912597656,
"learning_rate": 9.9699021820918e-06,
"loss": 0.2272,
"step": 2730
},
{
"epoch": 2.0617005267118134,
"grad_norm": 10.13204288482666,
"learning_rate": 9.965721929604549e-06,
"loss": 0.3073,
"step": 2740
},
{
"epoch": 2.0692249811888637,
"grad_norm": 10.628745079040527,
"learning_rate": 9.9615416771173e-06,
"loss": 0.2673,
"step": 2750
},
{
"epoch": 2.0767494356659144,
"grad_norm": 9.946393966674805,
"learning_rate": 9.957361424630048e-06,
"loss": 0.3093,
"step": 2760
},
{
"epoch": 2.0842738901429647,
"grad_norm": 9.905533790588379,
"learning_rate": 9.953181172142798e-06,
"loss": 0.2494,
"step": 2770
},
{
"epoch": 2.091798344620015,
"grad_norm": 12.323580741882324,
"learning_rate": 9.949000919655549e-06,
"loss": 0.2977,
"step": 2780
},
{
"epoch": 2.0993227990970653,
"grad_norm": 12.842970848083496,
"learning_rate": 9.944820667168298e-06,
"loss": 0.2829,
"step": 2790
},
{
"epoch": 2.106847253574116,
"grad_norm": 8.416594505310059,
"learning_rate": 9.940640414681048e-06,
"loss": 0.2974,
"step": 2800
},
{
"epoch": 2.1143717080511664,
"grad_norm": 8.839917182922363,
"learning_rate": 9.936460162193797e-06,
"loss": 0.2905,
"step": 2810
},
{
"epoch": 2.1218961625282167,
"grad_norm": 9.319620132446289,
"learning_rate": 9.932279909706547e-06,
"loss": 0.2812,
"step": 2820
},
{
"epoch": 2.129420617005267,
"grad_norm": 6.369785785675049,
"learning_rate": 9.928099657219297e-06,
"loss": 0.2398,
"step": 2830
},
{
"epoch": 2.1369450714823177,
"grad_norm": 9.347760200500488,
"learning_rate": 9.923919404732046e-06,
"loss": 0.2982,
"step": 2840
},
{
"epoch": 2.144469525959368,
"grad_norm": 10.478535652160645,
"learning_rate": 9.919739152244797e-06,
"loss": 0.3347,
"step": 2850
},
{
"epoch": 2.1519939804364183,
"grad_norm": 9.874757766723633,
"learning_rate": 9.915558899757545e-06,
"loss": 0.2499,
"step": 2860
},
{
"epoch": 2.1595184349134686,
"grad_norm": 10.045086860656738,
"learning_rate": 9.911378647270296e-06,
"loss": 0.2496,
"step": 2870
},
{
"epoch": 2.1670428893905194,
"grad_norm": 10.65018367767334,
"learning_rate": 9.907198394783046e-06,
"loss": 0.2742,
"step": 2880
},
{
"epoch": 2.1745673438675697,
"grad_norm": 7.430573463439941,
"learning_rate": 9.903018142295795e-06,
"loss": 0.2774,
"step": 2890
},
{
"epoch": 2.18209179834462,
"grad_norm": 6.933887958526611,
"learning_rate": 9.898837889808545e-06,
"loss": 0.2552,
"step": 2900
},
{
"epoch": 2.1896162528216703,
"grad_norm": 6.639419078826904,
"learning_rate": 9.894657637321296e-06,
"loss": 0.3397,
"step": 2910
},
{
"epoch": 2.197140707298721,
"grad_norm": 10.355103492736816,
"learning_rate": 9.890477384834046e-06,
"loss": 0.3017,
"step": 2920
},
{
"epoch": 2.2046651617757713,
"grad_norm": 10.879512786865234,
"learning_rate": 9.886297132346795e-06,
"loss": 0.3511,
"step": 2930
},
{
"epoch": 2.2121896162528216,
"grad_norm": 10.200553894042969,
"learning_rate": 9.882116879859545e-06,
"loss": 0.2442,
"step": 2940
},
{
"epoch": 2.219714070729872,
"grad_norm": 9.567547798156738,
"learning_rate": 9.877936627372294e-06,
"loss": 0.3113,
"step": 2950
},
{
"epoch": 2.2272385252069227,
"grad_norm": 8.075740814208984,
"learning_rate": 9.873756374885044e-06,
"loss": 0.2494,
"step": 2960
},
{
"epoch": 2.234762979683973,
"grad_norm": 10.302295684814453,
"learning_rate": 9.869576122397795e-06,
"loss": 0.2663,
"step": 2970
},
{
"epoch": 2.2422874341610233,
"grad_norm": 8.013357162475586,
"learning_rate": 9.865395869910543e-06,
"loss": 0.3004,
"step": 2980
},
{
"epoch": 2.2498118886380736,
"grad_norm": 6.954658031463623,
"learning_rate": 9.861215617423294e-06,
"loss": 0.3297,
"step": 2990
},
{
"epoch": 2.2573363431151243,
"grad_norm": 9.596476554870605,
"learning_rate": 9.857035364936042e-06,
"loss": 0.3145,
"step": 3000
},
{
"epoch": 2.2648607975921746,
"grad_norm": 7.96026086807251,
"learning_rate": 9.852855112448793e-06,
"loss": 0.282,
"step": 3010
},
{
"epoch": 2.272385252069225,
"grad_norm": 12.317435264587402,
"learning_rate": 9.848674859961541e-06,
"loss": 0.3573,
"step": 3020
},
{
"epoch": 2.2799097065462752,
"grad_norm": 6.821436405181885,
"learning_rate": 9.844494607474292e-06,
"loss": 0.2626,
"step": 3030
},
{
"epoch": 2.287434161023326,
"grad_norm": 12.15832233428955,
"learning_rate": 9.840314354987042e-06,
"loss": 0.2442,
"step": 3040
},
{
"epoch": 2.2949586155003763,
"grad_norm": 9.358490943908691,
"learning_rate": 9.836134102499791e-06,
"loss": 0.2295,
"step": 3050
},
{
"epoch": 2.3024830699774266,
"grad_norm": 6.147063255310059,
"learning_rate": 9.831953850012541e-06,
"loss": 0.2546,
"step": 3060
},
{
"epoch": 2.310007524454477,
"grad_norm": 9.920206069946289,
"learning_rate": 9.82777359752529e-06,
"loss": 0.3145,
"step": 3070
},
{
"epoch": 2.3175319789315276,
"grad_norm": 7.524102210998535,
"learning_rate": 9.823593345038042e-06,
"loss": 0.2276,
"step": 3080
},
{
"epoch": 2.325056433408578,
"grad_norm": 10.635570526123047,
"learning_rate": 9.81941309255079e-06,
"loss": 0.2949,
"step": 3090
},
{
"epoch": 2.3325808878856282,
"grad_norm": 11.139577865600586,
"learning_rate": 9.815232840063541e-06,
"loss": 0.2903,
"step": 3100
},
{
"epoch": 2.3401053423626785,
"grad_norm": 9.326565742492676,
"learning_rate": 9.81105258757629e-06,
"loss": 0.2521,
"step": 3110
},
{
"epoch": 2.3476297968397293,
"grad_norm": 11.433185577392578,
"learning_rate": 9.80687233508904e-06,
"loss": 0.3183,
"step": 3120
},
{
"epoch": 2.3551542513167796,
"grad_norm": 11.322211265563965,
"learning_rate": 9.80269208260179e-06,
"loss": 0.343,
"step": 3130
},
{
"epoch": 2.36267870579383,
"grad_norm": 11.234041213989258,
"learning_rate": 9.79851183011454e-06,
"loss": 0.3125,
"step": 3140
},
{
"epoch": 2.37020316027088,
"grad_norm": 10.041305541992188,
"learning_rate": 9.79433157762729e-06,
"loss": 0.3574,
"step": 3150
},
{
"epoch": 2.377727614747931,
"grad_norm": 9.322949409484863,
"learning_rate": 9.790151325140038e-06,
"loss": 0.2861,
"step": 3160
},
{
"epoch": 2.3852520692249812,
"grad_norm": 6.304590225219727,
"learning_rate": 9.785971072652789e-06,
"loss": 0.3074,
"step": 3170
},
{
"epoch": 2.3927765237020315,
"grad_norm": 9.552847862243652,
"learning_rate": 9.78179082016554e-06,
"loss": 0.308,
"step": 3180
},
{
"epoch": 2.400300978179082,
"grad_norm": 10.4456148147583,
"learning_rate": 9.777610567678288e-06,
"loss": 0.3444,
"step": 3190
},
{
"epoch": 2.4078254326561326,
"grad_norm": 9.106879234313965,
"learning_rate": 9.773430315191038e-06,
"loss": 0.2704,
"step": 3200
},
{
"epoch": 2.415349887133183,
"grad_norm": 9.76545524597168,
"learning_rate": 9.769250062703787e-06,
"loss": 0.2425,
"step": 3210
},
{
"epoch": 2.422874341610233,
"grad_norm": 9.640020370483398,
"learning_rate": 9.765069810216537e-06,
"loss": 0.3644,
"step": 3220
},
{
"epoch": 2.4303987960872835,
"grad_norm": 9.917800903320312,
"learning_rate": 9.760889557729288e-06,
"loss": 0.2608,
"step": 3230
},
{
"epoch": 2.4379232505643342,
"grad_norm": 8.935245513916016,
"learning_rate": 9.756709305242037e-06,
"loss": 0.3022,
"step": 3240
},
{
"epoch": 2.4454477050413845,
"grad_norm": 13.130419731140137,
"learning_rate": 9.752529052754787e-06,
"loss": 0.3011,
"step": 3250
},
{
"epoch": 2.452972159518435,
"grad_norm": 7.8220906257629395,
"learning_rate": 9.748348800267537e-06,
"loss": 0.3396,
"step": 3260
},
{
"epoch": 2.460496613995485,
"grad_norm": 6.676682472229004,
"learning_rate": 9.744168547780288e-06,
"loss": 0.2493,
"step": 3270
},
{
"epoch": 2.468021068472536,
"grad_norm": 10.557263374328613,
"learning_rate": 9.739988295293036e-06,
"loss": 0.2786,
"step": 3280
},
{
"epoch": 2.475545522949586,
"grad_norm": 11.085116386413574,
"learning_rate": 9.735808042805787e-06,
"loss": 0.2823,
"step": 3290
},
{
"epoch": 2.4830699774266365,
"grad_norm": 12.245048522949219,
"learning_rate": 9.731627790318536e-06,
"loss": 0.2572,
"step": 3300
},
{
"epoch": 2.490594431903687,
"grad_norm": 10.905339241027832,
"learning_rate": 9.727447537831286e-06,
"loss": 0.2343,
"step": 3310
},
{
"epoch": 2.4981188863807375,
"grad_norm": 10.4273681640625,
"learning_rate": 9.723267285344036e-06,
"loss": 0.2734,
"step": 3320
},
{
"epoch": 2.505643340857788,
"grad_norm": 9.312917709350586,
"learning_rate": 9.719087032856785e-06,
"loss": 0.2631,
"step": 3330
},
{
"epoch": 2.513167795334838,
"grad_norm": 4.157855033874512,
"learning_rate": 9.714906780369535e-06,
"loss": 0.2968,
"step": 3340
},
{
"epoch": 2.520692249811889,
"grad_norm": 10.894584655761719,
"learning_rate": 9.710726527882284e-06,
"loss": 0.3412,
"step": 3350
},
{
"epoch": 2.528216704288939,
"grad_norm": 10.197948455810547,
"learning_rate": 9.706546275395035e-06,
"loss": 0.301,
"step": 3360
},
{
"epoch": 2.5357411587659895,
"grad_norm": 10.176607131958008,
"learning_rate": 9.702366022907785e-06,
"loss": 0.289,
"step": 3370
},
{
"epoch": 2.54326561324304,
"grad_norm": 10.578222274780273,
"learning_rate": 9.698185770420534e-06,
"loss": 0.245,
"step": 3380
},
{
"epoch": 2.55079006772009,
"grad_norm": 7.295693874359131,
"learning_rate": 9.694005517933284e-06,
"loss": 0.2981,
"step": 3390
},
{
"epoch": 2.558314522197141,
"grad_norm": 8.082398414611816,
"learning_rate": 9.689825265446033e-06,
"loss": 0.2726,
"step": 3400
},
{
"epoch": 2.565838976674191,
"grad_norm": 7.935963153839111,
"learning_rate": 9.685645012958783e-06,
"loss": 0.2706,
"step": 3410
},
{
"epoch": 2.5733634311512414,
"grad_norm": 10.23719596862793,
"learning_rate": 9.681464760471534e-06,
"loss": 0.2492,
"step": 3420
},
{
"epoch": 2.580887885628292,
"grad_norm": 8.610269546508789,
"learning_rate": 9.677284507984282e-06,
"loss": 0.2644,
"step": 3430
},
{
"epoch": 2.5884123401053425,
"grad_norm": 8.948534965515137,
"learning_rate": 9.673104255497033e-06,
"loss": 0.2806,
"step": 3440
},
{
"epoch": 2.595936794582393,
"grad_norm": 10.290976524353027,
"learning_rate": 9.668924003009783e-06,
"loss": 0.2455,
"step": 3450
},
{
"epoch": 2.603461249059443,
"grad_norm": 6.816092014312744,
"learning_rate": 9.664743750522533e-06,
"loss": 0.2674,
"step": 3460
},
{
"epoch": 2.6109857035364934,
"grad_norm": 6.8829264640808105,
"learning_rate": 9.660563498035282e-06,
"loss": 0.2256,
"step": 3470
},
{
"epoch": 2.618510158013544,
"grad_norm": 10.476147651672363,
"learning_rate": 9.656383245548033e-06,
"loss": 0.239,
"step": 3480
},
{
"epoch": 2.6260346124905944,
"grad_norm": 10.579778671264648,
"learning_rate": 9.652202993060781e-06,
"loss": 0.3197,
"step": 3490
},
{
"epoch": 2.6335590669676447,
"grad_norm": 7.390209197998047,
"learning_rate": 9.648022740573532e-06,
"loss": 0.2487,
"step": 3500
},
{
"epoch": 2.6410835214446955,
"grad_norm": 7.1727447509765625,
"learning_rate": 9.643842488086282e-06,
"loss": 0.276,
"step": 3510
},
{
"epoch": 2.648607975921746,
"grad_norm": 10.80611515045166,
"learning_rate": 9.63966223559903e-06,
"loss": 0.2532,
"step": 3520
},
{
"epoch": 2.656132430398796,
"grad_norm": 5.822349548339844,
"learning_rate": 9.635481983111781e-06,
"loss": 0.2249,
"step": 3530
},
{
"epoch": 2.6636568848758464,
"grad_norm": 8.362287521362305,
"learning_rate": 9.63130173062453e-06,
"loss": 0.2445,
"step": 3540
},
{
"epoch": 2.6711813393528967,
"grad_norm": 10.724336624145508,
"learning_rate": 9.62712147813728e-06,
"loss": 0.3065,
"step": 3550
},
{
"epoch": 2.6787057938299474,
"grad_norm": 7.8322601318359375,
"learning_rate": 9.62294122565003e-06,
"loss": 0.2532,
"step": 3560
},
{
"epoch": 2.6862302483069977,
"grad_norm": 8.452668190002441,
"learning_rate": 9.61876097316278e-06,
"loss": 0.2857,
"step": 3570
},
{
"epoch": 2.693754702784048,
"grad_norm": 7.749220848083496,
"learning_rate": 9.61458072067553e-06,
"loss": 0.2676,
"step": 3580
},
{
"epoch": 2.701279157261099,
"grad_norm": 5.934022426605225,
"learning_rate": 9.610400468188278e-06,
"loss": 0.2524,
"step": 3590
},
{
"epoch": 2.708803611738149,
"grad_norm": 8.625337600708008,
"learning_rate": 9.606220215701029e-06,
"loss": 0.3011,
"step": 3600
},
{
"epoch": 2.7163280662151994,
"grad_norm": 10.652859687805176,
"learning_rate": 9.60203996321378e-06,
"loss": 0.2581,
"step": 3610
},
{
"epoch": 2.7238525206922497,
"grad_norm": 6.213090896606445,
"learning_rate": 9.59785971072653e-06,
"loss": 0.2808,
"step": 3620
},
{
"epoch": 2.7313769751693,
"grad_norm": 10.768575668334961,
"learning_rate": 9.593679458239278e-06,
"loss": 0.3132,
"step": 3630
},
{
"epoch": 2.7389014296463507,
"grad_norm": 9.935160636901855,
"learning_rate": 9.589499205752029e-06,
"loss": 0.2761,
"step": 3640
},
{
"epoch": 2.746425884123401,
"grad_norm": 10.432900428771973,
"learning_rate": 9.585318953264779e-06,
"loss": 0.2838,
"step": 3650
},
{
"epoch": 2.7539503386004514,
"grad_norm": 7.264293670654297,
"learning_rate": 9.581138700777528e-06,
"loss": 0.267,
"step": 3660
},
{
"epoch": 2.761474793077502,
"grad_norm": 10.48466968536377,
"learning_rate": 9.576958448290278e-06,
"loss": 0.2085,
"step": 3670
},
{
"epoch": 2.7689992475545524,
"grad_norm": 10.649408340454102,
"learning_rate": 9.572778195803027e-06,
"loss": 0.3237,
"step": 3680
},
{
"epoch": 2.7765237020316027,
"grad_norm": 7.883304119110107,
"learning_rate": 9.568597943315777e-06,
"loss": 0.2566,
"step": 3690
},
{
"epoch": 2.784048156508653,
"grad_norm": 8.996721267700195,
"learning_rate": 9.564417690828528e-06,
"loss": 0.243,
"step": 3700
},
{
"epoch": 2.7915726109857033,
"grad_norm": 10.833441734313965,
"learning_rate": 9.560237438341276e-06,
"loss": 0.2329,
"step": 3710
},
{
"epoch": 2.799097065462754,
"grad_norm": 8.895788192749023,
"learning_rate": 9.556057185854027e-06,
"loss": 0.2802,
"step": 3720
},
{
"epoch": 2.8066215199398044,
"grad_norm": 10.032631874084473,
"learning_rate": 9.551876933366775e-06,
"loss": 0.2527,
"step": 3730
},
{
"epoch": 2.8141459744168547,
"grad_norm": 11.826578140258789,
"learning_rate": 9.547696680879526e-06,
"loss": 0.2601,
"step": 3740
},
{
"epoch": 2.8216704288939054,
"grad_norm": 9.224230766296387,
"learning_rate": 9.543516428392276e-06,
"loss": 0.2083,
"step": 3750
},
{
"epoch": 2.8291948833709557,
"grad_norm": 8.846405029296875,
"learning_rate": 9.539336175905025e-06,
"loss": 0.2695,
"step": 3760
},
{
"epoch": 2.836719337848006,
"grad_norm": 8.925997734069824,
"learning_rate": 9.535155923417775e-06,
"loss": 0.2629,
"step": 3770
},
{
"epoch": 2.8442437923250563,
"grad_norm": 7.406662940979004,
"learning_rate": 9.530975670930524e-06,
"loss": 0.2627,
"step": 3780
},
{
"epoch": 2.8517682468021066,
"grad_norm": 8.157476425170898,
"learning_rate": 9.526795418443274e-06,
"loss": 0.2843,
"step": 3790
},
{
"epoch": 2.8592927012791574,
"grad_norm": 8.885982513427734,
"learning_rate": 9.522615165956023e-06,
"loss": 0.2227,
"step": 3800
},
{
"epoch": 2.8668171557562077,
"grad_norm": 10.690934181213379,
"learning_rate": 9.518434913468775e-06,
"loss": 0.2469,
"step": 3810
},
{
"epoch": 2.874341610233258,
"grad_norm": 7.025122165679932,
"learning_rate": 9.514254660981524e-06,
"loss": 0.2279,
"step": 3820
},
{
"epoch": 2.8818660647103087,
"grad_norm": 10.257140159606934,
"learning_rate": 9.510074408494274e-06,
"loss": 0.2779,
"step": 3830
},
{
"epoch": 2.889390519187359,
"grad_norm": 8.374255180358887,
"learning_rate": 9.505894156007025e-06,
"loss": 0.2543,
"step": 3840
},
{
"epoch": 2.8969149736644093,
"grad_norm": 9.439533233642578,
"learning_rate": 9.501713903519773e-06,
"loss": 0.2717,
"step": 3850
},
{
"epoch": 2.9044394281414596,
"grad_norm": 6.661174774169922,
"learning_rate": 9.497533651032524e-06,
"loss": 0.2418,
"step": 3860
},
{
"epoch": 2.91196388261851,
"grad_norm": 9.886220932006836,
"learning_rate": 9.493353398545273e-06,
"loss": 0.2159,
"step": 3870
},
{
"epoch": 2.9194883370955607,
"grad_norm": 6.797257423400879,
"learning_rate": 9.489173146058023e-06,
"loss": 0.1922,
"step": 3880
},
{
"epoch": 2.927012791572611,
"grad_norm": 7.842899322509766,
"learning_rate": 9.484992893570772e-06,
"loss": 0.2535,
"step": 3890
},
{
"epoch": 2.9345372460496613,
"grad_norm": 9.077611923217773,
"learning_rate": 9.480812641083522e-06,
"loss": 0.2766,
"step": 3900
},
{
"epoch": 2.942061700526712,
"grad_norm": 8.896077156066895,
"learning_rate": 9.476632388596272e-06,
"loss": 0.2413,
"step": 3910
},
{
"epoch": 2.9495861550037623,
"grad_norm": 3.8827147483825684,
"learning_rate": 9.472452136109021e-06,
"loss": 0.2291,
"step": 3920
},
{
"epoch": 2.9571106094808126,
"grad_norm": 8.659483909606934,
"learning_rate": 9.468271883621772e-06,
"loss": 0.2654,
"step": 3930
},
{
"epoch": 2.964635063957863,
"grad_norm": 8.626335144042969,
"learning_rate": 9.46409163113452e-06,
"loss": 0.244,
"step": 3940
},
{
"epoch": 2.972159518434913,
"grad_norm": 11.298845291137695,
"learning_rate": 9.45991137864727e-06,
"loss": 0.298,
"step": 3950
},
{
"epoch": 2.979683972911964,
"grad_norm": 6.526998519897461,
"learning_rate": 9.455731126160021e-06,
"loss": 0.1483,
"step": 3960
},
{
"epoch": 2.9872084273890143,
"grad_norm": 5.75825309753418,
"learning_rate": 9.45155087367277e-06,
"loss": 0.215,
"step": 3970
},
{
"epoch": 2.9947328818660646,
"grad_norm": 6.820727348327637,
"learning_rate": 9.44737062118552e-06,
"loss": 0.2889,
"step": 3980
},
{
"epoch": 3.0022573363431153,
"grad_norm": 6.299858093261719,
"learning_rate": 9.44319036869827e-06,
"loss": 0.1838,
"step": 3990
},
{
"epoch": 3.0097817908201656,
"grad_norm": 8.910431861877441,
"learning_rate": 9.439010116211021e-06,
"loss": 0.265,
"step": 4000
},
{
"epoch": 3.017306245297216,
"grad_norm": 6.495157718658447,
"learning_rate": 9.43482986372377e-06,
"loss": 0.2014,
"step": 4010
},
{
"epoch": 3.024830699774266,
"grad_norm": 8.76952075958252,
"learning_rate": 9.43064961123652e-06,
"loss": 0.2443,
"step": 4020
},
{
"epoch": 3.032355154251317,
"grad_norm": 11.289894104003906,
"learning_rate": 9.426469358749269e-06,
"loss": 0.2421,
"step": 4030
},
{
"epoch": 3.0398796087283673,
"grad_norm": 9.681814193725586,
"learning_rate": 9.422289106262019e-06,
"loss": 0.2,
"step": 4040
},
{
"epoch": 3.0474040632054176,
"grad_norm": 9.460895538330078,
"learning_rate": 9.41810885377477e-06,
"loss": 0.22,
"step": 4050
},
{
"epoch": 3.054928517682468,
"grad_norm": 8.664941787719727,
"learning_rate": 9.413928601287518e-06,
"loss": 0.241,
"step": 4060
},
{
"epoch": 3.0624529721595186,
"grad_norm": 8.543761253356934,
"learning_rate": 9.409748348800269e-06,
"loss": 0.2509,
"step": 4070
},
{
"epoch": 3.069977426636569,
"grad_norm": 8.258243560791016,
"learning_rate": 9.405568096313017e-06,
"loss": 0.2447,
"step": 4080
},
{
"epoch": 3.077501881113619,
"grad_norm": 11.222923278808594,
"learning_rate": 9.401387843825768e-06,
"loss": 0.2414,
"step": 4090
},
{
"epoch": 3.0850263355906695,
"grad_norm": 10.979453086853027,
"learning_rate": 9.397207591338518e-06,
"loss": 0.2449,
"step": 4100
},
{
"epoch": 3.0925507900677203,
"grad_norm": 7.181826591491699,
"learning_rate": 9.393027338851267e-06,
"loss": 0.2719,
"step": 4110
},
{
"epoch": 3.1000752445447706,
"grad_norm": 7.384873867034912,
"learning_rate": 9.388847086364017e-06,
"loss": 0.2834,
"step": 4120
},
{
"epoch": 3.107599699021821,
"grad_norm": 6.647815704345703,
"learning_rate": 9.384666833876766e-06,
"loss": 0.2227,
"step": 4130
},
{
"epoch": 3.115124153498871,
"grad_norm": 6.282780170440674,
"learning_rate": 9.380486581389516e-06,
"loss": 0.2574,
"step": 4140
},
{
"epoch": 3.122648607975922,
"grad_norm": 10.085737228393555,
"learning_rate": 9.376306328902267e-06,
"loss": 0.292,
"step": 4150
},
{
"epoch": 3.130173062452972,
"grad_norm": 10.34396743774414,
"learning_rate": 9.372126076415015e-06,
"loss": 0.2346,
"step": 4160
},
{
"epoch": 3.1376975169300225,
"grad_norm": 11.896784782409668,
"learning_rate": 9.367945823927766e-06,
"loss": 0.2423,
"step": 4170
},
{
"epoch": 3.145221971407073,
"grad_norm": 10.978114128112793,
"learning_rate": 9.363765571440516e-06,
"loss": 0.2671,
"step": 4180
},
{
"epoch": 3.1527464258841236,
"grad_norm": 6.393170356750488,
"learning_rate": 9.359585318953267e-06,
"loss": 0.1836,
"step": 4190
},
{
"epoch": 3.160270880361174,
"grad_norm": 6.737048149108887,
"learning_rate": 9.355405066466015e-06,
"loss": 0.1594,
"step": 4200
},
{
"epoch": 3.167795334838224,
"grad_norm": 6.920555591583252,
"learning_rate": 9.351224813978766e-06,
"loss": 0.2712,
"step": 4210
},
{
"epoch": 3.1753197893152745,
"grad_norm": 6.888144493103027,
"learning_rate": 9.347044561491514e-06,
"loss": 0.2113,
"step": 4220
},
{
"epoch": 3.1828442437923252,
"grad_norm": 7.4439167976379395,
"learning_rate": 9.342864309004265e-06,
"loss": 0.2273,
"step": 4230
},
{
"epoch": 3.1903686982693755,
"grad_norm": 5.45890998840332,
"learning_rate": 9.338684056517015e-06,
"loss": 0.2555,
"step": 4240
},
{
"epoch": 3.197893152746426,
"grad_norm": 7.8858208656311035,
"learning_rate": 9.334503804029764e-06,
"loss": 0.2308,
"step": 4250
},
{
"epoch": 3.205417607223476,
"grad_norm": 8.43179702758789,
"learning_rate": 9.330323551542514e-06,
"loss": 0.2704,
"step": 4260
},
{
"epoch": 3.212942061700527,
"grad_norm": 9.604415893554688,
"learning_rate": 9.326143299055263e-06,
"loss": 0.2345,
"step": 4270
},
{
"epoch": 3.220466516177577,
"grad_norm": 5.360594749450684,
"learning_rate": 9.321963046568013e-06,
"loss": 0.2418,
"step": 4280
},
{
"epoch": 3.2279909706546275,
"grad_norm": 10.218423843383789,
"learning_rate": 9.317782794080764e-06,
"loss": 0.2553,
"step": 4290
},
{
"epoch": 3.235515425131678,
"grad_norm": 6.115988254547119,
"learning_rate": 9.313602541593512e-06,
"loss": 0.2389,
"step": 4300
},
{
"epoch": 3.2430398796087285,
"grad_norm": 8.351349830627441,
"learning_rate": 9.309422289106263e-06,
"loss": 0.2156,
"step": 4310
},
{
"epoch": 3.250564334085779,
"grad_norm": 8.038823127746582,
"learning_rate": 9.305242036619012e-06,
"loss": 0.1934,
"step": 4320
},
{
"epoch": 3.258088788562829,
"grad_norm": 7.686810493469238,
"learning_rate": 9.301061784131762e-06,
"loss": 0.2239,
"step": 4330
},
{
"epoch": 3.2656132430398794,
"grad_norm": 9.159664154052734,
"learning_rate": 9.296881531644512e-06,
"loss": 0.2275,
"step": 4340
},
{
"epoch": 3.27313769751693,
"grad_norm": 8.89784049987793,
"learning_rate": 9.292701279157263e-06,
"loss": 0.2308,
"step": 4350
},
{
"epoch": 3.2806621519939805,
"grad_norm": 6.140933990478516,
"learning_rate": 9.288521026670011e-06,
"loss": 0.2337,
"step": 4360
},
{
"epoch": 3.288186606471031,
"grad_norm": 9.233074188232422,
"learning_rate": 9.284340774182762e-06,
"loss": 0.263,
"step": 4370
},
{
"epoch": 3.295711060948081,
"grad_norm": 9.521575927734375,
"learning_rate": 9.280160521695512e-06,
"loss": 0.2321,
"step": 4380
},
{
"epoch": 3.303235515425132,
"grad_norm": 6.050286769866943,
"learning_rate": 9.275980269208261e-06,
"loss": 0.2889,
"step": 4390
},
{
"epoch": 3.310759969902182,
"grad_norm": 5.523685455322266,
"learning_rate": 9.271800016721011e-06,
"loss": 0.2682,
"step": 4400
},
{
"epoch": 3.3182844243792324,
"grad_norm": 10.648170471191406,
"learning_rate": 9.26761976423376e-06,
"loss": 0.238,
"step": 4410
},
{
"epoch": 3.3258088788562827,
"grad_norm": 8.683711051940918,
"learning_rate": 9.26343951174651e-06,
"loss": 0.1859,
"step": 4420
},
{
"epoch": 3.3333333333333335,
"grad_norm": 9.345295906066895,
"learning_rate": 9.25925925925926e-06,
"loss": 0.2727,
"step": 4430
},
{
"epoch": 3.340857787810384,
"grad_norm": 7.622416973114014,
"learning_rate": 9.25507900677201e-06,
"loss": 0.1864,
"step": 4440
},
{
"epoch": 3.348382242287434,
"grad_norm": 7.486326694488525,
"learning_rate": 9.25089875428476e-06,
"loss": 0.3072,
"step": 4450
},
{
"epoch": 3.3559066967644844,
"grad_norm": 8.330912590026855,
"learning_rate": 9.246718501797509e-06,
"loss": 0.2326,
"step": 4460
},
{
"epoch": 3.363431151241535,
"grad_norm": 6.218571662902832,
"learning_rate": 9.242538249310259e-06,
"loss": 0.2677,
"step": 4470
},
{
"epoch": 3.3709556057185854,
"grad_norm": 9.183655738830566,
"learning_rate": 9.23835799682301e-06,
"loss": 0.1941,
"step": 4480
},
{
"epoch": 3.3784800601956357,
"grad_norm": 5.508805274963379,
"learning_rate": 9.234177744335758e-06,
"loss": 0.1838,
"step": 4490
},
{
"epoch": 3.386004514672686,
"grad_norm": 7.009641647338867,
"learning_rate": 9.229997491848509e-06,
"loss": 0.2221,
"step": 4500
},
{
"epoch": 3.393528969149737,
"grad_norm": 8.951751708984375,
"learning_rate": 9.225817239361257e-06,
"loss": 0.2573,
"step": 4510
},
{
"epoch": 3.401053423626787,
"grad_norm": 11.373517990112305,
"learning_rate": 9.221636986874008e-06,
"loss": 0.1813,
"step": 4520
},
{
"epoch": 3.4085778781038374,
"grad_norm": 11.15352725982666,
"learning_rate": 9.217456734386758e-06,
"loss": 0.2251,
"step": 4530
},
{
"epoch": 3.4161023325808877,
"grad_norm": 9.159383773803711,
"learning_rate": 9.213276481899508e-06,
"loss": 0.2496,
"step": 4540
},
{
"epoch": 3.4236267870579384,
"grad_norm": 11.02437973022461,
"learning_rate": 9.209096229412257e-06,
"loss": 0.2551,
"step": 4550
},
{
"epoch": 3.4311512415349887,
"grad_norm": 9.18223762512207,
"learning_rate": 9.204915976925008e-06,
"loss": 0.2264,
"step": 4560
},
{
"epoch": 3.438675696012039,
"grad_norm": 7.3303961753845215,
"learning_rate": 9.200735724437758e-06,
"loss": 0.2193,
"step": 4570
},
{
"epoch": 3.44620015048909,
"grad_norm": 8.85081672668457,
"learning_rate": 9.196555471950507e-06,
"loss": 0.2481,
"step": 4580
},
{
"epoch": 3.45372460496614,
"grad_norm": 6.6570539474487305,
"learning_rate": 9.192375219463257e-06,
"loss": 0.2238,
"step": 4590
},
{
"epoch": 3.4612490594431904,
"grad_norm": 7.035717964172363,
"learning_rate": 9.188194966976006e-06,
"loss": 0.2144,
"step": 4600
},
{
"epoch": 3.4687735139202407,
"grad_norm": 6.024372100830078,
"learning_rate": 9.184014714488756e-06,
"loss": 0.209,
"step": 4610
},
{
"epoch": 3.476297968397291,
"grad_norm": 7.535146713256836,
"learning_rate": 9.179834462001505e-06,
"loss": 0.2353,
"step": 4620
},
{
"epoch": 3.4838224228743417,
"grad_norm": 6.437473773956299,
"learning_rate": 9.175654209514255e-06,
"loss": 0.2614,
"step": 4630
},
{
"epoch": 3.491346877351392,
"grad_norm": 8.510677337646484,
"learning_rate": 9.171473957027006e-06,
"loss": 0.259,
"step": 4640
},
{
"epoch": 3.4988713318284423,
"grad_norm": 6.0311360359191895,
"learning_rate": 9.167293704539754e-06,
"loss": 0.2372,
"step": 4650
},
{
"epoch": 3.506395786305493,
"grad_norm": 10.722698211669922,
"learning_rate": 9.163113452052505e-06,
"loss": 0.2969,
"step": 4660
},
{
"epoch": 3.5139202407825434,
"grad_norm": 9.502874374389648,
"learning_rate": 9.158933199565253e-06,
"loss": 0.2657,
"step": 4670
},
{
"epoch": 3.5214446952595937,
"grad_norm": 9.616406440734863,
"learning_rate": 9.154752947078004e-06,
"loss": 0.2262,
"step": 4680
},
{
"epoch": 3.528969149736644,
"grad_norm": 8.309393882751465,
"learning_rate": 9.150572694590754e-06,
"loss": 0.1627,
"step": 4690
},
{
"epoch": 3.5364936042136943,
"grad_norm": 9.303479194641113,
"learning_rate": 9.146392442103503e-06,
"loss": 0.1774,
"step": 4700
},
{
"epoch": 3.544018058690745,
"grad_norm": 8.03176212310791,
"learning_rate": 9.142212189616253e-06,
"loss": 0.248,
"step": 4710
},
{
"epoch": 3.5515425131677953,
"grad_norm": 7.2166242599487305,
"learning_rate": 9.138031937129004e-06,
"loss": 0.2682,
"step": 4720
},
{
"epoch": 3.5590669676448456,
"grad_norm": 7.98032283782959,
"learning_rate": 9.133851684641754e-06,
"loss": 0.2244,
"step": 4730
},
{
"epoch": 3.5665914221218964,
"grad_norm": 9.044601440429688,
"learning_rate": 9.129671432154503e-06,
"loss": 0.2135,
"step": 4740
},
{
"epoch": 3.5741158765989467,
"grad_norm": 10.123546600341797,
"learning_rate": 9.125491179667253e-06,
"loss": 0.2442,
"step": 4750
},
{
"epoch": 3.581640331075997,
"grad_norm": 6.806577682495117,
"learning_rate": 9.121310927180002e-06,
"loss": 0.2396,
"step": 4760
},
{
"epoch": 3.5891647855530473,
"grad_norm": 7.733734130859375,
"learning_rate": 9.117130674692752e-06,
"loss": 0.231,
"step": 4770
},
{
"epoch": 3.5966892400300976,
"grad_norm": 9.914606094360352,
"learning_rate": 9.112950422205503e-06,
"loss": 0.2523,
"step": 4780
},
{
"epoch": 3.6042136945071483,
"grad_norm": 11.457139015197754,
"learning_rate": 9.108770169718251e-06,
"loss": 0.2646,
"step": 4790
},
{
"epoch": 3.6117381489841986,
"grad_norm": 8.741832733154297,
"learning_rate": 9.104589917231002e-06,
"loss": 0.2069,
"step": 4800
},
{
"epoch": 3.619262603461249,
"grad_norm": 6.032299995422363,
"learning_rate": 9.10040966474375e-06,
"loss": 0.2159,
"step": 4810
},
{
"epoch": 3.6267870579382997,
"grad_norm": 12.495452880859375,
"learning_rate": 9.096229412256501e-06,
"loss": 0.2467,
"step": 4820
},
{
"epoch": 3.63431151241535,
"grad_norm": 8.642016410827637,
"learning_rate": 9.092049159769251e-06,
"loss": 0.2526,
"step": 4830
},
{
"epoch": 3.6418359668924003,
"grad_norm": 9.123907089233398,
"learning_rate": 9.087868907282e-06,
"loss": 0.2534,
"step": 4840
},
{
"epoch": 3.6493604213694506,
"grad_norm": 9.16486644744873,
"learning_rate": 9.08368865479475e-06,
"loss": 0.2276,
"step": 4850
},
{
"epoch": 3.656884875846501,
"grad_norm": 8.667745590209961,
"learning_rate": 9.079508402307499e-06,
"loss": 0.2348,
"step": 4860
},
{
"epoch": 3.6644093303235517,
"grad_norm": 6.899482727050781,
"learning_rate": 9.07532814982025e-06,
"loss": 0.2081,
"step": 4870
},
{
"epoch": 3.671933784800602,
"grad_norm": 9.625839233398438,
"learning_rate": 9.071147897333e-06,
"loss": 0.2378,
"step": 4880
},
{
"epoch": 3.6794582392776523,
"grad_norm": 6.170398712158203,
"learning_rate": 9.066967644845749e-06,
"loss": 0.2897,
"step": 4890
},
{
"epoch": 3.686982693754703,
"grad_norm": 9.555203437805176,
"learning_rate": 9.062787392358499e-06,
"loss": 0.2446,
"step": 4900
},
{
"epoch": 3.6945071482317533,
"grad_norm": 6.0709052085876465,
"learning_rate": 9.05860713987125e-06,
"loss": 0.2073,
"step": 4910
},
{
"epoch": 3.7020316027088036,
"grad_norm": 8.169535636901855,
"learning_rate": 9.054426887384e-06,
"loss": 0.3046,
"step": 4920
},
{
"epoch": 3.709556057185854,
"grad_norm": 9.92249870300293,
"learning_rate": 9.050246634896748e-06,
"loss": 0.2819,
"step": 4930
},
{
"epoch": 3.717080511662904,
"grad_norm": 8.319649696350098,
"learning_rate": 9.046066382409499e-06,
"loss": 0.184,
"step": 4940
},
{
"epoch": 3.724604966139955,
"grad_norm": 9.96754264831543,
"learning_rate": 9.041886129922248e-06,
"loss": 0.2558,
"step": 4950
},
{
"epoch": 3.7321294206170053,
"grad_norm": 9.089340209960938,
"learning_rate": 9.037705877434998e-06,
"loss": 0.1661,
"step": 4960
},
{
"epoch": 3.7396538750940556,
"grad_norm": 8.855995178222656,
"learning_rate": 9.033525624947748e-06,
"loss": 0.1761,
"step": 4970
},
{
"epoch": 3.7471783295711063,
"grad_norm": 8.411288261413574,
"learning_rate": 9.029345372460497e-06,
"loss": 0.2158,
"step": 4980
},
{
"epoch": 3.7547027840481566,
"grad_norm": 6.084978103637695,
"learning_rate": 9.025165119973247e-06,
"loss": 0.1652,
"step": 4990
},
{
"epoch": 3.762227238525207,
"grad_norm": 5.3416428565979,
"learning_rate": 9.020984867485996e-06,
"loss": 0.1894,
"step": 5000
},
{
"epoch": 3.769751693002257,
"grad_norm": 6.2633137702941895,
"learning_rate": 9.016804614998747e-06,
"loss": 0.2604,
"step": 5010
},
{
"epoch": 3.7772761474793075,
"grad_norm": 11.048465728759766,
"learning_rate": 9.012624362511497e-06,
"loss": 0.2726,
"step": 5020
},
{
"epoch": 3.7848006019563583,
"grad_norm": 8.949687004089355,
"learning_rate": 9.008444110024246e-06,
"loss": 0.2216,
"step": 5030
},
{
"epoch": 3.7923250564334086,
"grad_norm": 8.664268493652344,
"learning_rate": 9.004263857536996e-06,
"loss": 0.2437,
"step": 5040
},
{
"epoch": 3.799849510910459,
"grad_norm": 10.612902641296387,
"learning_rate": 9.000083605049745e-06,
"loss": 0.2243,
"step": 5050
},
{
"epoch": 3.8073739653875096,
"grad_norm": 6.690532684326172,
"learning_rate": 8.995903352562495e-06,
"loss": 0.1877,
"step": 5060
},
{
"epoch": 3.81489841986456,
"grad_norm": 6.29967737197876,
"learning_rate": 8.991723100075246e-06,
"loss": 0.2449,
"step": 5070
},
{
"epoch": 3.82242287434161,
"grad_norm": 8.491719245910645,
"learning_rate": 8.987542847587996e-06,
"loss": 0.2135,
"step": 5080
},
{
"epoch": 3.8299473288186605,
"grad_norm": 8.454863548278809,
"learning_rate": 8.983362595100745e-06,
"loss": 0.2471,
"step": 5090
},
{
"epoch": 3.837471783295711,
"grad_norm": 6.257556438446045,
"learning_rate": 8.979182342613495e-06,
"loss": 0.2089,
"step": 5100
},
{
"epoch": 3.8449962377727616,
"grad_norm": 8.640929222106934,
"learning_rate": 8.975002090126245e-06,
"loss": 0.2339,
"step": 5110
},
{
"epoch": 3.852520692249812,
"grad_norm": 7.349085807800293,
"learning_rate": 8.970821837638994e-06,
"loss": 0.2119,
"step": 5120
},
{
"epoch": 3.860045146726862,
"grad_norm": 10.50108814239502,
"learning_rate": 8.966641585151744e-06,
"loss": 0.263,
"step": 5130
},
{
"epoch": 3.867569601203913,
"grad_norm": 7.560617923736572,
"learning_rate": 8.962461332664493e-06,
"loss": 0.2156,
"step": 5140
},
{
"epoch": 3.875094055680963,
"grad_norm": 9.218497276306152,
"learning_rate": 8.958281080177244e-06,
"loss": 0.1824,
"step": 5150
},
{
"epoch": 3.8826185101580135,
"grad_norm": 7.747318744659424,
"learning_rate": 8.954100827689994e-06,
"loss": 0.1957,
"step": 5160
},
{
"epoch": 3.890142964635064,
"grad_norm": 5.597620487213135,
"learning_rate": 8.949920575202743e-06,
"loss": 0.2798,
"step": 5170
},
{
"epoch": 3.897667419112114,
"grad_norm": 6.475823402404785,
"learning_rate": 8.945740322715493e-06,
"loss": 0.2045,
"step": 5180
},
{
"epoch": 3.905191873589165,
"grad_norm": 8.230575561523438,
"learning_rate": 8.941560070228242e-06,
"loss": 0.2547,
"step": 5190
},
{
"epoch": 3.912716328066215,
"grad_norm": 7.861040115356445,
"learning_rate": 8.937379817740992e-06,
"loss": 0.2188,
"step": 5200
},
{
"epoch": 3.9202407825432655,
"grad_norm": 7.680462837219238,
"learning_rate": 8.933199565253743e-06,
"loss": 0.1942,
"step": 5210
},
{
"epoch": 3.927765237020316,
"grad_norm": 7.623648166656494,
"learning_rate": 8.929019312766491e-06,
"loss": 0.2264,
"step": 5220
},
{
"epoch": 3.9352896914973665,
"grad_norm": 7.062371253967285,
"learning_rate": 8.924839060279242e-06,
"loss": 0.2293,
"step": 5230
},
{
"epoch": 3.942814145974417,
"grad_norm": 10.482141494750977,
"learning_rate": 8.92065880779199e-06,
"loss": 0.2453,
"step": 5240
},
{
"epoch": 3.950338600451467,
"grad_norm": 5.7604522705078125,
"learning_rate": 8.91647855530474e-06,
"loss": 0.2279,
"step": 5250
},
{
"epoch": 3.9578630549285174,
"grad_norm": 12.028779983520508,
"learning_rate": 8.912298302817491e-06,
"loss": 0.2511,
"step": 5260
},
{
"epoch": 3.965387509405568,
"grad_norm": 7.708649158477783,
"learning_rate": 8.908118050330242e-06,
"loss": 0.199,
"step": 5270
},
{
"epoch": 3.9729119638826185,
"grad_norm": 7.3175578117370605,
"learning_rate": 8.90393779784299e-06,
"loss": 0.2619,
"step": 5280
},
{
"epoch": 3.9804364183596688,
"grad_norm": 8.127973556518555,
"learning_rate": 8.89975754535574e-06,
"loss": 0.2279,
"step": 5290
},
{
"epoch": 3.9879608728367195,
"grad_norm": 8.595681190490723,
"learning_rate": 8.895577292868491e-06,
"loss": 0.2238,
"step": 5300
},
{
"epoch": 3.99548532731377,
"grad_norm": 4.470604419708252,
"learning_rate": 8.89139704038124e-06,
"loss": 0.1792,
"step": 5310
},
{
"epoch": 4.00300978179082,
"grad_norm": 5.3823676109313965,
"learning_rate": 8.88721678789399e-06,
"loss": 0.178,
"step": 5320
},
{
"epoch": 4.010534236267871,
"grad_norm": 7.360695838928223,
"learning_rate": 8.883036535406739e-06,
"loss": 0.2077,
"step": 5330
},
{
"epoch": 4.018058690744921,
"grad_norm": 7.8422770500183105,
"learning_rate": 8.87885628291949e-06,
"loss": 0.158,
"step": 5340
},
{
"epoch": 4.0255831452219715,
"grad_norm": 6.228549480438232,
"learning_rate": 8.87467603043224e-06,
"loss": 0.2119,
"step": 5350
},
{
"epoch": 4.033107599699022,
"grad_norm": 10.569890022277832,
"learning_rate": 8.870495777944988e-06,
"loss": 0.1691,
"step": 5360
},
{
"epoch": 4.040632054176072,
"grad_norm": 8.88987922668457,
"learning_rate": 8.866315525457739e-06,
"loss": 0.1894,
"step": 5370
},
{
"epoch": 4.048156508653123,
"grad_norm": 3.7573678493499756,
"learning_rate": 8.862135272970487e-06,
"loss": 0.1762,
"step": 5380
},
{
"epoch": 4.055680963130173,
"grad_norm": 10.3701810836792,
"learning_rate": 8.857955020483238e-06,
"loss": 0.1756,
"step": 5390
},
{
"epoch": 4.063205417607223,
"grad_norm": 12.642711639404297,
"learning_rate": 8.853774767995987e-06,
"loss": 0.2486,
"step": 5400
},
{
"epoch": 4.070729872084274,
"grad_norm": 7.448667526245117,
"learning_rate": 8.849594515508737e-06,
"loss": 0.2261,
"step": 5410
},
{
"epoch": 4.078254326561324,
"grad_norm": 4.713575839996338,
"learning_rate": 8.845414263021487e-06,
"loss": 0.1614,
"step": 5420
},
{
"epoch": 4.085778781038375,
"grad_norm": 8.637237548828125,
"learning_rate": 8.841234010534236e-06,
"loss": 0.1796,
"step": 5430
},
{
"epoch": 4.0933032355154255,
"grad_norm": 8.004151344299316,
"learning_rate": 8.837053758046988e-06,
"loss": 0.1982,
"step": 5440
},
{
"epoch": 4.100827689992475,
"grad_norm": 7.788305759429932,
"learning_rate": 8.832873505559737e-06,
"loss": 0.2108,
"step": 5450
},
{
"epoch": 4.108352144469526,
"grad_norm": 9.111312866210938,
"learning_rate": 8.828693253072487e-06,
"loss": 0.2049,
"step": 5460
},
{
"epoch": 4.115876598946576,
"grad_norm": 8.527594566345215,
"learning_rate": 8.824513000585236e-06,
"loss": 0.1699,
"step": 5470
},
{
"epoch": 4.123401053423627,
"grad_norm": 11.286643981933594,
"learning_rate": 8.820332748097986e-06,
"loss": 0.2459,
"step": 5480
},
{
"epoch": 4.1309255079006775,
"grad_norm": 6.5479888916015625,
"learning_rate": 8.816152495610735e-06,
"loss": 0.2793,
"step": 5490
},
{
"epoch": 4.138449962377727,
"grad_norm": 9.018651008605957,
"learning_rate": 8.811972243123485e-06,
"loss": 0.2286,
"step": 5500
},
{
"epoch": 4.145974416854778,
"grad_norm": 7.51389741897583,
"learning_rate": 8.807791990636236e-06,
"loss": 0.2164,
"step": 5510
},
{
"epoch": 4.153498871331829,
"grad_norm": 9.141986846923828,
"learning_rate": 8.803611738148985e-06,
"loss": 0.1989,
"step": 5520
},
{
"epoch": 4.161023325808879,
"grad_norm": 11.049951553344727,
"learning_rate": 8.799431485661735e-06,
"loss": 0.2415,
"step": 5530
},
{
"epoch": 4.168547780285929,
"grad_norm": 8.980281829833984,
"learning_rate": 8.795251233174484e-06,
"loss": 0.1959,
"step": 5540
},
{
"epoch": 4.176072234762979,
"grad_norm": 9.708146095275879,
"learning_rate": 8.791070980687234e-06,
"loss": 0.2255,
"step": 5550
},
{
"epoch": 4.18359668924003,
"grad_norm": 7.6197943687438965,
"learning_rate": 8.786890728199984e-06,
"loss": 0.1875,
"step": 5560
},
{
"epoch": 4.191121143717081,
"grad_norm": 7.607595920562744,
"learning_rate": 8.782710475712733e-06,
"loss": 0.209,
"step": 5570
},
{
"epoch": 4.198645598194131,
"grad_norm": 5.671550750732422,
"learning_rate": 8.778530223225483e-06,
"loss": 0.1943,
"step": 5580
},
{
"epoch": 4.206170052671181,
"grad_norm": 7.70829963684082,
"learning_rate": 8.774349970738232e-06,
"loss": 0.1776,
"step": 5590
},
{
"epoch": 4.213694507148232,
"grad_norm": 8.614123344421387,
"learning_rate": 8.770169718250983e-06,
"loss": 0.1654,
"step": 5600
},
{
"epoch": 4.221218961625282,
"grad_norm": 8.027443885803223,
"learning_rate": 8.765989465763733e-06,
"loss": 0.1863,
"step": 5610
},
{
"epoch": 4.228743416102333,
"grad_norm": 7.041518211364746,
"learning_rate": 8.761809213276483e-06,
"loss": 0.218,
"step": 5620
},
{
"epoch": 4.236267870579383,
"grad_norm": 7.971219062805176,
"learning_rate": 8.757628960789232e-06,
"loss": 0.1982,
"step": 5630
},
{
"epoch": 4.243792325056433,
"grad_norm": 7.808959007263184,
"learning_rate": 8.753448708301982e-06,
"loss": 0.2318,
"step": 5640
},
{
"epoch": 4.251316779533484,
"grad_norm": 8.63860034942627,
"learning_rate": 8.749268455814733e-06,
"loss": 0.2269,
"step": 5650
},
{
"epoch": 4.258841234010534,
"grad_norm": 5.921948432922363,
"learning_rate": 8.745088203327482e-06,
"loss": 0.1728,
"step": 5660
},
{
"epoch": 4.266365688487585,
"grad_norm": 9.674760818481445,
"learning_rate": 8.740907950840232e-06,
"loss": 0.2094,
"step": 5670
},
{
"epoch": 4.273890142964635,
"grad_norm": 7.121826648712158,
"learning_rate": 8.73672769835298e-06,
"loss": 0.2164,
"step": 5680
},
{
"epoch": 4.281414597441685,
"grad_norm": 8.577960968017578,
"learning_rate": 8.732547445865731e-06,
"loss": 0.2483,
"step": 5690
},
{
"epoch": 4.288939051918736,
"grad_norm": 5.841510772705078,
"learning_rate": 8.728367193378481e-06,
"loss": 0.2338,
"step": 5700
},
{
"epoch": 4.296463506395786,
"grad_norm": 7.245088577270508,
"learning_rate": 8.72418694089123e-06,
"loss": 0.1879,
"step": 5710
},
{
"epoch": 4.303987960872837,
"grad_norm": 9.421006202697754,
"learning_rate": 8.72000668840398e-06,
"loss": 0.2822,
"step": 5720
},
{
"epoch": 4.311512415349887,
"grad_norm": 6.611949920654297,
"learning_rate": 8.71582643591673e-06,
"loss": 0.1874,
"step": 5730
},
{
"epoch": 4.319036869826937,
"grad_norm": 9.723703384399414,
"learning_rate": 8.71164618342948e-06,
"loss": 0.2005,
"step": 5740
},
{
"epoch": 4.326561324303988,
"grad_norm": 8.469480514526367,
"learning_rate": 8.70746593094223e-06,
"loss": 0.22,
"step": 5750
},
{
"epoch": 4.334085778781039,
"grad_norm": 9.64450454711914,
"learning_rate": 8.703285678454979e-06,
"loss": 0.2411,
"step": 5760
},
{
"epoch": 4.341610233258089,
"grad_norm": 11.526230812072754,
"learning_rate": 8.69910542596773e-06,
"loss": 0.2353,
"step": 5770
},
{
"epoch": 4.349134687735139,
"grad_norm": 5.962429523468018,
"learning_rate": 8.694925173480478e-06,
"loss": 0.1535,
"step": 5780
},
{
"epoch": 4.356659142212189,
"grad_norm": 5.385570526123047,
"learning_rate": 8.690744920993228e-06,
"loss": 0.2185,
"step": 5790
},
{
"epoch": 4.36418359668924,
"grad_norm": 5.992058753967285,
"learning_rate": 8.686564668505979e-06,
"loss": 0.2033,
"step": 5800
},
{
"epoch": 4.371708051166291,
"grad_norm": 6.653234481811523,
"learning_rate": 8.682384416018729e-06,
"loss": 0.1728,
"step": 5810
},
{
"epoch": 4.3792325056433405,
"grad_norm": 6.744475841522217,
"learning_rate": 8.678204163531478e-06,
"loss": 0.1967,
"step": 5820
},
{
"epoch": 4.386756960120391,
"grad_norm": 9.5464448928833,
"learning_rate": 8.674023911044228e-06,
"loss": 0.2279,
"step": 5830
},
{
"epoch": 4.394281414597442,
"grad_norm": 8.616525650024414,
"learning_rate": 8.669843658556979e-06,
"loss": 0.2534,
"step": 5840
},
{
"epoch": 4.401805869074492,
"grad_norm": 4.971557140350342,
"learning_rate": 8.665663406069727e-06,
"loss": 0.1697,
"step": 5850
},
{
"epoch": 4.409330323551543,
"grad_norm": 10.338679313659668,
"learning_rate": 8.661483153582478e-06,
"loss": 0.1967,
"step": 5860
},
{
"epoch": 4.416854778028593,
"grad_norm": 7.161668300628662,
"learning_rate": 8.657302901095226e-06,
"loss": 0.1669,
"step": 5870
},
{
"epoch": 4.424379232505643,
"grad_norm": 4.379268646240234,
"learning_rate": 8.653122648607977e-06,
"loss": 0.131,
"step": 5880
},
{
"epoch": 4.431903686982694,
"grad_norm": 8.893242835998535,
"learning_rate": 8.648942396120727e-06,
"loss": 0.2316,
"step": 5890
},
{
"epoch": 4.439428141459744,
"grad_norm": 6.582338333129883,
"learning_rate": 8.644762143633476e-06,
"loss": 0.2434,
"step": 5900
},
{
"epoch": 4.446952595936795,
"grad_norm": 6.187091827392578,
"learning_rate": 8.640581891146226e-06,
"loss": 0.1217,
"step": 5910
},
{
"epoch": 4.454477050413845,
"grad_norm": 8.071149826049805,
"learning_rate": 8.636401638658975e-06,
"loss": 0.1744,
"step": 5920
},
{
"epoch": 4.462001504890895,
"grad_norm": 12.656167030334473,
"learning_rate": 8.632221386171725e-06,
"loss": 0.2249,
"step": 5930
},
{
"epoch": 4.469525959367946,
"grad_norm": 9.815470695495605,
"learning_rate": 8.628041133684476e-06,
"loss": 0.1969,
"step": 5940
},
{
"epoch": 4.477050413844996,
"grad_norm": 8.92492389678955,
"learning_rate": 8.623860881197224e-06,
"loss": 0.1994,
"step": 5950
},
{
"epoch": 4.4845748683220465,
"grad_norm": 5.436546802520752,
"learning_rate": 8.619680628709975e-06,
"loss": 0.1291,
"step": 5960
},
{
"epoch": 4.492099322799097,
"grad_norm": 3.1124415397644043,
"learning_rate": 8.615500376222724e-06,
"loss": 0.1929,
"step": 5970
},
{
"epoch": 4.499623777276147,
"grad_norm": 9.723775863647461,
"learning_rate": 8.611320123735476e-06,
"loss": 0.1891,
"step": 5980
},
{
"epoch": 4.507148231753198,
"grad_norm": 9.545393943786621,
"learning_rate": 8.607139871248224e-06,
"loss": 0.1785,
"step": 5990
},
{
"epoch": 4.514672686230249,
"grad_norm": 5.5943098068237305,
"learning_rate": 8.602959618760975e-06,
"loss": 0.1526,
"step": 6000
},
{
"epoch": 4.5221971407072985,
"grad_norm": 5.557697772979736,
"learning_rate": 8.598779366273723e-06,
"loss": 0.2211,
"step": 6010
},
{
"epoch": 4.529721595184349,
"grad_norm": 3.947209596633911,
"learning_rate": 8.594599113786474e-06,
"loss": 0.1528,
"step": 6020
},
{
"epoch": 4.5372460496614,
"grad_norm": 7.38191556930542,
"learning_rate": 8.590418861299224e-06,
"loss": 0.1562,
"step": 6030
},
{
"epoch": 4.54477050413845,
"grad_norm": 10.425215721130371,
"learning_rate": 8.586238608811973e-06,
"loss": 0.1897,
"step": 6040
},
{
"epoch": 4.552294958615501,
"grad_norm": 5.334704399108887,
"learning_rate": 8.582058356324723e-06,
"loss": 0.174,
"step": 6050
},
{
"epoch": 4.5598194130925505,
"grad_norm": 6.482622146606445,
"learning_rate": 8.577878103837472e-06,
"loss": 0.1774,
"step": 6060
},
{
"epoch": 4.567343867569601,
"grad_norm": 9.02678394317627,
"learning_rate": 8.573697851350222e-06,
"loss": 0.2446,
"step": 6070
},
{
"epoch": 4.574868322046652,
"grad_norm": 3.656137466430664,
"learning_rate": 8.569517598862973e-06,
"loss": 0.1866,
"step": 6080
},
{
"epoch": 4.582392776523702,
"grad_norm": 6.158934593200684,
"learning_rate": 8.565337346375721e-06,
"loss": 0.2248,
"step": 6090
},
{
"epoch": 4.5899172310007526,
"grad_norm": 10.076424598693848,
"learning_rate": 8.561157093888472e-06,
"loss": 0.2325,
"step": 6100
},
{
"epoch": 4.597441685477802,
"grad_norm": 8.406209945678711,
"learning_rate": 8.55697684140122e-06,
"loss": 0.2106,
"step": 6110
},
{
"epoch": 4.604966139954853,
"grad_norm": 7.778573989868164,
"learning_rate": 8.552796588913971e-06,
"loss": 0.1637,
"step": 6120
},
{
"epoch": 4.612490594431904,
"grad_norm": 7.563833713531494,
"learning_rate": 8.54861633642672e-06,
"loss": 0.1741,
"step": 6130
},
{
"epoch": 4.620015048908954,
"grad_norm": 8.280135154724121,
"learning_rate": 8.54443608393947e-06,
"loss": 0.1773,
"step": 6140
},
{
"epoch": 4.6275395033860045,
"grad_norm": 9.068934440612793,
"learning_rate": 8.54025583145222e-06,
"loss": 0.2054,
"step": 6150
},
{
"epoch": 4.635063957863055,
"grad_norm": 6.354267597198486,
"learning_rate": 8.53607557896497e-06,
"loss": 0.201,
"step": 6160
},
{
"epoch": 4.642588412340105,
"grad_norm": 8.397027969360352,
"learning_rate": 8.531895326477721e-06,
"loss": 0.2111,
"step": 6170
},
{
"epoch": 4.650112866817156,
"grad_norm": 8.86711597442627,
"learning_rate": 8.52771507399047e-06,
"loss": 0.1677,
"step": 6180
},
{
"epoch": 4.657637321294207,
"grad_norm": 8.797131538391113,
"learning_rate": 8.52353482150322e-06,
"loss": 0.2505,
"step": 6190
},
{
"epoch": 4.6651617757712565,
"grad_norm": 7.0519208908081055,
"learning_rate": 8.519354569015969e-06,
"loss": 0.1844,
"step": 6200
},
{
"epoch": 4.672686230248307,
"grad_norm": 4.867956161499023,
"learning_rate": 8.51517431652872e-06,
"loss": 0.2102,
"step": 6210
},
{
"epoch": 4.680210684725357,
"grad_norm": 4.225649833679199,
"learning_rate": 8.510994064041468e-06,
"loss": 0.2158,
"step": 6220
},
{
"epoch": 4.687735139202408,
"grad_norm": 6.078823566436768,
"learning_rate": 8.506813811554219e-06,
"loss": 0.1992,
"step": 6230
},
{
"epoch": 4.6952595936794586,
"grad_norm": 10.225565910339355,
"learning_rate": 8.502633559066969e-06,
"loss": 0.2692,
"step": 6240
},
{
"epoch": 4.702784048156508,
"grad_norm": 4.66995096206665,
"learning_rate": 8.498453306579718e-06,
"loss": 0.1973,
"step": 6250
},
{
"epoch": 4.710308502633559,
"grad_norm": 6.222870826721191,
"learning_rate": 8.494273054092468e-06,
"loss": 0.2045,
"step": 6260
},
{
"epoch": 4.717832957110609,
"grad_norm": 8.30414867401123,
"learning_rate": 8.490092801605217e-06,
"loss": 0.2066,
"step": 6270
},
{
"epoch": 4.72535741158766,
"grad_norm": 9.2549467086792,
"learning_rate": 8.485912549117967e-06,
"loss": 0.2159,
"step": 6280
},
{
"epoch": 4.7328818660647105,
"grad_norm": 7.878752708435059,
"learning_rate": 8.481732296630718e-06,
"loss": 0.1806,
"step": 6290
},
{
"epoch": 4.74040632054176,
"grad_norm": 11.239638328552246,
"learning_rate": 8.477552044143466e-06,
"loss": 0.1698,
"step": 6300
},
{
"epoch": 4.747930775018811,
"grad_norm": 5.1768670082092285,
"learning_rate": 8.473371791656217e-06,
"loss": 0.1374,
"step": 6310
},
{
"epoch": 4.755455229495862,
"grad_norm": 9.750138282775879,
"learning_rate": 8.469191539168965e-06,
"loss": 0.167,
"step": 6320
},
{
"epoch": 4.762979683972912,
"grad_norm": 7.808964252471924,
"learning_rate": 8.465011286681716e-06,
"loss": 0.2278,
"step": 6330
},
{
"epoch": 4.7705041384499625,
"grad_norm": 5.109748840332031,
"learning_rate": 8.460831034194466e-06,
"loss": 0.1781,
"step": 6340
},
{
"epoch": 4.778028592927013,
"grad_norm": 8.040872573852539,
"learning_rate": 8.456650781707217e-06,
"loss": 0.2723,
"step": 6350
},
{
"epoch": 4.785553047404063,
"grad_norm": 8.216675758361816,
"learning_rate": 8.452470529219965e-06,
"loss": 0.1732,
"step": 6360
},
{
"epoch": 4.793077501881114,
"grad_norm": 10.798053741455078,
"learning_rate": 8.448290276732716e-06,
"loss": 0.208,
"step": 6370
},
{
"epoch": 4.800601956358164,
"grad_norm": 6.185002326965332,
"learning_rate": 8.444110024245466e-06,
"loss": 0.1798,
"step": 6380
},
{
"epoch": 4.808126410835214,
"grad_norm": 5.082795143127441,
"learning_rate": 8.439929771758215e-06,
"loss": 0.166,
"step": 6390
},
{
"epoch": 4.815650865312265,
"grad_norm": 7.006508827209473,
"learning_rate": 8.435749519270965e-06,
"loss": 0.176,
"step": 6400
},
{
"epoch": 4.823175319789315,
"grad_norm": 8.347260475158691,
"learning_rate": 8.431569266783714e-06,
"loss": 0.1977,
"step": 6410
},
{
"epoch": 4.830699774266366,
"grad_norm": 9.687190055847168,
"learning_rate": 8.427389014296464e-06,
"loss": 0.281,
"step": 6420
},
{
"epoch": 4.838224228743416,
"grad_norm": 5.852231979370117,
"learning_rate": 8.423208761809215e-06,
"loss": 0.2346,
"step": 6430
},
{
"epoch": 4.845748683220466,
"grad_norm": 11.33639144897461,
"learning_rate": 8.419028509321963e-06,
"loss": 0.2193,
"step": 6440
},
{
"epoch": 4.853273137697517,
"grad_norm": 6.299645900726318,
"learning_rate": 8.414848256834714e-06,
"loss": 0.1589,
"step": 6450
},
{
"epoch": 4.860797592174567,
"grad_norm": 10.101712226867676,
"learning_rate": 8.410668004347462e-06,
"loss": 0.2342,
"step": 6460
},
{
"epoch": 4.868322046651618,
"grad_norm": 6.40759801864624,
"learning_rate": 8.406487751860213e-06,
"loss": 0.2052,
"step": 6470
},
{
"epoch": 4.8758465011286685,
"grad_norm": 9.549158096313477,
"learning_rate": 8.402307499372963e-06,
"loss": 0.201,
"step": 6480
},
{
"epoch": 4.883370955605718,
"grad_norm": 9.75588321685791,
"learning_rate": 8.398127246885712e-06,
"loss": 0.1658,
"step": 6490
},
{
"epoch": 4.890895410082769,
"grad_norm": 8.68608283996582,
"learning_rate": 8.393946994398462e-06,
"loss": 0.1837,
"step": 6500
},
{
"epoch": 4.89841986455982,
"grad_norm": 9.34019660949707,
"learning_rate": 8.389766741911211e-06,
"loss": 0.1958,
"step": 6510
},
{
"epoch": 4.90594431903687,
"grad_norm": 8.452191352844238,
"learning_rate": 8.385586489423961e-06,
"loss": 0.2024,
"step": 6520
},
{
"epoch": 4.91346877351392,
"grad_norm": 8.62346076965332,
"learning_rate": 8.381406236936712e-06,
"loss": 0.2091,
"step": 6530
},
{
"epoch": 4.92099322799097,
"grad_norm": 6.744731903076172,
"learning_rate": 8.377225984449462e-06,
"loss": 0.215,
"step": 6540
},
{
"epoch": 4.928517682468021,
"grad_norm": 7.814108371734619,
"learning_rate": 8.373045731962211e-06,
"loss": 0.1593,
"step": 6550
},
{
"epoch": 4.936042136945072,
"grad_norm": 11.607344627380371,
"learning_rate": 8.368865479474961e-06,
"loss": 0.225,
"step": 6560
},
{
"epoch": 4.943566591422122,
"grad_norm": 9.131011009216309,
"learning_rate": 8.364685226987712e-06,
"loss": 0.1857,
"step": 6570
},
{
"epoch": 4.951091045899172,
"grad_norm": 5.4191999435424805,
"learning_rate": 8.36050497450046e-06,
"loss": 0.1724,
"step": 6580
},
{
"epoch": 4.958615500376222,
"grad_norm": 5.488987922668457,
"learning_rate": 8.35632472201321e-06,
"loss": 0.1378,
"step": 6590
},
{
"epoch": 4.966139954853273,
"grad_norm": 5.251589775085449,
"learning_rate": 8.35214446952596e-06,
"loss": 0.2182,
"step": 6600
},
{
"epoch": 4.973664409330324,
"grad_norm": 5.991082191467285,
"learning_rate": 8.34796421703871e-06,
"loss": 0.1919,
"step": 6610
},
{
"epoch": 4.981188863807374,
"grad_norm": 10.733721733093262,
"learning_rate": 8.34378396455146e-06,
"loss": 0.1839,
"step": 6620
},
{
"epoch": 4.988713318284424,
"grad_norm": 11.137850761413574,
"learning_rate": 8.339603712064209e-06,
"loss": 0.2389,
"step": 6630
},
{
"epoch": 4.996237772761475,
"grad_norm": 11.486577987670898,
"learning_rate": 8.33542345957696e-06,
"loss": 0.2458,
"step": 6640
},
{
"epoch": 5.003762227238525,
"grad_norm": 5.592317581176758,
"learning_rate": 8.331243207089708e-06,
"loss": 0.1371,
"step": 6650
},
{
"epoch": 5.011286681715576,
"grad_norm": 6.482915878295898,
"learning_rate": 8.327062954602458e-06,
"loss": 0.177,
"step": 6660
},
{
"epoch": 5.018811136192626,
"grad_norm": 7.077815055847168,
"learning_rate": 8.322882702115209e-06,
"loss": 0.1495,
"step": 6670
},
{
"epoch": 5.026335590669676,
"grad_norm": 7.756783962249756,
"learning_rate": 8.318702449627958e-06,
"loss": 0.1427,
"step": 6680
},
{
"epoch": 5.033860045146727,
"grad_norm": 6.053231239318848,
"learning_rate": 8.314522197140708e-06,
"loss": 0.1547,
"step": 6690
},
{
"epoch": 5.041384499623777,
"grad_norm": 6.568417072296143,
"learning_rate": 8.310341944653457e-06,
"loss": 0.1753,
"step": 6700
},
{
"epoch": 5.048908954100828,
"grad_norm": 5.332026958465576,
"learning_rate": 8.306161692166209e-06,
"loss": 0.1305,
"step": 6710
},
{
"epoch": 5.056433408577878,
"grad_norm": 9.05020809173584,
"learning_rate": 8.301981439678957e-06,
"loss": 0.1973,
"step": 6720
},
{
"epoch": 5.063957863054928,
"grad_norm": 8.415604591369629,
"learning_rate": 8.297801187191708e-06,
"loss": 0.2205,
"step": 6730
},
{
"epoch": 5.071482317531979,
"grad_norm": 5.819530487060547,
"learning_rate": 8.293620934704457e-06,
"loss": 0.1862,
"step": 6740
},
{
"epoch": 5.07900677200903,
"grad_norm": 7.946175575256348,
"learning_rate": 8.289440682217207e-06,
"loss": 0.1132,
"step": 6750
},
{
"epoch": 5.08653122648608,
"grad_norm": 10.299882888793945,
"learning_rate": 8.285260429729957e-06,
"loss": 0.1745,
"step": 6760
},
{
"epoch": 5.09405568096313,
"grad_norm": 7.671544551849365,
"learning_rate": 8.281080177242706e-06,
"loss": 0.1829,
"step": 6770
},
{
"epoch": 5.10158013544018,
"grad_norm": 9.563514709472656,
"learning_rate": 8.276899924755456e-06,
"loss": 0.1952,
"step": 6780
},
{
"epoch": 5.109104589917231,
"grad_norm": 5.524237632751465,
"learning_rate": 8.272719672268205e-06,
"loss": 0.1327,
"step": 6790
},
{
"epoch": 5.116629044394282,
"grad_norm": 7.356444835662842,
"learning_rate": 8.268539419780956e-06,
"loss": 0.1503,
"step": 6800
},
{
"epoch": 5.1241534988713315,
"grad_norm": 10.332460403442383,
"learning_rate": 8.264359167293706e-06,
"loss": 0.1583,
"step": 6810
},
{
"epoch": 5.131677953348382,
"grad_norm": 8.877423286437988,
"learning_rate": 8.260178914806455e-06,
"loss": 0.2441,
"step": 6820
},
{
"epoch": 5.139202407825433,
"grad_norm": 11.976480484008789,
"learning_rate": 8.255998662319205e-06,
"loss": 0.2151,
"step": 6830
},
{
"epoch": 5.146726862302483,
"grad_norm": 8.162457466125488,
"learning_rate": 8.251818409831954e-06,
"loss": 0.1607,
"step": 6840
},
{
"epoch": 5.154251316779534,
"grad_norm": 8.65699577331543,
"learning_rate": 8.247638157344704e-06,
"loss": 0.1581,
"step": 6850
},
{
"epoch": 5.1617757712565835,
"grad_norm": 8.015094757080078,
"learning_rate": 8.243457904857453e-06,
"loss": 0.2031,
"step": 6860
},
{
"epoch": 5.169300225733634,
"grad_norm": 8.649307250976562,
"learning_rate": 8.239277652370203e-06,
"loss": 0.1524,
"step": 6870
},
{
"epoch": 5.176824680210685,
"grad_norm": 11.47341537475586,
"learning_rate": 8.235097399882954e-06,
"loss": 0.1759,
"step": 6880
},
{
"epoch": 5.184349134687735,
"grad_norm": 11.23306941986084,
"learning_rate": 8.230917147395704e-06,
"loss": 0.1342,
"step": 6890
},
{
"epoch": 5.191873589164786,
"grad_norm": 4.827985763549805,
"learning_rate": 8.226736894908454e-06,
"loss": 0.2029,
"step": 6900
},
{
"epoch": 5.199398043641836,
"grad_norm": 6.810028076171875,
"learning_rate": 8.222556642421203e-06,
"loss": 0.1814,
"step": 6910
},
{
"epoch": 5.206922498118886,
"grad_norm": 9.269369125366211,
"learning_rate": 8.218376389933954e-06,
"loss": 0.1445,
"step": 6920
},
{
"epoch": 5.214446952595937,
"grad_norm": 4.727482318878174,
"learning_rate": 8.214196137446702e-06,
"loss": 0.1737,
"step": 6930
},
{
"epoch": 5.221971407072987,
"grad_norm": 7.885960102081299,
"learning_rate": 8.210015884959453e-06,
"loss": 0.1524,
"step": 6940
},
{
"epoch": 5.2294958615500375,
"grad_norm": 9.722622871398926,
"learning_rate": 8.205835632472201e-06,
"loss": 0.213,
"step": 6950
},
{
"epoch": 5.237020316027088,
"grad_norm": 7.540585994720459,
"learning_rate": 8.201655379984952e-06,
"loss": 0.1726,
"step": 6960
},
{
"epoch": 5.244544770504138,
"grad_norm": 9.993691444396973,
"learning_rate": 8.197475127497702e-06,
"loss": 0.1434,
"step": 6970
},
{
"epoch": 5.252069224981189,
"grad_norm": 11.053601264953613,
"learning_rate": 8.19329487501045e-06,
"loss": 0.1883,
"step": 6980
},
{
"epoch": 5.25959367945824,
"grad_norm": 8.340205192565918,
"learning_rate": 8.189114622523201e-06,
"loss": 0.1497,
"step": 6990
},
{
"epoch": 5.2671181339352895,
"grad_norm": 8.768745422363281,
"learning_rate": 8.18493437003595e-06,
"loss": 0.2163,
"step": 7000
},
{
"epoch": 5.27464258841234,
"grad_norm": 7.16154146194458,
"learning_rate": 8.1807541175487e-06,
"loss": 0.1395,
"step": 7010
},
{
"epoch": 5.282167042889391,
"grad_norm": 8.094215393066406,
"learning_rate": 8.17657386506145e-06,
"loss": 0.1749,
"step": 7020
},
{
"epoch": 5.289691497366441,
"grad_norm": 9.044081687927246,
"learning_rate": 8.1723936125742e-06,
"loss": 0.1954,
"step": 7030
},
{
"epoch": 5.297215951843492,
"grad_norm": 7.638929843902588,
"learning_rate": 8.16821336008695e-06,
"loss": 0.1637,
"step": 7040
},
{
"epoch": 5.3047404063205414,
"grad_norm": 9.528214454650879,
"learning_rate": 8.164033107599699e-06,
"loss": 0.2195,
"step": 7050
},
{
"epoch": 5.312264860797592,
"grad_norm": 8.045819282531738,
"learning_rate": 8.159852855112449e-06,
"loss": 0.16,
"step": 7060
},
{
"epoch": 5.319789315274643,
"grad_norm": 8.95510196685791,
"learning_rate": 8.1556726026252e-06,
"loss": 0.161,
"step": 7070
},
{
"epoch": 5.327313769751693,
"grad_norm": 8.730019569396973,
"learning_rate": 8.15149235013795e-06,
"loss": 0.1873,
"step": 7080
},
{
"epoch": 5.3348382242287435,
"grad_norm": 6.827531814575195,
"learning_rate": 8.147312097650698e-06,
"loss": 0.1695,
"step": 7090
},
{
"epoch": 5.342362678705793,
"grad_norm": 6.7226104736328125,
"learning_rate": 8.143131845163449e-06,
"loss": 0.1567,
"step": 7100
},
{
"epoch": 5.349887133182844,
"grad_norm": 7.858177661895752,
"learning_rate": 8.1389515926762e-06,
"loss": 0.1619,
"step": 7110
},
{
"epoch": 5.357411587659895,
"grad_norm": 7.37957763671875,
"learning_rate": 8.134771340188948e-06,
"loss": 0.1582,
"step": 7120
},
{
"epoch": 5.364936042136945,
"grad_norm": 9.118856430053711,
"learning_rate": 8.130591087701698e-06,
"loss": 0.2016,
"step": 7130
},
{
"epoch": 5.3724604966139955,
"grad_norm": 11.332245826721191,
"learning_rate": 8.126410835214447e-06,
"loss": 0.1848,
"step": 7140
},
{
"epoch": 5.379984951091046,
"grad_norm": 8.34212589263916,
"learning_rate": 8.122230582727197e-06,
"loss": 0.1948,
"step": 7150
},
{
"epoch": 5.387509405568096,
"grad_norm": 8.121428489685059,
"learning_rate": 8.118050330239948e-06,
"loss": 0.2094,
"step": 7160
},
{
"epoch": 5.395033860045147,
"grad_norm": 8.244184494018555,
"learning_rate": 8.113870077752696e-06,
"loss": 0.1341,
"step": 7170
},
{
"epoch": 5.402558314522198,
"grad_norm": 5.444628715515137,
"learning_rate": 8.109689825265447e-06,
"loss": 0.1488,
"step": 7180
},
{
"epoch": 5.4100827689992474,
"grad_norm": 7.506073951721191,
"learning_rate": 8.105509572778196e-06,
"loss": 0.1999,
"step": 7190
},
{
"epoch": 5.417607223476298,
"grad_norm": 4.846856594085693,
"learning_rate": 8.101329320290946e-06,
"loss": 0.2043,
"step": 7200
},
{
"epoch": 5.425131677953348,
"grad_norm": 11.258270263671875,
"learning_rate": 8.097149067803696e-06,
"loss": 0.1968,
"step": 7210
},
{
"epoch": 5.432656132430399,
"grad_norm": 4.456273078918457,
"learning_rate": 8.092968815316445e-06,
"loss": 0.1815,
"step": 7220
},
{
"epoch": 5.4401805869074495,
"grad_norm": 5.355717182159424,
"learning_rate": 8.088788562829195e-06,
"loss": 0.1554,
"step": 7230
},
{
"epoch": 5.447705041384499,
"grad_norm": 8.809466361999512,
"learning_rate": 8.084608310341944e-06,
"loss": 0.2145,
"step": 7240
},
{
"epoch": 5.45522949586155,
"grad_norm": 8.372282981872559,
"learning_rate": 8.080428057854695e-06,
"loss": 0.2094,
"step": 7250
},
{
"epoch": 5.4627539503386,
"grad_norm": 7.424798488616943,
"learning_rate": 8.076247805367445e-06,
"loss": 0.1853,
"step": 7260
},
{
"epoch": 5.470278404815651,
"grad_norm": 8.875120162963867,
"learning_rate": 8.072067552880195e-06,
"loss": 0.1997,
"step": 7270
},
{
"epoch": 5.4778028592927015,
"grad_norm": 7.930933475494385,
"learning_rate": 8.067887300392944e-06,
"loss": 0.212,
"step": 7280
},
{
"epoch": 5.485327313769751,
"grad_norm": 5.328012466430664,
"learning_rate": 8.063707047905694e-06,
"loss": 0.158,
"step": 7290
},
{
"epoch": 5.492851768246802,
"grad_norm": 6.645898818969727,
"learning_rate": 8.059526795418445e-06,
"loss": 0.168,
"step": 7300
},
{
"epoch": 5.500376222723853,
"grad_norm": 9.081933975219727,
"learning_rate": 8.055346542931194e-06,
"loss": 0.1814,
"step": 7310
},
{
"epoch": 5.507900677200903,
"grad_norm": 7.437675476074219,
"learning_rate": 8.051166290443944e-06,
"loss": 0.2142,
"step": 7320
},
{
"epoch": 5.5154251316779535,
"grad_norm": 6.354644775390625,
"learning_rate": 8.046986037956693e-06,
"loss": 0.165,
"step": 7330
},
{
"epoch": 5.522949586155004,
"grad_norm": 7.42842435836792,
"learning_rate": 8.042805785469443e-06,
"loss": 0.1831,
"step": 7340
},
{
"epoch": 5.530474040632054,
"grad_norm": 9.511542320251465,
"learning_rate": 8.038625532982193e-06,
"loss": 0.1782,
"step": 7350
},
{
"epoch": 5.537998495109105,
"grad_norm": 7.416419982910156,
"learning_rate": 8.034445280494942e-06,
"loss": 0.1983,
"step": 7360
},
{
"epoch": 5.545522949586155,
"grad_norm": 8.564135551452637,
"learning_rate": 8.030265028007693e-06,
"loss": 0.1695,
"step": 7370
},
{
"epoch": 5.553047404063205,
"grad_norm": 6.778014659881592,
"learning_rate": 8.026084775520441e-06,
"loss": 0.1599,
"step": 7380
},
{
"epoch": 5.560571858540256,
"grad_norm": 6.545214653015137,
"learning_rate": 8.021904523033192e-06,
"loss": 0.1395,
"step": 7390
},
{
"epoch": 5.568096313017306,
"grad_norm": 3.063966989517212,
"learning_rate": 8.017724270545942e-06,
"loss": 0.1383,
"step": 7400
},
{
"epoch": 5.575620767494357,
"grad_norm": 5.198939800262451,
"learning_rate": 8.01354401805869e-06,
"loss": 0.176,
"step": 7410
},
{
"epoch": 5.583145221971407,
"grad_norm": 8.018445014953613,
"learning_rate": 8.009363765571441e-06,
"loss": 0.1773,
"step": 7420
},
{
"epoch": 5.590669676448457,
"grad_norm": 8.4862060546875,
"learning_rate": 8.00518351308419e-06,
"loss": 0.2351,
"step": 7430
},
{
"epoch": 5.598194130925508,
"grad_norm": 3.4017207622528076,
"learning_rate": 8.001003260596942e-06,
"loss": 0.1826,
"step": 7440
},
{
"epoch": 5.605718585402558,
"grad_norm": 8.308197021484375,
"learning_rate": 7.99682300810969e-06,
"loss": 0.2002,
"step": 7450
},
{
"epoch": 5.613243039879609,
"grad_norm": 6.5446038246154785,
"learning_rate": 7.992642755622441e-06,
"loss": 0.1472,
"step": 7460
},
{
"epoch": 5.6207674943566595,
"grad_norm": 12.85771369934082,
"learning_rate": 7.98846250313519e-06,
"loss": 0.2382,
"step": 7470
},
{
"epoch": 5.628291948833709,
"grad_norm": 7.108580589294434,
"learning_rate": 7.98428225064794e-06,
"loss": 0.1629,
"step": 7480
},
{
"epoch": 5.63581640331076,
"grad_norm": 10.055646896362305,
"learning_rate": 7.98010199816069e-06,
"loss": 0.2107,
"step": 7490
},
{
"epoch": 5.643340857787811,
"grad_norm": 3.524488687515259,
"learning_rate": 7.97592174567344e-06,
"loss": 0.1614,
"step": 7500
},
{
"epoch": 5.650865312264861,
"grad_norm": 8.577242851257324,
"learning_rate": 7.97174149318619e-06,
"loss": 0.1752,
"step": 7510
},
{
"epoch": 5.658389766741911,
"grad_norm": 4.436498641967773,
"learning_rate": 7.967561240698938e-06,
"loss": 0.2289,
"step": 7520
},
{
"epoch": 5.665914221218961,
"grad_norm": 9.47412395477295,
"learning_rate": 7.963380988211689e-06,
"loss": 0.1646,
"step": 7530
},
{
"epoch": 5.673438675696012,
"grad_norm": 11.120841026306152,
"learning_rate": 7.959200735724439e-06,
"loss": 0.1805,
"step": 7540
},
{
"epoch": 5.680963130173063,
"grad_norm": 8.272597312927246,
"learning_rate": 7.955020483237188e-06,
"loss": 0.2055,
"step": 7550
},
{
"epoch": 5.688487584650113,
"grad_norm": 7.943882465362549,
"learning_rate": 7.950840230749938e-06,
"loss": 0.1849,
"step": 7560
},
{
"epoch": 5.696012039127163,
"grad_norm": 9.238296508789062,
"learning_rate": 7.946659978262687e-06,
"loss": 0.1651,
"step": 7570
},
{
"epoch": 5.703536493604213,
"grad_norm": 11.098034858703613,
"learning_rate": 7.942479725775437e-06,
"loss": 0.216,
"step": 7580
},
{
"epoch": 5.711060948081264,
"grad_norm": 11.07455825805664,
"learning_rate": 7.938299473288188e-06,
"loss": 0.2109,
"step": 7590
},
{
"epoch": 5.718585402558315,
"grad_norm": 6.9234819412231445,
"learning_rate": 7.934119220800936e-06,
"loss": 0.1456,
"step": 7600
},
{
"epoch": 5.726109857035365,
"grad_norm": 5.608709812164307,
"learning_rate": 7.929938968313687e-06,
"loss": 0.1424,
"step": 7610
},
{
"epoch": 5.733634311512415,
"grad_norm": 13.218008995056152,
"learning_rate": 7.925758715826437e-06,
"loss": 0.1895,
"step": 7620
},
{
"epoch": 5.741158765989466,
"grad_norm": 6.975159168243408,
"learning_rate": 7.921578463339188e-06,
"loss": 0.1616,
"step": 7630
},
{
"epoch": 5.748683220466516,
"grad_norm": 8.910100936889648,
"learning_rate": 7.917398210851936e-06,
"loss": 0.2042,
"step": 7640
},
{
"epoch": 5.756207674943567,
"grad_norm": 8.068119049072266,
"learning_rate": 7.913217958364687e-06,
"loss": 0.1748,
"step": 7650
},
{
"epoch": 5.763732129420617,
"grad_norm": 9.351244926452637,
"learning_rate": 7.909037705877435e-06,
"loss": 0.169,
"step": 7660
},
{
"epoch": 5.771256583897667,
"grad_norm": 7.046148300170898,
"learning_rate": 7.904857453390186e-06,
"loss": 0.2126,
"step": 7670
},
{
"epoch": 5.778781038374718,
"grad_norm": 7.9526686668396,
"learning_rate": 7.900677200902936e-06,
"loss": 0.1893,
"step": 7680
},
{
"epoch": 5.786305492851768,
"grad_norm": 10.231925964355469,
"learning_rate": 7.896496948415685e-06,
"loss": 0.2103,
"step": 7690
},
{
"epoch": 5.793829947328819,
"grad_norm": 9.099306106567383,
"learning_rate": 7.892316695928435e-06,
"loss": 0.1875,
"step": 7700
},
{
"epoch": 5.801354401805869,
"grad_norm": 9.167414665222168,
"learning_rate": 7.888136443441184e-06,
"loss": 0.1893,
"step": 7710
},
{
"epoch": 5.808878856282919,
"grad_norm": 12.738779067993164,
"learning_rate": 7.883956190953934e-06,
"loss": 0.2392,
"step": 7720
},
{
"epoch": 5.81640331075997,
"grad_norm": 7.125742435455322,
"learning_rate": 7.879775938466683e-06,
"loss": 0.1697,
"step": 7730
},
{
"epoch": 5.82392776523702,
"grad_norm": 6.5781450271606445,
"learning_rate": 7.875595685979433e-06,
"loss": 0.1694,
"step": 7740
},
{
"epoch": 5.831452219714071,
"grad_norm": 4.691234588623047,
"learning_rate": 7.871415433492184e-06,
"loss": 0.2257,
"step": 7750
},
{
"epoch": 5.838976674191121,
"grad_norm": 8.69082260131836,
"learning_rate": 7.867235181004933e-06,
"loss": 0.2275,
"step": 7760
},
{
"epoch": 5.846501128668171,
"grad_norm": 9.822325706481934,
"learning_rate": 7.863054928517683e-06,
"loss": 0.1628,
"step": 7770
},
{
"epoch": 5.854025583145222,
"grad_norm": 4.987610816955566,
"learning_rate": 7.858874676030432e-06,
"loss": 0.1633,
"step": 7780
},
{
"epoch": 5.861550037622273,
"grad_norm": 7.953482151031494,
"learning_rate": 7.854694423543182e-06,
"loss": 0.1631,
"step": 7790
},
{
"epoch": 5.8690744920993225,
"grad_norm": 9.97362995147705,
"learning_rate": 7.850514171055932e-06,
"loss": 0.2077,
"step": 7800
},
{
"epoch": 5.876598946576373,
"grad_norm": 10.382735252380371,
"learning_rate": 7.846333918568683e-06,
"loss": 0.1484,
"step": 7810
},
{
"epoch": 5.884123401053424,
"grad_norm": 11.967596054077148,
"learning_rate": 7.842153666081432e-06,
"loss": 0.1525,
"step": 7820
},
{
"epoch": 5.891647855530474,
"grad_norm": 6.947834014892578,
"learning_rate": 7.837973413594182e-06,
"loss": 0.1685,
"step": 7830
},
{
"epoch": 5.899172310007525,
"grad_norm": 9.017217636108398,
"learning_rate": 7.833793161106932e-06,
"loss": 0.1737,
"step": 7840
},
{
"epoch": 5.9066967644845745,
"grad_norm": 9.230958938598633,
"learning_rate": 7.829612908619681e-06,
"loss": 0.142,
"step": 7850
},
{
"epoch": 5.914221218961625,
"grad_norm": 7.715651035308838,
"learning_rate": 7.825432656132431e-06,
"loss": 0.1249,
"step": 7860
},
{
"epoch": 5.921745673438676,
"grad_norm": 2.6331450939178467,
"learning_rate": 7.82125240364518e-06,
"loss": 0.1421,
"step": 7870
},
{
"epoch": 5.929270127915726,
"grad_norm": 4.872918128967285,
"learning_rate": 7.81707215115793e-06,
"loss": 0.1519,
"step": 7880
},
{
"epoch": 5.936794582392777,
"grad_norm": 11.077669143676758,
"learning_rate": 7.812891898670681e-06,
"loss": 0.1566,
"step": 7890
},
{
"epoch": 5.944319036869827,
"grad_norm": 9.00078296661377,
"learning_rate": 7.80871164618343e-06,
"loss": 0.1939,
"step": 7900
},
{
"epoch": 5.951843491346877,
"grad_norm": 9.66382122039795,
"learning_rate": 7.80453139369618e-06,
"loss": 0.1994,
"step": 7910
},
{
"epoch": 5.959367945823928,
"grad_norm": 6.525386333465576,
"learning_rate": 7.800351141208929e-06,
"loss": 0.1517,
"step": 7920
},
{
"epoch": 5.966892400300978,
"grad_norm": 5.410614967346191,
"learning_rate": 7.796170888721679e-06,
"loss": 0.1293,
"step": 7930
},
{
"epoch": 5.9744168547780285,
"grad_norm": 8.867262840270996,
"learning_rate": 7.79199063623443e-06,
"loss": 0.1892,
"step": 7940
},
{
"epoch": 5.981941309255079,
"grad_norm": 8.707972526550293,
"learning_rate": 7.787810383747178e-06,
"loss": 0.1489,
"step": 7950
},
{
"epoch": 5.989465763732129,
"grad_norm": 10.210127830505371,
"learning_rate": 7.783630131259929e-06,
"loss": 0.1879,
"step": 7960
},
{
"epoch": 5.99699021820918,
"grad_norm": 5.96549129486084,
"learning_rate": 7.779449878772677e-06,
"loss": 0.1821,
"step": 7970
},
{
"epoch": 6.004514672686231,
"grad_norm": 7.090506553649902,
"learning_rate": 7.77526962628543e-06,
"loss": 0.1218,
"step": 7980
},
{
"epoch": 6.0120391271632805,
"grad_norm": 8.773550033569336,
"learning_rate": 7.771089373798178e-06,
"loss": 0.1558,
"step": 7990
},
{
"epoch": 6.019563581640331,
"grad_norm": 8.615151405334473,
"learning_rate": 7.766909121310929e-06,
"loss": 0.1811,
"step": 8000
},
{
"epoch": 6.027088036117381,
"grad_norm": 4.9068922996521,
"learning_rate": 7.762728868823677e-06,
"loss": 0.2012,
"step": 8010
},
{
"epoch": 6.034612490594432,
"grad_norm": 9.907990455627441,
"learning_rate": 7.758548616336428e-06,
"loss": 0.1682,
"step": 8020
},
{
"epoch": 6.042136945071483,
"grad_norm": 5.487863063812256,
"learning_rate": 7.754368363849178e-06,
"loss": 0.1742,
"step": 8030
},
{
"epoch": 6.049661399548532,
"grad_norm": 8.537084579467773,
"learning_rate": 7.750188111361927e-06,
"loss": 0.1652,
"step": 8040
},
{
"epoch": 6.057185854025583,
"grad_norm": 10.835139274597168,
"learning_rate": 7.746007858874677e-06,
"loss": 0.1993,
"step": 8050
},
{
"epoch": 6.064710308502634,
"grad_norm": 7.074265003204346,
"learning_rate": 7.741827606387426e-06,
"loss": 0.2075,
"step": 8060
},
{
"epoch": 6.072234762979684,
"grad_norm": 6.608835697174072,
"learning_rate": 7.737647353900176e-06,
"loss": 0.144,
"step": 8070
},
{
"epoch": 6.0797592174567345,
"grad_norm": 11.262980461120605,
"learning_rate": 7.733467101412927e-06,
"loss": 0.1397,
"step": 8080
},
{
"epoch": 6.087283671933784,
"grad_norm": 6.517685890197754,
"learning_rate": 7.729286848925675e-06,
"loss": 0.1875,
"step": 8090
},
{
"epoch": 6.094808126410835,
"grad_norm": 6.834563255310059,
"learning_rate": 7.725106596438426e-06,
"loss": 0.1883,
"step": 8100
},
{
"epoch": 6.102332580887886,
"grad_norm": 7.841973304748535,
"learning_rate": 7.720926343951174e-06,
"loss": 0.1722,
"step": 8110
},
{
"epoch": 6.109857035364936,
"grad_norm": 7.006568431854248,
"learning_rate": 7.716746091463925e-06,
"loss": 0.1157,
"step": 8120
},
{
"epoch": 6.1173814898419865,
"grad_norm": 7.466751575469971,
"learning_rate": 7.712565838976675e-06,
"loss": 0.122,
"step": 8130
},
{
"epoch": 6.124905944319037,
"grad_norm": 6.685294151306152,
"learning_rate": 7.708385586489424e-06,
"loss": 0.1509,
"step": 8140
},
{
"epoch": 6.132430398796087,
"grad_norm": 9.151078224182129,
"learning_rate": 7.704205334002174e-06,
"loss": 0.1428,
"step": 8150
},
{
"epoch": 6.139954853273138,
"grad_norm": 9.963911056518555,
"learning_rate": 7.700025081514925e-06,
"loss": 0.1362,
"step": 8160
},
{
"epoch": 6.147479307750188,
"grad_norm": 7.9521942138671875,
"learning_rate": 7.695844829027675e-06,
"loss": 0.2037,
"step": 8170
},
{
"epoch": 6.155003762227238,
"grad_norm": 10.13383960723877,
"learning_rate": 7.691664576540424e-06,
"loss": 0.1683,
"step": 8180
},
{
"epoch": 6.162528216704289,
"grad_norm": 7.142587184906006,
"learning_rate": 7.687484324053174e-06,
"loss": 0.1904,
"step": 8190
},
{
"epoch": 6.170052671181339,
"grad_norm": 5.827476501464844,
"learning_rate": 7.683304071565923e-06,
"loss": 0.1182,
"step": 8200
},
{
"epoch": 6.17757712565839,
"grad_norm": 3.5819246768951416,
"learning_rate": 7.679123819078673e-06,
"loss": 0.1291,
"step": 8210
},
{
"epoch": 6.1851015801354405,
"grad_norm": 10.036810874938965,
"learning_rate": 7.674943566591424e-06,
"loss": 0.1531,
"step": 8220
},
{
"epoch": 6.19262603461249,
"grad_norm": 10.45486068725586,
"learning_rate": 7.670763314104172e-06,
"loss": 0.1943,
"step": 8230
},
{
"epoch": 6.200150489089541,
"grad_norm": 2.4924612045288086,
"learning_rate": 7.666583061616923e-06,
"loss": 0.148,
"step": 8240
},
{
"epoch": 6.207674943566591,
"grad_norm": 4.770188808441162,
"learning_rate": 7.662402809129671e-06,
"loss": 0.14,
"step": 8250
},
{
"epoch": 6.215199398043642,
"grad_norm": 11.903753280639648,
"learning_rate": 7.658222556642422e-06,
"loss": 0.1417,
"step": 8260
},
{
"epoch": 6.2227238525206925,
"grad_norm": 2.068080425262451,
"learning_rate": 7.654042304155172e-06,
"loss": 0.1679,
"step": 8270
},
{
"epoch": 6.230248306997742,
"grad_norm": 7.85164737701416,
"learning_rate": 7.649862051667921e-06,
"loss": 0.1778,
"step": 8280
},
{
"epoch": 6.237772761474793,
"grad_norm": 5.108582019805908,
"learning_rate": 7.645681799180671e-06,
"loss": 0.1471,
"step": 8290
},
{
"epoch": 6.245297215951844,
"grad_norm": 4.792969703674316,
"learning_rate": 7.64150154669342e-06,
"loss": 0.126,
"step": 8300
},
{
"epoch": 6.252821670428894,
"grad_norm": 8.351136207580566,
"learning_rate": 7.63732129420617e-06,
"loss": 0.1144,
"step": 8310
},
{
"epoch": 6.260346124905944,
"grad_norm": 9.570772171020508,
"learning_rate": 7.63314104171892e-06,
"loss": 0.2101,
"step": 8320
},
{
"epoch": 6.267870579382995,
"grad_norm": 10.960972785949707,
"learning_rate": 7.6289607892316695e-06,
"loss": 0.2095,
"step": 8330
},
{
"epoch": 6.275395033860045,
"grad_norm": 8.7171049118042,
"learning_rate": 7.624780536744421e-06,
"loss": 0.1396,
"step": 8340
},
{
"epoch": 6.282919488337096,
"grad_norm": 6.793865203857422,
"learning_rate": 7.62060028425717e-06,
"loss": 0.1524,
"step": 8350
},
{
"epoch": 6.290443942814146,
"grad_norm": 13.218555450439453,
"learning_rate": 7.61642003176992e-06,
"loss": 0.1649,
"step": 8360
},
{
"epoch": 6.297968397291196,
"grad_norm": 5.084217071533203,
"learning_rate": 7.6122397792826694e-06,
"loss": 0.144,
"step": 8370
},
{
"epoch": 6.305492851768247,
"grad_norm": 9.322858810424805,
"learning_rate": 7.608059526795419e-06,
"loss": 0.1576,
"step": 8380
},
{
"epoch": 6.313017306245297,
"grad_norm": 7.766299247741699,
"learning_rate": 7.603879274308169e-06,
"loss": 0.1581,
"step": 8390
},
{
"epoch": 6.320541760722348,
"grad_norm": 10.082528114318848,
"learning_rate": 7.599699021820919e-06,
"loss": 0.1198,
"step": 8400
},
{
"epoch": 6.328066215199398,
"grad_norm": 6.375280857086182,
"learning_rate": 7.5955187693336685e-06,
"loss": 0.1053,
"step": 8410
},
{
"epoch": 6.335590669676448,
"grad_norm": 8.377439498901367,
"learning_rate": 7.591338516846418e-06,
"loss": 0.1259,
"step": 8420
},
{
"epoch": 6.343115124153499,
"grad_norm": 7.567884922027588,
"learning_rate": 7.5871582643591676e-06,
"loss": 0.1555,
"step": 8430
},
{
"epoch": 6.350639578630549,
"grad_norm": 9.82593059539795,
"learning_rate": 7.582978011871918e-06,
"loss": 0.228,
"step": 8440
},
{
"epoch": 6.3581640331076,
"grad_norm": 3.6331682205200195,
"learning_rate": 7.5787977593846675e-06,
"loss": 0.1114,
"step": 8450
},
{
"epoch": 6.3656884875846504,
"grad_norm": 6.77184534072876,
"learning_rate": 7.574617506897417e-06,
"loss": 0.1114,
"step": 8460
},
{
"epoch": 6.3732129420617,
"grad_norm": 9.455371856689453,
"learning_rate": 7.570437254410167e-06,
"loss": 0.1958,
"step": 8470
},
{
"epoch": 6.380737396538751,
"grad_norm": 9.314596176147461,
"learning_rate": 7.566257001922916e-06,
"loss": 0.1667,
"step": 8480
},
{
"epoch": 6.388261851015802,
"grad_norm": 7.729313850402832,
"learning_rate": 7.5620767494356666e-06,
"loss": 0.1515,
"step": 8490
},
{
"epoch": 6.395786305492852,
"grad_norm": 6.675351619720459,
"learning_rate": 7.557896496948416e-06,
"loss": 0.1819,
"step": 8500
},
{
"epoch": 6.403310759969902,
"grad_norm": 8.883378028869629,
"learning_rate": 7.553716244461166e-06,
"loss": 0.1788,
"step": 8510
},
{
"epoch": 6.410835214446952,
"grad_norm": 5.213517665863037,
"learning_rate": 7.549535991973915e-06,
"loss": 0.1769,
"step": 8520
},
{
"epoch": 6.418359668924003,
"grad_norm": 7.817873954772949,
"learning_rate": 7.5453557394866664e-06,
"loss": 0.1976,
"step": 8530
},
{
"epoch": 6.425884123401054,
"grad_norm": 10.222846984863281,
"learning_rate": 7.541175486999416e-06,
"loss": 0.209,
"step": 8540
},
{
"epoch": 6.433408577878104,
"grad_norm": 7.177674770355225,
"learning_rate": 7.5369952345121655e-06,
"loss": 0.1755,
"step": 8550
},
{
"epoch": 6.440933032355154,
"grad_norm": 8.744006156921387,
"learning_rate": 7.532814982024915e-06,
"loss": 0.1434,
"step": 8560
},
{
"epoch": 6.448457486832204,
"grad_norm": 7.118343353271484,
"learning_rate": 7.528634729537665e-06,
"loss": 0.1724,
"step": 8570
},
{
"epoch": 6.455981941309255,
"grad_norm": 10.586155891418457,
"learning_rate": 7.524454477050415e-06,
"loss": 0.1731,
"step": 8580
},
{
"epoch": 6.463506395786306,
"grad_norm": 4.590671539306641,
"learning_rate": 7.520274224563165e-06,
"loss": 0.1541,
"step": 8590
},
{
"epoch": 6.471030850263356,
"grad_norm": 5.464056015014648,
"learning_rate": 7.516093972075914e-06,
"loss": 0.1574,
"step": 8600
},
{
"epoch": 6.478555304740406,
"grad_norm": 8.129666328430176,
"learning_rate": 7.511913719588664e-06,
"loss": 0.1433,
"step": 8610
},
{
"epoch": 6.486079759217457,
"grad_norm": 5.378341197967529,
"learning_rate": 7.507733467101413e-06,
"loss": 0.1796,
"step": 8620
},
{
"epoch": 6.493604213694507,
"grad_norm": 6.7451605796813965,
"learning_rate": 7.503553214614163e-06,
"loss": 0.1559,
"step": 8630
},
{
"epoch": 6.501128668171558,
"grad_norm": 8.130012512207031,
"learning_rate": 7.499372962126913e-06,
"loss": 0.1444,
"step": 8640
},
{
"epoch": 6.508653122648608,
"grad_norm": 12.124320983886719,
"learning_rate": 7.495192709639663e-06,
"loss": 0.1965,
"step": 8650
},
{
"epoch": 6.516177577125658,
"grad_norm": 6.617115020751953,
"learning_rate": 7.491012457152412e-06,
"loss": 0.1638,
"step": 8660
},
{
"epoch": 6.523702031602709,
"grad_norm": 9.720376968383789,
"learning_rate": 7.486832204665162e-06,
"loss": 0.1831,
"step": 8670
},
{
"epoch": 6.531226486079759,
"grad_norm": 9.170286178588867,
"learning_rate": 7.482651952177911e-06,
"loss": 0.1484,
"step": 8680
},
{
"epoch": 6.53875094055681,
"grad_norm": 7.603338241577148,
"learning_rate": 7.478471699690662e-06,
"loss": 0.1604,
"step": 8690
},
{
"epoch": 6.54627539503386,
"grad_norm": 10.253371238708496,
"learning_rate": 7.474291447203411e-06,
"loss": 0.1618,
"step": 8700
},
{
"epoch": 6.55379984951091,
"grad_norm": 7.7780890464782715,
"learning_rate": 7.470111194716162e-06,
"loss": 0.1819,
"step": 8710
},
{
"epoch": 6.561324303987961,
"grad_norm": 8.585005760192871,
"learning_rate": 7.465930942228912e-06,
"loss": 0.1276,
"step": 8720
},
{
"epoch": 6.568848758465011,
"grad_norm": 7.06712532043457,
"learning_rate": 7.461750689741662e-06,
"loss": 0.1413,
"step": 8730
},
{
"epoch": 6.576373212942062,
"grad_norm": 8.646645545959473,
"learning_rate": 7.457570437254411e-06,
"loss": 0.1467,
"step": 8740
},
{
"epoch": 6.583897667419112,
"grad_norm": 4.02337646484375,
"learning_rate": 7.453390184767161e-06,
"loss": 0.1563,
"step": 8750
},
{
"epoch": 6.591422121896162,
"grad_norm": 8.277909278869629,
"learning_rate": 7.44920993227991e-06,
"loss": 0.1693,
"step": 8760
},
{
"epoch": 6.598946576373213,
"grad_norm": 10.024460792541504,
"learning_rate": 7.44502967979266e-06,
"loss": 0.1785,
"step": 8770
},
{
"epoch": 6.606471030850264,
"grad_norm": 8.2098388671875,
"learning_rate": 7.44084942730541e-06,
"loss": 0.1633,
"step": 8780
},
{
"epoch": 6.6139954853273135,
"grad_norm": 11.487617492675781,
"learning_rate": 7.43666917481816e-06,
"loss": 0.142,
"step": 8790
},
{
"epoch": 6.621519939804364,
"grad_norm": 10.144819259643555,
"learning_rate": 7.432488922330909e-06,
"loss": 0.1462,
"step": 8800
},
{
"epoch": 6.629044394281415,
"grad_norm": 7.244460582733154,
"learning_rate": 7.428308669843659e-06,
"loss": 0.1517,
"step": 8810
},
{
"epoch": 6.636568848758465,
"grad_norm": 2.242067575454712,
"learning_rate": 7.4241284173564084e-06,
"loss": 0.1339,
"step": 8820
},
{
"epoch": 6.644093303235516,
"grad_norm": 6.76652193069458,
"learning_rate": 7.419948164869159e-06,
"loss": 0.1485,
"step": 8830
},
{
"epoch": 6.6516177577125655,
"grad_norm": 10.644311904907227,
"learning_rate": 7.415767912381908e-06,
"loss": 0.1635,
"step": 8840
},
{
"epoch": 6.659142212189616,
"grad_norm": 10.187577247619629,
"learning_rate": 7.411587659894658e-06,
"loss": 0.1605,
"step": 8850
},
{
"epoch": 6.666666666666667,
"grad_norm": 4.452393054962158,
"learning_rate": 7.4074074074074075e-06,
"loss": 0.1381,
"step": 8860
},
{
"epoch": 6.674191121143717,
"grad_norm": 4.998362064361572,
"learning_rate": 7.403227154920157e-06,
"loss": 0.1349,
"step": 8870
},
{
"epoch": 6.681715575620768,
"grad_norm": 6.7441534996032715,
"learning_rate": 7.3990469024329074e-06,
"loss": 0.1463,
"step": 8880
},
{
"epoch": 6.689240030097817,
"grad_norm": 7.791492462158203,
"learning_rate": 7.394866649945658e-06,
"loss": 0.1352,
"step": 8890
},
{
"epoch": 6.696764484574868,
"grad_norm": 4.74707555770874,
"learning_rate": 7.390686397458407e-06,
"loss": 0.1588,
"step": 8900
},
{
"epoch": 6.704288939051919,
"grad_norm": 6.361732482910156,
"learning_rate": 7.386506144971157e-06,
"loss": 0.194,
"step": 8910
},
{
"epoch": 6.711813393528969,
"grad_norm": 5.4700093269348145,
"learning_rate": 7.382325892483907e-06,
"loss": 0.0973,
"step": 8920
},
{
"epoch": 6.7193378480060195,
"grad_norm": 6.188055992126465,
"learning_rate": 7.378145639996657e-06,
"loss": 0.1872,
"step": 8930
},
{
"epoch": 6.72686230248307,
"grad_norm": 9.050508499145508,
"learning_rate": 7.373965387509406e-06,
"loss": 0.1253,
"step": 8940
},
{
"epoch": 6.73438675696012,
"grad_norm": 7.569061756134033,
"learning_rate": 7.369785135022156e-06,
"loss": 0.14,
"step": 8950
},
{
"epoch": 6.741911211437171,
"grad_norm": 4.738898754119873,
"learning_rate": 7.3656048825349055e-06,
"loss": 0.1375,
"step": 8960
},
{
"epoch": 6.749435665914222,
"grad_norm": 8.632952690124512,
"learning_rate": 7.361424630047656e-06,
"loss": 0.1495,
"step": 8970
},
{
"epoch": 6.7569601203912715,
"grad_norm": 7.746445655822754,
"learning_rate": 7.3572443775604055e-06,
"loss": 0.1338,
"step": 8980
},
{
"epoch": 6.764484574868322,
"grad_norm": 8.78629207611084,
"learning_rate": 7.353064125073155e-06,
"loss": 0.1672,
"step": 8990
},
{
"epoch": 6.772009029345372,
"grad_norm": 10.222789764404297,
"learning_rate": 7.3488838725859046e-06,
"loss": 0.2174,
"step": 9000
},
{
"epoch": 6.779533483822423,
"grad_norm": 9.454453468322754,
"learning_rate": 7.344703620098654e-06,
"loss": 0.2156,
"step": 9010
},
{
"epoch": 6.787057938299474,
"grad_norm": 9.131001472473145,
"learning_rate": 7.340523367611404e-06,
"loss": 0.1334,
"step": 9020
},
{
"epoch": 6.794582392776523,
"grad_norm": 9.862558364868164,
"learning_rate": 7.336343115124154e-06,
"loss": 0.1254,
"step": 9030
},
{
"epoch": 6.802106847253574,
"grad_norm": 12.024687767028809,
"learning_rate": 7.332162862636904e-06,
"loss": 0.1612,
"step": 9040
},
{
"epoch": 6.809631301730624,
"grad_norm": 6.9240946769714355,
"learning_rate": 7.327982610149653e-06,
"loss": 0.1136,
"step": 9050
},
{
"epoch": 6.817155756207675,
"grad_norm": 6.730030536651611,
"learning_rate": 7.323802357662403e-06,
"loss": 0.1415,
"step": 9060
},
{
"epoch": 6.8246802106847255,
"grad_norm": 10.351944923400879,
"learning_rate": 7.319622105175154e-06,
"loss": 0.1725,
"step": 9070
},
{
"epoch": 6.832204665161775,
"grad_norm": 5.732717514038086,
"learning_rate": 7.3154418526879035e-06,
"loss": 0.1599,
"step": 9080
},
{
"epoch": 6.839729119638826,
"grad_norm": 10.521687507629395,
"learning_rate": 7.311261600200653e-06,
"loss": 0.1625,
"step": 9090
},
{
"epoch": 6.847253574115877,
"grad_norm": 9.526365280151367,
"learning_rate": 7.307081347713403e-06,
"loss": 0.1078,
"step": 9100
},
{
"epoch": 6.854778028592927,
"grad_norm": 2.771315097808838,
"learning_rate": 7.302901095226152e-06,
"loss": 0.123,
"step": 9110
},
{
"epoch": 6.8623024830699775,
"grad_norm": 9.39211654663086,
"learning_rate": 7.2987208427389025e-06,
"loss": 0.1554,
"step": 9120
},
{
"epoch": 6.869826937547028,
"grad_norm": 9.30453109741211,
"learning_rate": 7.294540590251652e-06,
"loss": 0.1153,
"step": 9130
},
{
"epoch": 6.877351392024078,
"grad_norm": 8.631791114807129,
"learning_rate": 7.290360337764402e-06,
"loss": 0.1256,
"step": 9140
},
{
"epoch": 6.884875846501129,
"grad_norm": 7.919106960296631,
"learning_rate": 7.286180085277151e-06,
"loss": 0.226,
"step": 9150
},
{
"epoch": 6.89240030097818,
"grad_norm": 12.256912231445312,
"learning_rate": 7.281999832789901e-06,
"loss": 0.1843,
"step": 9160
},
{
"epoch": 6.899924755455229,
"grad_norm": 9.76773452758789,
"learning_rate": 7.277819580302651e-06,
"loss": 0.1758,
"step": 9170
},
{
"epoch": 6.90744920993228,
"grad_norm": 7.5923075675964355,
"learning_rate": 7.273639327815401e-06,
"loss": 0.1876,
"step": 9180
},
{
"epoch": 6.91497366440933,
"grad_norm": 9.641352653503418,
"learning_rate": 7.26945907532815e-06,
"loss": 0.1493,
"step": 9190
},
{
"epoch": 6.922498118886381,
"grad_norm": 7.658628940582275,
"learning_rate": 7.2652788228409e-06,
"loss": 0.1506,
"step": 9200
},
{
"epoch": 6.9300225733634315,
"grad_norm": 10.023941040039062,
"learning_rate": 7.261098570353649e-06,
"loss": 0.1427,
"step": 9210
},
{
"epoch": 6.937547027840481,
"grad_norm": 10.135645866394043,
"learning_rate": 7.2569183178664e-06,
"loss": 0.1777,
"step": 9220
},
{
"epoch": 6.945071482317532,
"grad_norm": 8.289971351623535,
"learning_rate": 7.252738065379149e-06,
"loss": 0.1751,
"step": 9230
},
{
"epoch": 6.952595936794582,
"grad_norm": 8.860151290893555,
"learning_rate": 7.248557812891899e-06,
"loss": 0.1426,
"step": 9240
},
{
"epoch": 6.960120391271633,
"grad_norm": 6.4370903968811035,
"learning_rate": 7.244377560404649e-06,
"loss": 0.1784,
"step": 9250
},
{
"epoch": 6.9676448457486835,
"grad_norm": 12.02550220489502,
"learning_rate": 7.2401973079174e-06,
"loss": 0.1894,
"step": 9260
},
{
"epoch": 6.975169300225733,
"grad_norm": 10.728776931762695,
"learning_rate": 7.236017055430149e-06,
"loss": 0.1437,
"step": 9270
},
{
"epoch": 6.982693754702784,
"grad_norm": 6.684060096740723,
"learning_rate": 7.231836802942899e-06,
"loss": 0.1483,
"step": 9280
},
{
"epoch": 6.990218209179835,
"grad_norm": 9.681809425354004,
"learning_rate": 7.227656550455648e-06,
"loss": 0.1908,
"step": 9290
},
{
"epoch": 6.997742663656885,
"grad_norm": 7.41337251663208,
"learning_rate": 7.223476297968398e-06,
"loss": 0.1655,
"step": 9300
},
{
"epoch": 7.005267118133935,
"grad_norm": 9.84195327758789,
"learning_rate": 7.219296045481148e-06,
"loss": 0.1548,
"step": 9310
},
{
"epoch": 7.012791572610985,
"grad_norm": 4.892616271972656,
"learning_rate": 7.215115792993898e-06,
"loss": 0.1249,
"step": 9320
},
{
"epoch": 7.020316027088036,
"grad_norm": 5.397826671600342,
"learning_rate": 7.210935540506647e-06,
"loss": 0.1382,
"step": 9330
},
{
"epoch": 7.027840481565087,
"grad_norm": 5.65526008605957,
"learning_rate": 7.206755288019397e-06,
"loss": 0.1215,
"step": 9340
},
{
"epoch": 7.035364936042137,
"grad_norm": 10.712868690490723,
"learning_rate": 7.202575035532146e-06,
"loss": 0.1923,
"step": 9350
},
{
"epoch": 7.042889390519187,
"grad_norm": 5.386087417602539,
"learning_rate": 7.198394783044897e-06,
"loss": 0.1403,
"step": 9360
},
{
"epoch": 7.050413844996238,
"grad_norm": 5.49284029006958,
"learning_rate": 7.194214530557646e-06,
"loss": 0.1359,
"step": 9370
},
{
"epoch": 7.057938299473288,
"grad_norm": 10.279863357543945,
"learning_rate": 7.190034278070396e-06,
"loss": 0.1786,
"step": 9380
},
{
"epoch": 7.065462753950339,
"grad_norm": 1.2782478332519531,
"learning_rate": 7.185854025583145e-06,
"loss": 0.1316,
"step": 9390
},
{
"epoch": 7.072987208427389,
"grad_norm": 10.440459251403809,
"learning_rate": 7.181673773095895e-06,
"loss": 0.1618,
"step": 9400
},
{
"epoch": 7.080511662904439,
"grad_norm": 2.517453908920288,
"learning_rate": 7.1774935206086445e-06,
"loss": 0.1037,
"step": 9410
},
{
"epoch": 7.08803611738149,
"grad_norm": 13.917778968811035,
"learning_rate": 7.173313268121395e-06,
"loss": 0.1435,
"step": 9420
},
{
"epoch": 7.09556057185854,
"grad_norm": 12.317310333251953,
"learning_rate": 7.169133015634145e-06,
"loss": 0.1318,
"step": 9430
},
{
"epoch": 7.103085026335591,
"grad_norm": 9.198356628417969,
"learning_rate": 7.164952763146895e-06,
"loss": 0.1214,
"step": 9440
},
{
"epoch": 7.110609480812641,
"grad_norm": 9.149123191833496,
"learning_rate": 7.160772510659645e-06,
"loss": 0.1442,
"step": 9450
},
{
"epoch": 7.118133935289691,
"grad_norm": 7.893470287322998,
"learning_rate": 7.156592258172395e-06,
"loss": 0.1711,
"step": 9460
},
{
"epoch": 7.125658389766742,
"grad_norm": 11.737761497497559,
"learning_rate": 7.152412005685144e-06,
"loss": 0.1204,
"step": 9470
},
{
"epoch": 7.133182844243792,
"grad_norm": 7.861152648925781,
"learning_rate": 7.148231753197894e-06,
"loss": 0.1883,
"step": 9480
},
{
"epoch": 7.140707298720843,
"grad_norm": 4.260904312133789,
"learning_rate": 7.1440515007106434e-06,
"loss": 0.1673,
"step": 9490
},
{
"epoch": 7.148231753197893,
"grad_norm": 4.621326923370361,
"learning_rate": 7.139871248223393e-06,
"loss": 0.1511,
"step": 9500
},
{
"epoch": 7.155756207674943,
"grad_norm": 6.470260143280029,
"learning_rate": 7.135690995736143e-06,
"loss": 0.1465,
"step": 9510
},
{
"epoch": 7.163280662151994,
"grad_norm": 13.358270645141602,
"learning_rate": 7.131510743248893e-06,
"loss": 0.1431,
"step": 9520
},
{
"epoch": 7.170805116629045,
"grad_norm": 6.210266590118408,
"learning_rate": 7.1273304907616425e-06,
"loss": 0.141,
"step": 9530
},
{
"epoch": 7.178329571106095,
"grad_norm": 11.789570808410645,
"learning_rate": 7.123150238274392e-06,
"loss": 0.1504,
"step": 9540
},
{
"epoch": 7.185854025583145,
"grad_norm": 6.791030406951904,
"learning_rate": 7.118969985787142e-06,
"loss": 0.1321,
"step": 9550
},
{
"epoch": 7.193378480060195,
"grad_norm": 4.915945529937744,
"learning_rate": 7.114789733299892e-06,
"loss": 0.1531,
"step": 9560
},
{
"epoch": 7.200902934537246,
"grad_norm": 5.299534797668457,
"learning_rate": 7.1106094808126415e-06,
"loss": 0.1585,
"step": 9570
},
{
"epoch": 7.208427389014297,
"grad_norm": 7.799785137176514,
"learning_rate": 7.106429228325391e-06,
"loss": 0.1507,
"step": 9580
},
{
"epoch": 7.2159518434913466,
"grad_norm": 6.652139663696289,
"learning_rate": 7.102248975838141e-06,
"loss": 0.1303,
"step": 9590
},
{
"epoch": 7.223476297968397,
"grad_norm": 8.317214965820312,
"learning_rate": 7.09806872335089e-06,
"loss": 0.1529,
"step": 9600
},
{
"epoch": 7.231000752445448,
"grad_norm": 7.286659240722656,
"learning_rate": 7.093888470863641e-06,
"loss": 0.1251,
"step": 9610
},
{
"epoch": 7.238525206922498,
"grad_norm": 5.869454860687256,
"learning_rate": 7.089708218376391e-06,
"loss": 0.1439,
"step": 9620
},
{
"epoch": 7.246049661399549,
"grad_norm": 7.951076984405518,
"learning_rate": 7.0855279658891405e-06,
"loss": 0.1603,
"step": 9630
},
{
"epoch": 7.253574115876599,
"grad_norm": 2.182021141052246,
"learning_rate": 7.08134771340189e-06,
"loss": 0.1119,
"step": 9640
},
{
"epoch": 7.261098570353649,
"grad_norm": 11.444308280944824,
"learning_rate": 7.0771674609146405e-06,
"loss": 0.1395,
"step": 9650
},
{
"epoch": 7.2686230248307,
"grad_norm": 9.718505859375,
"learning_rate": 7.07298720842739e-06,
"loss": 0.1535,
"step": 9660
},
{
"epoch": 7.27614747930775,
"grad_norm": 6.41453218460083,
"learning_rate": 7.0688069559401396e-06,
"loss": 0.1113,
"step": 9670
},
{
"epoch": 7.283671933784801,
"grad_norm": 11.704097747802734,
"learning_rate": 7.064626703452889e-06,
"loss": 0.1497,
"step": 9680
},
{
"epoch": 7.291196388261851,
"grad_norm": 8.959087371826172,
"learning_rate": 7.060446450965639e-06,
"loss": 0.1714,
"step": 9690
},
{
"epoch": 7.298720842738901,
"grad_norm": 4.717805862426758,
"learning_rate": 7.056266198478389e-06,
"loss": 0.1232,
"step": 9700
},
{
"epoch": 7.306245297215952,
"grad_norm": 6.030142307281494,
"learning_rate": 7.052085945991139e-06,
"loss": 0.0906,
"step": 9710
},
{
"epoch": 7.313769751693002,
"grad_norm": 3.481350898742676,
"learning_rate": 7.047905693503888e-06,
"loss": 0.1353,
"step": 9720
},
{
"epoch": 7.3212942061700526,
"grad_norm": 12.188994407653809,
"learning_rate": 7.043725441016638e-06,
"loss": 0.2042,
"step": 9730
},
{
"epoch": 7.328818660647103,
"grad_norm": 4.677647113800049,
"learning_rate": 7.039545188529387e-06,
"loss": 0.1216,
"step": 9740
},
{
"epoch": 7.336343115124153,
"grad_norm": 9.761384010314941,
"learning_rate": 7.035364936042137e-06,
"loss": 0.1193,
"step": 9750
},
{
"epoch": 7.343867569601204,
"grad_norm": 8.550495147705078,
"learning_rate": 7.031184683554887e-06,
"loss": 0.1348,
"step": 9760
},
{
"epoch": 7.351392024078255,
"grad_norm": 8.725384712219238,
"learning_rate": 7.027004431067637e-06,
"loss": 0.1579,
"step": 9770
},
{
"epoch": 7.3589164785553045,
"grad_norm": 7.324394702911377,
"learning_rate": 7.022824178580386e-06,
"loss": 0.1552,
"step": 9780
},
{
"epoch": 7.366440933032355,
"grad_norm": 10.771610260009766,
"learning_rate": 7.018643926093136e-06,
"loss": 0.095,
"step": 9790
},
{
"epoch": 7.373965387509406,
"grad_norm": 9.422534942626953,
"learning_rate": 7.014463673605887e-06,
"loss": 0.1488,
"step": 9800
},
{
"epoch": 7.381489841986456,
"grad_norm": 10.947734832763672,
"learning_rate": 7.010283421118637e-06,
"loss": 0.1326,
"step": 9810
},
{
"epoch": 7.389014296463507,
"grad_norm": 7.345083236694336,
"learning_rate": 7.006103168631386e-06,
"loss": 0.1573,
"step": 9820
},
{
"epoch": 7.3965387509405565,
"grad_norm": 10.11557674407959,
"learning_rate": 7.001922916144136e-06,
"loss": 0.1321,
"step": 9830
},
{
"epoch": 7.404063205417607,
"grad_norm": 7.984189033508301,
"learning_rate": 6.997742663656886e-06,
"loss": 0.152,
"step": 9840
},
{
"epoch": 7.411587659894658,
"grad_norm": 12.054664611816406,
"learning_rate": 6.993562411169636e-06,
"loss": 0.1605,
"step": 9850
},
{
"epoch": 7.419112114371708,
"grad_norm": 5.764801025390625,
"learning_rate": 6.989382158682385e-06,
"loss": 0.1356,
"step": 9860
},
{
"epoch": 7.426636568848759,
"grad_norm": 6.675129413604736,
"learning_rate": 6.985201906195135e-06,
"loss": 0.1324,
"step": 9870
},
{
"epoch": 7.434161023325808,
"grad_norm": 8.892173767089844,
"learning_rate": 6.981021653707884e-06,
"loss": 0.1916,
"step": 9880
},
{
"epoch": 7.441685477802859,
"grad_norm": 8.714239120483398,
"learning_rate": 6.976841401220634e-06,
"loss": 0.15,
"step": 9890
},
{
"epoch": 7.44920993227991,
"grad_norm": 8.057106971740723,
"learning_rate": 6.972661148733384e-06,
"loss": 0.22,
"step": 9900
},
{
"epoch": 7.45673438675696,
"grad_norm": 4.123317718505859,
"learning_rate": 6.968480896246134e-06,
"loss": 0.1306,
"step": 9910
},
{
"epoch": 7.4642588412340105,
"grad_norm": 3.9179821014404297,
"learning_rate": 6.964300643758883e-06,
"loss": 0.1524,
"step": 9920
},
{
"epoch": 7.471783295711061,
"grad_norm": 3.3642544746398926,
"learning_rate": 6.960120391271633e-06,
"loss": 0.1418,
"step": 9930
},
{
"epoch": 7.479307750188111,
"grad_norm": 9.933600425720215,
"learning_rate": 6.9559401387843825e-06,
"loss": 0.1309,
"step": 9940
},
{
"epoch": 7.486832204665162,
"grad_norm": 7.034138202667236,
"learning_rate": 6.951759886297133e-06,
"loss": 0.1102,
"step": 9950
},
{
"epoch": 7.494356659142213,
"grad_norm": 6.040769100189209,
"learning_rate": 6.947579633809882e-06,
"loss": 0.169,
"step": 9960
},
{
"epoch": 7.5018811136192625,
"grad_norm": 6.807097434997559,
"learning_rate": 6.943399381322632e-06,
"loss": 0.1343,
"step": 9970
},
{
"epoch": 7.509405568096313,
"grad_norm": 8.873719215393066,
"learning_rate": 6.939219128835382e-06,
"loss": 0.2087,
"step": 9980
},
{
"epoch": 7.516930022573363,
"grad_norm": 6.61569356918335,
"learning_rate": 6.935038876348133e-06,
"loss": 0.1491,
"step": 9990
},
{
"epoch": 7.524454477050414,
"grad_norm": 7.067235946655273,
"learning_rate": 6.930858623860882e-06,
"loss": 0.1299,
"step": 10000
},
{
"epoch": 7.531978931527465,
"grad_norm": 6.512937068939209,
"learning_rate": 6.926678371373632e-06,
"loss": 0.1534,
"step": 10010
},
{
"epoch": 7.539503386004514,
"grad_norm": 6.557222843170166,
"learning_rate": 6.922498118886381e-06,
"loss": 0.1079,
"step": 10020
},
{
"epoch": 7.547027840481565,
"grad_norm": 11.565508842468262,
"learning_rate": 6.918317866399131e-06,
"loss": 0.1389,
"step": 10030
},
{
"epoch": 7.554552294958615,
"grad_norm": 6.491377830505371,
"learning_rate": 6.914137613911881e-06,
"loss": 0.1371,
"step": 10040
},
{
"epoch": 7.562076749435666,
"grad_norm": 7.546027660369873,
"learning_rate": 6.909957361424631e-06,
"loss": 0.1332,
"step": 10050
},
{
"epoch": 7.5696012039127165,
"grad_norm": 7.451356887817383,
"learning_rate": 6.9057771089373804e-06,
"loss": 0.1036,
"step": 10060
},
{
"epoch": 7.577125658389766,
"grad_norm": 11.477706909179688,
"learning_rate": 6.90159685645013e-06,
"loss": 0.1742,
"step": 10070
},
{
"epoch": 7.584650112866817,
"grad_norm": 6.118583679199219,
"learning_rate": 6.8974166039628795e-06,
"loss": 0.1317,
"step": 10080
},
{
"epoch": 7.592174567343868,
"grad_norm": 8.546781539916992,
"learning_rate": 6.89323635147563e-06,
"loss": 0.1236,
"step": 10090
},
{
"epoch": 7.599699021820918,
"grad_norm": 8.164753913879395,
"learning_rate": 6.8890560989883795e-06,
"loss": 0.1189,
"step": 10100
},
{
"epoch": 7.6072234762979685,
"grad_norm": 14.010082244873047,
"learning_rate": 6.884875846501129e-06,
"loss": 0.1578,
"step": 10110
},
{
"epoch": 7.614747930775019,
"grad_norm": 3.0297963619232178,
"learning_rate": 6.8806955940138786e-06,
"loss": 0.1092,
"step": 10120
},
{
"epoch": 7.622272385252069,
"grad_norm": 9.559354782104492,
"learning_rate": 6.876515341526628e-06,
"loss": 0.1473,
"step": 10130
},
{
"epoch": 7.62979683972912,
"grad_norm": 5.298492431640625,
"learning_rate": 6.872335089039378e-06,
"loss": 0.1332,
"step": 10140
},
{
"epoch": 7.63732129420617,
"grad_norm": 3.04228138923645,
"learning_rate": 6.868154836552128e-06,
"loss": 0.1232,
"step": 10150
},
{
"epoch": 7.64484574868322,
"grad_norm": 2.9913089275360107,
"learning_rate": 6.8639745840648785e-06,
"loss": 0.1245,
"step": 10160
},
{
"epoch": 7.652370203160271,
"grad_norm": 5.246922969818115,
"learning_rate": 6.859794331577628e-06,
"loss": 0.1365,
"step": 10170
},
{
"epoch": 7.659894657637321,
"grad_norm": 13.609551429748535,
"learning_rate": 6.855614079090378e-06,
"loss": 0.1861,
"step": 10180
},
{
"epoch": 7.667419112114372,
"grad_norm": 7.830902576446533,
"learning_rate": 6.851433826603128e-06,
"loss": 0.1085,
"step": 10190
},
{
"epoch": 7.674943566591422,
"grad_norm": 9.319975852966309,
"learning_rate": 6.8472535741158775e-06,
"loss": 0.147,
"step": 10200
},
{
"epoch": 7.682468021068472,
"grad_norm": 5.328279495239258,
"learning_rate": 6.843073321628627e-06,
"loss": 0.1406,
"step": 10210
},
{
"epoch": 7.689992475545523,
"grad_norm": 8.353121757507324,
"learning_rate": 6.838893069141377e-06,
"loss": 0.1533,
"step": 10220
},
{
"epoch": 7.697516930022573,
"grad_norm": 7.57094669342041,
"learning_rate": 6.834712816654126e-06,
"loss": 0.1177,
"step": 10230
},
{
"epoch": 7.705041384499624,
"grad_norm": 7.212436199188232,
"learning_rate": 6.8305325641668765e-06,
"loss": 0.1369,
"step": 10240
},
{
"epoch": 7.7125658389766745,
"grad_norm": 4.2448320388793945,
"learning_rate": 6.826352311679626e-06,
"loss": 0.1551,
"step": 10250
},
{
"epoch": 7.720090293453724,
"grad_norm": 8.300276756286621,
"learning_rate": 6.822172059192376e-06,
"loss": 0.188,
"step": 10260
},
{
"epoch": 7.727614747930775,
"grad_norm": 9.391825675964355,
"learning_rate": 6.817991806705125e-06,
"loss": 0.1468,
"step": 10270
},
{
"epoch": 7.735139202407826,
"grad_norm": 9.549453735351562,
"learning_rate": 6.813811554217875e-06,
"loss": 0.1597,
"step": 10280
},
{
"epoch": 7.742663656884876,
"grad_norm": 8.50312328338623,
"learning_rate": 6.809631301730625e-06,
"loss": 0.1622,
"step": 10290
},
{
"epoch": 7.750188111361926,
"grad_norm": 7.836225509643555,
"learning_rate": 6.805451049243375e-06,
"loss": 0.1124,
"step": 10300
},
{
"epoch": 7.757712565838977,
"grad_norm": 6.063055515289307,
"learning_rate": 6.801270796756124e-06,
"loss": 0.1202,
"step": 10310
},
{
"epoch": 7.765237020316027,
"grad_norm": 7.710981369018555,
"learning_rate": 6.797090544268874e-06,
"loss": 0.1505,
"step": 10320
},
{
"epoch": 7.772761474793078,
"grad_norm": 4.136016845703125,
"learning_rate": 6.792910291781623e-06,
"loss": 0.0856,
"step": 10330
},
{
"epoch": 7.780285929270128,
"grad_norm": 5.1353983879089355,
"learning_rate": 6.7887300392943746e-06,
"loss": 0.1399,
"step": 10340
},
{
"epoch": 7.787810383747178,
"grad_norm": 7.657298564910889,
"learning_rate": 6.784549786807124e-06,
"loss": 0.1707,
"step": 10350
},
{
"epoch": 7.795334838224229,
"grad_norm": 8.755562782287598,
"learning_rate": 6.780369534319874e-06,
"loss": 0.1414,
"step": 10360
},
{
"epoch": 7.802859292701279,
"grad_norm": 9.025842666625977,
"learning_rate": 6.776189281832623e-06,
"loss": 0.1517,
"step": 10370
},
{
"epoch": 7.81038374717833,
"grad_norm": 8.332140922546387,
"learning_rate": 6.772009029345374e-06,
"loss": 0.152,
"step": 10380
},
{
"epoch": 7.81790820165538,
"grad_norm": 5.564815998077393,
"learning_rate": 6.767828776858123e-06,
"loss": 0.1647,
"step": 10390
},
{
"epoch": 7.82543265613243,
"grad_norm": 9.147259712219238,
"learning_rate": 6.763648524370873e-06,
"loss": 0.1692,
"step": 10400
},
{
"epoch": 7.832957110609481,
"grad_norm": 3.85477614402771,
"learning_rate": 6.759468271883622e-06,
"loss": 0.1395,
"step": 10410
},
{
"epoch": 7.840481565086531,
"grad_norm": 10.106462478637695,
"learning_rate": 6.755288019396372e-06,
"loss": 0.1652,
"step": 10420
},
{
"epoch": 7.848006019563582,
"grad_norm": 9.62307357788086,
"learning_rate": 6.751107766909122e-06,
"loss": 0.1578,
"step": 10430
},
{
"epoch": 7.855530474040632,
"grad_norm": 10.081789016723633,
"learning_rate": 6.746927514421872e-06,
"loss": 0.1153,
"step": 10440
},
{
"epoch": 7.863054928517682,
"grad_norm": 8.883935928344727,
"learning_rate": 6.742747261934621e-06,
"loss": 0.1476,
"step": 10450
},
{
"epoch": 7.870579382994733,
"grad_norm": 4.969008922576904,
"learning_rate": 6.738567009447371e-06,
"loss": 0.1149,
"step": 10460
},
{
"epoch": 7.878103837471784,
"grad_norm": 6.76956033706665,
"learning_rate": 6.73438675696012e-06,
"loss": 0.1159,
"step": 10470
},
{
"epoch": 7.885628291948834,
"grad_norm": 11.153620719909668,
"learning_rate": 6.730206504472871e-06,
"loss": 0.1176,
"step": 10480
},
{
"epoch": 7.893152746425884,
"grad_norm": 9.627654075622559,
"learning_rate": 6.72602625198562e-06,
"loss": 0.1475,
"step": 10490
},
{
"epoch": 7.900677200902934,
"grad_norm": 5.48689603805542,
"learning_rate": 6.72184599949837e-06,
"loss": 0.1462,
"step": 10500
},
{
"epoch": 7.908201655379985,
"grad_norm": 9.38402271270752,
"learning_rate": 6.7176657470111194e-06,
"loss": 0.1272,
"step": 10510
},
{
"epoch": 7.915726109857036,
"grad_norm": 4.6745452880859375,
"learning_rate": 6.713485494523871e-06,
"loss": 0.1367,
"step": 10520
},
{
"epoch": 7.923250564334086,
"grad_norm": 9.271729469299316,
"learning_rate": 6.70930524203662e-06,
"loss": 0.1049,
"step": 10530
},
{
"epoch": 7.930775018811136,
"grad_norm": 8.473658561706543,
"learning_rate": 6.70512498954937e-06,
"loss": 0.1422,
"step": 10540
},
{
"epoch": 7.938299473288186,
"grad_norm": 11.029475212097168,
"learning_rate": 6.700944737062119e-06,
"loss": 0.141,
"step": 10550
},
{
"epoch": 7.945823927765237,
"grad_norm": 3.050215005874634,
"learning_rate": 6.696764484574869e-06,
"loss": 0.1018,
"step": 10560
},
{
"epoch": 7.953348382242288,
"grad_norm": 9.241053581237793,
"learning_rate": 6.692584232087619e-06,
"loss": 0.2095,
"step": 10570
},
{
"epoch": 7.9608728367193375,
"grad_norm": 6.521198749542236,
"learning_rate": 6.688403979600369e-06,
"loss": 0.1362,
"step": 10580
},
{
"epoch": 7.968397291196388,
"grad_norm": 6.801877021789551,
"learning_rate": 6.684223727113118e-06,
"loss": 0.1676,
"step": 10590
},
{
"epoch": 7.975921745673439,
"grad_norm": 1.8709025382995605,
"learning_rate": 6.680043474625868e-06,
"loss": 0.1454,
"step": 10600
},
{
"epoch": 7.983446200150489,
"grad_norm": 8.10856819152832,
"learning_rate": 6.6758632221386175e-06,
"loss": 0.1387,
"step": 10610
},
{
"epoch": 7.99097065462754,
"grad_norm": 8.119205474853516,
"learning_rate": 6.671682969651367e-06,
"loss": 0.1514,
"step": 10620
},
{
"epoch": 7.99849510910459,
"grad_norm": 5.9093918800354,
"learning_rate": 6.667502717164117e-06,
"loss": 0.1428,
"step": 10630
},
{
"epoch": 8.00601956358164,
"grad_norm": 6.897785186767578,
"learning_rate": 6.663322464676867e-06,
"loss": 0.1647,
"step": 10640
},
{
"epoch": 8.01354401805869,
"grad_norm": 6.557318210601807,
"learning_rate": 6.6591422121896165e-06,
"loss": 0.1131,
"step": 10650
},
{
"epoch": 8.021068472535742,
"grad_norm": 9.699820518493652,
"learning_rate": 6.654961959702366e-06,
"loss": 0.1507,
"step": 10660
},
{
"epoch": 8.028592927012792,
"grad_norm": 4.894190311431885,
"learning_rate": 6.650781707215116e-06,
"loss": 0.1215,
"step": 10670
},
{
"epoch": 8.036117381489841,
"grad_norm": 4.050377368927002,
"learning_rate": 6.646601454727866e-06,
"loss": 0.1528,
"step": 10680
},
{
"epoch": 8.043641835966893,
"grad_norm": 6.824169158935547,
"learning_rate": 6.6424212022406156e-06,
"loss": 0.1206,
"step": 10690
},
{
"epoch": 8.051166290443943,
"grad_norm": 8.67491340637207,
"learning_rate": 6.638240949753366e-06,
"loss": 0.1241,
"step": 10700
},
{
"epoch": 8.058690744920993,
"grad_norm": 3.6065561771392822,
"learning_rate": 6.6340606972661155e-06,
"loss": 0.125,
"step": 10710
},
{
"epoch": 8.066215199398044,
"grad_norm": 6.743816375732422,
"learning_rate": 6.629880444778866e-06,
"loss": 0.1568,
"step": 10720
},
{
"epoch": 8.073739653875094,
"grad_norm": 9.00485897064209,
"learning_rate": 6.6257001922916154e-06,
"loss": 0.2092,
"step": 10730
},
{
"epoch": 8.081264108352144,
"grad_norm": 6.077624797821045,
"learning_rate": 6.621519939804365e-06,
"loss": 0.1088,
"step": 10740
},
{
"epoch": 8.088788562829194,
"grad_norm": 11.94995403289795,
"learning_rate": 6.6173396873171145e-06,
"loss": 0.1527,
"step": 10750
},
{
"epoch": 8.096313017306246,
"grad_norm": 2.177652597427368,
"learning_rate": 6.613159434829864e-06,
"loss": 0.0979,
"step": 10760
},
{
"epoch": 8.103837471783295,
"grad_norm": 6.277168273925781,
"learning_rate": 6.6089791823426145e-06,
"loss": 0.0971,
"step": 10770
},
{
"epoch": 8.111361926260345,
"grad_norm": 4.3780646324157715,
"learning_rate": 6.604798929855364e-06,
"loss": 0.1324,
"step": 10780
},
{
"epoch": 8.118886380737397,
"grad_norm": 6.6845903396606445,
"learning_rate": 6.600618677368114e-06,
"loss": 0.2028,
"step": 10790
},
{
"epoch": 8.126410835214447,
"grad_norm": 7.709301471710205,
"learning_rate": 6.596438424880863e-06,
"loss": 0.1619,
"step": 10800
},
{
"epoch": 8.133935289691497,
"grad_norm": 4.897986888885498,
"learning_rate": 6.592258172393613e-06,
"loss": 0.1379,
"step": 10810
},
{
"epoch": 8.141459744168548,
"grad_norm": 5.920723915100098,
"learning_rate": 6.588077919906363e-06,
"loss": 0.1152,
"step": 10820
},
{
"epoch": 8.148984198645598,
"grad_norm": 7.485513687133789,
"learning_rate": 6.583897667419113e-06,
"loss": 0.1393,
"step": 10830
},
{
"epoch": 8.156508653122648,
"grad_norm": 7.579458236694336,
"learning_rate": 6.579717414931862e-06,
"loss": 0.1251,
"step": 10840
},
{
"epoch": 8.1640331075997,
"grad_norm": 5.181578636169434,
"learning_rate": 6.575537162444612e-06,
"loss": 0.0906,
"step": 10850
},
{
"epoch": 8.17155756207675,
"grad_norm": 6.30539083480835,
"learning_rate": 6.571356909957361e-06,
"loss": 0.1349,
"step": 10860
},
{
"epoch": 8.1790820165538,
"grad_norm": 10.169859886169434,
"learning_rate": 6.567176657470111e-06,
"loss": 0.1416,
"step": 10870
},
{
"epoch": 8.186606471030851,
"grad_norm": 13.061054229736328,
"learning_rate": 6.562996404982861e-06,
"loss": 0.1334,
"step": 10880
},
{
"epoch": 8.194130925507901,
"grad_norm": 6.3449482917785645,
"learning_rate": 6.558816152495612e-06,
"loss": 0.1587,
"step": 10890
},
{
"epoch": 8.20165537998495,
"grad_norm": 7.411153316497803,
"learning_rate": 6.554635900008361e-06,
"loss": 0.1223,
"step": 10900
},
{
"epoch": 8.209179834462002,
"grad_norm": 6.868873119354248,
"learning_rate": 6.5504556475211116e-06,
"loss": 0.1375,
"step": 10910
},
{
"epoch": 8.216704288939052,
"grad_norm": 6.650938510894775,
"learning_rate": 6.546275395033861e-06,
"loss": 0.1059,
"step": 10920
},
{
"epoch": 8.224228743416102,
"grad_norm": 3.331028699874878,
"learning_rate": 6.542095142546611e-06,
"loss": 0.1155,
"step": 10930
},
{
"epoch": 8.231753197893152,
"grad_norm": 5.483583927154541,
"learning_rate": 6.53791489005936e-06,
"loss": 0.1507,
"step": 10940
},
{
"epoch": 8.239277652370204,
"grad_norm": 7.282931327819824,
"learning_rate": 6.53373463757211e-06,
"loss": 0.1207,
"step": 10950
},
{
"epoch": 8.246802106847253,
"grad_norm": 5.5015435218811035,
"learning_rate": 6.52955438508486e-06,
"loss": 0.1597,
"step": 10960
},
{
"epoch": 8.254326561324303,
"grad_norm": 7.288213729858398,
"learning_rate": 6.52537413259761e-06,
"loss": 0.1198,
"step": 10970
},
{
"epoch": 8.261851015801355,
"grad_norm": 5.3646745681762695,
"learning_rate": 6.521193880110359e-06,
"loss": 0.1356,
"step": 10980
},
{
"epoch": 8.269375470278405,
"grad_norm": 10.937861442565918,
"learning_rate": 6.517013627623109e-06,
"loss": 0.1249,
"step": 10990
},
{
"epoch": 8.276899924755455,
"grad_norm": 9.01375961303711,
"learning_rate": 6.512833375135858e-06,
"loss": 0.1456,
"step": 11000
},
{
"epoch": 8.284424379232506,
"grad_norm": 8.625186920166016,
"learning_rate": 6.508653122648608e-06,
"loss": 0.1496,
"step": 11010
},
{
"epoch": 8.291948833709556,
"grad_norm": 7.765505313873291,
"learning_rate": 6.504472870161358e-06,
"loss": 0.108,
"step": 11020
},
{
"epoch": 8.299473288186606,
"grad_norm": 10.0932035446167,
"learning_rate": 6.500292617674108e-06,
"loss": 0.1414,
"step": 11030
},
{
"epoch": 8.306997742663658,
"grad_norm": 9.906752586364746,
"learning_rate": 6.496112365186857e-06,
"loss": 0.136,
"step": 11040
},
{
"epoch": 8.314522197140708,
"grad_norm": 3.120361328125,
"learning_rate": 6.491932112699607e-06,
"loss": 0.095,
"step": 11050
},
{
"epoch": 8.322046651617757,
"grad_norm": 13.94308090209961,
"learning_rate": 6.4877518602123565e-06,
"loss": 0.1316,
"step": 11060
},
{
"epoch": 8.329571106094807,
"grad_norm": 9.929215431213379,
"learning_rate": 6.483571607725108e-06,
"loss": 0.1213,
"step": 11070
},
{
"epoch": 8.337095560571859,
"grad_norm": 7.462619781494141,
"learning_rate": 6.479391355237857e-06,
"loss": 0.1175,
"step": 11080
},
{
"epoch": 8.344620015048909,
"grad_norm": 7.091246128082275,
"learning_rate": 6.475211102750607e-06,
"loss": 0.1412,
"step": 11090
},
{
"epoch": 8.352144469525959,
"grad_norm": 7.457467555999756,
"learning_rate": 6.471030850263356e-06,
"loss": 0.1843,
"step": 11100
},
{
"epoch": 8.35966892400301,
"grad_norm": 5.405447959899902,
"learning_rate": 6.466850597776107e-06,
"loss": 0.1798,
"step": 11110
},
{
"epoch": 8.36719337848006,
"grad_norm": 8.421366691589355,
"learning_rate": 6.462670345288856e-06,
"loss": 0.1321,
"step": 11120
},
{
"epoch": 8.37471783295711,
"grad_norm": 9.620317459106445,
"learning_rate": 6.458490092801606e-06,
"loss": 0.1528,
"step": 11130
},
{
"epoch": 8.382242287434162,
"grad_norm": 6.394067764282227,
"learning_rate": 6.454309840314355e-06,
"loss": 0.1401,
"step": 11140
},
{
"epoch": 8.389766741911211,
"grad_norm": 5.651474952697754,
"learning_rate": 6.450129587827105e-06,
"loss": 0.1092,
"step": 11150
},
{
"epoch": 8.397291196388261,
"grad_norm": 8.883152961730957,
"learning_rate": 6.445949335339855e-06,
"loss": 0.1388,
"step": 11160
},
{
"epoch": 8.404815650865313,
"grad_norm": 2.737121820449829,
"learning_rate": 6.441769082852605e-06,
"loss": 0.1235,
"step": 11170
},
{
"epoch": 8.412340105342363,
"grad_norm": 3.6822330951690674,
"learning_rate": 6.4375888303653545e-06,
"loss": 0.0854,
"step": 11180
},
{
"epoch": 8.419864559819413,
"grad_norm": 12.290861129760742,
"learning_rate": 6.433408577878104e-06,
"loss": 0.1463,
"step": 11190
},
{
"epoch": 8.427389014296464,
"grad_norm": 2.9195001125335693,
"learning_rate": 6.4292283253908536e-06,
"loss": 0.1494,
"step": 11200
},
{
"epoch": 8.434913468773514,
"grad_norm": 6.521566867828369,
"learning_rate": 6.425048072903604e-06,
"loss": 0.1082,
"step": 11210
},
{
"epoch": 8.442437923250564,
"grad_norm": 5.775163650512695,
"learning_rate": 6.4208678204163535e-06,
"loss": 0.1312,
"step": 11220
},
{
"epoch": 8.449962377727616,
"grad_norm": 6.358558177947998,
"learning_rate": 6.416687567929103e-06,
"loss": 0.1382,
"step": 11230
},
{
"epoch": 8.457486832204665,
"grad_norm": 9.709700584411621,
"learning_rate": 6.412507315441853e-06,
"loss": 0.1261,
"step": 11240
},
{
"epoch": 8.465011286681715,
"grad_norm": 7.283178329467773,
"learning_rate": 6.408327062954604e-06,
"loss": 0.1698,
"step": 11250
},
{
"epoch": 8.472535741158765,
"grad_norm": 10.301934242248535,
"learning_rate": 6.404146810467353e-06,
"loss": 0.1708,
"step": 11260
},
{
"epoch": 8.480060195635817,
"grad_norm": 10.021185874938965,
"learning_rate": 6.399966557980103e-06,
"loss": 0.0992,
"step": 11270
},
{
"epoch": 8.487584650112867,
"grad_norm": 10.817153930664062,
"learning_rate": 6.3957863054928525e-06,
"loss": 0.1722,
"step": 11280
},
{
"epoch": 8.495109104589917,
"grad_norm": 8.073854446411133,
"learning_rate": 6.391606053005602e-06,
"loss": 0.1598,
"step": 11290
},
{
"epoch": 8.502633559066968,
"grad_norm": 10.416369438171387,
"learning_rate": 6.3874258005183524e-06,
"loss": 0.108,
"step": 11300
},
{
"epoch": 8.510158013544018,
"grad_norm": 6.909933567047119,
"learning_rate": 6.383245548031102e-06,
"loss": 0.1307,
"step": 11310
},
{
"epoch": 8.517682468021068,
"grad_norm": 6.960227966308594,
"learning_rate": 6.3790652955438515e-06,
"loss": 0.1179,
"step": 11320
},
{
"epoch": 8.52520692249812,
"grad_norm": 8.188361167907715,
"learning_rate": 6.374885043056601e-06,
"loss": 0.1314,
"step": 11330
},
{
"epoch": 8.53273137697517,
"grad_norm": 7.911260604858398,
"learning_rate": 6.370704790569351e-06,
"loss": 0.1338,
"step": 11340
},
{
"epoch": 8.54025583145222,
"grad_norm": 5.278087139129639,
"learning_rate": 6.3665245380821e-06,
"loss": 0.0957,
"step": 11350
},
{
"epoch": 8.54778028592927,
"grad_norm": 8.951353073120117,
"learning_rate": 6.3623442855948506e-06,
"loss": 0.126,
"step": 11360
},
{
"epoch": 8.55530474040632,
"grad_norm": 5.146210670471191,
"learning_rate": 6.3581640331076e-06,
"loss": 0.1068,
"step": 11370
},
{
"epoch": 8.56282919488337,
"grad_norm": 9.495402336120605,
"learning_rate": 6.35398378062035e-06,
"loss": 0.0963,
"step": 11380
},
{
"epoch": 8.57035364936042,
"grad_norm": 10.180619239807129,
"learning_rate": 6.349803528133099e-06,
"loss": 0.1814,
"step": 11390
},
{
"epoch": 8.577878103837472,
"grad_norm": 5.210803508758545,
"learning_rate": 6.345623275645849e-06,
"loss": 0.1345,
"step": 11400
},
{
"epoch": 8.585402558314522,
"grad_norm": 4.041356563568115,
"learning_rate": 6.341443023158599e-06,
"loss": 0.0962,
"step": 11410
},
{
"epoch": 8.592927012791572,
"grad_norm": 12.49039077758789,
"learning_rate": 6.337262770671349e-06,
"loss": 0.1345,
"step": 11420
},
{
"epoch": 8.600451467268623,
"grad_norm": 1.2120592594146729,
"learning_rate": 6.333082518184099e-06,
"loss": 0.1211,
"step": 11430
},
{
"epoch": 8.607975921745673,
"grad_norm": 6.367912292480469,
"learning_rate": 6.328902265696849e-06,
"loss": 0.0813,
"step": 11440
},
{
"epoch": 8.615500376222723,
"grad_norm": 2.9278907775878906,
"learning_rate": 6.324722013209599e-06,
"loss": 0.1357,
"step": 11450
},
{
"epoch": 8.623024830699775,
"grad_norm": 12.494004249572754,
"learning_rate": 6.320541760722349e-06,
"loss": 0.1315,
"step": 11460
},
{
"epoch": 8.630549285176825,
"grad_norm": 9.539612770080566,
"learning_rate": 6.316361508235098e-06,
"loss": 0.1288,
"step": 11470
},
{
"epoch": 8.638073739653874,
"grad_norm": 9.347472190856934,
"learning_rate": 6.312181255747848e-06,
"loss": 0.1031,
"step": 11480
},
{
"epoch": 8.645598194130926,
"grad_norm": 7.605668067932129,
"learning_rate": 6.308001003260597e-06,
"loss": 0.1058,
"step": 11490
},
{
"epoch": 8.653122648607976,
"grad_norm": 10.471912384033203,
"learning_rate": 6.303820750773348e-06,
"loss": 0.1145,
"step": 11500
},
{
"epoch": 8.660647103085026,
"grad_norm": 4.992396354675293,
"learning_rate": 6.299640498286097e-06,
"loss": 0.1255,
"step": 11510
},
{
"epoch": 8.668171557562077,
"grad_norm": 11.861599922180176,
"learning_rate": 6.295460245798847e-06,
"loss": 0.1305,
"step": 11520
},
{
"epoch": 8.675696012039127,
"grad_norm": 1.827236294746399,
"learning_rate": 6.291279993311596e-06,
"loss": 0.1086,
"step": 11530
},
{
"epoch": 8.683220466516177,
"grad_norm": 9.01653003692627,
"learning_rate": 6.287099740824346e-06,
"loss": 0.1412,
"step": 11540
},
{
"epoch": 8.690744920993229,
"grad_norm": 3.50248122215271,
"learning_rate": 6.282919488337096e-06,
"loss": 0.1062,
"step": 11550
},
{
"epoch": 8.698269375470279,
"grad_norm": 5.9312567710876465,
"learning_rate": 6.278739235849846e-06,
"loss": 0.1023,
"step": 11560
},
{
"epoch": 8.705793829947329,
"grad_norm": 7.17442512512207,
"learning_rate": 6.274558983362595e-06,
"loss": 0.1337,
"step": 11570
},
{
"epoch": 8.713318284424378,
"grad_norm": 10.798535346984863,
"learning_rate": 6.270378730875345e-06,
"loss": 0.1254,
"step": 11580
},
{
"epoch": 8.72084273890143,
"grad_norm": 9.032575607299805,
"learning_rate": 6.266198478388094e-06,
"loss": 0.1385,
"step": 11590
},
{
"epoch": 8.72836719337848,
"grad_norm": 7.200125694274902,
"learning_rate": 6.262018225900845e-06,
"loss": 0.1088,
"step": 11600
},
{
"epoch": 8.73589164785553,
"grad_norm": 6.259599685668945,
"learning_rate": 6.257837973413595e-06,
"loss": 0.1281,
"step": 11610
},
{
"epoch": 8.743416102332581,
"grad_norm": 12.889073371887207,
"learning_rate": 6.253657720926345e-06,
"loss": 0.1614,
"step": 11620
},
{
"epoch": 8.750940556809631,
"grad_norm": 6.001211643218994,
"learning_rate": 6.249477468439094e-06,
"loss": 0.1526,
"step": 11630
},
{
"epoch": 8.758465011286681,
"grad_norm": 5.817295074462891,
"learning_rate": 6.245297215951845e-06,
"loss": 0.1266,
"step": 11640
},
{
"epoch": 8.765989465763733,
"grad_norm": 6.75652551651001,
"learning_rate": 6.241116963464594e-06,
"loss": 0.1105,
"step": 11650
},
{
"epoch": 8.773513920240783,
"grad_norm": 6.990323543548584,
"learning_rate": 6.236936710977344e-06,
"loss": 0.1269,
"step": 11660
},
{
"epoch": 8.781038374717832,
"grad_norm": 9.484482765197754,
"learning_rate": 6.232756458490093e-06,
"loss": 0.1584,
"step": 11670
},
{
"epoch": 8.788562829194884,
"grad_norm": 6.757113456726074,
"learning_rate": 6.228576206002843e-06,
"loss": 0.1423,
"step": 11680
},
{
"epoch": 8.796087283671934,
"grad_norm": 10.376631736755371,
"learning_rate": 6.224395953515593e-06,
"loss": 0.1294,
"step": 11690
},
{
"epoch": 8.803611738148984,
"grad_norm": 9.998665809631348,
"learning_rate": 6.220215701028343e-06,
"loss": 0.1348,
"step": 11700
},
{
"epoch": 8.811136192626035,
"grad_norm": 5.319802284240723,
"learning_rate": 6.216035448541092e-06,
"loss": 0.1504,
"step": 11710
},
{
"epoch": 8.818660647103085,
"grad_norm": 6.685969352722168,
"learning_rate": 6.211855196053842e-06,
"loss": 0.0937,
"step": 11720
},
{
"epoch": 8.826185101580135,
"grad_norm": 11.51011848449707,
"learning_rate": 6.2076749435665915e-06,
"loss": 0.1496,
"step": 11730
},
{
"epoch": 8.833709556057187,
"grad_norm": 7.66890287399292,
"learning_rate": 6.203494691079341e-06,
"loss": 0.1438,
"step": 11740
},
{
"epoch": 8.841234010534237,
"grad_norm": 8.105353355407715,
"learning_rate": 6.1993144385920914e-06,
"loss": 0.1031,
"step": 11750
},
{
"epoch": 8.848758465011286,
"grad_norm": 9.982457160949707,
"learning_rate": 6.195134186104841e-06,
"loss": 0.126,
"step": 11760
},
{
"epoch": 8.856282919488336,
"grad_norm": 6.818399429321289,
"learning_rate": 6.1909539336175905e-06,
"loss": 0.1298,
"step": 11770
},
{
"epoch": 8.863807373965388,
"grad_norm": 6.4164299964904785,
"learning_rate": 6.18677368113034e-06,
"loss": 0.1298,
"step": 11780
},
{
"epoch": 8.871331828442438,
"grad_norm": 9.575235366821289,
"learning_rate": 6.182593428643091e-06,
"loss": 0.1284,
"step": 11790
},
{
"epoch": 8.878856282919488,
"grad_norm": 5.450047492980957,
"learning_rate": 6.178413176155841e-06,
"loss": 0.1272,
"step": 11800
},
{
"epoch": 8.88638073739654,
"grad_norm": 10.198444366455078,
"learning_rate": 6.17423292366859e-06,
"loss": 0.0938,
"step": 11810
},
{
"epoch": 8.89390519187359,
"grad_norm": 6.001352310180664,
"learning_rate": 6.17005267118134e-06,
"loss": 0.0924,
"step": 11820
},
{
"epoch": 8.901429646350639,
"grad_norm": 6.211798191070557,
"learning_rate": 6.1658724186940895e-06,
"loss": 0.1193,
"step": 11830
},
{
"epoch": 8.90895410082769,
"grad_norm": 4.294961929321289,
"learning_rate": 6.16169216620684e-06,
"loss": 0.1484,
"step": 11840
},
{
"epoch": 8.91647855530474,
"grad_norm": 7.163177013397217,
"learning_rate": 6.1575119137195895e-06,
"loss": 0.0928,
"step": 11850
},
{
"epoch": 8.92400300978179,
"grad_norm": 7.8336591720581055,
"learning_rate": 6.153331661232339e-06,
"loss": 0.1004,
"step": 11860
},
{
"epoch": 8.931527464258842,
"grad_norm": 6.162928104400635,
"learning_rate": 6.1491514087450886e-06,
"loss": 0.1435,
"step": 11870
},
{
"epoch": 8.939051918735892,
"grad_norm": 7.090447902679443,
"learning_rate": 6.144971156257838e-06,
"loss": 0.1394,
"step": 11880
},
{
"epoch": 8.946576373212942,
"grad_norm": 6.38063907623291,
"learning_rate": 6.1407909037705885e-06,
"loss": 0.1458,
"step": 11890
},
{
"epoch": 8.954100827689992,
"grad_norm": 8.057353019714355,
"learning_rate": 6.136610651283338e-06,
"loss": 0.1522,
"step": 11900
},
{
"epoch": 8.961625282167043,
"grad_norm": 6.432565689086914,
"learning_rate": 6.132430398796088e-06,
"loss": 0.1279,
"step": 11910
},
{
"epoch": 8.969149736644093,
"grad_norm": 9.430665969848633,
"learning_rate": 6.128250146308837e-06,
"loss": 0.1376,
"step": 11920
},
{
"epoch": 8.976674191121143,
"grad_norm": 6.492280960083008,
"learning_rate": 6.124069893821587e-06,
"loss": 0.0997,
"step": 11930
},
{
"epoch": 8.984198645598195,
"grad_norm": 7.028443813323975,
"learning_rate": 6.119889641334337e-06,
"loss": 0.1294,
"step": 11940
},
{
"epoch": 8.991723100075244,
"grad_norm": 8.465397834777832,
"learning_rate": 6.115709388847087e-06,
"loss": 0.1444,
"step": 11950
},
{
"epoch": 8.999247554552294,
"grad_norm": 9.109210968017578,
"learning_rate": 6.111529136359836e-06,
"loss": 0.1535,
"step": 11960
},
{
"epoch": 9.006772009029346,
"grad_norm": 5.075557708740234,
"learning_rate": 6.107348883872587e-06,
"loss": 0.068,
"step": 11970
},
{
"epoch": 9.014296463506396,
"grad_norm": 4.022278785705566,
"learning_rate": 6.103168631385337e-06,
"loss": 0.1538,
"step": 11980
},
{
"epoch": 9.021820917983446,
"grad_norm": 7.8916850090026855,
"learning_rate": 6.0989883788980865e-06,
"loss": 0.1137,
"step": 11990
},
{
"epoch": 9.029345372460497,
"grad_norm": 8.236541748046875,
"learning_rate": 6.094808126410836e-06,
"loss": 0.1207,
"step": 12000
},
{
"epoch": 9.036869826937547,
"grad_norm": 5.704281806945801,
"learning_rate": 6.090627873923586e-06,
"loss": 0.1249,
"step": 12010
},
{
"epoch": 9.044394281414597,
"grad_norm": 4.6430277824401855,
"learning_rate": 6.086447621436335e-06,
"loss": 0.1156,
"step": 12020
},
{
"epoch": 9.051918735891649,
"grad_norm": 11.825648307800293,
"learning_rate": 6.082267368949086e-06,
"loss": 0.1083,
"step": 12030
},
{
"epoch": 9.059443190368698,
"grad_norm": 10.094679832458496,
"learning_rate": 6.078087116461835e-06,
"loss": 0.1102,
"step": 12040
},
{
"epoch": 9.066967644845748,
"grad_norm": 9.167254447937012,
"learning_rate": 6.073906863974585e-06,
"loss": 0.1148,
"step": 12050
},
{
"epoch": 9.0744920993228,
"grad_norm": 5.975985050201416,
"learning_rate": 6.069726611487334e-06,
"loss": 0.1222,
"step": 12060
},
{
"epoch": 9.08201655379985,
"grad_norm": 11.157576560974121,
"learning_rate": 6.065546359000084e-06,
"loss": 0.1458,
"step": 12070
},
{
"epoch": 9.0895410082769,
"grad_norm": 3.7663228511810303,
"learning_rate": 6.061366106512834e-06,
"loss": 0.1524,
"step": 12080
},
{
"epoch": 9.09706546275395,
"grad_norm": 4.96702766418457,
"learning_rate": 6.057185854025584e-06,
"loss": 0.1043,
"step": 12090
},
{
"epoch": 9.104589917231001,
"grad_norm": 9.441642761230469,
"learning_rate": 6.053005601538333e-06,
"loss": 0.1026,
"step": 12100
},
{
"epoch": 9.112114371708051,
"grad_norm": 5.106550693511963,
"learning_rate": 6.048825349051083e-06,
"loss": 0.111,
"step": 12110
},
{
"epoch": 9.119638826185101,
"grad_norm": 12.366744995117188,
"learning_rate": 6.044645096563832e-06,
"loss": 0.1293,
"step": 12120
},
{
"epoch": 9.127163280662153,
"grad_norm": 10.476715087890625,
"learning_rate": 6.040464844076582e-06,
"loss": 0.1035,
"step": 12130
},
{
"epoch": 9.134687735139202,
"grad_norm": 11.363449096679688,
"learning_rate": 6.036284591589332e-06,
"loss": 0.1441,
"step": 12140
},
{
"epoch": 9.142212189616252,
"grad_norm": 4.014682292938232,
"learning_rate": 6.032104339102082e-06,
"loss": 0.1294,
"step": 12150
},
{
"epoch": 9.149736644093304,
"grad_norm": 7.98138952255249,
"learning_rate": 6.027924086614832e-06,
"loss": 0.096,
"step": 12160
},
{
"epoch": 9.157261098570354,
"grad_norm": 8.680583000183105,
"learning_rate": 6.023743834127583e-06,
"loss": 0.1076,
"step": 12170
},
{
"epoch": 9.164785553047404,
"grad_norm": 8.325934410095215,
"learning_rate": 6.019563581640332e-06,
"loss": 0.0796,
"step": 12180
},
{
"epoch": 9.172310007524455,
"grad_norm": 5.094624996185303,
"learning_rate": 6.015383329153082e-06,
"loss": 0.0804,
"step": 12190
},
{
"epoch": 9.179834462001505,
"grad_norm": 4.378869533538818,
"learning_rate": 6.011203076665831e-06,
"loss": 0.111,
"step": 12200
},
{
"epoch": 9.187358916478555,
"grad_norm": 8.656767845153809,
"learning_rate": 6.007022824178581e-06,
"loss": 0.1048,
"step": 12210
},
{
"epoch": 9.194883370955605,
"grad_norm": 4.8737592697143555,
"learning_rate": 6.00284257169133e-06,
"loss": 0.1315,
"step": 12220
},
{
"epoch": 9.202407825432656,
"grad_norm": 7.291375637054443,
"learning_rate": 5.998662319204081e-06,
"loss": 0.1414,
"step": 12230
},
{
"epoch": 9.209932279909706,
"grad_norm": 6.14476203918457,
"learning_rate": 5.99448206671683e-06,
"loss": 0.1087,
"step": 12240
},
{
"epoch": 9.217456734386756,
"grad_norm": 9.752476692199707,
"learning_rate": 5.99030181422958e-06,
"loss": 0.1536,
"step": 12250
},
{
"epoch": 9.224981188863808,
"grad_norm": 8.39211368560791,
"learning_rate": 5.9861215617423294e-06,
"loss": 0.1192,
"step": 12260
},
{
"epoch": 9.232505643340858,
"grad_norm": 10.023391723632812,
"learning_rate": 5.981941309255079e-06,
"loss": 0.1397,
"step": 12270
},
{
"epoch": 9.240030097817908,
"grad_norm": 7.0415730476379395,
"learning_rate": 5.977761056767829e-06,
"loss": 0.1331,
"step": 12280
},
{
"epoch": 9.24755455229496,
"grad_norm": 7.50435733795166,
"learning_rate": 5.973580804280579e-06,
"loss": 0.1104,
"step": 12290
},
{
"epoch": 9.255079006772009,
"grad_norm": 2.4626896381378174,
"learning_rate": 5.9694005517933285e-06,
"loss": 0.1293,
"step": 12300
},
{
"epoch": 9.262603461249059,
"grad_norm": 8.800031661987305,
"learning_rate": 5.965220299306078e-06,
"loss": 0.0948,
"step": 12310
},
{
"epoch": 9.27012791572611,
"grad_norm": 5.665508270263672,
"learning_rate": 5.9610400468188276e-06,
"loss": 0.082,
"step": 12320
},
{
"epoch": 9.27765237020316,
"grad_norm": 6.743051052093506,
"learning_rate": 5.956859794331578e-06,
"loss": 0.1377,
"step": 12330
},
{
"epoch": 9.28517682468021,
"grad_norm": 2.95204496383667,
"learning_rate": 5.952679541844328e-06,
"loss": 0.1225,
"step": 12340
},
{
"epoch": 9.292701279157262,
"grad_norm": 8.680118560791016,
"learning_rate": 5.948499289357078e-06,
"loss": 0.1105,
"step": 12350
},
{
"epoch": 9.300225733634312,
"grad_norm": 7.5288615226745605,
"learning_rate": 5.9443190368698275e-06,
"loss": 0.1021,
"step": 12360
},
{
"epoch": 9.307750188111362,
"grad_norm": 5.6351637840271,
"learning_rate": 5.940138784382578e-06,
"loss": 0.0972,
"step": 12370
},
{
"epoch": 9.315274642588413,
"grad_norm": 6.20802640914917,
"learning_rate": 5.935958531895327e-06,
"loss": 0.1096,
"step": 12380
},
{
"epoch": 9.322799097065463,
"grad_norm": 12.660746574401855,
"learning_rate": 5.931778279408077e-06,
"loss": 0.1233,
"step": 12390
},
{
"epoch": 9.330323551542513,
"grad_norm": 6.66998291015625,
"learning_rate": 5.9275980269208265e-06,
"loss": 0.1361,
"step": 12400
},
{
"epoch": 9.337848006019563,
"grad_norm": 4.0259318351745605,
"learning_rate": 5.923417774433576e-06,
"loss": 0.1011,
"step": 12410
},
{
"epoch": 9.345372460496614,
"grad_norm": 4.9495744705200195,
"learning_rate": 5.9192375219463264e-06,
"loss": 0.1337,
"step": 12420
},
{
"epoch": 9.352896914973664,
"grad_norm": 7.259575366973877,
"learning_rate": 5.915057269459076e-06,
"loss": 0.0662,
"step": 12430
},
{
"epoch": 9.360421369450714,
"grad_norm": 3.8569116592407227,
"learning_rate": 5.9108770169718255e-06,
"loss": 0.1211,
"step": 12440
},
{
"epoch": 9.367945823927766,
"grad_norm": 2.6672203540802,
"learning_rate": 5.906696764484575e-06,
"loss": 0.1019,
"step": 12450
},
{
"epoch": 9.375470278404816,
"grad_norm": 6.545354843139648,
"learning_rate": 5.902516511997325e-06,
"loss": 0.0804,
"step": 12460
},
{
"epoch": 9.382994732881865,
"grad_norm": 10.449742317199707,
"learning_rate": 5.898336259510074e-06,
"loss": 0.1515,
"step": 12470
},
{
"epoch": 9.390519187358917,
"grad_norm": 5.685015678405762,
"learning_rate": 5.894156007022825e-06,
"loss": 0.1063,
"step": 12480
},
{
"epoch": 9.398043641835967,
"grad_norm": 7.485517978668213,
"learning_rate": 5.889975754535574e-06,
"loss": 0.1443,
"step": 12490
},
{
"epoch": 9.405568096313017,
"grad_norm": 6.86757230758667,
"learning_rate": 5.885795502048324e-06,
"loss": 0.1088,
"step": 12500
},
{
"epoch": 9.413092550790068,
"grad_norm": 9.321928977966309,
"learning_rate": 5.881615249561073e-06,
"loss": 0.1346,
"step": 12510
},
{
"epoch": 9.420617005267118,
"grad_norm": 4.986440181732178,
"learning_rate": 5.8774349970738245e-06,
"loss": 0.1582,
"step": 12520
},
{
"epoch": 9.428141459744168,
"grad_norm": 7.16530179977417,
"learning_rate": 5.873254744586574e-06,
"loss": 0.0851,
"step": 12530
},
{
"epoch": 9.43566591422122,
"grad_norm": 8.802489280700684,
"learning_rate": 5.8690744920993236e-06,
"loss": 0.117,
"step": 12540
},
{
"epoch": 9.44319036869827,
"grad_norm": 7.261384010314941,
"learning_rate": 5.864894239612073e-06,
"loss": 0.184,
"step": 12550
},
{
"epoch": 9.45071482317532,
"grad_norm": 11.29928207397461,
"learning_rate": 5.860713987124823e-06,
"loss": 0.0921,
"step": 12560
},
{
"epoch": 9.45823927765237,
"grad_norm": 9.241240501403809,
"learning_rate": 5.856533734637573e-06,
"loss": 0.1232,
"step": 12570
},
{
"epoch": 9.465763732129421,
"grad_norm": 8.743525505065918,
"learning_rate": 5.852353482150323e-06,
"loss": 0.109,
"step": 12580
},
{
"epoch": 9.47328818660647,
"grad_norm": 7.939570426940918,
"learning_rate": 5.848173229663072e-06,
"loss": 0.1403,
"step": 12590
},
{
"epoch": 9.48081264108352,
"grad_norm": 11.364997863769531,
"learning_rate": 5.843992977175822e-06,
"loss": 0.182,
"step": 12600
},
{
"epoch": 9.488337095560572,
"grad_norm": 6.514659881591797,
"learning_rate": 5.839812724688571e-06,
"loss": 0.1144,
"step": 12610
},
{
"epoch": 9.495861550037622,
"grad_norm": 5.120135307312012,
"learning_rate": 5.835632472201322e-06,
"loss": 0.1045,
"step": 12620
},
{
"epoch": 9.503386004514672,
"grad_norm": 6.3493733406066895,
"learning_rate": 5.831452219714071e-06,
"loss": 0.0998,
"step": 12630
},
{
"epoch": 9.510910458991724,
"grad_norm": 10.599377632141113,
"learning_rate": 5.827271967226821e-06,
"loss": 0.1201,
"step": 12640
},
{
"epoch": 9.518434913468774,
"grad_norm": 5.005101203918457,
"learning_rate": 5.82309171473957e-06,
"loss": 0.0909,
"step": 12650
},
{
"epoch": 9.525959367945823,
"grad_norm": 5.786133289337158,
"learning_rate": 5.81891146225232e-06,
"loss": 0.1231,
"step": 12660
},
{
"epoch": 9.533483822422875,
"grad_norm": 8.286248207092285,
"learning_rate": 5.81473120976507e-06,
"loss": 0.0995,
"step": 12670
},
{
"epoch": 9.541008276899925,
"grad_norm": 6.8975653648376465,
"learning_rate": 5.81055095727782e-06,
"loss": 0.1253,
"step": 12680
},
{
"epoch": 9.548532731376975,
"grad_norm": 7.303131580352783,
"learning_rate": 5.806370704790569e-06,
"loss": 0.1029,
"step": 12690
},
{
"epoch": 9.556057185854026,
"grad_norm": 3.548121213912964,
"learning_rate": 5.80219045230332e-06,
"loss": 0.106,
"step": 12700
},
{
"epoch": 9.563581640331076,
"grad_norm": 5.769835472106934,
"learning_rate": 5.79801019981607e-06,
"loss": 0.1213,
"step": 12710
},
{
"epoch": 9.571106094808126,
"grad_norm": 5.858428955078125,
"learning_rate": 5.79382994732882e-06,
"loss": 0.1673,
"step": 12720
},
{
"epoch": 9.578630549285176,
"grad_norm": 8.080878257751465,
"learning_rate": 5.789649694841569e-06,
"loss": 0.1614,
"step": 12730
},
{
"epoch": 9.586155003762228,
"grad_norm": 3.0256166458129883,
"learning_rate": 5.785469442354319e-06,
"loss": 0.1086,
"step": 12740
},
{
"epoch": 9.593679458239277,
"grad_norm": 9.315362930297852,
"learning_rate": 5.781289189867068e-06,
"loss": 0.1029,
"step": 12750
},
{
"epoch": 9.601203912716327,
"grad_norm": 11.079126358032227,
"learning_rate": 5.777108937379819e-06,
"loss": 0.1252,
"step": 12760
},
{
"epoch": 9.608728367193379,
"grad_norm": 5.017213821411133,
"learning_rate": 5.772928684892568e-06,
"loss": 0.1493,
"step": 12770
},
{
"epoch": 9.616252821670429,
"grad_norm": 7.71808385848999,
"learning_rate": 5.768748432405318e-06,
"loss": 0.1598,
"step": 12780
},
{
"epoch": 9.623777276147479,
"grad_norm": 6.4492363929748535,
"learning_rate": 5.764568179918067e-06,
"loss": 0.1126,
"step": 12790
},
{
"epoch": 9.63130173062453,
"grad_norm": 9.06157112121582,
"learning_rate": 5.760387927430817e-06,
"loss": 0.1009,
"step": 12800
},
{
"epoch": 9.63882618510158,
"grad_norm": 7.8551154136657715,
"learning_rate": 5.756207674943567e-06,
"loss": 0.1184,
"step": 12810
},
{
"epoch": 9.64635063957863,
"grad_norm": 5.052379608154297,
"learning_rate": 5.752027422456317e-06,
"loss": 0.0983,
"step": 12820
},
{
"epoch": 9.653875094055682,
"grad_norm": 6.5726518630981445,
"learning_rate": 5.747847169969066e-06,
"loss": 0.088,
"step": 12830
},
{
"epoch": 9.661399548532732,
"grad_norm": 7.628342628479004,
"learning_rate": 5.743666917481816e-06,
"loss": 0.0957,
"step": 12840
},
{
"epoch": 9.668924003009781,
"grad_norm": 9.543529510498047,
"learning_rate": 5.7394866649945655e-06,
"loss": 0.1356,
"step": 12850
},
{
"epoch": 9.676448457486833,
"grad_norm": 7.6209397315979,
"learning_rate": 5.735306412507315e-06,
"loss": 0.1546,
"step": 12860
},
{
"epoch": 9.683972911963883,
"grad_norm": 7.236486434936523,
"learning_rate": 5.7311261600200655e-06,
"loss": 0.1506,
"step": 12870
},
{
"epoch": 9.691497366440933,
"grad_norm": 7.329847812652588,
"learning_rate": 5.726945907532816e-06,
"loss": 0.1615,
"step": 12880
},
{
"epoch": 9.699021820917984,
"grad_norm": 4.845203399658203,
"learning_rate": 5.722765655045565e-06,
"loss": 0.1202,
"step": 12890
},
{
"epoch": 9.706546275395034,
"grad_norm": 9.867792129516602,
"learning_rate": 5.718585402558316e-06,
"loss": 0.1367,
"step": 12900
},
{
"epoch": 9.714070729872084,
"grad_norm": 11.250028610229492,
"learning_rate": 5.714405150071065e-06,
"loss": 0.1792,
"step": 12910
},
{
"epoch": 9.721595184349134,
"grad_norm": 5.1288886070251465,
"learning_rate": 5.710224897583815e-06,
"loss": 0.1051,
"step": 12920
},
{
"epoch": 9.729119638826186,
"grad_norm": 7.018002033233643,
"learning_rate": 5.7060446450965644e-06,
"loss": 0.0986,
"step": 12930
},
{
"epoch": 9.736644093303235,
"grad_norm": 4.279899597167969,
"learning_rate": 5.701864392609314e-06,
"loss": 0.1085,
"step": 12940
},
{
"epoch": 9.744168547780285,
"grad_norm": 3.208705186843872,
"learning_rate": 5.6976841401220635e-06,
"loss": 0.1091,
"step": 12950
},
{
"epoch": 9.751693002257337,
"grad_norm": 8.875226974487305,
"learning_rate": 5.693503887634814e-06,
"loss": 0.1267,
"step": 12960
},
{
"epoch": 9.759217456734387,
"grad_norm": 8.384805679321289,
"learning_rate": 5.6893236351475635e-06,
"loss": 0.1048,
"step": 12970
},
{
"epoch": 9.766741911211437,
"grad_norm": 6.142811298370361,
"learning_rate": 5.685143382660313e-06,
"loss": 0.1153,
"step": 12980
},
{
"epoch": 9.774266365688488,
"grad_norm": 9.738375663757324,
"learning_rate": 5.680963130173063e-06,
"loss": 0.1123,
"step": 12990
},
{
"epoch": 9.781790820165538,
"grad_norm": 5.795185089111328,
"learning_rate": 5.676782877685812e-06,
"loss": 0.1143,
"step": 13000
},
{
"epoch": 9.789315274642588,
"grad_norm": 8.338446617126465,
"learning_rate": 5.6726026251985625e-06,
"loss": 0.1388,
"step": 13010
},
{
"epoch": 9.79683972911964,
"grad_norm": 3.7553884983062744,
"learning_rate": 5.668422372711312e-06,
"loss": 0.1147,
"step": 13020
},
{
"epoch": 9.80436418359669,
"grad_norm": 4.050734043121338,
"learning_rate": 5.664242120224062e-06,
"loss": 0.1068,
"step": 13030
},
{
"epoch": 9.81188863807374,
"grad_norm": 9.254073143005371,
"learning_rate": 5.660061867736811e-06,
"loss": 0.1267,
"step": 13040
},
{
"epoch": 9.81941309255079,
"grad_norm": 10.481110572814941,
"learning_rate": 5.655881615249561e-06,
"loss": 0.1394,
"step": 13050
},
{
"epoch": 9.82693754702784,
"grad_norm": 5.782297611236572,
"learning_rate": 5.651701362762312e-06,
"loss": 0.094,
"step": 13060
},
{
"epoch": 9.83446200150489,
"grad_norm": 4.115938663482666,
"learning_rate": 5.6475211102750615e-06,
"loss": 0.1009,
"step": 13070
},
{
"epoch": 9.84198645598194,
"grad_norm": 9.126145362854004,
"learning_rate": 5.643340857787811e-06,
"loss": 0.1249,
"step": 13080
},
{
"epoch": 9.849510910458992,
"grad_norm": 5.1168131828308105,
"learning_rate": 5.639160605300561e-06,
"loss": 0.1231,
"step": 13090
},
{
"epoch": 9.857035364936042,
"grad_norm": 3.9374754428863525,
"learning_rate": 5.634980352813311e-06,
"loss": 0.1204,
"step": 13100
},
{
"epoch": 9.864559819413092,
"grad_norm": 6.21238899230957,
"learning_rate": 5.6308001003260606e-06,
"loss": 0.1009,
"step": 13110
},
{
"epoch": 9.872084273890144,
"grad_norm": 3.8661928176879883,
"learning_rate": 5.62661984783881e-06,
"loss": 0.1324,
"step": 13120
},
{
"epoch": 9.879608728367193,
"grad_norm": 9.107396125793457,
"learning_rate": 5.62243959535156e-06,
"loss": 0.12,
"step": 13130
},
{
"epoch": 9.887133182844243,
"grad_norm": 9.252467155456543,
"learning_rate": 5.618259342864309e-06,
"loss": 0.1414,
"step": 13140
},
{
"epoch": 9.894657637321295,
"grad_norm": 5.15799617767334,
"learning_rate": 5.61407909037706e-06,
"loss": 0.0885,
"step": 13150
},
{
"epoch": 9.902182091798345,
"grad_norm": 7.307941913604736,
"learning_rate": 5.609898837889809e-06,
"loss": 0.1298,
"step": 13160
},
{
"epoch": 9.909706546275395,
"grad_norm": 7.1728644371032715,
"learning_rate": 5.605718585402559e-06,
"loss": 0.082,
"step": 13170
},
{
"epoch": 9.917231000752446,
"grad_norm": 6.501073360443115,
"learning_rate": 5.601538332915308e-06,
"loss": 0.1097,
"step": 13180
},
{
"epoch": 9.924755455229496,
"grad_norm": 5.394291877746582,
"learning_rate": 5.597358080428058e-06,
"loss": 0.1422,
"step": 13190
},
{
"epoch": 9.932279909706546,
"grad_norm": 3.971874952316284,
"learning_rate": 5.593177827940808e-06,
"loss": 0.094,
"step": 13200
},
{
"epoch": 9.939804364183598,
"grad_norm": 6.758007526397705,
"learning_rate": 5.588997575453558e-06,
"loss": 0.1511,
"step": 13210
},
{
"epoch": 9.947328818660647,
"grad_norm": 12.515239715576172,
"learning_rate": 5.584817322966307e-06,
"loss": 0.1404,
"step": 13220
},
{
"epoch": 9.954853273137697,
"grad_norm": 10.174310684204102,
"learning_rate": 5.580637070479057e-06,
"loss": 0.1463,
"step": 13230
},
{
"epoch": 9.962377727614747,
"grad_norm": 6.401690483093262,
"learning_rate": 5.576456817991808e-06,
"loss": 0.1249,
"step": 13240
},
{
"epoch": 9.969902182091799,
"grad_norm": 9.405531883239746,
"learning_rate": 5.572276565504558e-06,
"loss": 0.1145,
"step": 13250
},
{
"epoch": 9.977426636568849,
"grad_norm": 8.298341751098633,
"learning_rate": 5.568096313017307e-06,
"loss": 0.1074,
"step": 13260
},
{
"epoch": 9.984951091045899,
"grad_norm": 6.206060409545898,
"learning_rate": 5.563916060530057e-06,
"loss": 0.1025,
"step": 13270
},
{
"epoch": 9.99247554552295,
"grad_norm": 6.96009635925293,
"learning_rate": 5.559735808042806e-06,
"loss": 0.12,
"step": 13280
},
{
"epoch": 10.0,
"grad_norm": 0.6092808842658997,
"learning_rate": 5.555555555555557e-06,
"loss": 0.1308,
"step": 13290
}
],
"logging_steps": 10,
"max_steps": 26580,
"num_input_tokens_seen": 0,
"num_train_epochs": 20,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}