AgPerry's picture
Upload folder using huggingface_hub
ff7cf8d verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 3295,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0030350165029022344,
"grad_norm": 2.2979175122091453,
"learning_rate": 2.7272727272727274e-07,
"loss": 0.789,
"step": 10
},
{
"epoch": 0.006070033005804469,
"grad_norm": 1.7852801501566888,
"learning_rate": 5.757575757575758e-07,
"loss": 0.7904,
"step": 20
},
{
"epoch": 0.009105049508706704,
"grad_norm": 1.1632800870309477,
"learning_rate": 8.787878787878788e-07,
"loss": 0.744,
"step": 30
},
{
"epoch": 0.012140066011608937,
"grad_norm": 0.8818693207084786,
"learning_rate": 1.181818181818182e-06,
"loss": 0.693,
"step": 40
},
{
"epoch": 0.015175082514511173,
"grad_norm": 0.657250412764622,
"learning_rate": 1.484848484848485e-06,
"loss": 0.6656,
"step": 50
},
{
"epoch": 0.018210099017413408,
"grad_norm": 0.5316521889428266,
"learning_rate": 1.787878787878788e-06,
"loss": 0.6313,
"step": 60
},
{
"epoch": 0.021245115520315643,
"grad_norm": 0.4575711828032985,
"learning_rate": 2.090909090909091e-06,
"loss": 0.6098,
"step": 70
},
{
"epoch": 0.024280132023217875,
"grad_norm": 0.4548384308100045,
"learning_rate": 2.393939393939394e-06,
"loss": 0.5885,
"step": 80
},
{
"epoch": 0.02731514852612011,
"grad_norm": 0.4458364948040082,
"learning_rate": 2.6969696969696972e-06,
"loss": 0.5916,
"step": 90
},
{
"epoch": 0.030350165029022345,
"grad_norm": 0.4794166169125783,
"learning_rate": 3e-06,
"loss": 0.5753,
"step": 100
},
{
"epoch": 0.03338518153192458,
"grad_norm": 0.4567719754046465,
"learning_rate": 3.3030303030303033e-06,
"loss": 0.5597,
"step": 110
},
{
"epoch": 0.036420198034826816,
"grad_norm": 0.4307419720442758,
"learning_rate": 3.606060606060606e-06,
"loss": 0.5684,
"step": 120
},
{
"epoch": 0.03945521453772905,
"grad_norm": 0.4458642819812865,
"learning_rate": 3.90909090909091e-06,
"loss": 0.5569,
"step": 130
},
{
"epoch": 0.042490231040631286,
"grad_norm": 0.45568817442378473,
"learning_rate": 4.212121212121212e-06,
"loss": 0.5551,
"step": 140
},
{
"epoch": 0.045525247543533515,
"grad_norm": 0.4950300651709512,
"learning_rate": 4.5151515151515155e-06,
"loss": 0.5565,
"step": 150
},
{
"epoch": 0.04856026404643575,
"grad_norm": 0.4441786811205893,
"learning_rate": 4.818181818181819e-06,
"loss": 0.5482,
"step": 160
},
{
"epoch": 0.051595280549337985,
"grad_norm": 0.4926559856436913,
"learning_rate": 5.121212121212121e-06,
"loss": 0.5482,
"step": 170
},
{
"epoch": 0.05463029705224022,
"grad_norm": 0.46505669754342027,
"learning_rate": 5.424242424242425e-06,
"loss": 0.5397,
"step": 180
},
{
"epoch": 0.057665313555142456,
"grad_norm": 0.4862932493508676,
"learning_rate": 5.727272727272728e-06,
"loss": 0.5379,
"step": 190
},
{
"epoch": 0.06070033005804469,
"grad_norm": 0.5633080158535704,
"learning_rate": 6.030303030303031e-06,
"loss": 0.5487,
"step": 200
},
{
"epoch": 0.06373534656094693,
"grad_norm": 0.5144964810224658,
"learning_rate": 6.333333333333333e-06,
"loss": 0.5354,
"step": 210
},
{
"epoch": 0.06677036306384916,
"grad_norm": 0.623858354223414,
"learning_rate": 6.6363636363636375e-06,
"loss": 0.5321,
"step": 220
},
{
"epoch": 0.0698053795667514,
"grad_norm": 0.5384794963785807,
"learning_rate": 6.93939393939394e-06,
"loss": 0.5286,
"step": 230
},
{
"epoch": 0.07284039606965363,
"grad_norm": 0.535110770579217,
"learning_rate": 7.242424242424243e-06,
"loss": 0.5277,
"step": 240
},
{
"epoch": 0.07587541257255587,
"grad_norm": 0.5036293665682129,
"learning_rate": 7.545454545454546e-06,
"loss": 0.5333,
"step": 250
},
{
"epoch": 0.0789104290754581,
"grad_norm": 0.47084801964860584,
"learning_rate": 7.848484848484849e-06,
"loss": 0.5272,
"step": 260
},
{
"epoch": 0.08194544557836034,
"grad_norm": 0.6020093296202623,
"learning_rate": 8.151515151515152e-06,
"loss": 0.5283,
"step": 270
},
{
"epoch": 0.08498046208126257,
"grad_norm": 0.5024737305172274,
"learning_rate": 8.454545454545455e-06,
"loss": 0.5212,
"step": 280
},
{
"epoch": 0.08801547858416481,
"grad_norm": 0.4994326809453112,
"learning_rate": 8.757575757575759e-06,
"loss": 0.5161,
"step": 290
},
{
"epoch": 0.09105049508706703,
"grad_norm": 0.5231140922773104,
"learning_rate": 9.06060606060606e-06,
"loss": 0.5281,
"step": 300
},
{
"epoch": 0.09408551158996926,
"grad_norm": 0.5098295511036418,
"learning_rate": 9.363636363636365e-06,
"loss": 0.5154,
"step": 310
},
{
"epoch": 0.0971205280928715,
"grad_norm": 0.5892467794961541,
"learning_rate": 9.666666666666667e-06,
"loss": 0.5218,
"step": 320
},
{
"epoch": 0.10015554459577374,
"grad_norm": 0.4951705896851936,
"learning_rate": 9.96969696969697e-06,
"loss": 0.5169,
"step": 330
},
{
"epoch": 0.10319056109867597,
"grad_norm": 0.6364271310354288,
"learning_rate": 9.999772661973056e-06,
"loss": 0.5133,
"step": 340
},
{
"epoch": 0.1062255776015782,
"grad_norm": 0.5028738085648048,
"learning_rate": 9.99898682866784e-06,
"loss": 0.52,
"step": 350
},
{
"epoch": 0.10926059410448044,
"grad_norm": 0.5217859360497503,
"learning_rate": 9.997639781643002e-06,
"loss": 0.5008,
"step": 360
},
{
"epoch": 0.11229561060738268,
"grad_norm": 0.5157314231456531,
"learning_rate": 9.99573167212544e-06,
"loss": 0.517,
"step": 370
},
{
"epoch": 0.11533062711028491,
"grad_norm": 0.524618846108783,
"learning_rate": 9.993262714330009e-06,
"loss": 0.5092,
"step": 380
},
{
"epoch": 0.11836564361318715,
"grad_norm": 0.5191412167795787,
"learning_rate": 9.990233185435473e-06,
"loss": 0.513,
"step": 390
},
{
"epoch": 0.12140066011608938,
"grad_norm": 0.5047990336080693,
"learning_rate": 9.986643425553386e-06,
"loss": 0.5129,
"step": 400
},
{
"epoch": 0.12443567661899162,
"grad_norm": 0.49678214945335275,
"learning_rate": 9.98249383768991e-06,
"loss": 0.5073,
"step": 410
},
{
"epoch": 0.12747069312189385,
"grad_norm": 0.5341967716757828,
"learning_rate": 9.977784887700572e-06,
"loss": 0.5088,
"step": 420
},
{
"epoch": 0.13050570962479607,
"grad_norm": 0.49917946146850173,
"learning_rate": 9.972517104237961e-06,
"loss": 0.4991,
"step": 430
},
{
"epoch": 0.13354072612769832,
"grad_norm": 0.49221501327780853,
"learning_rate": 9.966691078692386e-06,
"loss": 0.506,
"step": 440
},
{
"epoch": 0.13657574263060054,
"grad_norm": 0.5448393355182506,
"learning_rate": 9.960307465125472e-06,
"loss": 0.5025,
"step": 450
},
{
"epoch": 0.1396107591335028,
"grad_norm": 0.9165522381667764,
"learning_rate": 9.953366980196746e-06,
"loss": 0.4976,
"step": 460
},
{
"epoch": 0.14264577563640501,
"grad_norm": 0.5122649698502958,
"learning_rate": 9.945870403083164e-06,
"loss": 0.503,
"step": 470
},
{
"epoch": 0.14568079213930726,
"grad_norm": 0.48003844926471007,
"learning_rate": 9.937818575391654e-06,
"loss": 0.5044,
"step": 480
},
{
"epoch": 0.14871580864220948,
"grad_norm": 0.5083116253946615,
"learning_rate": 9.929212401064616e-06,
"loss": 0.505,
"step": 490
},
{
"epoch": 0.15175082514511173,
"grad_norm": 0.4997990195713387,
"learning_rate": 9.920052846278455e-06,
"loss": 0.4991,
"step": 500
},
{
"epoch": 0.15478584164801396,
"grad_norm": 0.4863244820808186,
"learning_rate": 9.910340939335098e-06,
"loss": 0.4889,
"step": 510
},
{
"epoch": 0.1578208581509162,
"grad_norm": 0.519835159370644,
"learning_rate": 9.900077770546567e-06,
"loss": 0.488,
"step": 520
},
{
"epoch": 0.16085587465381843,
"grad_norm": 0.5247955372337217,
"learning_rate": 9.889264492112563e-06,
"loss": 0.5025,
"step": 530
},
{
"epoch": 0.16389089115672067,
"grad_norm": 0.47839854894729905,
"learning_rate": 9.877902317991116e-06,
"loss": 0.4946,
"step": 540
},
{
"epoch": 0.1669259076596229,
"grad_norm": 0.4889470867795177,
"learning_rate": 9.865992523762306e-06,
"loss": 0.4989,
"step": 550
},
{
"epoch": 0.16996092416252515,
"grad_norm": 0.4753575663441252,
"learning_rate": 9.853536446485048e-06,
"loss": 0.503,
"step": 560
},
{
"epoch": 0.17299594066542737,
"grad_norm": 0.5422582939397118,
"learning_rate": 9.840535484546996e-06,
"loss": 0.4903,
"step": 570
},
{
"epoch": 0.17603095716832962,
"grad_norm": 0.4706321475205836,
"learning_rate": 9.826991097507548e-06,
"loss": 0.4958,
"step": 580
},
{
"epoch": 0.17906597367123184,
"grad_norm": 0.4839096225021675,
"learning_rate": 9.812904805933989e-06,
"loss": 0.4922,
"step": 590
},
{
"epoch": 0.18210099017413406,
"grad_norm": 0.4732593571246822,
"learning_rate": 9.798278191230783e-06,
"loss": 0.5004,
"step": 600
},
{
"epoch": 0.1851360066770363,
"grad_norm": 0.4505029818283895,
"learning_rate": 9.78311289546204e-06,
"loss": 0.4802,
"step": 610
},
{
"epoch": 0.18817102317993853,
"grad_norm": 0.5055436622019127,
"learning_rate": 9.76741062116716e-06,
"loss": 0.4945,
"step": 620
},
{
"epoch": 0.19120603968284078,
"grad_norm": 0.49572149753193984,
"learning_rate": 9.751173131169705e-06,
"loss": 0.4906,
"step": 630
},
{
"epoch": 0.194241056185743,
"grad_norm": 0.4448394645138839,
"learning_rate": 9.73440224837949e-06,
"loss": 0.496,
"step": 640
},
{
"epoch": 0.19727607268864525,
"grad_norm": 0.49316009769902025,
"learning_rate": 9.717099855587935e-06,
"loss": 0.486,
"step": 650
},
{
"epoch": 0.20031108919154747,
"grad_norm": 0.47797040662070445,
"learning_rate": 9.699267895256695e-06,
"loss": 0.4769,
"step": 660
},
{
"epoch": 0.20334610569444972,
"grad_norm": 0.5092952568059224,
"learning_rate": 9.68090836929958e-06,
"loss": 0.4918,
"step": 670
},
{
"epoch": 0.20638112219735194,
"grad_norm": 0.45202984420347087,
"learning_rate": 9.662023338857822e-06,
"loss": 0.485,
"step": 680
},
{
"epoch": 0.2094161387002542,
"grad_norm": 0.4983093319634588,
"learning_rate": 9.642614924068667e-06,
"loss": 0.4902,
"step": 690
},
{
"epoch": 0.2124511552031564,
"grad_norm": 0.492174959560037,
"learning_rate": 9.622685303827366e-06,
"loss": 0.4881,
"step": 700
},
{
"epoch": 0.21548617170605866,
"grad_norm": 0.47815379215317105,
"learning_rate": 9.602236715542557e-06,
"loss": 0.4848,
"step": 710
},
{
"epoch": 0.21852118820896088,
"grad_norm": 0.48589573091096694,
"learning_rate": 9.581271454885077e-06,
"loss": 0.4903,
"step": 720
},
{
"epoch": 0.22155620471186313,
"grad_norm": 0.46478604800112194,
"learning_rate": 9.559791875530247e-06,
"loss": 0.489,
"step": 730
},
{
"epoch": 0.22459122121476535,
"grad_norm": 0.5035008932669235,
"learning_rate": 9.537800388893628e-06,
"loss": 0.4864,
"step": 740
},
{
"epoch": 0.2276262377176676,
"grad_norm": 0.49713803635487736,
"learning_rate": 9.515299463860301e-06,
"loss": 0.4858,
"step": 750
},
{
"epoch": 0.23066125422056982,
"grad_norm": 0.48728829894131276,
"learning_rate": 9.492291626507705e-06,
"loss": 0.4874,
"step": 760
},
{
"epoch": 0.23369627072347207,
"grad_norm": 0.5654089866669252,
"learning_rate": 9.468779459822034e-06,
"loss": 0.4865,
"step": 770
},
{
"epoch": 0.2367312872263743,
"grad_norm": 0.456617843862989,
"learning_rate": 9.444765603408273e-06,
"loss": 0.4834,
"step": 780
},
{
"epoch": 0.23976630372927651,
"grad_norm": 0.7108327075954629,
"learning_rate": 9.420252753193842e-06,
"loss": 0.4725,
"step": 790
},
{
"epoch": 0.24280132023217876,
"grad_norm": 0.49021009503467355,
"learning_rate": 9.395243661125948e-06,
"loss": 0.4882,
"step": 800
},
{
"epoch": 0.24583633673508098,
"grad_norm": 0.48115706251944507,
"learning_rate": 9.369741134862636e-06,
"loss": 0.4752,
"step": 810
},
{
"epoch": 0.24887135323798323,
"grad_norm": 0.5131051402999688,
"learning_rate": 9.343748037457585e-06,
"loss": 0.4869,
"step": 820
},
{
"epoch": 0.2519063697408855,
"grad_norm": 0.4658252213252816,
"learning_rate": 9.317267287038682e-06,
"loss": 0.4884,
"step": 830
},
{
"epoch": 0.2549413862437877,
"grad_norm": 0.4751568661396783,
"learning_rate": 9.290301856480425e-06,
"loss": 0.4797,
"step": 840
},
{
"epoch": 0.2579764027466899,
"grad_norm": 0.47930138340752015,
"learning_rate": 9.262854773070157e-06,
"loss": 0.4869,
"step": 850
},
{
"epoch": 0.26101141924959215,
"grad_norm": 0.47733793569696137,
"learning_rate": 9.234929118168228e-06,
"loss": 0.4712,
"step": 860
},
{
"epoch": 0.2640464357524944,
"grad_norm": 0.4566798290905438,
"learning_rate": 9.206528026862043e-06,
"loss": 0.4765,
"step": 870
},
{
"epoch": 0.26708145225539665,
"grad_norm": 0.4466757545533174,
"learning_rate": 9.177654687614112e-06,
"loss": 0.4824,
"step": 880
},
{
"epoch": 0.27011646875829887,
"grad_norm": 0.47012964956941894,
"learning_rate": 9.148312341904095e-06,
"loss": 0.4768,
"step": 890
},
{
"epoch": 0.2731514852612011,
"grad_norm": 0.4650672010975063,
"learning_rate": 9.118504283864891e-06,
"loss": 0.4763,
"step": 900
},
{
"epoch": 0.27618650176410336,
"grad_norm": 0.4522875610929074,
"learning_rate": 9.088233859912823e-06,
"loss": 0.4774,
"step": 910
},
{
"epoch": 0.2792215182670056,
"grad_norm": 0.9218754990609884,
"learning_rate": 9.057504468371954e-06,
"loss": 0.4774,
"step": 920
},
{
"epoch": 0.2822565347699078,
"grad_norm": 0.4446080760553245,
"learning_rate": 9.026319559092566e-06,
"loss": 0.4822,
"step": 930
},
{
"epoch": 0.28529155127281003,
"grad_norm": 0.45974314278861356,
"learning_rate": 8.994682633063868e-06,
"loss": 0.4737,
"step": 940
},
{
"epoch": 0.2883265677757123,
"grad_norm": 0.46640835738246217,
"learning_rate": 8.962597242020947e-06,
"loss": 0.4772,
"step": 950
},
{
"epoch": 0.2913615842786145,
"grad_norm": 0.45954793881402833,
"learning_rate": 8.930066988046042e-06,
"loss": 0.4688,
"step": 960
},
{
"epoch": 0.29439660078151675,
"grad_norm": 0.5238237149613728,
"learning_rate": 8.897095523164141e-06,
"loss": 0.4742,
"step": 970
},
{
"epoch": 0.29743161728441897,
"grad_norm": 0.45541775923959604,
"learning_rate": 8.863686548933001e-06,
"loss": 0.4786,
"step": 980
},
{
"epoch": 0.3004666337873212,
"grad_norm": 0.46811147224647004,
"learning_rate": 8.829843816027575e-06,
"loss": 0.4706,
"step": 990
},
{
"epoch": 0.30350165029022347,
"grad_norm": 0.47480506009090023,
"learning_rate": 8.795571123818948e-06,
"loss": 0.4733,
"step": 1000
},
{
"epoch": 0.3065366667931257,
"grad_norm": 0.46579935990440124,
"learning_rate": 8.760872319947796e-06,
"loss": 0.467,
"step": 1010
},
{
"epoch": 0.3095716832960279,
"grad_norm": 0.4713650285222608,
"learning_rate": 8.72575129989244e-06,
"loss": 0.4714,
"step": 1020
},
{
"epoch": 0.31260669979893013,
"grad_norm": 0.5172958915739383,
"learning_rate": 8.690212006531498e-06,
"loss": 0.4778,
"step": 1030
},
{
"epoch": 0.3156417163018324,
"grad_norm": 0.4636306902344978,
"learning_rate": 8.654258429701254e-06,
"loss": 0.4766,
"step": 1040
},
{
"epoch": 0.31867673280473463,
"grad_norm": 0.4630632755815923,
"learning_rate": 8.617894605747728e-06,
"loss": 0.471,
"step": 1050
},
{
"epoch": 0.32171174930763685,
"grad_norm": 0.4726631111043291,
"learning_rate": 8.581124617073531e-06,
"loss": 0.4754,
"step": 1060
},
{
"epoch": 0.3247467658105391,
"grad_norm": 0.449173460165581,
"learning_rate": 8.543952591679565e-06,
"loss": 0.4757,
"step": 1070
},
{
"epoch": 0.32778178231344135,
"grad_norm": 0.4593115406956091,
"learning_rate": 8.506382702701575e-06,
"loss": 0.4682,
"step": 1080
},
{
"epoch": 0.33081679881634357,
"grad_norm": 0.49121579493660045,
"learning_rate": 8.468419167941658e-06,
"loss": 0.4631,
"step": 1090
},
{
"epoch": 0.3338518153192458,
"grad_norm": 0.4799330378295981,
"learning_rate": 8.430066249394754e-06,
"loss": 0.4786,
"step": 1100
},
{
"epoch": 0.336886831822148,
"grad_norm": 0.46743240892192656,
"learning_rate": 8.391328252770165e-06,
"loss": 0.4648,
"step": 1110
},
{
"epoch": 0.3399218483250503,
"grad_norm": 0.49238589329434995,
"learning_rate": 8.352209527008164e-06,
"loss": 0.4785,
"step": 1120
},
{
"epoch": 0.3429568648279525,
"grad_norm": 0.463123088401655,
"learning_rate": 8.31271446379178e-06,
"loss": 0.4684,
"step": 1130
},
{
"epoch": 0.34599188133085473,
"grad_norm": 0.45804967626699783,
"learning_rate": 8.272847497053745e-06,
"loss": 0.467,
"step": 1140
},
{
"epoch": 0.34902689783375695,
"grad_norm": 0.49208327340861874,
"learning_rate": 8.232613102478722e-06,
"loss": 0.4734,
"step": 1150
},
{
"epoch": 0.35206191433665923,
"grad_norm": 0.4834727766246894,
"learning_rate": 8.192015797000849e-06,
"loss": 0.4634,
"step": 1160
},
{
"epoch": 0.35509693083956145,
"grad_norm": 0.45802821271745225,
"learning_rate": 8.151060138296624e-06,
"loss": 0.4769,
"step": 1170
},
{
"epoch": 0.3581319473424637,
"grad_norm": 0.47069229829560316,
"learning_rate": 8.10975072427326e-06,
"loss": 0.4631,
"step": 1180
},
{
"epoch": 0.3611669638453659,
"grad_norm": 0.4638785009109008,
"learning_rate": 8.068092192552473e-06,
"loss": 0.4621,
"step": 1190
},
{
"epoch": 0.3642019803482681,
"grad_norm": 0.5267368501556567,
"learning_rate": 8.026089219949856e-06,
"loss": 0.4707,
"step": 1200
},
{
"epoch": 0.3672369968511704,
"grad_norm": 0.4603778748004369,
"learning_rate": 7.983746521949822e-06,
"loss": 0.4691,
"step": 1210
},
{
"epoch": 0.3702720133540726,
"grad_norm": 0.5044371212314646,
"learning_rate": 7.941068852176233e-06,
"loss": 0.4673,
"step": 1220
},
{
"epoch": 0.37330702985697484,
"grad_norm": 0.5023326078920469,
"learning_rate": 7.898061001858712e-06,
"loss": 0.4652,
"step": 1230
},
{
"epoch": 0.37634204635987706,
"grad_norm": 0.5465382069712866,
"learning_rate": 7.854727799294768e-06,
"loss": 0.4648,
"step": 1240
},
{
"epoch": 0.37937706286277934,
"grad_norm": 0.46811731936405077,
"learning_rate": 7.81107410930774e-06,
"loss": 0.474,
"step": 1250
},
{
"epoch": 0.38241207936568156,
"grad_norm": 0.4582110078248361,
"learning_rate": 7.767104832700645e-06,
"loss": 0.4557,
"step": 1260
},
{
"epoch": 0.3854470958685838,
"grad_norm": 0.47816100008054285,
"learning_rate": 7.72282490570599e-06,
"loss": 0.4655,
"step": 1270
},
{
"epoch": 0.388482112371486,
"grad_norm": 0.4587552509577914,
"learning_rate": 7.678239299431594e-06,
"loss": 0.4675,
"step": 1280
},
{
"epoch": 0.3915171288743883,
"grad_norm": 0.48955107625039307,
"learning_rate": 7.633353019302519e-06,
"loss": 0.4628,
"step": 1290
},
{
"epoch": 0.3945521453772905,
"grad_norm": 0.4697573855941327,
"learning_rate": 7.58817110449912e-06,
"loss": 0.4705,
"step": 1300
},
{
"epoch": 0.3975871618801927,
"grad_norm": 0.4586083541834714,
"learning_rate": 7.5426986273913275e-06,
"loss": 0.4633,
"step": 1310
},
{
"epoch": 0.40062217838309494,
"grad_norm": 0.4943303423222579,
"learning_rate": 7.496940692969188e-06,
"loss": 0.4664,
"step": 1320
},
{
"epoch": 0.4036571948859972,
"grad_norm": 0.4472430460309261,
"learning_rate": 7.450902438269761e-06,
"loss": 0.466,
"step": 1330
},
{
"epoch": 0.40669221138889944,
"grad_norm": 0.45151069440310115,
"learning_rate": 7.404589031800395e-06,
"loss": 0.466,
"step": 1340
},
{
"epoch": 0.40972722789180166,
"grad_norm": 0.4931709679843594,
"learning_rate": 7.358005672958488e-06,
"loss": 0.4638,
"step": 1350
},
{
"epoch": 0.4127622443947039,
"grad_norm": 0.4458955481322604,
"learning_rate": 7.311157591447775e-06,
"loss": 0.4574,
"step": 1360
},
{
"epoch": 0.41579726089760616,
"grad_norm": 0.5037362623251065,
"learning_rate": 7.264050046691211e-06,
"loss": 0.4631,
"step": 1370
},
{
"epoch": 0.4188322774005084,
"grad_norm": 0.5401075737360693,
"learning_rate": 7.216688327240523e-06,
"loss": 0.4672,
"step": 1380
},
{
"epoch": 0.4218672939034106,
"grad_norm": 0.46823354166024683,
"learning_rate": 7.16907775018248e-06,
"loss": 0.4613,
"step": 1390
},
{
"epoch": 0.4249023104063128,
"grad_norm": 0.4780876891839522,
"learning_rate": 7.1212236605419795e-06,
"loss": 0.4666,
"step": 1400
},
{
"epoch": 0.42793732690921504,
"grad_norm": 0.5119425577262322,
"learning_rate": 7.0731314306819725e-06,
"loss": 0.454,
"step": 1410
},
{
"epoch": 0.4309723434121173,
"grad_norm": 0.47436698182963066,
"learning_rate": 7.024806459700344e-06,
"loss": 0.4745,
"step": 1420
},
{
"epoch": 0.43400735991501954,
"grad_norm": 0.47009240919947903,
"learning_rate": 6.976254172823773e-06,
"loss": 0.4578,
"step": 1430
},
{
"epoch": 0.43704237641792176,
"grad_norm": 0.4505894481894143,
"learning_rate": 6.92748002079867e-06,
"loss": 0.4652,
"step": 1440
},
{
"epoch": 0.440077392920824,
"grad_norm": 0.4739852546862808,
"learning_rate": 6.878489479279248e-06,
"loss": 0.4634,
"step": 1450
},
{
"epoch": 0.44311240942372626,
"grad_norm": 0.43339777098222865,
"learning_rate": 6.829288048212789e-06,
"loss": 0.4583,
"step": 1460
},
{
"epoch": 0.4461474259266285,
"grad_norm": 0.48444826933680063,
"learning_rate": 6.779881251222198e-06,
"loss": 0.4654,
"step": 1470
},
{
"epoch": 0.4491824424295307,
"grad_norm": 0.43409574874106044,
"learning_rate": 6.730274634985883e-06,
"loss": 0.4671,
"step": 1480
},
{
"epoch": 0.4522174589324329,
"grad_norm": 0.4532708590504662,
"learning_rate": 6.6804737686150615e-06,
"loss": 0.4698,
"step": 1490
},
{
"epoch": 0.4552524754353352,
"grad_norm": 0.473135305256464,
"learning_rate": 6.630484243028534e-06,
"loss": 0.4737,
"step": 1500
},
{
"epoch": 0.4582874919382374,
"grad_norm": 0.4685015310135646,
"learning_rate": 6.580311670325029e-06,
"loss": 0.4556,
"step": 1510
},
{
"epoch": 0.46132250844113964,
"grad_norm": 0.46358839605143787,
"learning_rate": 6.529961683153136e-06,
"loss": 0.4604,
"step": 1520
},
{
"epoch": 0.46435752494404187,
"grad_norm": 0.45388131225935224,
"learning_rate": 6.479439934078983e-06,
"loss": 0.4559,
"step": 1530
},
{
"epoch": 0.46739254144694414,
"grad_norm": 0.435672967522485,
"learning_rate": 6.428752094951621e-06,
"loss": 0.4589,
"step": 1540
},
{
"epoch": 0.47042755794984636,
"grad_norm": 0.4681448507359377,
"learning_rate": 6.377903856266285e-06,
"loss": 0.4656,
"step": 1550
},
{
"epoch": 0.4734625744527486,
"grad_norm": 0.4866196593408997,
"learning_rate": 6.326900926525552e-06,
"loss": 0.4587,
"step": 1560
},
{
"epoch": 0.4764975909556508,
"grad_norm": 0.5334135903943417,
"learning_rate": 6.275749031598457e-06,
"loss": 0.4596,
"step": 1570
},
{
"epoch": 0.47953260745855303,
"grad_norm": 0.545053303736673,
"learning_rate": 6.224453914077691e-06,
"loss": 0.4599,
"step": 1580
},
{
"epoch": 0.4825676239614553,
"grad_norm": 0.43049803862728125,
"learning_rate": 6.173021332634899e-06,
"loss": 0.4609,
"step": 1590
},
{
"epoch": 0.4856026404643575,
"grad_norm": 0.4679235070374552,
"learning_rate": 6.121457061374182e-06,
"loss": 0.4659,
"step": 1600
},
{
"epoch": 0.48863765696725975,
"grad_norm": 0.48596466698858914,
"learning_rate": 6.06976688918386e-06,
"loss": 0.4552,
"step": 1610
},
{
"epoch": 0.49167267347016197,
"grad_norm": 0.4654168221541189,
"learning_rate": 6.017956619086585e-06,
"loss": 0.4652,
"step": 1620
},
{
"epoch": 0.49470768997306425,
"grad_norm": 0.43813429165646467,
"learning_rate": 5.966032067587862e-06,
"loss": 0.4596,
"step": 1630
},
{
"epoch": 0.49774270647596647,
"grad_norm": 0.6681280851162651,
"learning_rate": 5.913999064023046e-06,
"loss": 0.4572,
"step": 1640
},
{
"epoch": 0.5007777229788687,
"grad_norm": 0.5204323449742513,
"learning_rate": 5.861863449902926e-06,
"loss": 0.4628,
"step": 1650
},
{
"epoch": 0.503812739481771,
"grad_norm": 0.4296041756151093,
"learning_rate": 5.80963107825791e-06,
"loss": 0.4568,
"step": 1660
},
{
"epoch": 0.5068477559846731,
"grad_norm": 0.4687304772565422,
"learning_rate": 5.7573078129809386e-06,
"loss": 0.4604,
"step": 1670
},
{
"epoch": 0.5098827724875754,
"grad_norm": 0.4554317567297939,
"learning_rate": 5.704899528169175e-06,
"loss": 0.4698,
"step": 1680
},
{
"epoch": 0.5129177889904777,
"grad_norm": 0.4550611945171018,
"learning_rate": 5.652412107464532e-06,
"loss": 0.4559,
"step": 1690
},
{
"epoch": 0.5159528054933799,
"grad_norm": 0.4737844991440093,
"learning_rate": 5.5998514433931636e-06,
"loss": 0.4657,
"step": 1700
},
{
"epoch": 0.5189878219962821,
"grad_norm": 0.44176755708702975,
"learning_rate": 5.547223436703919e-06,
"loss": 0.4555,
"step": 1710
},
{
"epoch": 0.5220228384991843,
"grad_norm": 0.4602868804162559,
"learning_rate": 5.494533995705904e-06,
"loss": 0.4587,
"step": 1720
},
{
"epoch": 0.5250578550020866,
"grad_norm": 0.4619636659852323,
"learning_rate": 5.441789035605174e-06,
"loss": 0.4605,
"step": 1730
},
{
"epoch": 0.5280928715049888,
"grad_norm": 0.4918743050797947,
"learning_rate": 5.3889944778406656e-06,
"loss": 0.4601,
"step": 1740
},
{
"epoch": 0.531127888007891,
"grad_norm": 0.5108035906034613,
"learning_rate": 5.336156249419422e-06,
"loss": 0.4583,
"step": 1750
},
{
"epoch": 0.5341629045107933,
"grad_norm": 0.4644351963738344,
"learning_rate": 5.283280282251192e-06,
"loss": 0.451,
"step": 1760
},
{
"epoch": 0.5371979210136956,
"grad_norm": 0.466505558488539,
"learning_rate": 5.230372512482485e-06,
"loss": 0.4569,
"step": 1770
},
{
"epoch": 0.5402329375165977,
"grad_norm": 0.4395984838919295,
"learning_rate": 5.177438879830148e-06,
"loss": 0.4546,
"step": 1780
},
{
"epoch": 0.5432679540195,
"grad_norm": 0.5081543080746838,
"learning_rate": 5.1244853269145315e-06,
"loss": 0.4522,
"step": 1790
},
{
"epoch": 0.5463029705224022,
"grad_norm": 0.4739999458440851,
"learning_rate": 5.0715177985923454e-06,
"loss": 0.4575,
"step": 1800
},
{
"epoch": 0.5493379870253045,
"grad_norm": 0.44986425269809655,
"learning_rate": 5.0185422412892615e-06,
"loss": 0.4504,
"step": 1810
},
{
"epoch": 0.5523730035282067,
"grad_norm": 0.493838430904318,
"learning_rate": 4.96556460233232e-06,
"loss": 0.4565,
"step": 1820
},
{
"epoch": 0.5554080200311089,
"grad_norm": 0.4597123498494703,
"learning_rate": 4.912590829282269e-06,
"loss": 0.4552,
"step": 1830
},
{
"epoch": 0.5584430365340112,
"grad_norm": 0.45249866862931715,
"learning_rate": 4.859626869265838e-06,
"loss": 0.4646,
"step": 1840
},
{
"epoch": 0.5614780530369133,
"grad_norm": 0.47693139933323253,
"learning_rate": 4.806678668308102e-06,
"loss": 0.4593,
"step": 1850
},
{
"epoch": 0.5645130695398156,
"grad_norm": 0.4441745555089401,
"learning_rate": 4.753752170664926e-06,
"loss": 0.4518,
"step": 1860
},
{
"epoch": 0.5675480860427179,
"grad_norm": 0.5407791277506199,
"learning_rate": 4.700853318155655e-06,
"loss": 0.4537,
"step": 1870
},
{
"epoch": 0.5705831025456201,
"grad_norm": 0.43766170865780535,
"learning_rate": 4.647988049496026e-06,
"loss": 0.456,
"step": 1880
},
{
"epoch": 0.5736181190485223,
"grad_norm": 0.4381830382862563,
"learning_rate": 4.5951622996314785e-06,
"loss": 0.4544,
"step": 1890
},
{
"epoch": 0.5766531355514246,
"grad_norm": 0.5008063102656403,
"learning_rate": 4.542381999070851e-06,
"loss": 0.4576,
"step": 1900
},
{
"epoch": 0.5796881520543268,
"grad_norm": 0.4449319997966233,
"learning_rate": 4.489653073220593e-06,
"loss": 0.4479,
"step": 1910
},
{
"epoch": 0.582723168557229,
"grad_norm": 0.4325405045901816,
"learning_rate": 4.43698144171955e-06,
"loss": 0.4566,
"step": 1920
},
{
"epoch": 0.5857581850601312,
"grad_norm": 0.4558072435464798,
"learning_rate": 4.3843730177743835e-06,
"loss": 0.4522,
"step": 1930
},
{
"epoch": 0.5887932015630335,
"grad_norm": 0.46993480403343113,
"learning_rate": 4.331833707495735e-06,
"loss": 0.4497,
"step": 1940
},
{
"epoch": 0.5918282180659358,
"grad_norm": 0.48201210672220995,
"learning_rate": 4.279369409235159e-06,
"loss": 0.4557,
"step": 1950
},
{
"epoch": 0.5948632345688379,
"grad_norm": 0.48886046384215476,
"learning_rate": 4.226986012922954e-06,
"loss": 0.4527,
"step": 1960
},
{
"epoch": 0.5978982510717402,
"grad_norm": 0.496378890478507,
"learning_rate": 4.174689399406917e-06,
"loss": 0.4474,
"step": 1970
},
{
"epoch": 0.6009332675746424,
"grad_norm": 0.5586162522102414,
"learning_rate": 4.122485439792139e-06,
"loss": 0.4525,
"step": 1980
},
{
"epoch": 0.6039682840775447,
"grad_norm": 0.446752242572371,
"learning_rate": 4.070379994781865e-06,
"loss": 0.446,
"step": 1990
},
{
"epoch": 0.6070033005804469,
"grad_norm": 0.47706411203661847,
"learning_rate": 4.018378914019556e-06,
"loss": 0.4596,
"step": 2000
},
{
"epoch": 0.6100383170833491,
"grad_norm": 0.538309740370942,
"learning_rate": 3.966488035432169e-06,
"loss": 0.4421,
"step": 2010
},
{
"epoch": 0.6130733335862514,
"grad_norm": 0.5230632035555158,
"learning_rate": 3.914713184574759e-06,
"loss": 0.4569,
"step": 2020
},
{
"epoch": 0.6161083500891537,
"grad_norm": 0.4338338834185748,
"learning_rate": 3.863060173976466e-06,
"loss": 0.4541,
"step": 2030
},
{
"epoch": 0.6191433665920558,
"grad_norm": 0.5603785317621145,
"learning_rate": 3.811534802487983e-06,
"loss": 0.4551,
"step": 2040
},
{
"epoch": 0.6221783830949581,
"grad_norm": 0.47642244684394447,
"learning_rate": 3.7601428546305246e-06,
"loss": 0.4523,
"step": 2050
},
{
"epoch": 0.6252133995978603,
"grad_norm": 0.46992960142430185,
"learning_rate": 3.7088900999464432e-06,
"loss": 0.446,
"step": 2060
},
{
"epoch": 0.6282484161007625,
"grad_norm": 0.4845504298042459,
"learning_rate": 3.657782292351501e-06,
"loss": 0.4566,
"step": 2070
},
{
"epoch": 0.6312834326036648,
"grad_norm": 0.47017322226992764,
"learning_rate": 3.6068251694888973e-06,
"loss": 0.4508,
"step": 2080
},
{
"epoch": 0.634318449106567,
"grad_norm": 0.5121234005197363,
"learning_rate": 3.556024452085144e-06,
"loss": 0.4431,
"step": 2090
},
{
"epoch": 0.6373534656094693,
"grad_norm": 0.487255052511626,
"learning_rate": 3.505385843307809e-06,
"loss": 0.4473,
"step": 2100
},
{
"epoch": 0.6403884821123715,
"grad_norm": 0.4564465966378128,
"learning_rate": 3.4549150281252635e-06,
"loss": 0.4472,
"step": 2110
},
{
"epoch": 0.6434234986152737,
"grad_norm": 0.4806890404068221,
"learning_rate": 3.404617672668441e-06,
"loss": 0.4536,
"step": 2120
},
{
"epoch": 0.646458515118176,
"grad_norm": 0.45235070380959885,
"learning_rate": 3.354499423594737e-06,
"loss": 0.4522,
"step": 2130
},
{
"epoch": 0.6494935316210781,
"grad_norm": 0.45955408837797435,
"learning_rate": 3.3045659074540797e-06,
"loss": 0.4441,
"step": 2140
},
{
"epoch": 0.6525285481239804,
"grad_norm": 0.4923322956992551,
"learning_rate": 3.254822730057266e-06,
"loss": 0.4551,
"step": 2150
},
{
"epoch": 0.6555635646268827,
"grad_norm": 0.48682070386464155,
"learning_rate": 3.205275475846614e-06,
"loss": 0.4496,
"step": 2160
},
{
"epoch": 0.6585985811297849,
"grad_norm": 0.47952708737796373,
"learning_rate": 3.1559297072690376e-06,
"loss": 0.4509,
"step": 2170
},
{
"epoch": 0.6616335976326871,
"grad_norm": 0.4543123589314356,
"learning_rate": 3.106790964151556e-06,
"loss": 0.4469,
"step": 2180
},
{
"epoch": 0.6646686141355893,
"grad_norm": 0.4820597535972369,
"learning_rate": 3.0578647630793845e-06,
"loss": 0.45,
"step": 2190
},
{
"epoch": 0.6677036306384916,
"grad_norm": 0.47560501988050274,
"learning_rate": 3.0091565967765903e-06,
"loss": 0.4506,
"step": 2200
},
{
"epoch": 0.6707386471413939,
"grad_norm": 0.4590471436665147,
"learning_rate": 2.9606719334894673e-06,
"loss": 0.4411,
"step": 2210
},
{
"epoch": 0.673773663644296,
"grad_norm": 0.4871582819595622,
"learning_rate": 2.9124162163726333e-06,
"loss": 0.4581,
"step": 2220
},
{
"epoch": 0.6768086801471983,
"grad_norm": 0.4621936812853514,
"learning_rate": 2.864394862877945e-06,
"loss": 0.4392,
"step": 2230
},
{
"epoch": 0.6798436966501006,
"grad_norm": 0.5106734882471429,
"learning_rate": 2.8166132641463174e-06,
"loss": 0.4514,
"step": 2240
},
{
"epoch": 0.6828787131530027,
"grad_norm": 0.4872901241863599,
"learning_rate": 2.7690767844024757e-06,
"loss": 0.456,
"step": 2250
},
{
"epoch": 0.685913729655905,
"grad_norm": 0.5327159140172049,
"learning_rate": 2.7217907603527425e-06,
"loss": 0.4502,
"step": 2260
},
{
"epoch": 0.6889487461588072,
"grad_norm": 0.46596161967531874,
"learning_rate": 2.67476050058591e-06,
"loss": 0.4368,
"step": 2270
},
{
"epoch": 0.6919837626617095,
"grad_norm": 0.4756057088646994,
"learning_rate": 2.627991284977265e-06,
"loss": 0.4427,
"step": 2280
},
{
"epoch": 0.6950187791646117,
"grad_norm": 0.457554867967829,
"learning_rate": 2.5814883640958425e-06,
"loss": 0.4492,
"step": 2290
},
{
"epoch": 0.6980537956675139,
"grad_norm": 0.4914134735789525,
"learning_rate": 2.535256958614972e-06,
"loss": 0.4521,
"step": 2300
},
{
"epoch": 0.7010888121704162,
"grad_norm": 0.4514454510170551,
"learning_rate": 2.489302258726169e-06,
"loss": 0.445,
"step": 2310
},
{
"epoch": 0.7041238286733185,
"grad_norm": 0.5541195252480908,
"learning_rate": 2.4436294235564616e-06,
"loss": 0.4487,
"step": 2320
},
{
"epoch": 0.7071588451762206,
"grad_norm": 0.4701102512963565,
"learning_rate": 2.398243580589197e-06,
"loss": 0.4467,
"step": 2330
},
{
"epoch": 0.7101938616791229,
"grad_norm": 0.5033348530844566,
"learning_rate": 2.353149825088401e-06,
"loss": 0.4424,
"step": 2340
},
{
"epoch": 0.7132288781820251,
"grad_norm": 0.4896392329694068,
"learning_rate": 2.30835321952675e-06,
"loss": 0.4492,
"step": 2350
},
{
"epoch": 0.7162638946849273,
"grad_norm": 0.45357241678052895,
"learning_rate": 2.263858793017247e-06,
"loss": 0.4399,
"step": 2360
},
{
"epoch": 0.7192989111878296,
"grad_norm": 0.47254537181545697,
"learning_rate": 2.219671540748607e-06,
"loss": 0.4486,
"step": 2370
},
{
"epoch": 0.7223339276907318,
"grad_norm": 1.0513237554990393,
"learning_rate": 2.1757964234244806e-06,
"loss": 0.4516,
"step": 2380
},
{
"epoch": 0.7253689441936341,
"grad_norm": 1.211615701259343,
"learning_rate": 2.1322383667065328e-06,
"loss": 0.4459,
"step": 2390
},
{
"epoch": 0.7284039606965362,
"grad_norm": 0.49549117242754526,
"learning_rate": 2.0890022606614658e-06,
"loss": 0.4519,
"step": 2400
},
{
"epoch": 0.7314389771994385,
"grad_norm": 0.441302787437587,
"learning_rate": 2.0460929592120286e-06,
"loss": 0.4421,
"step": 2410
},
{
"epoch": 0.7344739937023408,
"grad_norm": 0.47459997274968196,
"learning_rate": 2.0035152795920943e-06,
"loss": 0.4474,
"step": 2420
},
{
"epoch": 0.737509010205243,
"grad_norm": 0.5534394858546283,
"learning_rate": 1.961274001805844e-06,
"loss": 0.4506,
"step": 2430
},
{
"epoch": 0.7405440267081452,
"grad_norm": 0.5940641703049439,
"learning_rate": 1.9193738680911444e-06,
"loss": 0.4435,
"step": 2440
},
{
"epoch": 0.7435790432110475,
"grad_norm": 0.47749388420290173,
"learning_rate": 1.8778195823871537e-06,
"loss": 0.4473,
"step": 2450
},
{
"epoch": 0.7466140597139497,
"grad_norm": 0.46649043860109973,
"learning_rate": 1.836615809806232e-06,
"loss": 0.441,
"step": 2460
},
{
"epoch": 0.749649076216852,
"grad_norm": 0.47765577041447577,
"learning_rate": 1.7957671761102142e-06,
"loss": 0.4394,
"step": 2470
},
{
"epoch": 0.7526840927197541,
"grad_norm": 0.4609425591065501,
"learning_rate": 1.7552782671910845e-06,
"loss": 0.4491,
"step": 2480
},
{
"epoch": 0.7557191092226564,
"grad_norm": 0.4952386048466494,
"learning_rate": 1.715153628556162e-06,
"loss": 0.4429,
"step": 2490
},
{
"epoch": 0.7587541257255587,
"grad_norm": 0.45327322111640855,
"learning_rate": 1.6753977648177682e-06,
"loss": 0.452,
"step": 2500
},
{
"epoch": 0.7617891422284608,
"grad_norm": 0.560152662522095,
"learning_rate": 1.6360151391875395e-06,
"loss": 0.4482,
"step": 2510
},
{
"epoch": 0.7648241587313631,
"grad_norm": 0.5588616468957155,
"learning_rate": 1.5970101729753485e-06,
"loss": 0.4411,
"step": 2520
},
{
"epoch": 0.7678591752342654,
"grad_norm": 0.5012865331807312,
"learning_rate": 1.5583872450929455e-06,
"loss": 0.4466,
"step": 2530
},
{
"epoch": 0.7708941917371676,
"grad_norm": 0.4458865832841834,
"learning_rate": 1.5201506915623621e-06,
"loss": 0.443,
"step": 2540
},
{
"epoch": 0.7739292082400698,
"grad_norm": 0.5048770346419175,
"learning_rate": 1.4823048050291211e-06,
"loss": 0.452,
"step": 2550
},
{
"epoch": 0.776964224742972,
"grad_norm": 0.46420339110163,
"learning_rate": 1.4448538342803242e-06,
"loss": 0.4405,
"step": 2560
},
{
"epoch": 0.7799992412458743,
"grad_norm": 0.5148070470946807,
"learning_rate": 1.407801983767656e-06,
"loss": 0.4452,
"step": 2570
},
{
"epoch": 0.7830342577487766,
"grad_norm": 0.4919925143595241,
"learning_rate": 1.3711534131353738e-06,
"loss": 0.4481,
"step": 2580
},
{
"epoch": 0.7860692742516787,
"grad_norm": 0.8295554704623347,
"learning_rate": 1.3349122367533135e-06,
"loss": 0.4443,
"step": 2590
},
{
"epoch": 0.789104290754581,
"grad_norm": 0.4715835221582969,
"learning_rate": 1.2990825232550065e-06,
"loss": 0.4441,
"step": 2600
},
{
"epoch": 0.7921393072574832,
"grad_norm": 0.4888383711595245,
"learning_rate": 1.2636682950808882e-06,
"loss": 0.4414,
"step": 2610
},
{
"epoch": 0.7951743237603854,
"grad_norm": 0.46367852635576223,
"learning_rate": 1.228673528026741e-06,
"loss": 0.443,
"step": 2620
},
{
"epoch": 0.7982093402632877,
"grad_norm": 0.458931669471863,
"learning_rate": 1.194102150797326e-06,
"loss": 0.445,
"step": 2630
},
{
"epoch": 0.8012443567661899,
"grad_norm": 0.5805342428508335,
"learning_rate": 1.1599580445653496e-06,
"loss": 0.4416,
"step": 2640
},
{
"epoch": 0.8042793732690922,
"grad_norm": 0.47446300706665095,
"learning_rate": 1.1262450425357175e-06,
"loss": 0.4527,
"step": 2650
},
{
"epoch": 0.8073143897719944,
"grad_norm": 0.4585780912736001,
"learning_rate": 1.092966929515218e-06,
"loss": 0.44,
"step": 2660
},
{
"epoch": 0.8103494062748966,
"grad_norm": 0.48233598242441533,
"learning_rate": 1.0601274414876067e-06,
"loss": 0.4455,
"step": 2670
},
{
"epoch": 0.8133844227777989,
"grad_norm": 0.46200584695129754,
"learning_rate": 1.0277302651941894e-06,
"loss": 0.4446,
"step": 2680
},
{
"epoch": 0.816419439280701,
"grad_norm": 0.4874246650348082,
"learning_rate": 9.95779037719926e-07,
"loss": 0.4397,
"step": 2690
},
{
"epoch": 0.8194544557836033,
"grad_norm": 0.4547919796548769,
"learning_rate": 9.642773460851141e-07,
"loss": 0.4473,
"step": 2700
},
{
"epoch": 0.8224894722865056,
"grad_norm": 0.4972539970599672,
"learning_rate": 9.332287268426881e-07,
"loss": 0.4425,
"step": 2710
},
{
"epoch": 0.8255244887894078,
"grad_norm": 0.43865232063488374,
"learning_rate": 9.026366656811835e-07,
"loss": 0.4401,
"step": 2720
},
{
"epoch": 0.82855950529231,
"grad_norm": 1.1602238104916722,
"learning_rate": 8.725045970334262e-07,
"loss": 0.4504,
"step": 2730
},
{
"epoch": 0.8315945217952123,
"grad_norm": 0.4663648778218256,
"learning_rate": 8.428359036909455e-07,
"loss": 0.4391,
"step": 2740
},
{
"epoch": 0.8346295382981145,
"grad_norm": 0.48682417123785116,
"learning_rate": 8.136339164242241e-07,
"loss": 0.4467,
"step": 2750
},
{
"epoch": 0.8376645548010168,
"grad_norm": 0.4673166286188927,
"learning_rate": 7.849019136087477e-07,
"loss": 0.4398,
"step": 2760
},
{
"epoch": 0.8406995713039189,
"grad_norm": 0.4903812570342262,
"learning_rate": 7.566431208569747e-07,
"loss": 0.4413,
"step": 2770
},
{
"epoch": 0.8437345878068212,
"grad_norm": 0.485844612436569,
"learning_rate": 7.288607106561935e-07,
"loss": 0.4451,
"step": 2780
},
{
"epoch": 0.8467696043097235,
"grad_norm": 0.4585651698156433,
"learning_rate": 7.015578020123804e-07,
"loss": 0.439,
"step": 2790
},
{
"epoch": 0.8498046208126256,
"grad_norm": 0.4839043585723541,
"learning_rate": 6.747374601000229e-07,
"loss": 0.4451,
"step": 2800
},
{
"epoch": 0.8528396373155279,
"grad_norm": 0.45692832107739795,
"learning_rate": 6.484026959180256e-07,
"loss": 0.439,
"step": 2810
},
{
"epoch": 0.8558746538184301,
"grad_norm": 0.8934971320404392,
"learning_rate": 6.225564659516653e-07,
"loss": 0.4427,
"step": 2820
},
{
"epoch": 0.8589096703213324,
"grad_norm": 0.42447350665857003,
"learning_rate": 5.972016718406832e-07,
"loss": 0.445,
"step": 2830
},
{
"epoch": 0.8619446868242346,
"grad_norm": 0.5221390912623963,
"learning_rate": 5.723411600535378e-07,
"loss": 0.4493,
"step": 2840
},
{
"epoch": 0.8649797033271368,
"grad_norm": 0.470728180227603,
"learning_rate": 5.4797772156783e-07,
"loss": 0.4436,
"step": 2850
},
{
"epoch": 0.8680147198300391,
"grad_norm": 0.47672096853939294,
"learning_rate": 5.24114091556992e-07,
"loss": 0.4405,
"step": 2860
},
{
"epoch": 0.8710497363329414,
"grad_norm": 0.4546129075040933,
"learning_rate": 5.00752949083202e-07,
"loss": 0.4534,
"step": 2870
},
{
"epoch": 0.8740847528358435,
"grad_norm": 0.463106848164873,
"learning_rate": 4.778969167966346e-07,
"loss": 0.444,
"step": 2880
},
{
"epoch": 0.8771197693387458,
"grad_norm": 0.4491761462781034,
"learning_rate": 4.5554856064101314e-07,
"loss": 0.436,
"step": 2890
},
{
"epoch": 0.880154785841648,
"grad_norm": 0.4675366494918407,
"learning_rate": 4.337103895655581e-07,
"loss": 0.4531,
"step": 2900
},
{
"epoch": 0.8831898023445502,
"grad_norm": 0.44541163156936503,
"learning_rate": 4.123848552433019e-07,
"loss": 0.4375,
"step": 2910
},
{
"epoch": 0.8862248188474525,
"grad_norm": 0.49240823007466994,
"learning_rate": 3.9157435179586756e-07,
"loss": 0.4374,
"step": 2920
},
{
"epoch": 0.8892598353503547,
"grad_norm": 0.4733511579541881,
"learning_rate": 3.712812155246759e-07,
"loss": 0.4441,
"step": 2930
},
{
"epoch": 0.892294851853257,
"grad_norm": 0.5930159408214484,
"learning_rate": 3.5150772464867314e-07,
"loss": 0.4441,
"step": 2940
},
{
"epoch": 0.8953298683561592,
"grad_norm": 0.4915855850221476,
"learning_rate": 3.322560990485535e-07,
"loss": 0.4475,
"step": 2950
},
{
"epoch": 0.8983648848590614,
"grad_norm": 0.4400578392570043,
"learning_rate": 3.135285000175531e-07,
"loss": 0.437,
"step": 2960
},
{
"epoch": 0.9013999013619637,
"grad_norm": 1.0625536086809468,
"learning_rate": 2.953270300188038e-07,
"loss": 0.4461,
"step": 2970
},
{
"epoch": 0.9044349178648659,
"grad_norm": 0.47898803434881615,
"learning_rate": 2.776537324493045e-07,
"loss": 0.4411,
"step": 2980
},
{
"epoch": 0.9074699343677681,
"grad_norm": 0.48013757465246243,
"learning_rate": 2.6051059141051713e-07,
"loss": 0.4463,
"step": 2990
},
{
"epoch": 0.9105049508706704,
"grad_norm": 0.46792544413322756,
"learning_rate": 2.4389953148561574e-07,
"loss": 0.4541,
"step": 3000
},
{
"epoch": 0.9135399673735726,
"grad_norm": 0.47509286467451683,
"learning_rate": 2.2782241752343004e-07,
"loss": 0.4392,
"step": 3010
},
{
"epoch": 0.9165749838764748,
"grad_norm": 0.5355760986925688,
"learning_rate": 2.122810544290782e-07,
"loss": 0.4459,
"step": 3020
},
{
"epoch": 0.919610000379377,
"grad_norm": 0.4481951370049728,
"learning_rate": 1.972771869613499e-07,
"loss": 0.4408,
"step": 3030
},
{
"epoch": 0.9226450168822793,
"grad_norm": 1.212596637597613,
"learning_rate": 1.8281249953681633e-07,
"loss": 0.4524,
"step": 3040
},
{
"epoch": 0.9256800333851816,
"grad_norm": 0.4653062821594877,
"learning_rate": 1.6888861604074158e-07,
"loss": 0.4547,
"step": 3050
},
{
"epoch": 0.9287150498880837,
"grad_norm": 0.4417864126009544,
"learning_rate": 1.5550709964476606e-07,
"loss": 0.4343,
"step": 3060
},
{
"epoch": 0.931750066390986,
"grad_norm": 0.4636627611118036,
"learning_rate": 1.4266945263142152e-07,
"loss": 0.4442,
"step": 3070
},
{
"epoch": 0.9347850828938883,
"grad_norm": 0.4750602636819015,
"learning_rate": 1.3037711622547633e-07,
"loss": 0.4402,
"step": 3080
},
{
"epoch": 0.9378200993967905,
"grad_norm": 0.5641879257431062,
"learning_rate": 1.1863147043213453e-07,
"loss": 0.4463,
"step": 3090
},
{
"epoch": 0.9408551158996927,
"grad_norm": 0.48298472160583994,
"learning_rate": 1.0743383388210849e-07,
"loss": 0.4509,
"step": 3100
},
{
"epoch": 0.9438901324025949,
"grad_norm": 0.4709867848354663,
"learning_rate": 9.678546368358299e-08,
"loss": 0.4469,
"step": 3110
},
{
"epoch": 0.9469251489054972,
"grad_norm": 0.45111687792423893,
"learning_rate": 8.668755528108586e-08,
"loss": 0.446,
"step": 3120
},
{
"epoch": 0.9499601654083994,
"grad_norm": 0.4640080663242888,
"learning_rate": 7.714124232127974e-08,
"loss": 0.447,
"step": 3130
},
{
"epoch": 0.9529951819113016,
"grad_norm": 0.4835533725431675,
"learning_rate": 6.814759652569391e-08,
"loss": 0.4471,
"step": 3140
},
{
"epoch": 0.9560301984142039,
"grad_norm": 0.4775784576850005,
"learning_rate": 5.970762757040339e-08,
"loss": 0.4581,
"step": 3150
},
{
"epoch": 0.9590652149171061,
"grad_norm": 0.4406150853474837,
"learning_rate": 5.182228297268388e-08,
"loss": 0.4377,
"step": 3160
},
{
"epoch": 0.9621002314200083,
"grad_norm": 0.4875065749326578,
"learning_rate": 4.449244798463037e-08,
"loss": 0.4466,
"step": 3170
},
{
"epoch": 0.9651352479229106,
"grad_norm": 0.5331920537623118,
"learning_rate": 3.7718945493781523e-08,
"loss": 0.4457,
"step": 3180
},
{
"epoch": 0.9681702644258128,
"grad_norm": 0.4829553299763894,
"learning_rate": 3.150253593073027e-08,
"loss": 0.4373,
"step": 3190
},
{
"epoch": 0.971205280928715,
"grad_norm": 0.4683361415791196,
"learning_rate": 2.5843917183761002e-08,
"loss": 0.4541,
"step": 3200
},
{
"epoch": 0.9742402974316173,
"grad_norm": 0.4647148561695759,
"learning_rate": 2.0743724520495e-08,
"loss": 0.4372,
"step": 3210
},
{
"epoch": 0.9772753139345195,
"grad_norm": 0.45464384300389377,
"learning_rate": 1.6202530516574165e-08,
"loss": 0.4495,
"step": 3220
},
{
"epoch": 0.9803103304374218,
"grad_norm": 0.4823468219123769,
"learning_rate": 1.222084499138243e-08,
"loss": 0.4408,
"step": 3230
},
{
"epoch": 0.9833453469403239,
"grad_norm": 0.4802790220017127,
"learning_rate": 8.799114950806542e-09,
"loss": 0.4419,
"step": 3240
},
{
"epoch": 0.9863803634432262,
"grad_norm": 0.46706355617957335,
"learning_rate": 5.9377245370551005e-09,
"loss": 0.4453,
"step": 3250
},
{
"epoch": 0.9894153799461285,
"grad_norm": 0.440626466158207,
"learning_rate": 3.636994985534159e-09,
"loss": 0.4345,
"step": 3260
},
{
"epoch": 0.9924503964490307,
"grad_norm": 0.4555769223211312,
"learning_rate": 1.8971845887794105e-09,
"loss": 0.4468,
"step": 3270
},
{
"epoch": 0.9954854129519329,
"grad_norm": 0.4398220412821882,
"learning_rate": 7.184886674627134e-10,
"loss": 0.4384,
"step": 3280
},
{
"epoch": 0.9985204294548352,
"grad_norm": 0.45149244572807423,
"learning_rate": 1.010395484624116e-10,
"loss": 0.44,
"step": 3290
},
{
"epoch": 1.0,
"step": 3295,
"total_flos": 8.530187773318005e+18,
"train_loss": 0.4728999632081421,
"train_runtime": 62323.8697,
"train_samples_per_second": 6.767,
"train_steps_per_second": 0.053
}
],
"logging_steps": 10,
"max_steps": 3295,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 8.530187773318005e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}