adpter_5000 / trainer_state.json
DungND1107's picture
Upload 15 files
2575939 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0855545301558005,
"eval_steps": 500,
"global_step": 5000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00217143477552793,
"grad_norm": 0.9825782179832458,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.7137,
"step": 10
},
{
"epoch": 0.00434286955105586,
"grad_norm": 1.4503270387649536,
"learning_rate": 7.600000000000001e-06,
"loss": 1.7044,
"step": 20
},
{
"epoch": 0.006514304326583791,
"grad_norm": 0.7030352354049683,
"learning_rate": 1.16e-05,
"loss": 1.5721,
"step": 30
},
{
"epoch": 0.00868573910211172,
"grad_norm": 0.9047777652740479,
"learning_rate": 1.5600000000000003e-05,
"loss": 1.4224,
"step": 40
},
{
"epoch": 0.01085717387763965,
"grad_norm": 0.6958425641059875,
"learning_rate": 1.9600000000000002e-05,
"loss": 1.3783,
"step": 50
},
{
"epoch": 0.013028608653167581,
"grad_norm": 0.683045506477356,
"learning_rate": 1.9980353634577606e-05,
"loss": 1.3706,
"step": 60
},
{
"epoch": 0.015200043428695511,
"grad_norm": 0.5452519655227661,
"learning_rate": 1.9958524339663828e-05,
"loss": 1.3022,
"step": 70
},
{
"epoch": 0.01737147820422344,
"grad_norm": 0.8496165871620178,
"learning_rate": 1.9936695044750056e-05,
"loss": 1.2474,
"step": 80
},
{
"epoch": 0.01954291297975137,
"grad_norm": 0.5976231098175049,
"learning_rate": 1.991486574983628e-05,
"loss": 1.2646,
"step": 90
},
{
"epoch": 0.0217143477552793,
"grad_norm": 0.7484721541404724,
"learning_rate": 1.9893036454922506e-05,
"loss": 1.2051,
"step": 100
},
{
"epoch": 0.02388578253080723,
"grad_norm": 0.6440810561180115,
"learning_rate": 1.9871207160008735e-05,
"loss": 1.2289,
"step": 110
},
{
"epoch": 0.026057217306335163,
"grad_norm": 0.7614450454711914,
"learning_rate": 1.984937786509496e-05,
"loss": 1.26,
"step": 120
},
{
"epoch": 0.028228652081863093,
"grad_norm": 0.7417937517166138,
"learning_rate": 1.9827548570181185e-05,
"loss": 1.2034,
"step": 130
},
{
"epoch": 0.030400086857391023,
"grad_norm": 0.7169002890586853,
"learning_rate": 1.980571927526741e-05,
"loss": 1.2561,
"step": 140
},
{
"epoch": 0.03257152163291895,
"grad_norm": 0.6170061826705933,
"learning_rate": 1.9783889980353638e-05,
"loss": 1.1907,
"step": 150
},
{
"epoch": 0.03474295640844688,
"grad_norm": 0.7643230557441711,
"learning_rate": 1.976206068543986e-05,
"loss": 1.2109,
"step": 160
},
{
"epoch": 0.03691439118397481,
"grad_norm": 0.8660950660705566,
"learning_rate": 1.9740231390526088e-05,
"loss": 1.3197,
"step": 170
},
{
"epoch": 0.03908582595950274,
"grad_norm": 0.7613770961761475,
"learning_rate": 1.9718402095612313e-05,
"loss": 1.2072,
"step": 180
},
{
"epoch": 0.04125726073503067,
"grad_norm": 0.880974531173706,
"learning_rate": 1.9696572800698538e-05,
"loss": 1.226,
"step": 190
},
{
"epoch": 0.0434286955105586,
"grad_norm": 0.8629663586616516,
"learning_rate": 1.9674743505784766e-05,
"loss": 1.2298,
"step": 200
},
{
"epoch": 0.04560013028608653,
"grad_norm": 0.7981083393096924,
"learning_rate": 1.965291421087099e-05,
"loss": 1.145,
"step": 210
},
{
"epoch": 0.04777156506161446,
"grad_norm": 0.8174938559532166,
"learning_rate": 1.9631084915957216e-05,
"loss": 1.1745,
"step": 220
},
{
"epoch": 0.04994299983714239,
"grad_norm": 0.7619920969009399,
"learning_rate": 1.960925562104344e-05,
"loss": 1.1557,
"step": 230
},
{
"epoch": 0.052114434612670325,
"grad_norm": 0.8618036508560181,
"learning_rate": 1.958742632612967e-05,
"loss": 1.2093,
"step": 240
},
{
"epoch": 0.05428586938819825,
"grad_norm": 0.9684587717056274,
"learning_rate": 1.956559703121589e-05,
"loss": 1.1358,
"step": 250
},
{
"epoch": 0.056457304163726185,
"grad_norm": 0.856431245803833,
"learning_rate": 1.954376773630212e-05,
"loss": 1.0864,
"step": 260
},
{
"epoch": 0.05862873893925411,
"grad_norm": 0.8133667707443237,
"learning_rate": 1.9521938441388345e-05,
"loss": 1.1005,
"step": 270
},
{
"epoch": 0.060800173714782045,
"grad_norm": 1.0199098587036133,
"learning_rate": 1.950010914647457e-05,
"loss": 1.1349,
"step": 280
},
{
"epoch": 0.06297160849030997,
"grad_norm": 0.8546782732009888,
"learning_rate": 1.9478279851560794e-05,
"loss": 1.1272,
"step": 290
},
{
"epoch": 0.0651430432658379,
"grad_norm": 1.0497276782989502,
"learning_rate": 1.9456450556647023e-05,
"loss": 1.176,
"step": 300
},
{
"epoch": 0.06731447804136584,
"grad_norm": 0.9524215459823608,
"learning_rate": 1.9434621261733248e-05,
"loss": 1.1281,
"step": 310
},
{
"epoch": 0.06948591281689376,
"grad_norm": 0.9029881954193115,
"learning_rate": 1.9412791966819473e-05,
"loss": 1.0668,
"step": 320
},
{
"epoch": 0.07165734759242169,
"grad_norm": 1.0050421953201294,
"learning_rate": 1.93909626719057e-05,
"loss": 1.0993,
"step": 330
},
{
"epoch": 0.07382878236794962,
"grad_norm": 0.8202849626541138,
"learning_rate": 1.9369133376991923e-05,
"loss": 1.1794,
"step": 340
},
{
"epoch": 0.07600021714347756,
"grad_norm": 0.8110634684562683,
"learning_rate": 1.934730408207815e-05,
"loss": 1.1452,
"step": 350
},
{
"epoch": 0.07817165191900548,
"grad_norm": 0.9648256301879883,
"learning_rate": 1.9325474787164376e-05,
"loss": 1.1503,
"step": 360
},
{
"epoch": 0.08034308669453341,
"grad_norm": 0.969715416431427,
"learning_rate": 1.93036454922506e-05,
"loss": 1.129,
"step": 370
},
{
"epoch": 0.08251452147006134,
"grad_norm": 1.0881967544555664,
"learning_rate": 1.9281816197336826e-05,
"loss": 1.1217,
"step": 380
},
{
"epoch": 0.08468595624558928,
"grad_norm": 0.9472118616104126,
"learning_rate": 1.9259986902423054e-05,
"loss": 1.1206,
"step": 390
},
{
"epoch": 0.0868573910211172,
"grad_norm": 1.0082671642303467,
"learning_rate": 1.923815760750928e-05,
"loss": 1.1371,
"step": 400
},
{
"epoch": 0.08902882579664513,
"grad_norm": 1.0587445497512817,
"learning_rate": 1.9216328312595504e-05,
"loss": 1.0499,
"step": 410
},
{
"epoch": 0.09120026057217306,
"grad_norm": 0.869490385055542,
"learning_rate": 1.9194499017681733e-05,
"loss": 1.0992,
"step": 420
},
{
"epoch": 0.093371695347701,
"grad_norm": 1.024477243423462,
"learning_rate": 1.9172669722767954e-05,
"loss": 1.1033,
"step": 430
},
{
"epoch": 0.09554313012322892,
"grad_norm": 0.7851136326789856,
"learning_rate": 1.9150840427854183e-05,
"loss": 1.1261,
"step": 440
},
{
"epoch": 0.09771456489875685,
"grad_norm": 1.0576775074005127,
"learning_rate": 1.9129011132940408e-05,
"loss": 1.0772,
"step": 450
},
{
"epoch": 0.09988599967428478,
"grad_norm": 0.9781667590141296,
"learning_rate": 1.9107181838026633e-05,
"loss": 1.0995,
"step": 460
},
{
"epoch": 0.10205743444981272,
"grad_norm": 1.0188452005386353,
"learning_rate": 1.9085352543112858e-05,
"loss": 1.1518,
"step": 470
},
{
"epoch": 0.10422886922534065,
"grad_norm": 1.052553653717041,
"learning_rate": 1.9063523248199086e-05,
"loss": 1.1514,
"step": 480
},
{
"epoch": 0.10640030400086857,
"grad_norm": 0.9977424144744873,
"learning_rate": 1.904169395328531e-05,
"loss": 1.1605,
"step": 490
},
{
"epoch": 0.1085717387763965,
"grad_norm": 0.9981403946876526,
"learning_rate": 1.9019864658371536e-05,
"loss": 1.0996,
"step": 500
},
{
"epoch": 0.11074317355192444,
"grad_norm": 0.9761925339698792,
"learning_rate": 1.899803536345776e-05,
"loss": 1.048,
"step": 510
},
{
"epoch": 0.11291460832745237,
"grad_norm": 0.9788073301315308,
"learning_rate": 1.8976206068543986e-05,
"loss": 1.1175,
"step": 520
},
{
"epoch": 0.11508604310298029,
"grad_norm": 0.9808152914047241,
"learning_rate": 1.8954376773630214e-05,
"loss": 1.1271,
"step": 530
},
{
"epoch": 0.11725747787850822,
"grad_norm": 0.9630600214004517,
"learning_rate": 1.893254747871644e-05,
"loss": 1.0699,
"step": 540
},
{
"epoch": 0.11942891265403616,
"grad_norm": 1.1894537210464478,
"learning_rate": 1.8910718183802664e-05,
"loss": 1.0589,
"step": 550
},
{
"epoch": 0.12160034742956409,
"grad_norm": 0.967409074306488,
"learning_rate": 1.888888888888889e-05,
"loss": 1.1132,
"step": 560
},
{
"epoch": 0.12377178220509201,
"grad_norm": 0.9783412218093872,
"learning_rate": 1.8867059593975117e-05,
"loss": 1.0887,
"step": 570
},
{
"epoch": 0.12594321698061994,
"grad_norm": 0.9031311869621277,
"learning_rate": 1.8845230299061342e-05,
"loss": 1.0836,
"step": 580
},
{
"epoch": 0.12811465175614786,
"grad_norm": 1.2321075201034546,
"learning_rate": 1.8823401004147567e-05,
"loss": 1.084,
"step": 590
},
{
"epoch": 0.1302860865316758,
"grad_norm": 0.9194741249084473,
"learning_rate": 1.8801571709233792e-05,
"loss": 1.0999,
"step": 600
},
{
"epoch": 0.13245752130720373,
"grad_norm": 1.2474993467330933,
"learning_rate": 1.877974241432002e-05,
"loss": 1.0497,
"step": 610
},
{
"epoch": 0.13462895608273168,
"grad_norm": 1.0515743494033813,
"learning_rate": 1.8757913119406246e-05,
"loss": 1.0983,
"step": 620
},
{
"epoch": 0.1368003908582596,
"grad_norm": 1.2497025728225708,
"learning_rate": 1.873608382449247e-05,
"loss": 1.118,
"step": 630
},
{
"epoch": 0.13897182563378752,
"grad_norm": 1.1340830326080322,
"learning_rate": 1.8714254529578696e-05,
"loss": 1.0629,
"step": 640
},
{
"epoch": 0.14114326040931546,
"grad_norm": 1.1488502025604248,
"learning_rate": 1.869242523466492e-05,
"loss": 1.1057,
"step": 650
},
{
"epoch": 0.14331469518484338,
"grad_norm": 1.1718027591705322,
"learning_rate": 1.867059593975115e-05,
"loss": 1.0895,
"step": 660
},
{
"epoch": 0.1454861299603713,
"grad_norm": 0.8492761850357056,
"learning_rate": 1.8648766644837374e-05,
"loss": 1.0919,
"step": 670
},
{
"epoch": 0.14765756473589925,
"grad_norm": 1.0783703327178955,
"learning_rate": 1.86269373499236e-05,
"loss": 1.0929,
"step": 680
},
{
"epoch": 0.14982899951142717,
"grad_norm": 1.0920681953430176,
"learning_rate": 1.8605108055009824e-05,
"loss": 1.0545,
"step": 690
},
{
"epoch": 0.15200043428695512,
"grad_norm": 1.0387171506881714,
"learning_rate": 1.8583278760096052e-05,
"loss": 1.0386,
"step": 700
},
{
"epoch": 0.15417186906248304,
"grad_norm": 1.2252532243728638,
"learning_rate": 1.8561449465182274e-05,
"loss": 1.0743,
"step": 710
},
{
"epoch": 0.15634330383801096,
"grad_norm": 1.0585488080978394,
"learning_rate": 1.8539620170268502e-05,
"loss": 1.1129,
"step": 720
},
{
"epoch": 0.1585147386135389,
"grad_norm": 0.9711065292358398,
"learning_rate": 1.8517790875354727e-05,
"loss": 1.1111,
"step": 730
},
{
"epoch": 0.16068617338906682,
"grad_norm": 1.1681485176086426,
"learning_rate": 1.8495961580440952e-05,
"loss": 1.072,
"step": 740
},
{
"epoch": 0.16285760816459477,
"grad_norm": 0.9218672513961792,
"learning_rate": 1.847413228552718e-05,
"loss": 1.0748,
"step": 750
},
{
"epoch": 0.1650290429401227,
"grad_norm": 0.9746413230895996,
"learning_rate": 1.8452302990613406e-05,
"loss": 1.0946,
"step": 760
},
{
"epoch": 0.1672004777156506,
"grad_norm": 1.1038978099822998,
"learning_rate": 1.843047369569963e-05,
"loss": 1.1346,
"step": 770
},
{
"epoch": 0.16937191249117856,
"grad_norm": 0.9651903510093689,
"learning_rate": 1.8408644400785856e-05,
"loss": 1.0662,
"step": 780
},
{
"epoch": 0.17154334726670648,
"grad_norm": 1.1864938735961914,
"learning_rate": 1.8386815105872084e-05,
"loss": 1.1023,
"step": 790
},
{
"epoch": 0.1737147820422344,
"grad_norm": 0.9629665017127991,
"learning_rate": 1.8364985810958305e-05,
"loss": 1.0739,
"step": 800
},
{
"epoch": 0.17588621681776234,
"grad_norm": 1.2128831148147583,
"learning_rate": 1.8343156516044534e-05,
"loss": 1.0484,
"step": 810
},
{
"epoch": 0.17805765159329026,
"grad_norm": 1.0595309734344482,
"learning_rate": 1.832132722113076e-05,
"loss": 1.0829,
"step": 820
},
{
"epoch": 0.1802290863688182,
"grad_norm": 1.1851084232330322,
"learning_rate": 1.8299497926216984e-05,
"loss": 1.0791,
"step": 830
},
{
"epoch": 0.18240052114434613,
"grad_norm": 0.9105240702629089,
"learning_rate": 1.8277668631303212e-05,
"loss": 1.0909,
"step": 840
},
{
"epoch": 0.18457195591987405,
"grad_norm": 1.0232548713684082,
"learning_rate": 1.8255839336389437e-05,
"loss": 1.0585,
"step": 850
},
{
"epoch": 0.186743390695402,
"grad_norm": 1.0697710514068604,
"learning_rate": 1.8234010041475662e-05,
"loss": 1.1133,
"step": 860
},
{
"epoch": 0.18891482547092991,
"grad_norm": 0.9465317130088806,
"learning_rate": 1.8212180746561887e-05,
"loss": 1.0755,
"step": 870
},
{
"epoch": 0.19108626024645783,
"grad_norm": 1.0849310159683228,
"learning_rate": 1.8190351451648115e-05,
"loss": 1.1369,
"step": 880
},
{
"epoch": 0.19325769502198578,
"grad_norm": 1.0284308195114136,
"learning_rate": 1.8168522156734337e-05,
"loss": 1.0504,
"step": 890
},
{
"epoch": 0.1954291297975137,
"grad_norm": 1.000159740447998,
"learning_rate": 1.8146692861820565e-05,
"loss": 1.0658,
"step": 900
},
{
"epoch": 0.19760056457304165,
"grad_norm": 1.0055243968963623,
"learning_rate": 1.812486356690679e-05,
"loss": 1.0563,
"step": 910
},
{
"epoch": 0.19977199934856957,
"grad_norm": 1.1526374816894531,
"learning_rate": 1.8103034271993015e-05,
"loss": 1.0802,
"step": 920
},
{
"epoch": 0.2019434341240975,
"grad_norm": 0.8575794696807861,
"learning_rate": 1.808120497707924e-05,
"loss": 1.0893,
"step": 930
},
{
"epoch": 0.20411486889962543,
"grad_norm": 0.985564649105072,
"learning_rate": 1.805937568216547e-05,
"loss": 1.0543,
"step": 940
},
{
"epoch": 0.20628630367515335,
"grad_norm": 1.2791037559509277,
"learning_rate": 1.8037546387251694e-05,
"loss": 1.0984,
"step": 950
},
{
"epoch": 0.2084577384506813,
"grad_norm": 1.1033849716186523,
"learning_rate": 1.801571709233792e-05,
"loss": 1.0456,
"step": 960
},
{
"epoch": 0.21062917322620922,
"grad_norm": 1.1214113235473633,
"learning_rate": 1.7993887797424147e-05,
"loss": 1.0519,
"step": 970
},
{
"epoch": 0.21280060800173714,
"grad_norm": 1.1759611368179321,
"learning_rate": 1.797205850251037e-05,
"loss": 1.0646,
"step": 980
},
{
"epoch": 0.2149720427772651,
"grad_norm": 1.0244547128677368,
"learning_rate": 1.7950229207596597e-05,
"loss": 1.0995,
"step": 990
},
{
"epoch": 0.217143477552793,
"grad_norm": 1.134796142578125,
"learning_rate": 1.7928399912682822e-05,
"loss": 1.0889,
"step": 1000
},
{
"epoch": 0.21931491232832093,
"grad_norm": 1.0857653617858887,
"learning_rate": 1.7906570617769047e-05,
"loss": 1.0283,
"step": 1010
},
{
"epoch": 0.22148634710384887,
"grad_norm": 1.1252498626708984,
"learning_rate": 1.7884741322855272e-05,
"loss": 1.0462,
"step": 1020
},
{
"epoch": 0.2236577818793768,
"grad_norm": 1.0542049407958984,
"learning_rate": 1.78629120279415e-05,
"loss": 1.0499,
"step": 1030
},
{
"epoch": 0.22582921665490474,
"grad_norm": 1.1074199676513672,
"learning_rate": 1.7841082733027725e-05,
"loss": 1.0394,
"step": 1040
},
{
"epoch": 0.22800065143043266,
"grad_norm": 1.0936591625213623,
"learning_rate": 1.781925343811395e-05,
"loss": 1.0529,
"step": 1050
},
{
"epoch": 0.23017208620596058,
"grad_norm": 1.032329797744751,
"learning_rate": 1.779742414320018e-05,
"loss": 1.0311,
"step": 1060
},
{
"epoch": 0.23234352098148853,
"grad_norm": 1.6111783981323242,
"learning_rate": 1.77755948482864e-05,
"loss": 1.0481,
"step": 1070
},
{
"epoch": 0.23451495575701645,
"grad_norm": 1.1454813480377197,
"learning_rate": 1.775376555337263e-05,
"loss": 1.0231,
"step": 1080
},
{
"epoch": 0.23668639053254437,
"grad_norm": 1.0079035758972168,
"learning_rate": 1.7731936258458853e-05,
"loss": 1.0907,
"step": 1090
},
{
"epoch": 0.2388578253080723,
"grad_norm": 1.0366803407669067,
"learning_rate": 1.771010696354508e-05,
"loss": 1.0368,
"step": 1100
},
{
"epoch": 0.24102926008360023,
"grad_norm": 1.057990550994873,
"learning_rate": 1.7688277668631303e-05,
"loss": 1.0462,
"step": 1110
},
{
"epoch": 0.24320069485912818,
"grad_norm": 0.9940240383148193,
"learning_rate": 1.7666448373717532e-05,
"loss": 1.089,
"step": 1120
},
{
"epoch": 0.2453721296346561,
"grad_norm": 1.0284287929534912,
"learning_rate": 1.7644619078803757e-05,
"loss": 1.044,
"step": 1130
},
{
"epoch": 0.24754356441018402,
"grad_norm": 1.2615009546279907,
"learning_rate": 1.7622789783889982e-05,
"loss": 1.0978,
"step": 1140
},
{
"epoch": 0.24971499918571197,
"grad_norm": 1.1974271535873413,
"learning_rate": 1.7600960488976207e-05,
"loss": 1.1405,
"step": 1150
},
{
"epoch": 0.2518864339612399,
"grad_norm": 1.1542342901229858,
"learning_rate": 1.757913119406243e-05,
"loss": 1.0874,
"step": 1160
},
{
"epoch": 0.25405786873676783,
"grad_norm": 1.2193187475204468,
"learning_rate": 1.755730189914866e-05,
"loss": 1.06,
"step": 1170
},
{
"epoch": 0.2562293035122957,
"grad_norm": 0.8851369619369507,
"learning_rate": 1.7535472604234885e-05,
"loss": 1.0793,
"step": 1180
},
{
"epoch": 0.2584007382878237,
"grad_norm": 0.9751698970794678,
"learning_rate": 1.751364330932111e-05,
"loss": 1.085,
"step": 1190
},
{
"epoch": 0.2605721730633516,
"grad_norm": 1.1651514768600464,
"learning_rate": 1.7491814014407335e-05,
"loss": 1.0548,
"step": 1200
},
{
"epoch": 0.26274360783887957,
"grad_norm": 1.0106171369552612,
"learning_rate": 1.7469984719493563e-05,
"loss": 1.0162,
"step": 1210
},
{
"epoch": 0.26491504261440746,
"grad_norm": 1.0185978412628174,
"learning_rate": 1.7448155424579788e-05,
"loss": 1.0557,
"step": 1220
},
{
"epoch": 0.2670864773899354,
"grad_norm": 1.0883762836456299,
"learning_rate": 1.7426326129666013e-05,
"loss": 1.0507,
"step": 1230
},
{
"epoch": 0.26925791216546335,
"grad_norm": 1.1618812084197998,
"learning_rate": 1.7404496834752238e-05,
"loss": 1.0452,
"step": 1240
},
{
"epoch": 0.27142934694099125,
"grad_norm": 1.1427685022354126,
"learning_rate": 1.7382667539838463e-05,
"loss": 1.0772,
"step": 1250
},
{
"epoch": 0.2736007817165192,
"grad_norm": 1.1353427171707153,
"learning_rate": 1.7360838244924688e-05,
"loss": 1.0152,
"step": 1260
},
{
"epoch": 0.27577221649204714,
"grad_norm": 1.0538302659988403,
"learning_rate": 1.7339008950010917e-05,
"loss": 1.0453,
"step": 1270
},
{
"epoch": 0.27794365126757503,
"grad_norm": 0.9475343823432922,
"learning_rate": 1.731717965509714e-05,
"loss": 1.0159,
"step": 1280
},
{
"epoch": 0.280115086043103,
"grad_norm": 1.1043903827667236,
"learning_rate": 1.7295350360183367e-05,
"loss": 1.0767,
"step": 1290
},
{
"epoch": 0.2822865208186309,
"grad_norm": 1.2597566843032837,
"learning_rate": 1.7273521065269595e-05,
"loss": 1.0976,
"step": 1300
},
{
"epoch": 0.2844579555941588,
"grad_norm": 1.2252488136291504,
"learning_rate": 1.725169177035582e-05,
"loss": 1.1019,
"step": 1310
},
{
"epoch": 0.28662939036968677,
"grad_norm": 1.4177309274673462,
"learning_rate": 1.7229862475442045e-05,
"loss": 1.1039,
"step": 1320
},
{
"epoch": 0.2888008251452147,
"grad_norm": 1.2762172222137451,
"learning_rate": 1.720803318052827e-05,
"loss": 1.0195,
"step": 1330
},
{
"epoch": 0.2909722599207426,
"grad_norm": 1.3187838792800903,
"learning_rate": 1.7186203885614495e-05,
"loss": 1.0851,
"step": 1340
},
{
"epoch": 0.29314369469627055,
"grad_norm": 1.2178953886032104,
"learning_rate": 1.716437459070072e-05,
"loss": 1.0503,
"step": 1350
},
{
"epoch": 0.2953151294717985,
"grad_norm": 0.9904911518096924,
"learning_rate": 1.7142545295786948e-05,
"loss": 1.0683,
"step": 1360
},
{
"epoch": 0.29748656424732645,
"grad_norm": 0.9594365358352661,
"learning_rate": 1.7120716000873173e-05,
"loss": 1.052,
"step": 1370
},
{
"epoch": 0.29965799902285434,
"grad_norm": 1.218839406967163,
"learning_rate": 1.7098886705959398e-05,
"loss": 1.0563,
"step": 1380
},
{
"epoch": 0.3018294337983823,
"grad_norm": 1.2965632677078247,
"learning_rate": 1.7077057411045626e-05,
"loss": 1.1087,
"step": 1390
},
{
"epoch": 0.30400086857391023,
"grad_norm": 1.0554800033569336,
"learning_rate": 1.705522811613185e-05,
"loss": 1.036,
"step": 1400
},
{
"epoch": 0.3061723033494381,
"grad_norm": 1.1262216567993164,
"learning_rate": 1.7033398821218076e-05,
"loss": 1.0489,
"step": 1410
},
{
"epoch": 0.30834373812496607,
"grad_norm": 1.044252872467041,
"learning_rate": 1.70115695263043e-05,
"loss": 1.048,
"step": 1420
},
{
"epoch": 0.310515172900494,
"grad_norm": 1.4321969747543335,
"learning_rate": 1.6989740231390526e-05,
"loss": 1.0712,
"step": 1430
},
{
"epoch": 0.3126866076760219,
"grad_norm": 0.9649491310119629,
"learning_rate": 1.696791093647675e-05,
"loss": 1.0366,
"step": 1440
},
{
"epoch": 0.31485804245154986,
"grad_norm": 1.0629323720932007,
"learning_rate": 1.694608164156298e-05,
"loss": 1.0527,
"step": 1450
},
{
"epoch": 0.3170294772270778,
"grad_norm": 1.1887277364730835,
"learning_rate": 1.6924252346649205e-05,
"loss": 1.0597,
"step": 1460
},
{
"epoch": 0.3192009120026057,
"grad_norm": 1.0008701086044312,
"learning_rate": 1.690242305173543e-05,
"loss": 1.0733,
"step": 1470
},
{
"epoch": 0.32137234677813364,
"grad_norm": 1.1184202432632446,
"learning_rate": 1.6880593756821658e-05,
"loss": 1.0093,
"step": 1480
},
{
"epoch": 0.3235437815536616,
"grad_norm": 1.2822941541671753,
"learning_rate": 1.6858764461907883e-05,
"loss": 1.0505,
"step": 1490
},
{
"epoch": 0.32571521632918954,
"grad_norm": 1.365919828414917,
"learning_rate": 1.6836935166994108e-05,
"loss": 1.0616,
"step": 1500
},
{
"epoch": 0.32788665110471743,
"grad_norm": 1.4940375089645386,
"learning_rate": 1.6815105872080333e-05,
"loss": 1.0189,
"step": 1510
},
{
"epoch": 0.3300580858802454,
"grad_norm": 1.443363070487976,
"learning_rate": 1.679327657716656e-05,
"loss": 1.0591,
"step": 1520
},
{
"epoch": 0.3322295206557733,
"grad_norm": 1.0023658275604248,
"learning_rate": 1.6771447282252783e-05,
"loss": 1.0784,
"step": 1530
},
{
"epoch": 0.3344009554313012,
"grad_norm": 1.0569523572921753,
"learning_rate": 1.674961798733901e-05,
"loss": 1.0167,
"step": 1540
},
{
"epoch": 0.33657239020682916,
"grad_norm": 1.5533829927444458,
"learning_rate": 1.6727788692425236e-05,
"loss": 1.059,
"step": 1550
},
{
"epoch": 0.3387438249823571,
"grad_norm": 1.2175540924072266,
"learning_rate": 1.670595939751146e-05,
"loss": 1.0566,
"step": 1560
},
{
"epoch": 0.340915259757885,
"grad_norm": 1.2406116724014282,
"learning_rate": 1.6684130102597686e-05,
"loss": 1.0284,
"step": 1570
},
{
"epoch": 0.34308669453341295,
"grad_norm": 1.1116441488265991,
"learning_rate": 1.6662300807683914e-05,
"loss": 1.0686,
"step": 1580
},
{
"epoch": 0.3452581293089409,
"grad_norm": 0.9541231989860535,
"learning_rate": 1.664047151277014e-05,
"loss": 1.0569,
"step": 1590
},
{
"epoch": 0.3474295640844688,
"grad_norm": 0.9048693180084229,
"learning_rate": 1.6618642217856364e-05,
"loss": 1.0503,
"step": 1600
},
{
"epoch": 0.34960099885999674,
"grad_norm": 1.2782031297683716,
"learning_rate": 1.6596812922942593e-05,
"loss": 1.0462,
"step": 1610
},
{
"epoch": 0.3517724336355247,
"grad_norm": 1.0912036895751953,
"learning_rate": 1.6574983628028814e-05,
"loss": 1.0488,
"step": 1620
},
{
"epoch": 0.3539438684110526,
"grad_norm": 1.2449527978897095,
"learning_rate": 1.6553154333115043e-05,
"loss": 1.0283,
"step": 1630
},
{
"epoch": 0.3561153031865805,
"grad_norm": 1.367113471031189,
"learning_rate": 1.6531325038201268e-05,
"loss": 1.0687,
"step": 1640
},
{
"epoch": 0.35828673796210847,
"grad_norm": 1.4987077713012695,
"learning_rate": 1.6509495743287493e-05,
"loss": 1.0419,
"step": 1650
},
{
"epoch": 0.3604581727376364,
"grad_norm": 1.0947941541671753,
"learning_rate": 1.6487666448373718e-05,
"loss": 1.0666,
"step": 1660
},
{
"epoch": 0.3626296075131643,
"grad_norm": 1.199379801750183,
"learning_rate": 1.6465837153459946e-05,
"loss": 1.0062,
"step": 1670
},
{
"epoch": 0.36480104228869226,
"grad_norm": 1.281281590461731,
"learning_rate": 1.6444007858546168e-05,
"loss": 1.0054,
"step": 1680
},
{
"epoch": 0.3669724770642202,
"grad_norm": 1.00531804561615,
"learning_rate": 1.6422178563632396e-05,
"loss": 1.0951,
"step": 1690
},
{
"epoch": 0.3691439118397481,
"grad_norm": 1.2501758337020874,
"learning_rate": 1.6400349268718624e-05,
"loss": 1.0289,
"step": 1700
},
{
"epoch": 0.37131534661527604,
"grad_norm": 1.1288474798202515,
"learning_rate": 1.6378519973804846e-05,
"loss": 1.0539,
"step": 1710
},
{
"epoch": 0.373486781390804,
"grad_norm": 1.2077093124389648,
"learning_rate": 1.6356690678891074e-05,
"loss": 1.0112,
"step": 1720
},
{
"epoch": 0.3756582161663319,
"grad_norm": 1.0771955251693726,
"learning_rate": 1.63348613839773e-05,
"loss": 1.0527,
"step": 1730
},
{
"epoch": 0.37782965094185983,
"grad_norm": 1.5062224864959717,
"learning_rate": 1.6313032089063524e-05,
"loss": 1.0023,
"step": 1740
},
{
"epoch": 0.3800010857173878,
"grad_norm": 1.4642319679260254,
"learning_rate": 1.629120279414975e-05,
"loss": 1.0253,
"step": 1750
},
{
"epoch": 0.38217252049291567,
"grad_norm": 1.17564058303833,
"learning_rate": 1.6269373499235978e-05,
"loss": 1.0378,
"step": 1760
},
{
"epoch": 0.3843439552684436,
"grad_norm": 1.155928134918213,
"learning_rate": 1.6247544204322203e-05,
"loss": 1.0797,
"step": 1770
},
{
"epoch": 0.38651539004397156,
"grad_norm": 1.161272406578064,
"learning_rate": 1.6225714909408428e-05,
"loss": 1.0645,
"step": 1780
},
{
"epoch": 0.3886868248194995,
"grad_norm": 1.1190975904464722,
"learning_rate": 1.6203885614494653e-05,
"loss": 0.9895,
"step": 1790
},
{
"epoch": 0.3908582595950274,
"grad_norm": 1.0364742279052734,
"learning_rate": 1.6182056319580877e-05,
"loss": 0.993,
"step": 1800
},
{
"epoch": 0.39302969437055535,
"grad_norm": 0.9722704887390137,
"learning_rate": 1.6160227024667106e-05,
"loss": 1.074,
"step": 1810
},
{
"epoch": 0.3952011291460833,
"grad_norm": 1.196349024772644,
"learning_rate": 1.613839772975333e-05,
"loss": 1.0192,
"step": 1820
},
{
"epoch": 0.3973725639216112,
"grad_norm": 1.2496604919433594,
"learning_rate": 1.6116568434839556e-05,
"loss": 1.0539,
"step": 1830
},
{
"epoch": 0.39954399869713914,
"grad_norm": 1.273461937904358,
"learning_rate": 1.609473913992578e-05,
"loss": 1.0283,
"step": 1840
},
{
"epoch": 0.4017154334726671,
"grad_norm": 1.267354965209961,
"learning_rate": 1.607290984501201e-05,
"loss": 1.0265,
"step": 1850
},
{
"epoch": 0.403886868248195,
"grad_norm": 1.1388341188430786,
"learning_rate": 1.6051080550098234e-05,
"loss": 1.0998,
"step": 1860
},
{
"epoch": 0.4060583030237229,
"grad_norm": 1.7409948110580444,
"learning_rate": 1.602925125518446e-05,
"loss": 1.0246,
"step": 1870
},
{
"epoch": 0.40822973779925087,
"grad_norm": 1.0280303955078125,
"learning_rate": 1.6007421960270684e-05,
"loss": 1.075,
"step": 1880
},
{
"epoch": 0.41040117257477876,
"grad_norm": 1.093042016029358,
"learning_rate": 1.598559266535691e-05,
"loss": 1.018,
"step": 1890
},
{
"epoch": 0.4125726073503067,
"grad_norm": 0.9621986746788025,
"learning_rate": 1.5963763370443134e-05,
"loss": 1.1104,
"step": 1900
},
{
"epoch": 0.41474404212583466,
"grad_norm": 1.0159006118774414,
"learning_rate": 1.5941934075529362e-05,
"loss": 1.0231,
"step": 1910
},
{
"epoch": 0.4169154769013626,
"grad_norm": 1.2041517496109009,
"learning_rate": 1.5920104780615587e-05,
"loss": 1.0426,
"step": 1920
},
{
"epoch": 0.4190869116768905,
"grad_norm": 1.2012951374053955,
"learning_rate": 1.5898275485701812e-05,
"loss": 1.0376,
"step": 1930
},
{
"epoch": 0.42125834645241844,
"grad_norm": 1.207979679107666,
"learning_rate": 1.587644619078804e-05,
"loss": 1.0054,
"step": 1940
},
{
"epoch": 0.4234297812279464,
"grad_norm": 1.1251835823059082,
"learning_rate": 1.5854616895874266e-05,
"loss": 1.068,
"step": 1950
},
{
"epoch": 0.4256012160034743,
"grad_norm": 1.2626091241836548,
"learning_rate": 1.583278760096049e-05,
"loss": 1.0705,
"step": 1960
},
{
"epoch": 0.42777265077900223,
"grad_norm": 1.203305959701538,
"learning_rate": 1.5810958306046716e-05,
"loss": 1.0319,
"step": 1970
},
{
"epoch": 0.4299440855545302,
"grad_norm": 1.3643816709518433,
"learning_rate": 1.578912901113294e-05,
"loss": 1.0383,
"step": 1980
},
{
"epoch": 0.43211552033005807,
"grad_norm": 1.3260042667388916,
"learning_rate": 1.5767299716219166e-05,
"loss": 1.0949,
"step": 1990
},
{
"epoch": 0.434286955105586,
"grad_norm": 1.0160613059997559,
"learning_rate": 1.5745470421305394e-05,
"loss": 1.0717,
"step": 2000
},
{
"epoch": 0.43645838988111396,
"grad_norm": 0.9759429693222046,
"learning_rate": 1.572364112639162e-05,
"loss": 1.023,
"step": 2010
},
{
"epoch": 0.43862982465664185,
"grad_norm": 1.268486738204956,
"learning_rate": 1.5701811831477844e-05,
"loss": 0.986,
"step": 2020
},
{
"epoch": 0.4408012594321698,
"grad_norm": 1.326611876487732,
"learning_rate": 1.5679982536564072e-05,
"loss": 1.0641,
"step": 2030
},
{
"epoch": 0.44297269420769775,
"grad_norm": 1.638113021850586,
"learning_rate": 1.5658153241650297e-05,
"loss": 1.0302,
"step": 2040
},
{
"epoch": 0.44514412898322564,
"grad_norm": 1.5037381649017334,
"learning_rate": 1.5636323946736522e-05,
"loss": 1.0291,
"step": 2050
},
{
"epoch": 0.4473155637587536,
"grad_norm": 1.1574699878692627,
"learning_rate": 1.5614494651822747e-05,
"loss": 1.0268,
"step": 2060
},
{
"epoch": 0.44948699853428153,
"grad_norm": 1.1230093240737915,
"learning_rate": 1.5592665356908972e-05,
"loss": 1.0471,
"step": 2070
},
{
"epoch": 0.4516584333098095,
"grad_norm": 1.134092092514038,
"learning_rate": 1.5570836061995197e-05,
"loss": 1.045,
"step": 2080
},
{
"epoch": 0.4538298680853374,
"grad_norm": 1.4253817796707153,
"learning_rate": 1.5549006767081425e-05,
"loss": 1.0575,
"step": 2090
},
{
"epoch": 0.4560013028608653,
"grad_norm": 1.322679877281189,
"learning_rate": 1.552717747216765e-05,
"loss": 1.005,
"step": 2100
},
{
"epoch": 0.45817273763639327,
"grad_norm": 1.190661072731018,
"learning_rate": 1.5505348177253875e-05,
"loss": 1.0392,
"step": 2110
},
{
"epoch": 0.46034417241192116,
"grad_norm": 1.10509192943573,
"learning_rate": 1.54835188823401e-05,
"loss": 1.046,
"step": 2120
},
{
"epoch": 0.4625156071874491,
"grad_norm": 1.305440902709961,
"learning_rate": 1.546168958742633e-05,
"loss": 1.067,
"step": 2130
},
{
"epoch": 0.46468704196297705,
"grad_norm": 0.9714760184288025,
"learning_rate": 1.5439860292512554e-05,
"loss": 1.0648,
"step": 2140
},
{
"epoch": 0.46685847673850495,
"grad_norm": 1.249341368675232,
"learning_rate": 1.541803099759878e-05,
"loss": 1.0326,
"step": 2150
},
{
"epoch": 0.4690299115140329,
"grad_norm": 1.182078242301941,
"learning_rate": 1.5396201702685004e-05,
"loss": 0.9649,
"step": 2160
},
{
"epoch": 0.47120134628956084,
"grad_norm": 1.1989095211029053,
"learning_rate": 1.537437240777123e-05,
"loss": 1.0324,
"step": 2170
},
{
"epoch": 0.47337278106508873,
"grad_norm": 1.1520359516143799,
"learning_rate": 1.5352543112857457e-05,
"loss": 1.0453,
"step": 2180
},
{
"epoch": 0.4755442158406167,
"grad_norm": 0.9840512871742249,
"learning_rate": 1.5330713817943682e-05,
"loss": 1.028,
"step": 2190
},
{
"epoch": 0.4777156506161446,
"grad_norm": 1.2729812860488892,
"learning_rate": 1.5308884523029907e-05,
"loss": 1.0866,
"step": 2200
},
{
"epoch": 0.4798870853916726,
"grad_norm": 1.2075546979904175,
"learning_rate": 1.5287055228116132e-05,
"loss": 1.0633,
"step": 2210
},
{
"epoch": 0.48205852016720047,
"grad_norm": 1.6592689752578735,
"learning_rate": 1.526522593320236e-05,
"loss": 1.0107,
"step": 2220
},
{
"epoch": 0.4842299549427284,
"grad_norm": 1.2771036624908447,
"learning_rate": 1.5243396638288585e-05,
"loss": 0.9855,
"step": 2230
},
{
"epoch": 0.48640138971825636,
"grad_norm": 1.4246838092803955,
"learning_rate": 1.522156734337481e-05,
"loss": 1.0405,
"step": 2240
},
{
"epoch": 0.48857282449378425,
"grad_norm": 1.1746619939804077,
"learning_rate": 1.5199738048461037e-05,
"loss": 0.9615,
"step": 2250
},
{
"epoch": 0.4907442592693122,
"grad_norm": 0.9854568243026733,
"learning_rate": 1.5177908753547262e-05,
"loss": 1.0748,
"step": 2260
},
{
"epoch": 0.49291569404484015,
"grad_norm": 1.2437796592712402,
"learning_rate": 1.5156079458633489e-05,
"loss": 1.0428,
"step": 2270
},
{
"epoch": 0.49508712882036804,
"grad_norm": 1.4417718648910522,
"learning_rate": 1.5134250163719712e-05,
"loss": 1.0339,
"step": 2280
},
{
"epoch": 0.497258563595896,
"grad_norm": 1.5475140810012817,
"learning_rate": 1.5112420868805939e-05,
"loss": 1.0054,
"step": 2290
},
{
"epoch": 0.49942999837142393,
"grad_norm": 1.2441282272338867,
"learning_rate": 1.5090591573892164e-05,
"loss": 1.0406,
"step": 2300
},
{
"epoch": 0.5016014331469518,
"grad_norm": 1.3758796453475952,
"learning_rate": 1.506876227897839e-05,
"loss": 1.0709,
"step": 2310
},
{
"epoch": 0.5037728679224798,
"grad_norm": 1.412845492362976,
"learning_rate": 1.5046932984064615e-05,
"loss": 1.0111,
"step": 2320
},
{
"epoch": 0.5059443026980077,
"grad_norm": 1.2830110788345337,
"learning_rate": 1.5025103689150842e-05,
"loss": 1.0142,
"step": 2330
},
{
"epoch": 0.5081157374735357,
"grad_norm": 1.1173603534698486,
"learning_rate": 1.5003274394237068e-05,
"loss": 1.0425,
"step": 2340
},
{
"epoch": 0.5102871722490636,
"grad_norm": 1.3206751346588135,
"learning_rate": 1.4981445099323293e-05,
"loss": 1.0147,
"step": 2350
},
{
"epoch": 0.5124586070245915,
"grad_norm": 1.1469355821609497,
"learning_rate": 1.495961580440952e-05,
"loss": 1.1042,
"step": 2360
},
{
"epoch": 0.5146300418001194,
"grad_norm": 1.5979527235031128,
"learning_rate": 1.4937786509495743e-05,
"loss": 1.0293,
"step": 2370
},
{
"epoch": 0.5168014765756473,
"grad_norm": 1.1847728490829468,
"learning_rate": 1.491595721458197e-05,
"loss": 1.0589,
"step": 2380
},
{
"epoch": 0.5189729113511753,
"grad_norm": 1.1340694427490234,
"learning_rate": 1.4894127919668195e-05,
"loss": 1.008,
"step": 2390
},
{
"epoch": 0.5211443461267032,
"grad_norm": 1.3193624019622803,
"learning_rate": 1.4872298624754422e-05,
"loss": 1.0455,
"step": 2400
},
{
"epoch": 0.5233157809022312,
"grad_norm": 1.3973023891448975,
"learning_rate": 1.4850469329840647e-05,
"loss": 1.0455,
"step": 2410
},
{
"epoch": 0.5254872156777591,
"grad_norm": 1.4529467821121216,
"learning_rate": 1.4828640034926873e-05,
"loss": 1.0726,
"step": 2420
},
{
"epoch": 0.527658650453287,
"grad_norm": 1.2769255638122559,
"learning_rate": 1.4806810740013098e-05,
"loss": 1.0646,
"step": 2430
},
{
"epoch": 0.5298300852288149,
"grad_norm": 1.4367311000823975,
"learning_rate": 1.4784981445099325e-05,
"loss": 0.9821,
"step": 2440
},
{
"epoch": 0.5320015200043429,
"grad_norm": 1.6156344413757324,
"learning_rate": 1.4763152150185552e-05,
"loss": 1.0581,
"step": 2450
},
{
"epoch": 0.5341729547798708,
"grad_norm": 1.093217372894287,
"learning_rate": 1.4741322855271775e-05,
"loss": 1.0125,
"step": 2460
},
{
"epoch": 0.5363443895553988,
"grad_norm": 1.3095054626464844,
"learning_rate": 1.4719493560358002e-05,
"loss": 1.0268,
"step": 2470
},
{
"epoch": 0.5385158243309267,
"grad_norm": 1.3275405168533325,
"learning_rate": 1.4697664265444227e-05,
"loss": 1.036,
"step": 2480
},
{
"epoch": 0.5406872591064545,
"grad_norm": 1.1585111618041992,
"learning_rate": 1.4675834970530453e-05,
"loss": 1.0893,
"step": 2490
},
{
"epoch": 0.5428586938819825,
"grad_norm": 1.1334049701690674,
"learning_rate": 1.4654005675616678e-05,
"loss": 1.0356,
"step": 2500
},
{
"epoch": 0.5450301286575104,
"grad_norm": 1.1498132944107056,
"learning_rate": 1.4632176380702905e-05,
"loss": 1.0297,
"step": 2510
},
{
"epoch": 0.5472015634330384,
"grad_norm": 1.3892987966537476,
"learning_rate": 1.461034708578913e-05,
"loss": 0.9885,
"step": 2520
},
{
"epoch": 0.5493729982085663,
"grad_norm": 1.1444848775863647,
"learning_rate": 1.4588517790875357e-05,
"loss": 1.0221,
"step": 2530
},
{
"epoch": 0.5515444329840943,
"grad_norm": 1.0999592542648315,
"learning_rate": 1.456668849596158e-05,
"loss": 1.0053,
"step": 2540
},
{
"epoch": 0.5537158677596221,
"grad_norm": 1.2366653680801392,
"learning_rate": 1.4544859201047807e-05,
"loss": 0.9872,
"step": 2550
},
{
"epoch": 0.5558873025351501,
"grad_norm": 1.066278100013733,
"learning_rate": 1.4523029906134035e-05,
"loss": 1.0833,
"step": 2560
},
{
"epoch": 0.558058737310678,
"grad_norm": 1.418614149093628,
"learning_rate": 1.4501200611220258e-05,
"loss": 1.0316,
"step": 2570
},
{
"epoch": 0.560230172086206,
"grad_norm": 1.2488312721252441,
"learning_rate": 1.4479371316306485e-05,
"loss": 1.0261,
"step": 2580
},
{
"epoch": 0.5624016068617339,
"grad_norm": 1.1262556314468384,
"learning_rate": 1.445754202139271e-05,
"loss": 0.9926,
"step": 2590
},
{
"epoch": 0.5645730416372619,
"grad_norm": 1.2547680139541626,
"learning_rate": 1.4435712726478936e-05,
"loss": 1.0211,
"step": 2600
},
{
"epoch": 0.5667444764127898,
"grad_norm": 1.3836477994918823,
"learning_rate": 1.4413883431565161e-05,
"loss": 0.9906,
"step": 2610
},
{
"epoch": 0.5689159111883176,
"grad_norm": 1.0498002767562866,
"learning_rate": 1.4392054136651388e-05,
"loss": 0.9824,
"step": 2620
},
{
"epoch": 0.5710873459638456,
"grad_norm": 0.9665150046348572,
"learning_rate": 1.4370224841737611e-05,
"loss": 1.0113,
"step": 2630
},
{
"epoch": 0.5732587807393735,
"grad_norm": 1.2889072895050049,
"learning_rate": 1.434839554682384e-05,
"loss": 0.9909,
"step": 2640
},
{
"epoch": 0.5754302155149015,
"grad_norm": 1.5180598497390747,
"learning_rate": 1.4326566251910063e-05,
"loss": 1.0092,
"step": 2650
},
{
"epoch": 0.5776016502904294,
"grad_norm": 1.6388850212097168,
"learning_rate": 1.430473695699629e-05,
"loss": 1.0116,
"step": 2660
},
{
"epoch": 0.5797730850659574,
"grad_norm": 1.2516218423843384,
"learning_rate": 1.4282907662082516e-05,
"loss": 0.9775,
"step": 2670
},
{
"epoch": 0.5819445198414852,
"grad_norm": 1.1634091138839722,
"learning_rate": 1.4261078367168741e-05,
"loss": 0.9898,
"step": 2680
},
{
"epoch": 0.5841159546170132,
"grad_norm": 1.1283944845199585,
"learning_rate": 1.4239249072254968e-05,
"loss": 1.0265,
"step": 2690
},
{
"epoch": 0.5862873893925411,
"grad_norm": 1.3887890577316284,
"learning_rate": 1.4217419777341193e-05,
"loss": 1.0433,
"step": 2700
},
{
"epoch": 0.588458824168069,
"grad_norm": 1.2213870286941528,
"learning_rate": 1.419559048242742e-05,
"loss": 1.0116,
"step": 2710
},
{
"epoch": 0.590630258943597,
"grad_norm": 1.2879663705825806,
"learning_rate": 1.4173761187513645e-05,
"loss": 0.9866,
"step": 2720
},
{
"epoch": 0.592801693719125,
"grad_norm": 1.3769855499267578,
"learning_rate": 1.4151931892599871e-05,
"loss": 0.9735,
"step": 2730
},
{
"epoch": 0.5949731284946529,
"grad_norm": 1.347123146057129,
"learning_rate": 1.4130102597686095e-05,
"loss": 1.0169,
"step": 2740
},
{
"epoch": 0.5971445632701807,
"grad_norm": 1.093166708946228,
"learning_rate": 1.4108273302772321e-05,
"loss": 1.0288,
"step": 2750
},
{
"epoch": 0.5993159980457087,
"grad_norm": 1.3573272228240967,
"learning_rate": 1.4086444007858546e-05,
"loss": 1.0042,
"step": 2760
},
{
"epoch": 0.6014874328212366,
"grad_norm": 1.400972604751587,
"learning_rate": 1.4064614712944773e-05,
"loss": 1.0181,
"step": 2770
},
{
"epoch": 0.6036588675967646,
"grad_norm": 1.1371185779571533,
"learning_rate": 1.4042785418031e-05,
"loss": 1.0504,
"step": 2780
},
{
"epoch": 0.6058303023722925,
"grad_norm": 1.31002676486969,
"learning_rate": 1.4020956123117225e-05,
"loss": 1.0615,
"step": 2790
},
{
"epoch": 0.6080017371478205,
"grad_norm": 1.557403326034546,
"learning_rate": 1.3999126828203451e-05,
"loss": 1.0286,
"step": 2800
},
{
"epoch": 0.6101731719233483,
"grad_norm": 1.2506225109100342,
"learning_rate": 1.3977297533289676e-05,
"loss": 1.0316,
"step": 2810
},
{
"epoch": 0.6123446066988762,
"grad_norm": 1.2750262022018433,
"learning_rate": 1.3955468238375903e-05,
"loss": 1.0691,
"step": 2820
},
{
"epoch": 0.6145160414744042,
"grad_norm": 1.3119608163833618,
"learning_rate": 1.3933638943462126e-05,
"loss": 1.0107,
"step": 2830
},
{
"epoch": 0.6166874762499321,
"grad_norm": 1.269987940788269,
"learning_rate": 1.3911809648548353e-05,
"loss": 1.0298,
"step": 2840
},
{
"epoch": 0.6188589110254601,
"grad_norm": 1.1371833086013794,
"learning_rate": 1.3889980353634578e-05,
"loss": 1.0541,
"step": 2850
},
{
"epoch": 0.621030345800988,
"grad_norm": 1.2296518087387085,
"learning_rate": 1.3868151058720804e-05,
"loss": 1.0225,
"step": 2860
},
{
"epoch": 0.623201780576516,
"grad_norm": 1.5416007041931152,
"learning_rate": 1.384632176380703e-05,
"loss": 0.9838,
"step": 2870
},
{
"epoch": 0.6253732153520438,
"grad_norm": 1.2770878076553345,
"learning_rate": 1.3824492468893256e-05,
"loss": 0.9917,
"step": 2880
},
{
"epoch": 0.6275446501275718,
"grad_norm": 1.3633027076721191,
"learning_rate": 1.3802663173979483e-05,
"loss": 1.0636,
"step": 2890
},
{
"epoch": 0.6297160849030997,
"grad_norm": 1.2924447059631348,
"learning_rate": 1.3780833879065708e-05,
"loss": 1.0151,
"step": 2900
},
{
"epoch": 0.6318875196786277,
"grad_norm": 1.3453025817871094,
"learning_rate": 1.3759004584151934e-05,
"loss": 1.0053,
"step": 2910
},
{
"epoch": 0.6340589544541556,
"grad_norm": 1.497462511062622,
"learning_rate": 1.3737175289238158e-05,
"loss": 0.9936,
"step": 2920
},
{
"epoch": 0.6362303892296836,
"grad_norm": 1.0469037294387817,
"learning_rate": 1.3715345994324384e-05,
"loss": 1.0465,
"step": 2930
},
{
"epoch": 0.6384018240052114,
"grad_norm": 1.4272680282592773,
"learning_rate": 1.369351669941061e-05,
"loss": 1.0634,
"step": 2940
},
{
"epoch": 0.6405732587807393,
"grad_norm": 1.065047264099121,
"learning_rate": 1.3671687404496836e-05,
"loss": 1.0464,
"step": 2950
},
{
"epoch": 0.6427446935562673,
"grad_norm": 1.3233064413070679,
"learning_rate": 1.3649858109583061e-05,
"loss": 1.07,
"step": 2960
},
{
"epoch": 0.6449161283317952,
"grad_norm": 1.792734980583191,
"learning_rate": 1.3628028814669288e-05,
"loss": 0.9722,
"step": 2970
},
{
"epoch": 0.6470875631073232,
"grad_norm": 1.7977020740509033,
"learning_rate": 1.3606199519755514e-05,
"loss": 0.9811,
"step": 2980
},
{
"epoch": 0.6492589978828511,
"grad_norm": 1.2973439693450928,
"learning_rate": 1.358437022484174e-05,
"loss": 0.9958,
"step": 2990
},
{
"epoch": 0.6514304326583791,
"grad_norm": 1.249764323234558,
"learning_rate": 1.3562540929927966e-05,
"loss": 1.0675,
"step": 3000
},
{
"epoch": 0.6536018674339069,
"grad_norm": 1.343056559562683,
"learning_rate": 1.354071163501419e-05,
"loss": 1.0493,
"step": 3010
},
{
"epoch": 0.6557733022094349,
"grad_norm": 1.6171714067459106,
"learning_rate": 1.3518882340100416e-05,
"loss": 1.044,
"step": 3020
},
{
"epoch": 0.6579447369849628,
"grad_norm": 1.2323534488677979,
"learning_rate": 1.3497053045186641e-05,
"loss": 1.0386,
"step": 3030
},
{
"epoch": 0.6601161717604908,
"grad_norm": 1.1134217977523804,
"learning_rate": 1.3475223750272868e-05,
"loss": 1.0225,
"step": 3040
},
{
"epoch": 0.6622876065360187,
"grad_norm": 1.6027779579162598,
"learning_rate": 1.3453394455359093e-05,
"loss": 1.0195,
"step": 3050
},
{
"epoch": 0.6644590413115466,
"grad_norm": 1.3403127193450928,
"learning_rate": 1.343156516044532e-05,
"loss": 0.9529,
"step": 3060
},
{
"epoch": 0.6666304760870745,
"grad_norm": 1.3543404340744019,
"learning_rate": 1.3409735865531544e-05,
"loss": 0.9783,
"step": 3070
},
{
"epoch": 0.6688019108626024,
"grad_norm": 1.1751652956008911,
"learning_rate": 1.3387906570617771e-05,
"loss": 1.0199,
"step": 3080
},
{
"epoch": 0.6709733456381304,
"grad_norm": 1.44953453540802,
"learning_rate": 1.3366077275703998e-05,
"loss": 1.04,
"step": 3090
},
{
"epoch": 0.6731447804136583,
"grad_norm": 1.2177844047546387,
"learning_rate": 1.334424798079022e-05,
"loss": 1.0228,
"step": 3100
},
{
"epoch": 0.6753162151891863,
"grad_norm": 1.2051985263824463,
"learning_rate": 1.3322418685876447e-05,
"loss": 0.9834,
"step": 3110
},
{
"epoch": 0.6774876499647142,
"grad_norm": 1.249619483947754,
"learning_rate": 1.3300589390962672e-05,
"loss": 1.0089,
"step": 3120
},
{
"epoch": 0.6796590847402422,
"grad_norm": 1.3662210702896118,
"learning_rate": 1.3278760096048899e-05,
"loss": 1.0312,
"step": 3130
},
{
"epoch": 0.68183051951577,
"grad_norm": 1.2683398723602295,
"learning_rate": 1.3256930801135124e-05,
"loss": 0.9806,
"step": 3140
},
{
"epoch": 0.684001954291298,
"grad_norm": 1.3069689273834229,
"learning_rate": 1.323510150622135e-05,
"loss": 1.009,
"step": 3150
},
{
"epoch": 0.6861733890668259,
"grad_norm": 1.4314109086990356,
"learning_rate": 1.3213272211307576e-05,
"loss": 0.9918,
"step": 3160
},
{
"epoch": 0.6883448238423538,
"grad_norm": 1.2950971126556396,
"learning_rate": 1.3191442916393802e-05,
"loss": 1.0356,
"step": 3170
},
{
"epoch": 0.6905162586178818,
"grad_norm": 1.4553663730621338,
"learning_rate": 1.3169613621480026e-05,
"loss": 1.0255,
"step": 3180
},
{
"epoch": 0.6926876933934097,
"grad_norm": 1.439324140548706,
"learning_rate": 1.3147784326566252e-05,
"loss": 1.0561,
"step": 3190
},
{
"epoch": 0.6948591281689376,
"grad_norm": 1.1153829097747803,
"learning_rate": 1.3125955031652479e-05,
"loss": 1.017,
"step": 3200
},
{
"epoch": 0.6970305629444655,
"grad_norm": 1.1670260429382324,
"learning_rate": 1.3104125736738704e-05,
"loss": 1.0096,
"step": 3210
},
{
"epoch": 0.6992019977199935,
"grad_norm": 1.400228500366211,
"learning_rate": 1.308229644182493e-05,
"loss": 1.0309,
"step": 3220
},
{
"epoch": 0.7013734324955214,
"grad_norm": 1.1673344373703003,
"learning_rate": 1.3060467146911156e-05,
"loss": 1.0515,
"step": 3230
},
{
"epoch": 0.7035448672710494,
"grad_norm": 1.152686357498169,
"learning_rate": 1.3038637851997382e-05,
"loss": 0.9687,
"step": 3240
},
{
"epoch": 0.7057163020465773,
"grad_norm": 1.3322280645370483,
"learning_rate": 1.3016808557083607e-05,
"loss": 1.0255,
"step": 3250
},
{
"epoch": 0.7078877368221052,
"grad_norm": 1.0270700454711914,
"learning_rate": 1.2994979262169834e-05,
"loss": 1.0124,
"step": 3260
},
{
"epoch": 0.7100591715976331,
"grad_norm": 1.0481797456741333,
"learning_rate": 1.2973149967256057e-05,
"loss": 1.0298,
"step": 3270
},
{
"epoch": 0.712230606373161,
"grad_norm": 1.358763337135315,
"learning_rate": 1.2951320672342284e-05,
"loss": 1.0009,
"step": 3280
},
{
"epoch": 0.714402041148689,
"grad_norm": 1.3017981052398682,
"learning_rate": 1.2929491377428509e-05,
"loss": 1.0362,
"step": 3290
},
{
"epoch": 0.7165734759242169,
"grad_norm": 1.4643291234970093,
"learning_rate": 1.2907662082514736e-05,
"loss": 0.96,
"step": 3300
},
{
"epoch": 0.7187449106997449,
"grad_norm": 1.158682942390442,
"learning_rate": 1.2885832787600962e-05,
"loss": 0.9807,
"step": 3310
},
{
"epoch": 0.7209163454752728,
"grad_norm": 1.2945632934570312,
"learning_rate": 1.2864003492687187e-05,
"loss": 0.977,
"step": 3320
},
{
"epoch": 0.7230877802508007,
"grad_norm": 1.6654890775680542,
"learning_rate": 1.2842174197773414e-05,
"loss": 1.0128,
"step": 3330
},
{
"epoch": 0.7252592150263286,
"grad_norm": 1.2067387104034424,
"learning_rate": 1.2820344902859639e-05,
"loss": 1.0261,
"step": 3340
},
{
"epoch": 0.7274306498018566,
"grad_norm": 1.4484736919403076,
"learning_rate": 1.2798515607945866e-05,
"loss": 1.0055,
"step": 3350
},
{
"epoch": 0.7296020845773845,
"grad_norm": 1.428499698638916,
"learning_rate": 1.2776686313032089e-05,
"loss": 1.0584,
"step": 3360
},
{
"epoch": 0.7317735193529125,
"grad_norm": 1.454953670501709,
"learning_rate": 1.2754857018118315e-05,
"loss": 1.0327,
"step": 3370
},
{
"epoch": 0.7339449541284404,
"grad_norm": 1.1868793964385986,
"learning_rate": 1.273302772320454e-05,
"loss": 1.019,
"step": 3380
},
{
"epoch": 0.7361163889039682,
"grad_norm": 1.2822529077529907,
"learning_rate": 1.2711198428290767e-05,
"loss": 0.9966,
"step": 3390
},
{
"epoch": 0.7382878236794962,
"grad_norm": 1.2787412405014038,
"learning_rate": 1.2689369133376992e-05,
"loss": 1.0473,
"step": 3400
},
{
"epoch": 0.7404592584550241,
"grad_norm": 1.3772400617599487,
"learning_rate": 1.2667539838463219e-05,
"loss": 0.9689,
"step": 3410
},
{
"epoch": 0.7426306932305521,
"grad_norm": 1.2161903381347656,
"learning_rate": 1.2645710543549445e-05,
"loss": 1.0082,
"step": 3420
},
{
"epoch": 0.74480212800608,
"grad_norm": 1.489033579826355,
"learning_rate": 1.262388124863567e-05,
"loss": 1.0139,
"step": 3430
},
{
"epoch": 0.746973562781608,
"grad_norm": 1.3982605934143066,
"learning_rate": 1.2602051953721897e-05,
"loss": 0.9921,
"step": 3440
},
{
"epoch": 0.7491449975571359,
"grad_norm": 1.378158450126648,
"learning_rate": 1.258022265880812e-05,
"loss": 0.9678,
"step": 3450
},
{
"epoch": 0.7513164323326638,
"grad_norm": 1.4947155714035034,
"learning_rate": 1.2558393363894347e-05,
"loss": 1.0051,
"step": 3460
},
{
"epoch": 0.7534878671081917,
"grad_norm": 1.1531239748001099,
"learning_rate": 1.2536564068980572e-05,
"loss": 1.1186,
"step": 3470
},
{
"epoch": 0.7556593018837197,
"grad_norm": 1.38021981716156,
"learning_rate": 1.2514734774066799e-05,
"loss": 0.9771,
"step": 3480
},
{
"epoch": 0.7578307366592476,
"grad_norm": 1.2459088563919067,
"learning_rate": 1.2492905479153024e-05,
"loss": 1.0216,
"step": 3490
},
{
"epoch": 0.7600021714347756,
"grad_norm": 2.1082191467285156,
"learning_rate": 1.247107618423925e-05,
"loss": 0.9956,
"step": 3500
},
{
"epoch": 0.7621736062103035,
"grad_norm": 1.1670981645584106,
"learning_rate": 1.2449246889325475e-05,
"loss": 1.0518,
"step": 3510
},
{
"epoch": 0.7643450409858313,
"grad_norm": 1.453430414199829,
"learning_rate": 1.2427417594411702e-05,
"loss": 0.986,
"step": 3520
},
{
"epoch": 0.7665164757613593,
"grad_norm": 0.9967979788780212,
"learning_rate": 1.2405588299497929e-05,
"loss": 1.0468,
"step": 3530
},
{
"epoch": 0.7686879105368872,
"grad_norm": 1.5002816915512085,
"learning_rate": 1.2383759004584152e-05,
"loss": 1.0078,
"step": 3540
},
{
"epoch": 0.7708593453124152,
"grad_norm": 1.4501177072525024,
"learning_rate": 1.236192970967038e-05,
"loss": 0.9615,
"step": 3550
},
{
"epoch": 0.7730307800879431,
"grad_norm": 1.377883791923523,
"learning_rate": 1.2340100414756604e-05,
"loss": 0.9432,
"step": 3560
},
{
"epoch": 0.7752022148634711,
"grad_norm": 1.2856801748275757,
"learning_rate": 1.231827111984283e-05,
"loss": 0.9958,
"step": 3570
},
{
"epoch": 0.777373649638999,
"grad_norm": 1.2119390964508057,
"learning_rate": 1.2296441824929055e-05,
"loss": 1.0029,
"step": 3580
},
{
"epoch": 0.7795450844145269,
"grad_norm": 1.4396144151687622,
"learning_rate": 1.2274612530015282e-05,
"loss": 1.0159,
"step": 3590
},
{
"epoch": 0.7817165191900548,
"grad_norm": 1.6045223474502563,
"learning_rate": 1.2252783235101507e-05,
"loss": 0.9767,
"step": 3600
},
{
"epoch": 0.7838879539655828,
"grad_norm": 1.2426387071609497,
"learning_rate": 1.2230953940187733e-05,
"loss": 1.0287,
"step": 3610
},
{
"epoch": 0.7860593887411107,
"grad_norm": 1.2435184717178345,
"learning_rate": 1.2209124645273957e-05,
"loss": 1.0052,
"step": 3620
},
{
"epoch": 0.7882308235166386,
"grad_norm": 1.6939178705215454,
"learning_rate": 1.2187295350360185e-05,
"loss": 1.0216,
"step": 3630
},
{
"epoch": 0.7904022582921666,
"grad_norm": 1.1843641996383667,
"learning_rate": 1.2165466055446412e-05,
"loss": 0.9738,
"step": 3640
},
{
"epoch": 0.7925736930676944,
"grad_norm": 1.3802050352096558,
"learning_rate": 1.2143636760532635e-05,
"loss": 0.9216,
"step": 3650
},
{
"epoch": 0.7947451278432224,
"grad_norm": 1.2471121549606323,
"learning_rate": 1.2121807465618862e-05,
"loss": 0.977,
"step": 3660
},
{
"epoch": 0.7969165626187503,
"grad_norm": 1.3608779907226562,
"learning_rate": 1.2099978170705087e-05,
"loss": 1.009,
"step": 3670
},
{
"epoch": 0.7990879973942783,
"grad_norm": 1.4472932815551758,
"learning_rate": 1.2078148875791313e-05,
"loss": 0.9946,
"step": 3680
},
{
"epoch": 0.8012594321698062,
"grad_norm": 1.7036590576171875,
"learning_rate": 1.2056319580877538e-05,
"loss": 0.999,
"step": 3690
},
{
"epoch": 0.8034308669453342,
"grad_norm": 1.258748173713684,
"learning_rate": 1.2034490285963765e-05,
"loss": 0.9968,
"step": 3700
},
{
"epoch": 0.8056023017208621,
"grad_norm": 1.191994547843933,
"learning_rate": 1.2012660991049988e-05,
"loss": 0.9941,
"step": 3710
},
{
"epoch": 0.80777373649639,
"grad_norm": 1.9393503665924072,
"learning_rate": 1.1990831696136217e-05,
"loss": 1.0167,
"step": 3720
},
{
"epoch": 0.8099451712719179,
"grad_norm": 1.5484780073165894,
"learning_rate": 1.196900240122244e-05,
"loss": 0.9962,
"step": 3730
},
{
"epoch": 0.8121166060474458,
"grad_norm": 1.2578662633895874,
"learning_rate": 1.1947173106308667e-05,
"loss": 1.0057,
"step": 3740
},
{
"epoch": 0.8142880408229738,
"grad_norm": 1.3208587169647217,
"learning_rate": 1.1925343811394893e-05,
"loss": 1.0086,
"step": 3750
},
{
"epoch": 0.8164594755985017,
"grad_norm": 1.2795675992965698,
"learning_rate": 1.1903514516481118e-05,
"loss": 1.051,
"step": 3760
},
{
"epoch": 0.8186309103740297,
"grad_norm": 1.1958470344543457,
"learning_rate": 1.1881685221567345e-05,
"loss": 0.9974,
"step": 3770
},
{
"epoch": 0.8208023451495575,
"grad_norm": 1.1479548215866089,
"learning_rate": 1.185985592665357e-05,
"loss": 0.9678,
"step": 3780
},
{
"epoch": 0.8229737799250855,
"grad_norm": 1.616144061088562,
"learning_rate": 1.1838026631739797e-05,
"loss": 1.0586,
"step": 3790
},
{
"epoch": 0.8251452147006134,
"grad_norm": 1.3224166631698608,
"learning_rate": 1.1816197336826022e-05,
"loss": 1.0533,
"step": 3800
},
{
"epoch": 0.8273166494761414,
"grad_norm": 1.6158727407455444,
"learning_rate": 1.1794368041912248e-05,
"loss": 1.0128,
"step": 3810
},
{
"epoch": 0.8294880842516693,
"grad_norm": 1.3982148170471191,
"learning_rate": 1.1772538746998472e-05,
"loss": 1.0249,
"step": 3820
},
{
"epoch": 0.8316595190271973,
"grad_norm": 1.2071058750152588,
"learning_rate": 1.1750709452084698e-05,
"loss": 0.965,
"step": 3830
},
{
"epoch": 0.8338309538027252,
"grad_norm": 1.3230708837509155,
"learning_rate": 1.1728880157170925e-05,
"loss": 0.9441,
"step": 3840
},
{
"epoch": 0.836002388578253,
"grad_norm": 1.106053113937378,
"learning_rate": 1.170705086225715e-05,
"loss": 1.0488,
"step": 3850
},
{
"epoch": 0.838173823353781,
"grad_norm": 1.5212702751159668,
"learning_rate": 1.1685221567343377e-05,
"loss": 1.0445,
"step": 3860
},
{
"epoch": 0.8403452581293089,
"grad_norm": 1.3804950714111328,
"learning_rate": 1.1663392272429601e-05,
"loss": 1.0183,
"step": 3870
},
{
"epoch": 0.8425166929048369,
"grad_norm": 1.3932008743286133,
"learning_rate": 1.1641562977515828e-05,
"loss": 1.0027,
"step": 3880
},
{
"epoch": 0.8446881276803648,
"grad_norm": 1.3928159475326538,
"learning_rate": 1.1619733682602053e-05,
"loss": 0.9646,
"step": 3890
},
{
"epoch": 0.8468595624558928,
"grad_norm": 1.3050851821899414,
"learning_rate": 1.159790438768828e-05,
"loss": 1.0099,
"step": 3900
},
{
"epoch": 0.8490309972314206,
"grad_norm": 1.2780051231384277,
"learning_rate": 1.1576075092774503e-05,
"loss": 0.9859,
"step": 3910
},
{
"epoch": 0.8512024320069486,
"grad_norm": 1.317460060119629,
"learning_rate": 1.155424579786073e-05,
"loss": 1.0021,
"step": 3920
},
{
"epoch": 0.8533738667824765,
"grad_norm": 1.4765187501907349,
"learning_rate": 1.1532416502946955e-05,
"loss": 1.0305,
"step": 3930
},
{
"epoch": 0.8555453015580045,
"grad_norm": 1.1514675617218018,
"learning_rate": 1.1510587208033181e-05,
"loss": 0.946,
"step": 3940
},
{
"epoch": 0.8577167363335324,
"grad_norm": 1.3265900611877441,
"learning_rate": 1.1488757913119408e-05,
"loss": 0.9427,
"step": 3950
},
{
"epoch": 0.8598881711090604,
"grad_norm": 1.4531445503234863,
"learning_rate": 1.1466928618205633e-05,
"loss": 1.0272,
"step": 3960
},
{
"epoch": 0.8620596058845882,
"grad_norm": 1.0620979070663452,
"learning_rate": 1.144509932329186e-05,
"loss": 1.0114,
"step": 3970
},
{
"epoch": 0.8642310406601161,
"grad_norm": 1.086349606513977,
"learning_rate": 1.1423270028378085e-05,
"loss": 0.9946,
"step": 3980
},
{
"epoch": 0.8664024754356441,
"grad_norm": 1.3090065717697144,
"learning_rate": 1.1401440733464311e-05,
"loss": 0.9915,
"step": 3990
},
{
"epoch": 0.868573910211172,
"grad_norm": 1.1086080074310303,
"learning_rate": 1.1379611438550535e-05,
"loss": 0.9599,
"step": 4000
},
{
"epoch": 0.8707453449867,
"grad_norm": 1.4512288570404053,
"learning_rate": 1.1357782143636761e-05,
"loss": 1.0143,
"step": 4010
},
{
"epoch": 0.8729167797622279,
"grad_norm": 1.2470262050628662,
"learning_rate": 1.1335952848722986e-05,
"loss": 0.9715,
"step": 4020
},
{
"epoch": 0.8750882145377559,
"grad_norm": 1.5051038265228271,
"learning_rate": 1.1314123553809213e-05,
"loss": 1.0206,
"step": 4030
},
{
"epoch": 0.8772596493132837,
"grad_norm": 1.607826828956604,
"learning_rate": 1.1292294258895438e-05,
"loss": 0.9833,
"step": 4040
},
{
"epoch": 0.8794310840888117,
"grad_norm": 1.431874394416809,
"learning_rate": 1.1270464963981665e-05,
"loss": 1.0264,
"step": 4050
},
{
"epoch": 0.8816025188643396,
"grad_norm": 1.440034031867981,
"learning_rate": 1.1248635669067891e-05,
"loss": 1.0013,
"step": 4060
},
{
"epoch": 0.8837739536398675,
"grad_norm": 1.4963476657867432,
"learning_rate": 1.1226806374154116e-05,
"loss": 0.9861,
"step": 4070
},
{
"epoch": 0.8859453884153955,
"grad_norm": 1.5683997869491577,
"learning_rate": 1.1204977079240343e-05,
"loss": 1.0247,
"step": 4080
},
{
"epoch": 0.8881168231909234,
"grad_norm": 1.4047991037368774,
"learning_rate": 1.1183147784326566e-05,
"loss": 0.9966,
"step": 4090
},
{
"epoch": 0.8902882579664513,
"grad_norm": 1.3178616762161255,
"learning_rate": 1.1161318489412793e-05,
"loss": 1.0107,
"step": 4100
},
{
"epoch": 0.8924596927419792,
"grad_norm": 1.5227705240249634,
"learning_rate": 1.1139489194499018e-05,
"loss": 0.9826,
"step": 4110
},
{
"epoch": 0.8946311275175072,
"grad_norm": 1.4800081253051758,
"learning_rate": 1.1117659899585244e-05,
"loss": 1.0544,
"step": 4120
},
{
"epoch": 0.8968025622930351,
"grad_norm": 1.3340637683868408,
"learning_rate": 1.109583060467147e-05,
"loss": 1.0342,
"step": 4130
},
{
"epoch": 0.8989739970685631,
"grad_norm": 1.6699985265731812,
"learning_rate": 1.1074001309757696e-05,
"loss": 0.9726,
"step": 4140
},
{
"epoch": 0.901145431844091,
"grad_norm": 1.466199517250061,
"learning_rate": 1.1052172014843921e-05,
"loss": 0.9623,
"step": 4150
},
{
"epoch": 0.903316866619619,
"grad_norm": 1.6779991388320923,
"learning_rate": 1.1030342719930148e-05,
"loss": 1.033,
"step": 4160
},
{
"epoch": 0.9054883013951468,
"grad_norm": 1.338218331336975,
"learning_rate": 1.1008513425016374e-05,
"loss": 0.983,
"step": 4170
},
{
"epoch": 0.9076597361706747,
"grad_norm": 1.430690884590149,
"learning_rate": 1.0986684130102598e-05,
"loss": 0.9673,
"step": 4180
},
{
"epoch": 0.9098311709462027,
"grad_norm": 1.381343126296997,
"learning_rate": 1.0964854835188824e-05,
"loss": 0.9552,
"step": 4190
},
{
"epoch": 0.9120026057217306,
"grad_norm": 1.2798620462417603,
"learning_rate": 1.094302554027505e-05,
"loss": 1.0115,
"step": 4200
},
{
"epoch": 0.9141740404972586,
"grad_norm": 1.5903421640396118,
"learning_rate": 1.0921196245361276e-05,
"loss": 0.9903,
"step": 4210
},
{
"epoch": 0.9163454752727865,
"grad_norm": 1.1908365488052368,
"learning_rate": 1.0899366950447501e-05,
"loss": 1.0046,
"step": 4220
},
{
"epoch": 0.9185169100483144,
"grad_norm": 1.1967812776565552,
"learning_rate": 1.0877537655533728e-05,
"loss": 0.9842,
"step": 4230
},
{
"epoch": 0.9206883448238423,
"grad_norm": 1.2975422143936157,
"learning_rate": 1.0855708360619953e-05,
"loss": 1.1223,
"step": 4240
},
{
"epoch": 0.9228597795993703,
"grad_norm": 1.213766098022461,
"learning_rate": 1.083387906570618e-05,
"loss": 1.0106,
"step": 4250
},
{
"epoch": 0.9250312143748982,
"grad_norm": 1.301695704460144,
"learning_rate": 1.0812049770792403e-05,
"loss": 0.9959,
"step": 4260
},
{
"epoch": 0.9272026491504262,
"grad_norm": 1.3527394533157349,
"learning_rate": 1.079022047587863e-05,
"loss": 1.0124,
"step": 4270
},
{
"epoch": 0.9293740839259541,
"grad_norm": 1.3432750701904297,
"learning_rate": 1.0768391180964856e-05,
"loss": 1.0047,
"step": 4280
},
{
"epoch": 0.9315455187014821,
"grad_norm": 1.329483151435852,
"learning_rate": 1.0746561886051081e-05,
"loss": 1.0124,
"step": 4290
},
{
"epoch": 0.9337169534770099,
"grad_norm": 1.430738091468811,
"learning_rate": 1.0724732591137308e-05,
"loss": 0.9462,
"step": 4300
},
{
"epoch": 0.9358883882525378,
"grad_norm": 1.491452693939209,
"learning_rate": 1.0702903296223533e-05,
"loss": 0.9885,
"step": 4310
},
{
"epoch": 0.9380598230280658,
"grad_norm": 1.4353605508804321,
"learning_rate": 1.068107400130976e-05,
"loss": 1.0131,
"step": 4320
},
{
"epoch": 0.9402312578035937,
"grad_norm": 1.1809788942337036,
"learning_rate": 1.0659244706395984e-05,
"loss": 0.9926,
"step": 4330
},
{
"epoch": 0.9424026925791217,
"grad_norm": 1.2355526685714722,
"learning_rate": 1.0637415411482211e-05,
"loss": 0.9945,
"step": 4340
},
{
"epoch": 0.9445741273546496,
"grad_norm": 1.3314152956008911,
"learning_rate": 1.0615586116568434e-05,
"loss": 1.0037,
"step": 4350
},
{
"epoch": 0.9467455621301775,
"grad_norm": 1.2427114248275757,
"learning_rate": 1.059375682165466e-05,
"loss": 1.0048,
"step": 4360
},
{
"epoch": 0.9489169969057054,
"grad_norm": 1.298858642578125,
"learning_rate": 1.0571927526740886e-05,
"loss": 1.068,
"step": 4370
},
{
"epoch": 0.9510884316812334,
"grad_norm": 1.432786226272583,
"learning_rate": 1.0550098231827112e-05,
"loss": 0.9922,
"step": 4380
},
{
"epoch": 0.9532598664567613,
"grad_norm": 1.3567193746566772,
"learning_rate": 1.0528268936913339e-05,
"loss": 1.0097,
"step": 4390
},
{
"epoch": 0.9554313012322893,
"grad_norm": 1.4737164974212646,
"learning_rate": 1.0506439641999564e-05,
"loss": 1.0053,
"step": 4400
},
{
"epoch": 0.9576027360078172,
"grad_norm": 1.1993675231933594,
"learning_rate": 1.048461034708579e-05,
"loss": 1.0553,
"step": 4410
},
{
"epoch": 0.9597741707833451,
"grad_norm": 1.483333945274353,
"learning_rate": 1.0462781052172016e-05,
"loss": 1.0163,
"step": 4420
},
{
"epoch": 0.961945605558873,
"grad_norm": 1.4248449802398682,
"learning_rate": 1.0440951757258242e-05,
"loss": 1.0013,
"step": 4430
},
{
"epoch": 0.9641170403344009,
"grad_norm": 1.4888718128204346,
"learning_rate": 1.0419122462344466e-05,
"loss": 1.0428,
"step": 4440
},
{
"epoch": 0.9662884751099289,
"grad_norm": 1.2882726192474365,
"learning_rate": 1.0397293167430692e-05,
"loss": 0.9764,
"step": 4450
},
{
"epoch": 0.9684599098854568,
"grad_norm": 1.3666644096374512,
"learning_rate": 1.0375463872516917e-05,
"loss": 1.0235,
"step": 4460
},
{
"epoch": 0.9706313446609848,
"grad_norm": 1.5665643215179443,
"learning_rate": 1.0353634577603144e-05,
"loss": 0.9966,
"step": 4470
},
{
"epoch": 0.9728027794365127,
"grad_norm": 1.4171271324157715,
"learning_rate": 1.0331805282689369e-05,
"loss": 1.021,
"step": 4480
},
{
"epoch": 0.9749742142120406,
"grad_norm": 1.4926506280899048,
"learning_rate": 1.0309975987775596e-05,
"loss": 0.9794,
"step": 4490
},
{
"epoch": 0.9771456489875685,
"grad_norm": 1.1166307926177979,
"learning_rate": 1.0288146692861822e-05,
"loss": 1.0138,
"step": 4500
},
{
"epoch": 0.9793170837630965,
"grad_norm": 1.515855312347412,
"learning_rate": 1.0266317397948047e-05,
"loss": 0.9691,
"step": 4510
},
{
"epoch": 0.9814885185386244,
"grad_norm": 1.421080231666565,
"learning_rate": 1.0244488103034274e-05,
"loss": 0.9646,
"step": 4520
},
{
"epoch": 0.9836599533141523,
"grad_norm": 1.4241400957107544,
"learning_rate": 1.0222658808120497e-05,
"loss": 0.9447,
"step": 4530
},
{
"epoch": 0.9858313880896803,
"grad_norm": 1.6205312013626099,
"learning_rate": 1.0200829513206724e-05,
"loss": 0.9843,
"step": 4540
},
{
"epoch": 0.9880028228652082,
"grad_norm": 1.3039618730545044,
"learning_rate": 1.0179000218292949e-05,
"loss": 1.0065,
"step": 4550
},
{
"epoch": 0.9901742576407361,
"grad_norm": 1.4685053825378418,
"learning_rate": 1.0157170923379176e-05,
"loss": 0.9925,
"step": 4560
},
{
"epoch": 0.992345692416264,
"grad_norm": 1.2964003086090088,
"learning_rate": 1.01353416284654e-05,
"loss": 1.0104,
"step": 4570
},
{
"epoch": 0.994517127191792,
"grad_norm": 1.4937127828598022,
"learning_rate": 1.0113512333551627e-05,
"loss": 1.0642,
"step": 4580
},
{
"epoch": 0.9966885619673199,
"grad_norm": 1.2731589078903198,
"learning_rate": 1.0091683038637854e-05,
"loss": 1.0486,
"step": 4590
},
{
"epoch": 0.9988599967428479,
"grad_norm": 1.3573518991470337,
"learning_rate": 1.0069853743724079e-05,
"loss": 0.9839,
"step": 4600
},
{
"epoch": 1.0008685739102112,
"grad_norm": 1.6150940656661987,
"learning_rate": 1.0048024448810306e-05,
"loss": 1.0595,
"step": 4610
},
{
"epoch": 1.0030400086857392,
"grad_norm": 1.435672640800476,
"learning_rate": 1.0026195153896529e-05,
"loss": 1.0018,
"step": 4620
},
{
"epoch": 1.0052114434612671,
"grad_norm": 1.3522926568984985,
"learning_rate": 1.0004365858982757e-05,
"loss": 0.9845,
"step": 4630
},
{
"epoch": 1.007382878236795,
"grad_norm": 1.327671766281128,
"learning_rate": 9.98253656406898e-06,
"loss": 0.947,
"step": 4640
},
{
"epoch": 1.0095543130123228,
"grad_norm": 1.40632164478302,
"learning_rate": 9.960707269155207e-06,
"loss": 0.955,
"step": 4650
},
{
"epoch": 1.0117257477878507,
"grad_norm": 1.7449159622192383,
"learning_rate": 9.938877974241434e-06,
"loss": 0.9406,
"step": 4660
},
{
"epoch": 1.0138971825633787,
"grad_norm": 1.410897135734558,
"learning_rate": 9.917048679327659e-06,
"loss": 1.0253,
"step": 4670
},
{
"epoch": 1.0160686173389066,
"grad_norm": 1.3368771076202393,
"learning_rate": 9.895219384413884e-06,
"loss": 0.9922,
"step": 4680
},
{
"epoch": 1.0182400521144346,
"grad_norm": 1.2922542095184326,
"learning_rate": 9.87339008950011e-06,
"loss": 1.0153,
"step": 4690
},
{
"epoch": 1.0204114868899625,
"grad_norm": 1.4930267333984375,
"learning_rate": 9.851560794586335e-06,
"loss": 0.9725,
"step": 4700
},
{
"epoch": 1.0225829216654905,
"grad_norm": 1.2955012321472168,
"learning_rate": 9.829731499672562e-06,
"loss": 0.9858,
"step": 4710
},
{
"epoch": 1.0247543564410184,
"grad_norm": 1.5806477069854736,
"learning_rate": 9.807902204758787e-06,
"loss": 0.9046,
"step": 4720
},
{
"epoch": 1.0269257912165464,
"grad_norm": 1.3869348764419556,
"learning_rate": 9.786072909845012e-06,
"loss": 1.0147,
"step": 4730
},
{
"epoch": 1.0290972259920743,
"grad_norm": 1.4592316150665283,
"learning_rate": 9.764243614931239e-06,
"loss": 0.9497,
"step": 4740
},
{
"epoch": 1.0312686607676023,
"grad_norm": 1.9150491952896118,
"learning_rate": 9.742414320017464e-06,
"loss": 0.9584,
"step": 4750
},
{
"epoch": 1.0334400955431302,
"grad_norm": 1.2069435119628906,
"learning_rate": 9.72058502510369e-06,
"loss": 0.9853,
"step": 4760
},
{
"epoch": 1.0356115303186582,
"grad_norm": 1.521933674812317,
"learning_rate": 9.698755730189915e-06,
"loss": 0.9611,
"step": 4770
},
{
"epoch": 1.0377829650941859,
"grad_norm": 1.9448108673095703,
"learning_rate": 9.676926435276142e-06,
"loss": 1.0092,
"step": 4780
},
{
"epoch": 1.0399543998697138,
"grad_norm": 1.577696442604065,
"learning_rate": 9.655097140362367e-06,
"loss": 1.0072,
"step": 4790
},
{
"epoch": 1.0421258346452418,
"grad_norm": 1.9846240282058716,
"learning_rate": 9.633267845448594e-06,
"loss": 0.9533,
"step": 4800
},
{
"epoch": 1.0442972694207697,
"grad_norm": 1.4275234937667847,
"learning_rate": 9.611438550534819e-06,
"loss": 0.9662,
"step": 4810
},
{
"epoch": 1.0464687041962977,
"grad_norm": 1.548954963684082,
"learning_rate": 9.589609255621044e-06,
"loss": 0.9454,
"step": 4820
},
{
"epoch": 1.0486401389718256,
"grad_norm": 1.8117595911026,
"learning_rate": 9.56777996070727e-06,
"loss": 0.9623,
"step": 4830
},
{
"epoch": 1.0508115737473536,
"grad_norm": 1.417375087738037,
"learning_rate": 9.545950665793495e-06,
"loss": 0.9791,
"step": 4840
},
{
"epoch": 1.0529830085228815,
"grad_norm": 1.2770414352416992,
"learning_rate": 9.52412137087972e-06,
"loss": 0.8828,
"step": 4850
},
{
"epoch": 1.0551544432984095,
"grad_norm": 1.3013825416564941,
"learning_rate": 9.502292075965947e-06,
"loss": 1.029,
"step": 4860
},
{
"epoch": 1.0573258780739374,
"grad_norm": 1.5322422981262207,
"learning_rate": 9.480462781052174e-06,
"loss": 0.9592,
"step": 4870
},
{
"epoch": 1.0594973128494654,
"grad_norm": 1.7801984548568726,
"learning_rate": 9.458633486138398e-06,
"loss": 0.9531,
"step": 4880
},
{
"epoch": 1.0616687476249933,
"grad_norm": 2.0160224437713623,
"learning_rate": 9.436804191224625e-06,
"loss": 0.9471,
"step": 4890
},
{
"epoch": 1.0638401824005213,
"grad_norm": 1.4919092655181885,
"learning_rate": 9.41497489631085e-06,
"loss": 0.9719,
"step": 4900
},
{
"epoch": 1.066011617176049,
"grad_norm": 1.379225730895996,
"learning_rate": 9.393145601397075e-06,
"loss": 0.9406,
"step": 4910
},
{
"epoch": 1.068183051951577,
"grad_norm": 1.4247862100601196,
"learning_rate": 9.371316306483302e-06,
"loss": 0.975,
"step": 4920
},
{
"epoch": 1.0703544867271049,
"grad_norm": 1.290443778038025,
"learning_rate": 9.349487011569527e-06,
"loss": 0.9636,
"step": 4930
},
{
"epoch": 1.0725259215026328,
"grad_norm": 1.2737443447113037,
"learning_rate": 9.327657716655752e-06,
"loss": 0.9779,
"step": 4940
},
{
"epoch": 1.0746973562781608,
"grad_norm": 1.1298906803131104,
"learning_rate": 9.305828421741978e-06,
"loss": 0.9705,
"step": 4950
},
{
"epoch": 1.0768687910536887,
"grad_norm": 1.368236780166626,
"learning_rate": 9.283999126828203e-06,
"loss": 0.9791,
"step": 4960
},
{
"epoch": 1.0790402258292167,
"grad_norm": 1.3343724012374878,
"learning_rate": 9.26216983191443e-06,
"loss": 1.0074,
"step": 4970
},
{
"epoch": 1.0812116606047446,
"grad_norm": 1.547235369682312,
"learning_rate": 9.240340537000657e-06,
"loss": 0.9545,
"step": 4980
},
{
"epoch": 1.0833830953802726,
"grad_norm": 1.8547582626342773,
"learning_rate": 9.218511242086882e-06,
"loss": 0.9981,
"step": 4990
},
{
"epoch": 1.0855545301558005,
"grad_norm": 1.3031221628189087,
"learning_rate": 9.196681947173107e-06,
"loss": 0.9537,
"step": 5000
}
],
"logging_steps": 10,
"max_steps": 9212,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.20553175074816e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}