feedback-dropout-even / trainer_state.json
etiennebamas's picture
Upload folder using huggingface_hub
fe77dcb verified
Raw
History Blame Contribute Delete
143 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 1000,
"global_step": 8029,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0012454851164528584,
"grad_norm": 32.3415608390714,
"learning_rate": 9.950248756218906e-08,
"loss": 3.4644,
"step": 10
},
{
"epoch": 0.0024909702329057168,
"grad_norm": 25.480025620708208,
"learning_rate": 3.4825870646766175e-07,
"loss": 3.2929,
"step": 20
},
{
"epoch": 0.003736455349358575,
"grad_norm": 12.863917253019975,
"learning_rate": 5.970149253731343e-07,
"loss": 2.7645,
"step": 30
},
{
"epoch": 0.0049819404658114335,
"grad_norm": 8.661769957642282,
"learning_rate": 8.457711442786071e-07,
"loss": 1.9337,
"step": 40
},
{
"epoch": 0.006227425582264292,
"grad_norm": 4.547851186090259,
"learning_rate": 1.0945273631840796e-06,
"loss": 1.2927,
"step": 50
},
{
"epoch": 0.00747291069871715,
"grad_norm": 6.988951275950853,
"learning_rate": 1.3432835820895524e-06,
"loss": 0.8596,
"step": 60
},
{
"epoch": 0.008718395815170008,
"grad_norm": 4.536503979583195,
"learning_rate": 1.592039800995025e-06,
"loss": 0.6875,
"step": 70
},
{
"epoch": 0.009963880931622867,
"grad_norm": 2.346121168881654,
"learning_rate": 1.8407960199004975e-06,
"loss": 0.4645,
"step": 80
},
{
"epoch": 0.011209366048075726,
"grad_norm": 2.1671318696701634,
"learning_rate": 2.08955223880597e-06,
"loss": 0.3597,
"step": 90
},
{
"epoch": 0.012454851164528585,
"grad_norm": 1.4808764283998557,
"learning_rate": 2.338308457711443e-06,
"loss": 0.3404,
"step": 100
},
{
"epoch": 0.013700336280981442,
"grad_norm": 2.6469110933498814,
"learning_rate": 2.5870646766169156e-06,
"loss": 0.2943,
"step": 110
},
{
"epoch": 0.0149458213974343,
"grad_norm": 1.3279656822564871,
"learning_rate": 2.835820895522388e-06,
"loss": 0.247,
"step": 120
},
{
"epoch": 0.016191306513887158,
"grad_norm": 1.302836510685337,
"learning_rate": 3.0845771144278608e-06,
"loss": 0.2096,
"step": 130
},
{
"epoch": 0.017436791630340016,
"grad_norm": 1.6249274804656253,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.2025,
"step": 140
},
{
"epoch": 0.018682276746792875,
"grad_norm": 0.576179870318832,
"learning_rate": 3.582089552238806e-06,
"loss": 0.18,
"step": 150
},
{
"epoch": 0.019927761863245734,
"grad_norm": 0.47200629179365866,
"learning_rate": 3.8308457711442784e-06,
"loss": 0.1537,
"step": 160
},
{
"epoch": 0.021173246979698593,
"grad_norm": 0.4438513005336629,
"learning_rate": 4.079601990049751e-06,
"loss": 0.1416,
"step": 170
},
{
"epoch": 0.02241873209615145,
"grad_norm": 0.5119712526129353,
"learning_rate": 4.3283582089552236e-06,
"loss": 0.1355,
"step": 180
},
{
"epoch": 0.02366421721260431,
"grad_norm": 0.32603135016250945,
"learning_rate": 4.577114427860697e-06,
"loss": 0.1308,
"step": 190
},
{
"epoch": 0.02490970232905717,
"grad_norm": 0.6505900620341762,
"learning_rate": 4.8258706467661695e-06,
"loss": 0.1191,
"step": 200
},
{
"epoch": 0.026155187445510025,
"grad_norm": 0.7492762783220748,
"learning_rate": 5.074626865671642e-06,
"loss": 0.1126,
"step": 210
},
{
"epoch": 0.027400672561962883,
"grad_norm": 0.31871727876603,
"learning_rate": 5.323383084577115e-06,
"loss": 0.115,
"step": 220
},
{
"epoch": 0.028646157678415742,
"grad_norm": 0.35982305102159207,
"learning_rate": 5.572139303482588e-06,
"loss": 0.1065,
"step": 230
},
{
"epoch": 0.0298916427948686,
"grad_norm": 1.1496609573283096,
"learning_rate": 5.820895522388061e-06,
"loss": 0.0991,
"step": 240
},
{
"epoch": 0.03113712791132146,
"grad_norm": 0.33707215880834224,
"learning_rate": 6.069651741293533e-06,
"loss": 0.1008,
"step": 250
},
{
"epoch": 0.032382613027774315,
"grad_norm": 0.6415190743111118,
"learning_rate": 6.318407960199006e-06,
"loss": 0.095,
"step": 260
},
{
"epoch": 0.033628098144227174,
"grad_norm": 0.2926220853782081,
"learning_rate": 6.567164179104478e-06,
"loss": 0.0891,
"step": 270
},
{
"epoch": 0.03487358326068003,
"grad_norm": 1.0821127561427188,
"learning_rate": 6.815920398009951e-06,
"loss": 0.0926,
"step": 280
},
{
"epoch": 0.03611906837713289,
"grad_norm": 0.2223395687156767,
"learning_rate": 7.064676616915423e-06,
"loss": 0.0887,
"step": 290
},
{
"epoch": 0.03736455349358575,
"grad_norm": 0.7983248688393392,
"learning_rate": 7.313432835820896e-06,
"loss": 0.0871,
"step": 300
},
{
"epoch": 0.03861003861003861,
"grad_norm": 0.33593570329622213,
"learning_rate": 7.5621890547263685e-06,
"loss": 0.082,
"step": 310
},
{
"epoch": 0.03985552372649147,
"grad_norm": 0.2641752043712419,
"learning_rate": 7.810945273631842e-06,
"loss": 0.082,
"step": 320
},
{
"epoch": 0.04110100884294433,
"grad_norm": 0.2489578420678032,
"learning_rate": 8.059701492537314e-06,
"loss": 0.0807,
"step": 330
},
{
"epoch": 0.042346493959397186,
"grad_norm": 0.26255217591168595,
"learning_rate": 8.308457711442787e-06,
"loss": 0.0785,
"step": 340
},
{
"epoch": 0.043591979075850044,
"grad_norm": 0.23946518388509047,
"learning_rate": 8.557213930348259e-06,
"loss": 0.075,
"step": 350
},
{
"epoch": 0.0448374641923029,
"grad_norm": 0.48978889854780566,
"learning_rate": 8.805970149253732e-06,
"loss": 0.0703,
"step": 360
},
{
"epoch": 0.04608294930875576,
"grad_norm": 0.2962946028664091,
"learning_rate": 9.054726368159204e-06,
"loss": 0.0728,
"step": 370
},
{
"epoch": 0.04732843442520862,
"grad_norm": 2.561942986560748,
"learning_rate": 9.303482587064677e-06,
"loss": 0.072,
"step": 380
},
{
"epoch": 0.04857391954166148,
"grad_norm": 0.19974610955113598,
"learning_rate": 9.552238805970149e-06,
"loss": 0.0726,
"step": 390
},
{
"epoch": 0.04981940465811434,
"grad_norm": 0.2590515690991322,
"learning_rate": 9.800995024875622e-06,
"loss": 0.0699,
"step": 400
},
{
"epoch": 0.0510648897745672,
"grad_norm": 0.23290141366261857,
"learning_rate": 9.99999830335013e-06,
"loss": 0.0703,
"step": 410
},
{
"epoch": 0.05231037489102005,
"grad_norm": 0.40462475733847536,
"learning_rate": 9.9999389207256e-06,
"loss": 0.0671,
"step": 420
},
{
"epoch": 0.05355586000747291,
"grad_norm": 0.1992853939667153,
"learning_rate": 9.999794706759033e-06,
"loss": 0.0642,
"step": 430
},
{
"epoch": 0.05480134512392577,
"grad_norm": 0.25005263193683325,
"learning_rate": 9.99956566389724e-06,
"loss": 0.0651,
"step": 440
},
{
"epoch": 0.056046830240378626,
"grad_norm": 0.19810910279996163,
"learning_rate": 9.999251796026267e-06,
"loss": 0.0636,
"step": 450
},
{
"epoch": 0.057292315356831484,
"grad_norm": 0.29240057935413744,
"learning_rate": 9.998853108471344e-06,
"loss": 0.0636,
"step": 460
},
{
"epoch": 0.05853780047328434,
"grad_norm": 0.1984892241286391,
"learning_rate": 9.998369607996798e-06,
"loss": 0.066,
"step": 470
},
{
"epoch": 0.0597832855897372,
"grad_norm": 0.2947006490171397,
"learning_rate": 9.997801302805926e-06,
"loss": 0.0594,
"step": 480
},
{
"epoch": 0.06102877070619006,
"grad_norm": 0.18665953295143467,
"learning_rate": 9.99714820254086e-06,
"loss": 0.0689,
"step": 490
},
{
"epoch": 0.06227425582264292,
"grad_norm": 0.15776324665440072,
"learning_rate": 9.996410318282419e-06,
"loss": 0.0629,
"step": 500
},
{
"epoch": 0.06351974093909578,
"grad_norm": 0.19893409117531088,
"learning_rate": 9.995587662549889e-06,
"loss": 0.0625,
"step": 510
},
{
"epoch": 0.06476522605554863,
"grad_norm": 0.1735255827587707,
"learning_rate": 9.994680249300844e-06,
"loss": 0.0589,
"step": 520
},
{
"epoch": 0.0660107111720015,
"grad_norm": 0.19646006827520265,
"learning_rate": 9.993688093930886e-06,
"loss": 0.0652,
"step": 530
},
{
"epoch": 0.06725619628845435,
"grad_norm": 0.22623430709009726,
"learning_rate": 9.992611213273399e-06,
"loss": 0.0597,
"step": 540
},
{
"epoch": 0.06850168140490721,
"grad_norm": 0.18327306390120063,
"learning_rate": 9.991449625599248e-06,
"loss": 0.0588,
"step": 550
},
{
"epoch": 0.06974716652136007,
"grad_norm": 0.1580054956829609,
"learning_rate": 9.990203350616484e-06,
"loss": 0.0619,
"step": 560
},
{
"epoch": 0.07099265163781293,
"grad_norm": 0.19834896954819606,
"learning_rate": 9.988872409470001e-06,
"loss": 0.0611,
"step": 570
},
{
"epoch": 0.07223813675426578,
"grad_norm": 0.16314415947022498,
"learning_rate": 9.98745682474118e-06,
"loss": 0.0604,
"step": 580
},
{
"epoch": 0.07348362187071865,
"grad_norm": 0.16290843158832854,
"learning_rate": 9.985956620447504e-06,
"loss": 0.0576,
"step": 590
},
{
"epoch": 0.0747291069871715,
"grad_norm": 0.19591924007546974,
"learning_rate": 9.984371822042151e-06,
"loss": 0.057,
"step": 600
},
{
"epoch": 0.07597459210362437,
"grad_norm": 0.2320859070446132,
"learning_rate": 9.982702456413569e-06,
"loss": 0.0583,
"step": 610
},
{
"epoch": 0.07722007722007722,
"grad_norm": 0.24472879145819784,
"learning_rate": 9.980948551885005e-06,
"loss": 0.0578,
"step": 620
},
{
"epoch": 0.07846556233653008,
"grad_norm": 0.1669725544164473,
"learning_rate": 9.97911013821404e-06,
"loss": 0.0575,
"step": 630
},
{
"epoch": 0.07971104745298294,
"grad_norm": 0.17241984682854564,
"learning_rate": 9.977187246592077e-06,
"loss": 0.0603,
"step": 640
},
{
"epoch": 0.0809565325694358,
"grad_norm": 0.16131121027720133,
"learning_rate": 9.975179909643805e-06,
"loss": 0.0562,
"step": 650
},
{
"epoch": 0.08220201768588865,
"grad_norm": 0.1604328826857812,
"learning_rate": 9.973088161426658e-06,
"loss": 0.0576,
"step": 660
},
{
"epoch": 0.0834475028023415,
"grad_norm": 0.1629200393652728,
"learning_rate": 9.970912037430234e-06,
"loss": 0.0572,
"step": 670
},
{
"epoch": 0.08469298791879437,
"grad_norm": 0.1461978920145311,
"learning_rate": 9.968651574575687e-06,
"loss": 0.0553,
"step": 680
},
{
"epoch": 0.08593847303524722,
"grad_norm": 0.16209938995070383,
"learning_rate": 9.966306811215105e-06,
"loss": 0.0536,
"step": 690
},
{
"epoch": 0.08718395815170009,
"grad_norm": 0.19541525812908397,
"learning_rate": 9.963877787130859e-06,
"loss": 0.0539,
"step": 700
},
{
"epoch": 0.08842944326815294,
"grad_norm": 0.1545173625361575,
"learning_rate": 9.961364543534924e-06,
"loss": 0.0553,
"step": 710
},
{
"epoch": 0.0896749283846058,
"grad_norm": 0.16095801919296102,
"learning_rate": 9.95876712306819e-06,
"loss": 0.0537,
"step": 720
},
{
"epoch": 0.09092041350105866,
"grad_norm": 0.14887807294672145,
"learning_rate": 9.956085569799724e-06,
"loss": 0.0504,
"step": 730
},
{
"epoch": 0.09216589861751152,
"grad_norm": 0.1698520709293453,
"learning_rate": 9.95331992922604e-06,
"loss": 0.0529,
"step": 740
},
{
"epoch": 0.09341138373396438,
"grad_norm": 0.16598062442974584,
"learning_rate": 9.950470248270307e-06,
"loss": 0.0564,
"step": 750
},
{
"epoch": 0.09465686885041724,
"grad_norm": 0.17737770942501244,
"learning_rate": 9.947536575281568e-06,
"loss": 0.0512,
"step": 760
},
{
"epoch": 0.0959023539668701,
"grad_norm": 0.16015622463667809,
"learning_rate": 9.944518960033917e-06,
"loss": 0.0551,
"step": 770
},
{
"epoch": 0.09714783908332296,
"grad_norm": 0.14721291686225765,
"learning_rate": 9.941417453725649e-06,
"loss": 0.0507,
"step": 780
},
{
"epoch": 0.09839332419977581,
"grad_norm": 0.15025959809228565,
"learning_rate": 9.938232108978393e-06,
"loss": 0.0502,
"step": 790
},
{
"epoch": 0.09963880931622868,
"grad_norm": 0.15529305685637196,
"learning_rate": 9.934962979836224e-06,
"loss": 0.0504,
"step": 800
},
{
"epoch": 0.10088429443268153,
"grad_norm": 0.15689543890929084,
"learning_rate": 9.931610121764744e-06,
"loss": 0.0508,
"step": 810
},
{
"epoch": 0.1021297795491344,
"grad_norm": 0.1344138827444646,
"learning_rate": 9.928173591650137e-06,
"loss": 0.0519,
"step": 820
},
{
"epoch": 0.10337526466558725,
"grad_norm": 0.7979415465164218,
"learning_rate": 9.924653447798204e-06,
"loss": 0.054,
"step": 830
},
{
"epoch": 0.1046207497820401,
"grad_norm": 0.18879786339120036,
"learning_rate": 9.921049749933384e-06,
"loss": 0.0539,
"step": 840
},
{
"epoch": 0.10586623489849296,
"grad_norm": 0.14405857504328634,
"learning_rate": 9.91736255919773e-06,
"loss": 0.0533,
"step": 850
},
{
"epoch": 0.10711172001494582,
"grad_norm": 0.13950191715821708,
"learning_rate": 9.913591938149872e-06,
"loss": 0.0505,
"step": 860
},
{
"epoch": 0.10835720513139868,
"grad_norm": 0.14169146418069137,
"learning_rate": 9.909737950763958e-06,
"loss": 0.0508,
"step": 870
},
{
"epoch": 0.10960269024785153,
"grad_norm": 0.12924275214745518,
"learning_rate": 9.905800662428575e-06,
"loss": 0.0482,
"step": 880
},
{
"epoch": 0.1108481753643044,
"grad_norm": 0.1470760362836568,
"learning_rate": 9.901780139945627e-06,
"loss": 0.0482,
"step": 890
},
{
"epoch": 0.11209366048075725,
"grad_norm": 0.13291195502690506,
"learning_rate": 9.897676451529215e-06,
"loss": 0.0517,
"step": 900
},
{
"epoch": 0.11333914559721012,
"grad_norm": 0.13390278563858438,
"learning_rate": 9.893489666804463e-06,
"loss": 0.0509,
"step": 910
},
{
"epoch": 0.11458463071366297,
"grad_norm": 0.1671697925970578,
"learning_rate": 9.889219856806356e-06,
"loss": 0.0494,
"step": 920
},
{
"epoch": 0.11583011583011583,
"grad_norm": 0.12304313859255972,
"learning_rate": 9.88486709397852e-06,
"loss": 0.0486,
"step": 930
},
{
"epoch": 0.11707560094656869,
"grad_norm": 0.13908830833667438,
"learning_rate": 9.880431452171999e-06,
"loss": 0.0488,
"step": 940
},
{
"epoch": 0.11832108606302155,
"grad_norm": 0.12296731852388516,
"learning_rate": 9.875913006644005e-06,
"loss": 0.0479,
"step": 950
},
{
"epoch": 0.1195665711794744,
"grad_norm": 0.1691596906562094,
"learning_rate": 9.87131183405663e-06,
"loss": 0.051,
"step": 960
},
{
"epoch": 0.12081205629592727,
"grad_norm": 0.130152721836032,
"learning_rate": 9.866628012475558e-06,
"loss": 0.0487,
"step": 970
},
{
"epoch": 0.12205754141238012,
"grad_norm": 0.11267290434698013,
"learning_rate": 9.861861621368736e-06,
"loss": 0.0507,
"step": 980
},
{
"epoch": 0.12330302652883297,
"grad_norm": 0.11331354524627157,
"learning_rate": 9.857012741605021e-06,
"loss": 0.0468,
"step": 990
},
{
"epoch": 0.12454851164528584,
"grad_norm": 0.15865708412965962,
"learning_rate": 9.852081455452814e-06,
"loss": 0.0489,
"step": 1000
},
{
"epoch": 0.12454851164528584,
"eval_loss": 0.05067430064082146,
"eval_runtime": 837.2581,
"eval_samples_per_second": 4.779,
"eval_steps_per_second": 0.3,
"step": 1000
},
{
"epoch": 0.1257939967617387,
"grad_norm": 0.12008518001971867,
"learning_rate": 9.847067846578659e-06,
"loss": 0.0484,
"step": 1010
},
{
"epoch": 0.12703948187819156,
"grad_norm": 0.12646879015730683,
"learning_rate": 9.841972000045835e-06,
"loss": 0.0591,
"step": 1020
},
{
"epoch": 0.12828496699464442,
"grad_norm": 0.11364990085331482,
"learning_rate": 9.83679400231289e-06,
"loss": 0.0494,
"step": 1030
},
{
"epoch": 0.12953045211109726,
"grad_norm": 0.11021349917771965,
"learning_rate": 9.831533941232204e-06,
"loss": 0.0481,
"step": 1040
},
{
"epoch": 0.13077593722755013,
"grad_norm": 0.13551575516221423,
"learning_rate": 9.826191906048472e-06,
"loss": 0.0474,
"step": 1050
},
{
"epoch": 0.132021422344003,
"grad_norm": 0.14122398348644333,
"learning_rate": 9.820767987397203e-06,
"loss": 0.0459,
"step": 1060
},
{
"epoch": 0.13326690746045586,
"grad_norm": 0.12736875628169675,
"learning_rate": 9.815262277303183e-06,
"loss": 0.0477,
"step": 1070
},
{
"epoch": 0.1345123925769087,
"grad_norm": 0.23110936390101788,
"learning_rate": 9.809674869178907e-06,
"loss": 0.0462,
"step": 1080
},
{
"epoch": 0.13575787769336156,
"grad_norm": 0.14834027930375704,
"learning_rate": 9.804005857822998e-06,
"loss": 0.0511,
"step": 1090
},
{
"epoch": 0.13700336280981443,
"grad_norm": 0.15740511829453357,
"learning_rate": 9.798255339418602e-06,
"loss": 0.0465,
"step": 1100
},
{
"epoch": 0.1382488479262673,
"grad_norm": 0.6994208642724673,
"learning_rate": 9.792423411531748e-06,
"loss": 0.0452,
"step": 1110
},
{
"epoch": 0.13949433304272013,
"grad_norm": 0.10523328058081761,
"learning_rate": 9.786510173109698e-06,
"loss": 0.0473,
"step": 1120
},
{
"epoch": 0.140739818159173,
"grad_norm": 0.1237911423948465,
"learning_rate": 9.780515724479269e-06,
"loss": 0.0478,
"step": 1130
},
{
"epoch": 0.14198530327562586,
"grad_norm": 0.10909368404610931,
"learning_rate": 9.774440167345128e-06,
"loss": 0.0519,
"step": 1140
},
{
"epoch": 0.14323078839207873,
"grad_norm": 0.11941370989825983,
"learning_rate": 9.768283604788066e-06,
"loss": 0.0448,
"step": 1150
},
{
"epoch": 0.14447627350853157,
"grad_norm": 0.10569604500692853,
"learning_rate": 9.762046141263252e-06,
"loss": 0.0473,
"step": 1160
},
{
"epoch": 0.14572175862498443,
"grad_norm": 0.11930885115843079,
"learning_rate": 9.75572788259846e-06,
"loss": 0.0454,
"step": 1170
},
{
"epoch": 0.1469672437414373,
"grad_norm": 0.1529042882405941,
"learning_rate": 9.749328935992272e-06,
"loss": 0.0479,
"step": 1180
},
{
"epoch": 0.14821272885789014,
"grad_norm": 0.12662782458494234,
"learning_rate": 9.742849410012258e-06,
"loss": 0.0487,
"step": 1190
},
{
"epoch": 0.149458213974343,
"grad_norm": 0.13041132917045617,
"learning_rate": 9.736289414593141e-06,
"loss": 0.0457,
"step": 1200
},
{
"epoch": 0.15070369909079587,
"grad_norm": 0.164274078618184,
"learning_rate": 9.72964906103492e-06,
"loss": 0.0485,
"step": 1210
},
{
"epoch": 0.15194918420724873,
"grad_norm": 0.10818673095804575,
"learning_rate": 9.722928462000995e-06,
"loss": 0.0456,
"step": 1220
},
{
"epoch": 0.15319466932370157,
"grad_norm": 0.1552950433826579,
"learning_rate": 9.716127731516244e-06,
"loss": 0.045,
"step": 1230
},
{
"epoch": 0.15444015444015444,
"grad_norm": 0.12126428490890664,
"learning_rate": 9.709246984965096e-06,
"loss": 0.0455,
"step": 1240
},
{
"epoch": 0.1556856395566073,
"grad_norm": 0.11864472718798152,
"learning_rate": 9.702286339089571e-06,
"loss": 0.0481,
"step": 1250
},
{
"epoch": 0.15693112467306017,
"grad_norm": 0.10464973543054351,
"learning_rate": 9.695245911987296e-06,
"loss": 0.0445,
"step": 1260
},
{
"epoch": 0.158176609789513,
"grad_norm": 0.1326030396019517,
"learning_rate": 9.68812582310951e-06,
"loss": 0.045,
"step": 1270
},
{
"epoch": 0.15942209490596587,
"grad_norm": 0.10871250003075815,
"learning_rate": 9.68092619325902e-06,
"loss": 0.0445,
"step": 1280
},
{
"epoch": 0.16066758002241874,
"grad_norm": 0.12614974726798006,
"learning_rate": 9.673647144588179e-06,
"loss": 0.0468,
"step": 1290
},
{
"epoch": 0.1619130651388716,
"grad_norm": 0.12395416843875896,
"learning_rate": 9.666288800596783e-06,
"loss": 0.0459,
"step": 1300
},
{
"epoch": 0.16315855025532444,
"grad_norm": 0.12469526881318954,
"learning_rate": 9.65885128613e-06,
"loss": 0.0479,
"step": 1310
},
{
"epoch": 0.1644040353717773,
"grad_norm": 0.10888988016342659,
"learning_rate": 9.651334727376238e-06,
"loss": 0.0438,
"step": 1320
},
{
"epoch": 0.16564952048823017,
"grad_norm": 0.11437633567324614,
"learning_rate": 9.643739251865007e-06,
"loss": 0.0461,
"step": 1330
},
{
"epoch": 0.166895005604683,
"grad_norm": 0.11243100463309648,
"learning_rate": 9.636064988464758e-06,
"loss": 0.0477,
"step": 1340
},
{
"epoch": 0.16814049072113588,
"grad_norm": 0.11775229353678274,
"learning_rate": 9.628312067380692e-06,
"loss": 0.0439,
"step": 1350
},
{
"epoch": 0.16938597583758874,
"grad_norm": 0.10576070226559706,
"learning_rate": 9.62048062015256e-06,
"loss": 0.0421,
"step": 1360
},
{
"epoch": 0.1706314609540416,
"grad_norm": 0.1225756664014832,
"learning_rate": 9.612570779652416e-06,
"loss": 0.0464,
"step": 1370
},
{
"epoch": 0.17187694607049445,
"grad_norm": 0.2526510367759425,
"learning_rate": 9.604582680082381e-06,
"loss": 0.0473,
"step": 1380
},
{
"epoch": 0.1731224311869473,
"grad_norm": 0.17121671255735518,
"learning_rate": 9.59651645697235e-06,
"loss": 0.0469,
"step": 1390
},
{
"epoch": 0.17436791630340018,
"grad_norm": 0.13788045784235398,
"learning_rate": 9.5883722471777e-06,
"loss": 0.0449,
"step": 1400
},
{
"epoch": 0.17561340141985304,
"grad_norm": 0.11515249340982896,
"learning_rate": 9.580150188876972e-06,
"loss": 0.0448,
"step": 1410
},
{
"epoch": 0.17685888653630588,
"grad_norm": 0.1253470560419917,
"learning_rate": 9.571850421569513e-06,
"loss": 0.0443,
"step": 1420
},
{
"epoch": 0.17810437165275875,
"grad_norm": 0.10614690655604767,
"learning_rate": 9.563473086073125e-06,
"loss": 0.0427,
"step": 1430
},
{
"epoch": 0.1793498567692116,
"grad_norm": 0.10078492497548407,
"learning_rate": 9.55501832452167e-06,
"loss": 0.044,
"step": 1440
},
{
"epoch": 0.18059534188566448,
"grad_norm": 0.11636102799380885,
"learning_rate": 9.54648628036265e-06,
"loss": 0.0436,
"step": 1450
},
{
"epoch": 0.18184082700211732,
"grad_norm": 0.1251116625684969,
"learning_rate": 9.537877098354787e-06,
"loss": 0.0541,
"step": 1460
},
{
"epoch": 0.18308631211857018,
"grad_norm": 0.1419915597237492,
"learning_rate": 9.529190924565555e-06,
"loss": 0.0431,
"step": 1470
},
{
"epoch": 0.18433179723502305,
"grad_norm": 0.11419668667392972,
"learning_rate": 9.520427906368715e-06,
"loss": 0.0449,
"step": 1480
},
{
"epoch": 0.1855772823514759,
"grad_norm": 0.11598298797722469,
"learning_rate": 9.511588192441799e-06,
"loss": 0.0444,
"step": 1490
},
{
"epoch": 0.18682276746792875,
"grad_norm": 0.11089301562266411,
"learning_rate": 9.502671932763598e-06,
"loss": 0.0441,
"step": 1500
},
{
"epoch": 0.18806825258438162,
"grad_norm": 0.10698032319807455,
"learning_rate": 9.493679278611616e-06,
"loss": 0.0438,
"step": 1510
},
{
"epoch": 0.18931373770083448,
"grad_norm": 0.12527021698193896,
"learning_rate": 9.484610382559501e-06,
"loss": 0.0445,
"step": 1520
},
{
"epoch": 0.19055922281728732,
"grad_norm": 0.10900615710064696,
"learning_rate": 9.475465398474455e-06,
"loss": 0.0445,
"step": 1530
},
{
"epoch": 0.1918047079337402,
"grad_norm": 0.1403468055385648,
"learning_rate": 9.46624448151463e-06,
"loss": 0.0456,
"step": 1540
},
{
"epoch": 0.19305019305019305,
"grad_norm": 0.1869277922181907,
"learning_rate": 9.45694778812649e-06,
"loss": 0.0458,
"step": 1550
},
{
"epoch": 0.19429567816664592,
"grad_norm": 0.157447180906341,
"learning_rate": 9.447575476042155e-06,
"loss": 0.044,
"step": 1560
},
{
"epoch": 0.19554116328309876,
"grad_norm": 0.12324051539101406,
"learning_rate": 9.43812770427673e-06,
"loss": 0.0427,
"step": 1570
},
{
"epoch": 0.19678664839955162,
"grad_norm": 0.11563356170107901,
"learning_rate": 9.428604633125606e-06,
"loss": 0.0408,
"step": 1580
},
{
"epoch": 0.1980321335160045,
"grad_norm": 0.09761737865487308,
"learning_rate": 9.419006424161739e-06,
"loss": 0.0427,
"step": 1590
},
{
"epoch": 0.19927761863245735,
"grad_norm": 0.10249368675553559,
"learning_rate": 9.409333240232905e-06,
"loss": 0.0428,
"step": 1600
},
{
"epoch": 0.2005231037489102,
"grad_norm": 0.11669776086288003,
"learning_rate": 9.399585245458947e-06,
"loss": 0.0443,
"step": 1610
},
{
"epoch": 0.20176858886536306,
"grad_norm": 0.09468584586364824,
"learning_rate": 9.389762605228975e-06,
"loss": 0.0413,
"step": 1620
},
{
"epoch": 0.20301407398181592,
"grad_norm": 0.11968306874997592,
"learning_rate": 9.379865486198584e-06,
"loss": 0.0437,
"step": 1630
},
{
"epoch": 0.2042595590982688,
"grad_norm": 0.09494979516841215,
"learning_rate": 9.369894056286996e-06,
"loss": 0.0419,
"step": 1640
},
{
"epoch": 0.20550504421472163,
"grad_norm": 0.09623346347367455,
"learning_rate": 9.359848484674239e-06,
"loss": 0.0426,
"step": 1650
},
{
"epoch": 0.2067505293311745,
"grad_norm": 0.12130237072433238,
"learning_rate": 9.349728941798258e-06,
"loss": 0.041,
"step": 1660
},
{
"epoch": 0.20799601444762736,
"grad_norm": 0.12871340456904928,
"learning_rate": 9.339535599352028e-06,
"loss": 0.0434,
"step": 1670
},
{
"epoch": 0.2092414995640802,
"grad_norm": 0.09171151878863659,
"learning_rate": 9.329268630280648e-06,
"loss": 0.0429,
"step": 1680
},
{
"epoch": 0.21048698468053306,
"grad_norm": 0.09982291501970612,
"learning_rate": 9.318928208778398e-06,
"loss": 0.0393,
"step": 1690
},
{
"epoch": 0.21173246979698593,
"grad_norm": 0.102908470033454,
"learning_rate": 9.308514510285789e-06,
"loss": 0.0437,
"step": 1700
},
{
"epoch": 0.2129779549134388,
"grad_norm": 0.10809759008020647,
"learning_rate": 9.298027711486583e-06,
"loss": 0.0414,
"step": 1710
},
{
"epoch": 0.21422344002989163,
"grad_norm": 0.09922481615525773,
"learning_rate": 9.287467990304796e-06,
"loss": 0.0405,
"step": 1720
},
{
"epoch": 0.2154689251463445,
"grad_norm": 0.2454857877766523,
"learning_rate": 9.278967829598777e-06,
"loss": 0.0642,
"step": 1730
},
{
"epoch": 0.21671441026279736,
"grad_norm": 0.12679122997721115,
"learning_rate": 9.268277300444182e-06,
"loss": 0.0466,
"step": 1740
},
{
"epoch": 0.21795989537925023,
"grad_norm": 0.10334949659338302,
"learning_rate": 9.257514353666632e-06,
"loss": 0.0422,
"step": 1750
},
{
"epoch": 0.21920538049570307,
"grad_norm": 0.10766550798989903,
"learning_rate": 9.2466791718754e-06,
"loss": 0.0426,
"step": 1760
},
{
"epoch": 0.22045086561215593,
"grad_norm": 0.09498401264455936,
"learning_rate": 9.235771938905337e-06,
"loss": 0.0449,
"step": 1770
},
{
"epoch": 0.2216963507286088,
"grad_norm": 0.09915101652344138,
"learning_rate": 9.224792839813743e-06,
"loss": 0.0435,
"step": 1780
},
{
"epoch": 0.22294183584506166,
"grad_norm": 0.16658667419219628,
"learning_rate": 9.213742060877235e-06,
"loss": 0.0522,
"step": 1790
},
{
"epoch": 0.2241873209615145,
"grad_norm": 0.11676226870887034,
"learning_rate": 9.202619789588587e-06,
"loss": 0.0419,
"step": 1800
},
{
"epoch": 0.22543280607796737,
"grad_norm": 0.11632456369300531,
"learning_rate": 9.191426214653545e-06,
"loss": 0.0422,
"step": 1810
},
{
"epoch": 0.22667829119442023,
"grad_norm": 0.13094132334044692,
"learning_rate": 9.180161525987622e-06,
"loss": 0.0432,
"step": 1820
},
{
"epoch": 0.22792377631087307,
"grad_norm": 0.5133186348285592,
"learning_rate": 9.168825914712887e-06,
"loss": 0.0439,
"step": 1830
},
{
"epoch": 0.22916926142732594,
"grad_norm": 0.1126566913085879,
"learning_rate": 9.157419573154712e-06,
"loss": 0.0424,
"step": 1840
},
{
"epoch": 0.2304147465437788,
"grad_norm": 0.11872246307002708,
"learning_rate": 9.145942694838514e-06,
"loss": 0.0452,
"step": 1850
},
{
"epoch": 0.23166023166023167,
"grad_norm": 0.11638084946068121,
"learning_rate": 9.134395474486471e-06,
"loss": 0.043,
"step": 1860
},
{
"epoch": 0.2329057167766845,
"grad_norm": 0.09781456820229469,
"learning_rate": 9.122778108014212e-06,
"loss": 0.0405,
"step": 1870
},
{
"epoch": 0.23415120189313737,
"grad_norm": 0.10584584559462579,
"learning_rate": 9.111090792527505e-06,
"loss": 0.0423,
"step": 1880
},
{
"epoch": 0.23539668700959024,
"grad_norm": 0.10925593174245034,
"learning_rate": 9.099333726318904e-06,
"loss": 0.047,
"step": 1890
},
{
"epoch": 0.2366421721260431,
"grad_norm": 0.10367090384152275,
"learning_rate": 9.087507108864388e-06,
"loss": 0.0426,
"step": 1900
},
{
"epoch": 0.23788765724249594,
"grad_norm": 0.09506359091665009,
"learning_rate": 9.075611140819971e-06,
"loss": 0.0401,
"step": 1910
},
{
"epoch": 0.2391331423589488,
"grad_norm": 0.1111594741008404,
"learning_rate": 9.063646024018309e-06,
"loss": 0.0451,
"step": 1920
},
{
"epoch": 0.24037862747540167,
"grad_norm": 0.11301927478560299,
"learning_rate": 9.051611961465264e-06,
"loss": 0.0421,
"step": 1930
},
{
"epoch": 0.24162411259185454,
"grad_norm": 0.10789281571638606,
"learning_rate": 9.039509157336461e-06,
"loss": 0.0401,
"step": 1940
},
{
"epoch": 0.24286959770830738,
"grad_norm": 0.1089623526464556,
"learning_rate": 9.027337816973839e-06,
"loss": 0.0433,
"step": 1950
},
{
"epoch": 0.24411508282476024,
"grad_norm": 0.11849781991781926,
"learning_rate": 9.015098146882143e-06,
"loss": 0.0429,
"step": 1960
},
{
"epoch": 0.2453605679412131,
"grad_norm": 0.10702936405408302,
"learning_rate": 9.002790354725439e-06,
"loss": 0.0407,
"step": 1970
},
{
"epoch": 0.24660605305766595,
"grad_norm": 0.1009623884471384,
"learning_rate": 8.990414649323581e-06,
"loss": 0.0411,
"step": 1980
},
{
"epoch": 0.2478515381741188,
"grad_norm": 0.098165132502255,
"learning_rate": 8.977971240648683e-06,
"loss": 0.0407,
"step": 1990
},
{
"epoch": 0.24909702329057168,
"grad_norm": 0.09911720442830722,
"learning_rate": 8.965460339821524e-06,
"loss": 0.0413,
"step": 2000
},
{
"epoch": 0.24909702329057168,
"eval_loss": 0.04383137822151184,
"eval_runtime": 837.2466,
"eval_samples_per_second": 4.779,
"eval_steps_per_second": 0.3,
"step": 2000
},
{
"epoch": 0.25034250840702454,
"grad_norm": 0.09733221424301261,
"learning_rate": 8.952882159108001e-06,
"loss": 0.0462,
"step": 2010
},
{
"epoch": 0.2515879935234774,
"grad_norm": 0.08904028592951219,
"learning_rate": 8.940236911915517e-06,
"loss": 0.0397,
"step": 2020
},
{
"epoch": 0.2528334786399303,
"grad_norm": 0.10223479310930707,
"learning_rate": 8.927524812789344e-06,
"loss": 0.0417,
"step": 2030
},
{
"epoch": 0.2540789637563831,
"grad_norm": 0.10176207129527515,
"learning_rate": 8.914746077409007e-06,
"loss": 0.0398,
"step": 2040
},
{
"epoch": 0.25532444887283595,
"grad_norm": 0.1345916238686809,
"learning_rate": 8.901900922584607e-06,
"loss": 0.0412,
"step": 2050
},
{
"epoch": 0.25656993398928885,
"grad_norm": 0.12462863862373187,
"learning_rate": 8.88898956625315e-06,
"loss": 0.0407,
"step": 2060
},
{
"epoch": 0.2578154191057417,
"grad_norm": 0.18923051207281882,
"learning_rate": 8.87601222747485e-06,
"loss": 0.04,
"step": 2070
},
{
"epoch": 0.2590609042221945,
"grad_norm": 0.12257338708335132,
"learning_rate": 8.86296912642941e-06,
"loss": 0.0411,
"step": 2080
},
{
"epoch": 0.2603063893386474,
"grad_norm": 0.11075532387556147,
"learning_rate": 8.849860484412286e-06,
"loss": 0.0451,
"step": 2090
},
{
"epoch": 0.26155187445510025,
"grad_norm": 0.09032535067048295,
"learning_rate": 8.836686523830932e-06,
"loss": 0.0404,
"step": 2100
},
{
"epoch": 0.26279735957155315,
"grad_norm": 0.09193144129394688,
"learning_rate": 8.823447468201034e-06,
"loss": 0.0401,
"step": 2110
},
{
"epoch": 0.264042844688006,
"grad_norm": 0.10362652954418632,
"learning_rate": 8.810143542142704e-06,
"loss": 0.0417,
"step": 2120
},
{
"epoch": 0.2652883298044588,
"grad_norm": 0.11300498286039973,
"learning_rate": 8.79677497137668e-06,
"loss": 0.0401,
"step": 2130
},
{
"epoch": 0.2665338149209117,
"grad_norm": 0.10397514267437184,
"learning_rate": 8.783341982720493e-06,
"loss": 0.0402,
"step": 2140
},
{
"epoch": 0.26777930003736455,
"grad_norm": 0.15892501392526515,
"learning_rate": 8.76984480408462e-06,
"loss": 0.0397,
"step": 2150
},
{
"epoch": 0.2690247851538174,
"grad_norm": 0.10218806129831098,
"learning_rate": 8.756283664468609e-06,
"loss": 0.0396,
"step": 2160
},
{
"epoch": 0.2702702702702703,
"grad_norm": 0.10968625770271137,
"learning_rate": 8.74265879395721e-06,
"loss": 0.0404,
"step": 2170
},
{
"epoch": 0.2715157553867231,
"grad_norm": 0.0941877475595611,
"learning_rate": 8.728970423716455e-06,
"loss": 0.0382,
"step": 2180
},
{
"epoch": 0.27276124050317596,
"grad_norm": 0.08706969977702901,
"learning_rate": 8.715218785989746e-06,
"loss": 0.0386,
"step": 2190
},
{
"epoch": 0.27400672561962885,
"grad_norm": 0.10169423226434647,
"learning_rate": 8.701404114093907e-06,
"loss": 0.0408,
"step": 2200
},
{
"epoch": 0.2752522107360817,
"grad_norm": 0.1088585215738465,
"learning_rate": 8.687526642415234e-06,
"loss": 0.0504,
"step": 2210
},
{
"epoch": 0.2764976958525346,
"grad_norm": 0.11049918001821324,
"learning_rate": 8.673586606405515e-06,
"loss": 0.0415,
"step": 2220
},
{
"epoch": 0.2777431809689874,
"grad_norm": 0.09280566997623665,
"learning_rate": 8.659584242578027e-06,
"loss": 0.0393,
"step": 2230
},
{
"epoch": 0.27898866608544026,
"grad_norm": 0.10571033510018531,
"learning_rate": 8.64551978850354e-06,
"loss": 0.041,
"step": 2240
},
{
"epoch": 0.28023415120189316,
"grad_norm": 0.11616105487029853,
"learning_rate": 8.63139348280627e-06,
"loss": 0.0407,
"step": 2250
},
{
"epoch": 0.281479636318346,
"grad_norm": 0.15087648013362626,
"learning_rate": 8.617205565159837e-06,
"loss": 0.041,
"step": 2260
},
{
"epoch": 0.28272512143479883,
"grad_norm": 0.12873729187439528,
"learning_rate": 8.602956276283203e-06,
"loss": 0.0427,
"step": 2270
},
{
"epoch": 0.2839706065512517,
"grad_norm": 0.11892192024900505,
"learning_rate": 8.58864585793658e-06,
"loss": 0.0399,
"step": 2280
},
{
"epoch": 0.28521609166770456,
"grad_norm": 0.10031768131797066,
"learning_rate": 8.574274552917332e-06,
"loss": 0.039,
"step": 2290
},
{
"epoch": 0.28646157678415746,
"grad_norm": 0.10020071796996806,
"learning_rate": 8.559842605055857e-06,
"loss": 0.0392,
"step": 2300
},
{
"epoch": 0.2877070619006103,
"grad_norm": 0.10436992009765807,
"learning_rate": 8.545350259211446e-06,
"loss": 0.0412,
"step": 2310
},
{
"epoch": 0.28895254701706313,
"grad_norm": 0.1110208366008346,
"learning_rate": 8.530797761268132e-06,
"loss": 0.0393,
"step": 2320
},
{
"epoch": 0.290198032133516,
"grad_norm": 0.12229516537117378,
"learning_rate": 8.516185358130521e-06,
"loss": 0.0397,
"step": 2330
},
{
"epoch": 0.29144351724996886,
"grad_norm": 0.09885538403138262,
"learning_rate": 8.501513297719591e-06,
"loss": 0.0401,
"step": 2340
},
{
"epoch": 0.2926890023664217,
"grad_norm": 0.08486977949008306,
"learning_rate": 8.486781828968497e-06,
"loss": 0.0384,
"step": 2350
},
{
"epoch": 0.2939344874828746,
"grad_norm": 0.09549355152979244,
"learning_rate": 8.47199120181835e-06,
"loss": 0.0387,
"step": 2360
},
{
"epoch": 0.29517997259932743,
"grad_norm": 0.09412138643820275,
"learning_rate": 8.457141667213964e-06,
"loss": 0.041,
"step": 2370
},
{
"epoch": 0.29642545771578027,
"grad_norm": 0.10599371085007579,
"learning_rate": 8.442233477099606e-06,
"loss": 0.0433,
"step": 2380
},
{
"epoch": 0.29767094283223317,
"grad_norm": 0.09226690222055461,
"learning_rate": 8.427266884414717e-06,
"loss": 0.0414,
"step": 2390
},
{
"epoch": 0.298916427948686,
"grad_norm": 0.10960059061019917,
"learning_rate": 8.412242143089634e-06,
"loss": 0.0392,
"step": 2400
},
{
"epoch": 0.3001619130651389,
"grad_norm": 0.09400507906095983,
"learning_rate": 8.397159508041259e-06,
"loss": 0.0394,
"step": 2410
},
{
"epoch": 0.30140739818159173,
"grad_norm": 0.0876337900908495,
"learning_rate": 8.38201923516876e-06,
"loss": 0.0402,
"step": 2420
},
{
"epoch": 0.3026528832980446,
"grad_norm": 0.10685042487637744,
"learning_rate": 8.3668215813492e-06,
"loss": 0.0379,
"step": 2430
},
{
"epoch": 0.30389836841449747,
"grad_norm": 0.09343466669123661,
"learning_rate": 8.351566804433207e-06,
"loss": 0.0392,
"step": 2440
},
{
"epoch": 0.3051438535309503,
"grad_norm": 0.11232771427051269,
"learning_rate": 8.336255163240583e-06,
"loss": 0.0442,
"step": 2450
},
{
"epoch": 0.30638933864740314,
"grad_norm": 0.120922739449198,
"learning_rate": 8.320886917555915e-06,
"loss": 0.0392,
"step": 2460
},
{
"epoch": 0.30763482376385604,
"grad_norm": 0.0982996904722228,
"learning_rate": 8.30546232812417e-06,
"loss": 0.0376,
"step": 2470
},
{
"epoch": 0.3088803088803089,
"grad_norm": 0.09473609450051207,
"learning_rate": 8.289981656646265e-06,
"loss": 0.0405,
"step": 2480
},
{
"epoch": 0.3101257939967617,
"grad_norm": 0.10866745221704315,
"learning_rate": 8.274445165774642e-06,
"loss": 0.0416,
"step": 2490
},
{
"epoch": 0.3113712791132146,
"grad_norm": 0.10297062161793576,
"learning_rate": 8.258853119108792e-06,
"loss": 0.039,
"step": 2500
},
{
"epoch": 0.31261676422966744,
"grad_norm": 0.08904243384684579,
"learning_rate": 8.243205781190796e-06,
"loss": 0.0405,
"step": 2510
},
{
"epoch": 0.31386224934612034,
"grad_norm": 0.09914719735887853,
"learning_rate": 8.227503417500832e-06,
"loss": 0.0409,
"step": 2520
},
{
"epoch": 0.3151077344625732,
"grad_norm": 0.09801174377656924,
"learning_rate": 8.211746294452671e-06,
"loss": 0.0402,
"step": 2530
},
{
"epoch": 0.316353219579026,
"grad_norm": 0.09207471394052044,
"learning_rate": 8.195934679389159e-06,
"loss": 0.0395,
"step": 2540
},
{
"epoch": 0.3175987046954789,
"grad_norm": 0.0998841361762494,
"learning_rate": 8.180068840577676e-06,
"loss": 0.0399,
"step": 2550
},
{
"epoch": 0.31884418981193174,
"grad_norm": 0.2734444046615027,
"learning_rate": 8.164149047205593e-06,
"loss": 0.039,
"step": 2560
},
{
"epoch": 0.3200896749283846,
"grad_norm": 0.09809929708143017,
"learning_rate": 8.148175569375696e-06,
"loss": 0.0398,
"step": 2570
},
{
"epoch": 0.3213351600448375,
"grad_norm": 0.09598861497985733,
"learning_rate": 8.132148678101605e-06,
"loss": 0.0387,
"step": 2580
},
{
"epoch": 0.3225806451612903,
"grad_norm": 0.10548719945871352,
"learning_rate": 8.116068645303185e-06,
"loss": 0.0392,
"step": 2590
},
{
"epoch": 0.3238261302777432,
"grad_norm": 0.1112917380798265,
"learning_rate": 8.099935743801919e-06,
"loss": 0.0405,
"step": 2600
},
{
"epoch": 0.32507161539419605,
"grad_norm": 0.09667326537617928,
"learning_rate": 8.083750247316287e-06,
"loss": 0.042,
"step": 2610
},
{
"epoch": 0.3263171005106489,
"grad_norm": 0.08565875844370581,
"learning_rate": 8.067512430457122e-06,
"loss": 0.0402,
"step": 2620
},
{
"epoch": 0.3275625856271018,
"grad_norm": 0.09463079394910894,
"learning_rate": 8.051222568722951e-06,
"loss": 0.0404,
"step": 2630
},
{
"epoch": 0.3288080707435546,
"grad_norm": 0.09671091654042781,
"learning_rate": 8.034880938495314e-06,
"loss": 0.039,
"step": 2640
},
{
"epoch": 0.33005355586000745,
"grad_norm": 0.1079506452875451,
"learning_rate": 8.018487817034083e-06,
"loss": 0.0391,
"step": 2650
},
{
"epoch": 0.33129904097646035,
"grad_norm": 0.1151332285179333,
"learning_rate": 8.002043482472755e-06,
"loss": 0.0399,
"step": 2660
},
{
"epoch": 0.3325445260929132,
"grad_norm": 0.09005373864469504,
"learning_rate": 7.985548213813731e-06,
"loss": 0.0386,
"step": 2670
},
{
"epoch": 0.333790011209366,
"grad_norm": 0.10997395130510185,
"learning_rate": 7.96900229092359e-06,
"loss": 0.0409,
"step": 2680
},
{
"epoch": 0.3350354963258189,
"grad_norm": 0.12491568601669467,
"learning_rate": 7.952405994528321e-06,
"loss": 0.0407,
"step": 2690
},
{
"epoch": 0.33628098144227175,
"grad_norm": 0.12759285808221393,
"learning_rate": 7.935759606208591e-06,
"loss": 0.0389,
"step": 2700
},
{
"epoch": 0.33752646655872465,
"grad_norm": 0.10047860560284863,
"learning_rate": 7.919063408394939e-06,
"loss": 0.0385,
"step": 2710
},
{
"epoch": 0.3387719516751775,
"grad_norm": 0.09488782333922151,
"learning_rate": 7.902317684363e-06,
"loss": 0.0373,
"step": 2720
},
{
"epoch": 0.3400174367916303,
"grad_norm": 0.10229240494191594,
"learning_rate": 7.88552271822869e-06,
"loss": 0.0404,
"step": 2730
},
{
"epoch": 0.3412629219080832,
"grad_norm": 0.10860330897269024,
"learning_rate": 7.8686787949434e-06,
"loss": 0.0452,
"step": 2740
},
{
"epoch": 0.34250840702453605,
"grad_norm": 0.12046253584694053,
"learning_rate": 7.851786200289138e-06,
"loss": 0.0383,
"step": 2750
},
{
"epoch": 0.3437538921409889,
"grad_norm": 0.09588233012721392,
"learning_rate": 7.834845220873704e-06,
"loss": 0.0379,
"step": 2760
},
{
"epoch": 0.3449993772574418,
"grad_norm": 0.11141514409699106,
"learning_rate": 7.817856144125812e-06,
"loss": 0.0404,
"step": 2770
},
{
"epoch": 0.3462448623738946,
"grad_norm": 0.0929166117688466,
"learning_rate": 7.800819258290217e-06,
"loss": 0.0385,
"step": 2780
},
{
"epoch": 0.3474903474903475,
"grad_norm": 0.12872377426032375,
"learning_rate": 7.783734852422832e-06,
"loss": 0.0369,
"step": 2790
},
{
"epoch": 0.34873583260680036,
"grad_norm": 0.10248060164902685,
"learning_rate": 7.766603216385811e-06,
"loss": 0.0383,
"step": 2800
},
{
"epoch": 0.3499813177232532,
"grad_norm": 0.09236553046935063,
"learning_rate": 7.74942464084264e-06,
"loss": 0.0385,
"step": 2810
},
{
"epoch": 0.3512268028397061,
"grad_norm": 0.09960999182950707,
"learning_rate": 7.732199417253205e-06,
"loss": 0.0378,
"step": 2820
},
{
"epoch": 0.3524722879561589,
"grad_norm": 0.08158158710183284,
"learning_rate": 7.714927837868839e-06,
"loss": 0.0419,
"step": 2830
},
{
"epoch": 0.35371777307261176,
"grad_norm": 0.10038517508781465,
"learning_rate": 7.697610195727376e-06,
"loss": 0.0411,
"step": 2840
},
{
"epoch": 0.35496325818906466,
"grad_norm": 0.11182473811936411,
"learning_rate": 7.680246784648168e-06,
"loss": 0.0386,
"step": 2850
},
{
"epoch": 0.3562087433055175,
"grad_norm": 0.10940388386133251,
"learning_rate": 7.66283789922711e-06,
"loss": 0.0402,
"step": 2860
},
{
"epoch": 0.35745422842197033,
"grad_norm": 0.09514114187943787,
"learning_rate": 7.645383834831632e-06,
"loss": 0.0361,
"step": 2870
},
{
"epoch": 0.3586997135384232,
"grad_norm": 0.12614933036066395,
"learning_rate": 7.627884887595691e-06,
"loss": 0.0418,
"step": 2880
},
{
"epoch": 0.35994519865487606,
"grad_norm": 0.08549408832523489,
"learning_rate": 7.6103413544147485e-06,
"loss": 0.0389,
"step": 2890
},
{
"epoch": 0.36119068377132896,
"grad_norm": 0.0775555820195004,
"learning_rate": 7.592753532940735e-06,
"loss": 0.0371,
"step": 2900
},
{
"epoch": 0.3624361688877818,
"grad_norm": 0.09559583490394649,
"learning_rate": 7.575121721576999e-06,
"loss": 0.0363,
"step": 2910
},
{
"epoch": 0.36368165400423463,
"grad_norm": 0.09082312459081106,
"learning_rate": 7.557446219473234e-06,
"loss": 0.039,
"step": 2920
},
{
"epoch": 0.3649271391206875,
"grad_norm": 0.08688457316413475,
"learning_rate": 7.5397273265204164e-06,
"loss": 0.0363,
"step": 2930
},
{
"epoch": 0.36617262423714037,
"grad_norm": 0.09279603811387303,
"learning_rate": 7.521965343345714e-06,
"loss": 0.04,
"step": 2940
},
{
"epoch": 0.3674181093535932,
"grad_norm": 0.09546109850007727,
"learning_rate": 7.504160571307384e-06,
"loss": 0.0373,
"step": 2950
},
{
"epoch": 0.3686635944700461,
"grad_norm": 0.0900380879214148,
"learning_rate": 7.486313312489655e-06,
"loss": 0.0375,
"step": 2960
},
{
"epoch": 0.36990907958649893,
"grad_norm": 0.08133584321860296,
"learning_rate": 7.468423869697608e-06,
"loss": 0.0403,
"step": 2970
},
{
"epoch": 0.3711545647029518,
"grad_norm": 0.09421374605290615,
"learning_rate": 7.450492546452044e-06,
"loss": 0.0402,
"step": 2980
},
{
"epoch": 0.37240004981940467,
"grad_norm": 0.10830645226608504,
"learning_rate": 7.43251964698432e-06,
"loss": 0.0382,
"step": 2990
},
{
"epoch": 0.3736455349358575,
"grad_norm": 0.10977667899434115,
"learning_rate": 7.414505476231197e-06,
"loss": 0.0394,
"step": 3000
},
{
"epoch": 0.3736455349358575,
"eval_loss": 0.040858492255210876,
"eval_runtime": 837.3599,
"eval_samples_per_second": 4.778,
"eval_steps_per_second": 0.3,
"step": 3000
},
{
"epoch": 0.3748910200523104,
"grad_norm": 0.09711498859085808,
"learning_rate": 7.396450339829663e-06,
"loss": 0.0399,
"step": 3010
},
{
"epoch": 0.37613650516876324,
"grad_norm": 0.0901346610564539,
"learning_rate": 7.378354544111754e-06,
"loss": 0.0406,
"step": 3020
},
{
"epoch": 0.3773819902852161,
"grad_norm": 0.09688866851727604,
"learning_rate": 7.360218396099346e-06,
"loss": 0.0365,
"step": 3030
},
{
"epoch": 0.37862747540166897,
"grad_norm": 0.23387727804118624,
"learning_rate": 7.342042203498952e-06,
"loss": 0.0399,
"step": 3040
},
{
"epoch": 0.3798729605181218,
"grad_norm": 0.11523194837485062,
"learning_rate": 7.323826274696502e-06,
"loss": 0.0379,
"step": 3050
},
{
"epoch": 0.38111844563457464,
"grad_norm": 0.11882080178623564,
"learning_rate": 7.30557091875211e-06,
"loss": 0.0391,
"step": 3060
},
{
"epoch": 0.38236393075102754,
"grad_norm": 0.08130643445119262,
"learning_rate": 7.287276445394829e-06,
"loss": 0.0408,
"step": 3070
},
{
"epoch": 0.3836094158674804,
"grad_norm": 0.09322150514308884,
"learning_rate": 7.268943165017393e-06,
"loss": 0.0387,
"step": 3080
},
{
"epoch": 0.38485490098393327,
"grad_norm": 0.08315906543611476,
"learning_rate": 7.250571388670958e-06,
"loss": 0.0389,
"step": 3090
},
{
"epoch": 0.3861003861003861,
"grad_norm": 0.09634973122982947,
"learning_rate": 7.232161428059824e-06,
"loss": 0.0383,
"step": 3100
},
{
"epoch": 0.38734587121683894,
"grad_norm": 0.09970447471105355,
"learning_rate": 7.213713595536135e-06,
"loss": 0.0382,
"step": 3110
},
{
"epoch": 0.38859135633329184,
"grad_norm": 0.10057346750293096,
"learning_rate": 7.195228204094596e-06,
"loss": 0.0381,
"step": 3120
},
{
"epoch": 0.3898368414497447,
"grad_norm": 0.08999580358587174,
"learning_rate": 7.17670556736715e-06,
"loss": 0.037,
"step": 3130
},
{
"epoch": 0.3910823265661975,
"grad_norm": 0.09136321882720792,
"learning_rate": 7.1581459996176605e-06,
"loss": 0.0467,
"step": 3140
},
{
"epoch": 0.3923278116826504,
"grad_norm": 0.12563106478394556,
"learning_rate": 7.139549815736586e-06,
"loss": 0.0387,
"step": 3150
},
{
"epoch": 0.39357329679910324,
"grad_norm": 0.09101166085701792,
"learning_rate": 7.1209173312356236e-06,
"loss": 0.0373,
"step": 3160
},
{
"epoch": 0.3948187819155561,
"grad_norm": 0.08122854516944507,
"learning_rate": 7.102248862242372e-06,
"loss": 0.0376,
"step": 3170
},
{
"epoch": 0.396064267032009,
"grad_norm": 0.09960511898606343,
"learning_rate": 7.083544725494952e-06,
"loss": 0.0359,
"step": 3180
},
{
"epoch": 0.3973097521484618,
"grad_norm": 0.09474808949631978,
"learning_rate": 7.064805238336648e-06,
"loss": 0.0374,
"step": 3190
},
{
"epoch": 0.3985552372649147,
"grad_norm": 0.10134253688315532,
"learning_rate": 7.046030718710512e-06,
"loss": 0.0398,
"step": 3200
},
{
"epoch": 0.39980072238136755,
"grad_norm": 0.07918405098273072,
"learning_rate": 7.027221485153973e-06,
"loss": 0.0369,
"step": 3210
},
{
"epoch": 0.4010462074978204,
"grad_norm": 0.09635090014215289,
"learning_rate": 7.008377856793438e-06,
"loss": 0.0392,
"step": 3220
},
{
"epoch": 0.4022916926142733,
"grad_norm": 0.09855638055068429,
"learning_rate": 6.9895001533388655e-06,
"loss": 0.0382,
"step": 3230
},
{
"epoch": 0.4035371777307261,
"grad_norm": 0.11203107568068171,
"learning_rate": 6.970588695078352e-06,
"loss": 0.0375,
"step": 3240
},
{
"epoch": 0.40478266284717895,
"grad_norm": 0.09400579660862664,
"learning_rate": 6.951643802872694e-06,
"loss": 0.0382,
"step": 3250
},
{
"epoch": 0.40602814796363185,
"grad_norm": 0.09007080595096814,
"learning_rate": 6.9326657981499455e-06,
"loss": 0.0375,
"step": 3260
},
{
"epoch": 0.4072736330800847,
"grad_norm": 0.08872251352854935,
"learning_rate": 6.913655002899963e-06,
"loss": 0.0379,
"step": 3270
},
{
"epoch": 0.4085191181965376,
"grad_norm": 0.08156717974302112,
"learning_rate": 6.894611739668938e-06,
"loss": 0.0383,
"step": 3280
},
{
"epoch": 0.4097646033129904,
"grad_norm": 0.07436587923812871,
"learning_rate": 6.875536331553936e-06,
"loss": 0.0356,
"step": 3290
},
{
"epoch": 0.41101008842944325,
"grad_norm": 0.1028753466102165,
"learning_rate": 6.856429102197402e-06,
"loss": 0.0393,
"step": 3300
},
{
"epoch": 0.41225557354589615,
"grad_norm": 0.08917324817657225,
"learning_rate": 6.8372903757816785e-06,
"loss": 0.0352,
"step": 3310
},
{
"epoch": 0.413501058662349,
"grad_norm": 0.08448940604648857,
"learning_rate": 6.818120477023502e-06,
"loss": 0.0387,
"step": 3320
},
{
"epoch": 0.4147465437788018,
"grad_norm": 0.0893460784893377,
"learning_rate": 6.798919731168494e-06,
"loss": 0.0378,
"step": 3330
},
{
"epoch": 0.4159920288952547,
"grad_norm": 0.09353075063275997,
"learning_rate": 6.779688463985641e-06,
"loss": 0.0369,
"step": 3340
},
{
"epoch": 0.41723751401170756,
"grad_norm": 0.09142697273532494,
"learning_rate": 6.76042700176177e-06,
"loss": 0.0388,
"step": 3350
},
{
"epoch": 0.4184829991281604,
"grad_norm": 0.09851205130893685,
"learning_rate": 6.741135671296011e-06,
"loss": 0.0363,
"step": 3360
},
{
"epoch": 0.4197284842446133,
"grad_norm": 0.09031023070478926,
"learning_rate": 6.721814799894253e-06,
"loss": 0.0375,
"step": 3370
},
{
"epoch": 0.4209739693610661,
"grad_norm": 0.0840573742921597,
"learning_rate": 6.70246471536359e-06,
"loss": 0.0398,
"step": 3380
},
{
"epoch": 0.422219454477519,
"grad_norm": 0.10066838375332304,
"learning_rate": 6.68308574600676e-06,
"loss": 0.0387,
"step": 3390
},
{
"epoch": 0.42346493959397186,
"grad_norm": 0.09550683351843423,
"learning_rate": 6.663678220616576e-06,
"loss": 0.0387,
"step": 3400
},
{
"epoch": 0.4247104247104247,
"grad_norm": 0.10723200453106543,
"learning_rate": 6.6442424684703465e-06,
"loss": 0.0379,
"step": 3410
},
{
"epoch": 0.4259559098268776,
"grad_norm": 0.09925503698863433,
"learning_rate": 6.624778819324285e-06,
"loss": 0.039,
"step": 3420
},
{
"epoch": 0.4272013949433304,
"grad_norm": 0.08675689051040839,
"learning_rate": 6.605287603407922e-06,
"loss": 0.0458,
"step": 3430
},
{
"epoch": 0.42844688005978326,
"grad_norm": 0.09760322646858284,
"learning_rate": 6.585769151418499e-06,
"loss": 0.0439,
"step": 3440
},
{
"epoch": 0.42969236517623616,
"grad_norm": 0.08833172310610418,
"learning_rate": 6.566223794515354e-06,
"loss": 0.0361,
"step": 3450
},
{
"epoch": 0.430937850292689,
"grad_norm": 0.10904819378533635,
"learning_rate": 6.5466518643143104e-06,
"loss": 0.0374,
"step": 3460
},
{
"epoch": 0.43218333540914183,
"grad_norm": 0.07692639646342093,
"learning_rate": 6.527053692882048e-06,
"loss": 0.0414,
"step": 3470
},
{
"epoch": 0.4334288205255947,
"grad_norm": 0.09853563227999187,
"learning_rate": 6.507429612730465e-06,
"loss": 0.0346,
"step": 3480
},
{
"epoch": 0.43467430564204756,
"grad_norm": 0.07701996602957163,
"learning_rate": 6.487779956811038e-06,
"loss": 0.0393,
"step": 3490
},
{
"epoch": 0.43591979075850046,
"grad_norm": 0.07580579829906375,
"learning_rate": 6.468105058509174e-06,
"loss": 0.0365,
"step": 3500
},
{
"epoch": 0.4371652758749533,
"grad_norm": 0.10290784742784614,
"learning_rate": 6.448405251638562e-06,
"loss": 0.04,
"step": 3510
},
{
"epoch": 0.43841076099140613,
"grad_norm": 0.08603442461481811,
"learning_rate": 6.428680870435492e-06,
"loss": 0.0379,
"step": 3520
},
{
"epoch": 0.43965624610785903,
"grad_norm": 0.096295774246373,
"learning_rate": 6.408932249553198e-06,
"loss": 0.0375,
"step": 3530
},
{
"epoch": 0.44090173122431187,
"grad_norm": 0.09699707334972303,
"learning_rate": 6.389159724056175e-06,
"loss": 0.0358,
"step": 3540
},
{
"epoch": 0.4421472163407647,
"grad_norm": 0.08164011122773947,
"learning_rate": 6.3693636294145e-06,
"loss": 0.0381,
"step": 3550
},
{
"epoch": 0.4433927014572176,
"grad_norm": 0.08751001930302163,
"learning_rate": 6.349544301498127e-06,
"loss": 0.0356,
"step": 3560
},
{
"epoch": 0.44463818657367044,
"grad_norm": 0.08965656172522919,
"learning_rate": 6.329702076571202e-06,
"loss": 0.0367,
"step": 3570
},
{
"epoch": 0.44588367169012333,
"grad_norm": 0.08872702673172018,
"learning_rate": 6.30983729128635e-06,
"loss": 0.0374,
"step": 3580
},
{
"epoch": 0.44712915680657617,
"grad_norm": 0.09919217277076049,
"learning_rate": 6.289950282678968e-06,
"loss": 0.0365,
"step": 3590
},
{
"epoch": 0.448374641923029,
"grad_norm": 0.1204688614416355,
"learning_rate": 6.2700413881615045e-06,
"loss": 0.0405,
"step": 3600
},
{
"epoch": 0.4496201270394819,
"grad_norm": 0.0886836308714703,
"learning_rate": 6.2501109455177335e-06,
"loss": 0.0376,
"step": 3610
},
{
"epoch": 0.45086561215593474,
"grad_norm": 0.07736114102152715,
"learning_rate": 6.230159292897024e-06,
"loss": 0.0384,
"step": 3620
},
{
"epoch": 0.4521110972723876,
"grad_norm": 0.08573745332344869,
"learning_rate": 6.210186768808608e-06,
"loss": 0.0359,
"step": 3630
},
{
"epoch": 0.45335658238884047,
"grad_norm": 0.0879163698710747,
"learning_rate": 6.190193712115826e-06,
"loss": 0.0429,
"step": 3640
},
{
"epoch": 0.4546020675052933,
"grad_norm": 0.08385409268772795,
"learning_rate": 6.1701804620303885e-06,
"loss": 0.0384,
"step": 3650
},
{
"epoch": 0.45584755262174614,
"grad_norm": 0.08913212132892431,
"learning_rate": 6.150147358106616e-06,
"loss": 0.0431,
"step": 3660
},
{
"epoch": 0.45709303773819904,
"grad_norm": 0.0866186587160168,
"learning_rate": 6.130094740235679e-06,
"loss": 0.0376,
"step": 3670
},
{
"epoch": 0.4583385228546519,
"grad_norm": 0.09083929180589548,
"learning_rate": 6.110022948639833e-06,
"loss": 0.0357,
"step": 3680
},
{
"epoch": 0.45958400797110477,
"grad_norm": 0.10433973917520964,
"learning_rate": 6.089932323866639e-06,
"loss": 0.0365,
"step": 3690
},
{
"epoch": 0.4608294930875576,
"grad_norm": 0.08693585315855797,
"learning_rate": 6.069823206783194e-06,
"loss": 0.0358,
"step": 3700
},
{
"epoch": 0.46207497820401044,
"grad_norm": 0.09506549061024662,
"learning_rate": 6.049695938570342e-06,
"loss": 0.0359,
"step": 3710
},
{
"epoch": 0.46332046332046334,
"grad_norm": 0.08650360999384969,
"learning_rate": 6.029550860716894e-06,
"loss": 0.0373,
"step": 3720
},
{
"epoch": 0.4645659484369162,
"grad_norm": 0.08774369289195169,
"learning_rate": 6.009388315013819e-06,
"loss": 0.035,
"step": 3730
},
{
"epoch": 0.465811433553369,
"grad_norm": 0.09281844863222019,
"learning_rate": 5.9892086435484575e-06,
"loss": 0.0355,
"step": 3740
},
{
"epoch": 0.4670569186698219,
"grad_norm": 0.08168130617977422,
"learning_rate": 5.969012188698717e-06,
"loss": 0.0366,
"step": 3750
},
{
"epoch": 0.46830240378627475,
"grad_norm": 0.09289124591513812,
"learning_rate": 5.948799293127258e-06,
"loss": 0.0368,
"step": 3760
},
{
"epoch": 0.46954788890272764,
"grad_norm": 0.08590808622934684,
"learning_rate": 5.9285702997756774e-06,
"loss": 0.0372,
"step": 3770
},
{
"epoch": 0.4707933740191805,
"grad_norm": 0.09587624744407341,
"learning_rate": 5.908325551858703e-06,
"loss": 0.0365,
"step": 3780
},
{
"epoch": 0.4720388591356333,
"grad_norm": 0.09008095094062744,
"learning_rate": 5.8880653928583555e-06,
"loss": 0.038,
"step": 3790
},
{
"epoch": 0.4732843442520862,
"grad_norm": 0.07615490717221561,
"learning_rate": 5.867790166518129e-06,
"loss": 0.0354,
"step": 3800
},
{
"epoch": 0.47452982936853905,
"grad_norm": 0.09473010524394446,
"learning_rate": 5.8475002168371585e-06,
"loss": 0.0374,
"step": 3810
},
{
"epoch": 0.4757753144849919,
"grad_norm": 0.08303526376060547,
"learning_rate": 5.827195888064383e-06,
"loss": 0.0371,
"step": 3820
},
{
"epoch": 0.4770207996014448,
"grad_norm": 0.08855714494970791,
"learning_rate": 5.806877524692699e-06,
"loss": 0.0358,
"step": 3830
},
{
"epoch": 0.4782662847178976,
"grad_norm": 0.07160984703351547,
"learning_rate": 5.786545471453129e-06,
"loss": 0.0352,
"step": 3840
},
{
"epoch": 0.47951176983435045,
"grad_norm": 0.0997131158733735,
"learning_rate": 5.766200073308957e-06,
"loss": 0.0364,
"step": 3850
},
{
"epoch": 0.48075725495080335,
"grad_norm": 0.09305720410336772,
"learning_rate": 5.7458416754498835e-06,
"loss": 0.0357,
"step": 3860
},
{
"epoch": 0.4820027400672562,
"grad_norm": 0.07178448066058737,
"learning_rate": 5.725470623286172e-06,
"loss": 0.0372,
"step": 3870
},
{
"epoch": 0.4832482251837091,
"grad_norm": 0.08008774183521761,
"learning_rate": 5.705087262442785e-06,
"loss": 0.0369,
"step": 3880
},
{
"epoch": 0.4844937103001619,
"grad_norm": 0.07477132401537798,
"learning_rate": 5.684691938753517e-06,
"loss": 0.0376,
"step": 3890
},
{
"epoch": 0.48573919541661476,
"grad_norm": 0.08712972267469035,
"learning_rate": 5.6642849982551315e-06,
"loss": 0.0551,
"step": 3900
},
{
"epoch": 0.48698468053306765,
"grad_norm": 0.09715561691541937,
"learning_rate": 5.643866787181486e-06,
"loss": 0.038,
"step": 3910
},
{
"epoch": 0.4882301656495205,
"grad_norm": 0.08465964641344985,
"learning_rate": 5.623437651957666e-06,
"loss": 0.0363,
"step": 3920
},
{
"epoch": 0.4894756507659733,
"grad_norm": 0.1084167071375196,
"learning_rate": 5.602997939194094e-06,
"loss": 0.0374,
"step": 3930
},
{
"epoch": 0.4907211358824262,
"grad_norm": 0.0813004465711618,
"learning_rate": 5.582547995680658e-06,
"loss": 0.037,
"step": 3940
},
{
"epoch": 0.49196662099887906,
"grad_norm": 0.0845492848679353,
"learning_rate": 5.5620881683808285e-06,
"loss": 0.0453,
"step": 3950
},
{
"epoch": 0.4932121061153319,
"grad_norm": 0.09095736782725126,
"learning_rate": 5.541618804425768e-06,
"loss": 0.0375,
"step": 3960
},
{
"epoch": 0.4944575912317848,
"grad_norm": 0.09077674512465791,
"learning_rate": 5.521140251108443e-06,
"loss": 0.0359,
"step": 3970
},
{
"epoch": 0.4957030763482376,
"grad_norm": 0.08095932810450854,
"learning_rate": 5.500652855877728e-06,
"loss": 0.0412,
"step": 3980
},
{
"epoch": 0.4969485614646905,
"grad_norm": 0.10046653085305575,
"learning_rate": 5.480156966332516e-06,
"loss": 0.0373,
"step": 3990
},
{
"epoch": 0.49819404658114336,
"grad_norm": 0.07731109541739078,
"learning_rate": 5.4596529302158195e-06,
"loss": 0.0379,
"step": 4000
},
{
"epoch": 0.49819404658114336,
"eval_loss": 0.03875109180808067,
"eval_runtime": 837.2143,
"eval_samples_per_second": 4.779,
"eval_steps_per_second": 0.3,
"step": 4000
},
{
"epoch": 0.4994395316975962,
"grad_norm": 0.08087609269303184,
"learning_rate": 5.439141095408868e-06,
"loss": 0.0401,
"step": 4010
},
{
"epoch": 0.5006850168140491,
"grad_norm": 0.08137909411365103,
"learning_rate": 5.4186218099252055e-06,
"loss": 0.0379,
"step": 4020
},
{
"epoch": 0.5019305019305019,
"grad_norm": 0.08950480563012517,
"learning_rate": 5.398095421904791e-06,
"loss": 0.0365,
"step": 4030
},
{
"epoch": 0.5031759870469548,
"grad_norm": 0.1057918321419654,
"learning_rate": 5.377562279608089e-06,
"loss": 0.0356,
"step": 4040
},
{
"epoch": 0.5044214721634076,
"grad_norm": 0.07251854316319703,
"learning_rate": 5.357022731410157e-06,
"loss": 0.0351,
"step": 4050
},
{
"epoch": 0.5056669572798606,
"grad_norm": 0.08066012398266963,
"learning_rate": 5.336477125794743e-06,
"loss": 0.0356,
"step": 4060
},
{
"epoch": 0.5069124423963134,
"grad_norm": 0.0761800355561372,
"learning_rate": 5.315925811348357e-06,
"loss": 0.0346,
"step": 4070
},
{
"epoch": 0.5081579275127662,
"grad_norm": 0.15590272248770423,
"learning_rate": 5.295369136754384e-06,
"loss": 0.0385,
"step": 4080
},
{
"epoch": 0.5094034126292191,
"grad_norm": 0.08073245707842704,
"learning_rate": 5.274807450787137e-06,
"loss": 0.0365,
"step": 4090
},
{
"epoch": 0.5106488977456719,
"grad_norm": 0.09757351217404539,
"learning_rate": 5.254241102305965e-06,
"loss": 0.0382,
"step": 4100
},
{
"epoch": 0.5118943828621249,
"grad_norm": 0.08013336393248688,
"learning_rate": 5.233670440249318e-06,
"loss": 0.0354,
"step": 4110
},
{
"epoch": 0.5131398679785777,
"grad_norm": 0.08500527243988966,
"learning_rate": 5.21309581362883e-06,
"loss": 0.0356,
"step": 4120
},
{
"epoch": 0.5143853530950305,
"grad_norm": 0.08246081379011684,
"learning_rate": 5.192517571523408e-06,
"loss": 0.038,
"step": 4130
},
{
"epoch": 0.5156308382114834,
"grad_norm": 0.08795733956897053,
"learning_rate": 5.171936063073292e-06,
"loss": 0.043,
"step": 4140
},
{
"epoch": 0.5168763233279362,
"grad_norm": 0.17188664091219194,
"learning_rate": 5.151351637474145e-06,
"loss": 0.0371,
"step": 4150
},
{
"epoch": 0.518121808444389,
"grad_norm": 0.08935192789639629,
"learning_rate": 5.130764643971123e-06,
"loss": 0.0364,
"step": 4160
},
{
"epoch": 0.519367293560842,
"grad_norm": 0.08254538202102524,
"learning_rate": 5.110175431852954e-06,
"loss": 0.0407,
"step": 4170
},
{
"epoch": 0.5206127786772948,
"grad_norm": 0.08501238116429123,
"learning_rate": 5.089584350446001e-06,
"loss": 0.0371,
"step": 4180
},
{
"epoch": 0.5218582637937477,
"grad_norm": 0.0846361975040117,
"learning_rate": 5.068991749108345e-06,
"loss": 0.0383,
"step": 4190
},
{
"epoch": 0.5231037489102005,
"grad_norm": 0.0826265348364115,
"learning_rate": 5.048397977223856e-06,
"loss": 0.0367,
"step": 4200
},
{
"epoch": 0.5243492340266533,
"grad_norm": 0.08860053794001904,
"learning_rate": 5.027803384196265e-06,
"loss": 0.0373,
"step": 4210
},
{
"epoch": 0.5255947191431063,
"grad_norm": 0.0800144716438524,
"learning_rate": 5.007208319443233e-06,
"loss": 0.0369,
"step": 4220
},
{
"epoch": 0.5268402042595591,
"grad_norm": 0.08494407990290043,
"learning_rate": 4.9866131323904225e-06,
"loss": 0.0356,
"step": 4230
},
{
"epoch": 0.528085689376012,
"grad_norm": 0.07571005703566845,
"learning_rate": 4.966018172465577e-06,
"loss": 0.0369,
"step": 4240
},
{
"epoch": 0.5293311744924648,
"grad_norm": 0.08285683201982978,
"learning_rate": 4.945423789092582e-06,
"loss": 0.0344,
"step": 4250
},
{
"epoch": 0.5305766596089176,
"grad_norm": 0.08546389296460612,
"learning_rate": 4.924830331685539e-06,
"loss": 0.0364,
"step": 4260
},
{
"epoch": 0.5318221447253705,
"grad_norm": 0.11128427091364633,
"learning_rate": 4.904238149642847e-06,
"loss": 0.0364,
"step": 4270
},
{
"epoch": 0.5330676298418234,
"grad_norm": 0.07670884663714952,
"learning_rate": 4.883647592341258e-06,
"loss": 0.0365,
"step": 4280
},
{
"epoch": 0.5343131149582763,
"grad_norm": 0.3176148785698488,
"learning_rate": 4.863059009129962e-06,
"loss": 0.0369,
"step": 4290
},
{
"epoch": 0.5355586000747291,
"grad_norm": 0.10713813466482028,
"learning_rate": 4.842472749324656e-06,
"loss": 0.0359,
"step": 4300
},
{
"epoch": 0.536804085191182,
"grad_norm": 0.07931238210250223,
"learning_rate": 4.821889162201615e-06,
"loss": 0.0358,
"step": 4310
},
{
"epoch": 0.5380495703076348,
"grad_norm": 0.08258895469888074,
"learning_rate": 4.801308596991769e-06,
"loss": 0.0346,
"step": 4320
},
{
"epoch": 0.5392950554240877,
"grad_norm": 0.1109589196940028,
"learning_rate": 4.780731402874778e-06,
"loss": 0.0363,
"step": 4330
},
{
"epoch": 0.5405405405405406,
"grad_norm": 0.0981222571494136,
"learning_rate": 4.760157928973105e-06,
"loss": 0.0386,
"step": 4340
},
{
"epoch": 0.5417860256569934,
"grad_norm": 0.08759331671597073,
"learning_rate": 4.739588524346093e-06,
"loss": 0.035,
"step": 4350
},
{
"epoch": 0.5430315107734462,
"grad_norm": 0.08247240433770839,
"learning_rate": 4.719023537984046e-06,
"loss": 0.0365,
"step": 4360
},
{
"epoch": 0.5442769958898991,
"grad_norm": 0.2538358357776023,
"learning_rate": 4.698463318802305e-06,
"loss": 0.046,
"step": 4370
},
{
"epoch": 0.5455224810063519,
"grad_norm": 0.0924003545309488,
"learning_rate": 4.677908215635328e-06,
"loss": 0.0348,
"step": 4380
},
{
"epoch": 0.5467679661228049,
"grad_norm": 0.07953047997851986,
"learning_rate": 4.657358577230772e-06,
"loss": 0.035,
"step": 4390
},
{
"epoch": 0.5480134512392577,
"grad_norm": 0.07615861201505639,
"learning_rate": 4.636814752243577e-06,
"loss": 0.0347,
"step": 4400
},
{
"epoch": 0.5492589363557105,
"grad_norm": 0.07306009452356657,
"learning_rate": 4.616277089230051e-06,
"loss": 0.0348,
"step": 4410
},
{
"epoch": 0.5505044214721634,
"grad_norm": 0.08089102430322212,
"learning_rate": 4.595745936641955e-06,
"loss": 0.0361,
"step": 4420
},
{
"epoch": 0.5517499065886162,
"grad_norm": 0.08217252158628566,
"learning_rate": 4.575221642820588e-06,
"loss": 0.0424,
"step": 4430
},
{
"epoch": 0.5529953917050692,
"grad_norm": 0.07762469038975436,
"learning_rate": 4.55470455599088e-06,
"loss": 0.0364,
"step": 4440
},
{
"epoch": 0.554240876821522,
"grad_norm": 0.0686081495172252,
"learning_rate": 4.534195024255488e-06,
"loss": 0.0351,
"step": 4450
},
{
"epoch": 0.5554863619379748,
"grad_norm": 0.08788638662330096,
"learning_rate": 4.513693395588883e-06,
"loss": 0.0341,
"step": 4460
},
{
"epoch": 0.5567318470544277,
"grad_norm": 0.07249731141269991,
"learning_rate": 4.493200017831448e-06,
"loss": 0.0368,
"step": 4470
},
{
"epoch": 0.5579773321708805,
"grad_norm": 0.08456421928370435,
"learning_rate": 4.472715238683577e-06,
"loss": 0.0359,
"step": 4480
},
{
"epoch": 0.5592228172873334,
"grad_norm": 0.08888692985996884,
"learning_rate": 4.45223940569978e-06,
"loss": 0.0355,
"step": 4490
},
{
"epoch": 0.5604683024037863,
"grad_norm": 0.0728067762448024,
"learning_rate": 4.431772866282776e-06,
"loss": 0.0359,
"step": 4500
},
{
"epoch": 0.5617137875202391,
"grad_norm": 0.09569939502128422,
"learning_rate": 4.41131596767761e-06,
"loss": 0.0401,
"step": 4510
},
{
"epoch": 0.562959272636692,
"grad_norm": 0.09245622279833991,
"learning_rate": 4.390869056965752e-06,
"loss": 0.0346,
"step": 4520
},
{
"epoch": 0.5642047577531448,
"grad_norm": 0.0856450746438228,
"learning_rate": 4.370432481059219e-06,
"loss": 0.0357,
"step": 4530
},
{
"epoch": 0.5654502428695977,
"grad_norm": 0.07864235155334423,
"learning_rate": 4.3500065866946736e-06,
"loss": 0.0354,
"step": 4540
},
{
"epoch": 0.5666957279860506,
"grad_norm": 0.07895524505876401,
"learning_rate": 4.329591720427561e-06,
"loss": 0.038,
"step": 4550
},
{
"epoch": 0.5679412131025035,
"grad_norm": 0.06919521619960296,
"learning_rate": 4.30918822862621e-06,
"loss": 0.0392,
"step": 4560
},
{
"epoch": 0.5691866982189563,
"grad_norm": 0.06988408853131772,
"learning_rate": 4.288796457465967e-06,
"loss": 0.0367,
"step": 4570
},
{
"epoch": 0.5704321833354091,
"grad_norm": 0.08530089083645045,
"learning_rate": 4.268416752923323e-06,
"loss": 0.034,
"step": 4580
},
{
"epoch": 0.571677668451862,
"grad_norm": 0.0866092035592873,
"learning_rate": 4.2480494607700394e-06,
"loss": 0.039,
"step": 4590
},
{
"epoch": 0.5729231535683149,
"grad_norm": 0.07426315591183634,
"learning_rate": 4.2276949265672815e-06,
"loss": 0.0355,
"step": 4600
},
{
"epoch": 0.5741686386847678,
"grad_norm": 0.07105220250355836,
"learning_rate": 4.207353495659758e-06,
"loss": 0.0348,
"step": 4610
},
{
"epoch": 0.5754141238012206,
"grad_norm": 0.0828502788807824,
"learning_rate": 4.1870255131698636e-06,
"loss": 0.036,
"step": 4620
},
{
"epoch": 0.5766596089176734,
"grad_norm": 0.08079433895964318,
"learning_rate": 4.166711323991818e-06,
"loss": 0.0435,
"step": 4630
},
{
"epoch": 0.5779050940341263,
"grad_norm": 0.08493675955532878,
"learning_rate": 4.146411272785815e-06,
"loss": 0.035,
"step": 4640
},
{
"epoch": 0.5791505791505791,
"grad_norm": 0.07728604940447889,
"learning_rate": 4.1261257039721845e-06,
"loss": 0.0362,
"step": 4650
},
{
"epoch": 0.580396064267032,
"grad_norm": 0.08914486315528271,
"learning_rate": 4.1058549617255315e-06,
"loss": 0.0362,
"step": 4660
},
{
"epoch": 0.5816415493834849,
"grad_norm": 0.084417951611929,
"learning_rate": 4.085599389968913e-06,
"loss": 0.035,
"step": 4670
},
{
"epoch": 0.5828870344999377,
"grad_norm": 0.08307682451486548,
"learning_rate": 4.065359332367992e-06,
"loss": 0.0535,
"step": 4680
},
{
"epoch": 0.5841325196163906,
"grad_norm": 0.07165685132824422,
"learning_rate": 4.045135132325216e-06,
"loss": 0.0342,
"step": 4690
},
{
"epoch": 0.5853780047328434,
"grad_norm": 0.08250346834846708,
"learning_rate": 4.024927132973984e-06,
"loss": 0.0375,
"step": 4700
},
{
"epoch": 0.5866234898492964,
"grad_norm": 0.09593473940955272,
"learning_rate": 4.004735677172823e-06,
"loss": 0.0352,
"step": 4710
},
{
"epoch": 0.5878689749657492,
"grad_norm": 0.08037620776646574,
"learning_rate": 3.984561107499576e-06,
"loss": 0.0376,
"step": 4720
},
{
"epoch": 0.589114460082202,
"grad_norm": 0.08192591510347322,
"learning_rate": 3.9644037662455904e-06,
"loss": 0.0368,
"step": 4730
},
{
"epoch": 0.5903599451986549,
"grad_norm": 0.08855109420448647,
"learning_rate": 3.944263995409905e-06,
"loss": 0.0368,
"step": 4740
},
{
"epoch": 0.5916054303151077,
"grad_norm": 0.07587131061896397,
"learning_rate": 3.924142136693452e-06,
"loss": 0.0362,
"step": 4750
},
{
"epoch": 0.5928509154315605,
"grad_norm": 0.0818796399275586,
"learning_rate": 3.904038531493257e-06,
"loss": 0.0367,
"step": 4760
},
{
"epoch": 0.5940964005480135,
"grad_norm": 0.06799608040292701,
"learning_rate": 3.8839535208966474e-06,
"loss": 0.0353,
"step": 4770
},
{
"epoch": 0.5953418856644663,
"grad_norm": 0.09285743533757898,
"learning_rate": 3.86388744567547e-06,
"loss": 0.0366,
"step": 4780
},
{
"epoch": 0.5965873707809192,
"grad_norm": 0.08503340805142413,
"learning_rate": 3.8438406462803e-06,
"loss": 0.0343,
"step": 4790
},
{
"epoch": 0.597832855897372,
"grad_norm": 0.08719236424372612,
"learning_rate": 3.82381346283467e-06,
"loss": 0.0345,
"step": 4800
},
{
"epoch": 0.5990783410138248,
"grad_norm": 0.07911759435664936,
"learning_rate": 3.8038062351293036e-06,
"loss": 0.0344,
"step": 4810
},
{
"epoch": 0.6003238261302778,
"grad_norm": 0.07129019359967453,
"learning_rate": 3.783819302616339e-06,
"loss": 0.0345,
"step": 4820
},
{
"epoch": 0.6015693112467306,
"grad_norm": 0.08169666397178237,
"learning_rate": 3.763853004403585e-06,
"loss": 0.0366,
"step": 4830
},
{
"epoch": 0.6028147963631835,
"grad_norm": 0.07816183273790703,
"learning_rate": 3.743907679248752e-06,
"loss": 0.0403,
"step": 4840
},
{
"epoch": 0.6040602814796363,
"grad_norm": 0.07799188265438552,
"learning_rate": 3.723983665553712e-06,
"loss": 0.0356,
"step": 4850
},
{
"epoch": 0.6053057665960891,
"grad_norm": 0.08453695233583738,
"learning_rate": 3.7040813013587617e-06,
"loss": 0.0352,
"step": 4860
},
{
"epoch": 0.606551251712542,
"grad_norm": 0.08582922327632445,
"learning_rate": 3.6842009243368783e-06,
"loss": 0.0391,
"step": 4870
},
{
"epoch": 0.6077967368289949,
"grad_norm": 0.07682972276297419,
"learning_rate": 3.6643428717879935e-06,
"loss": 0.036,
"step": 4880
},
{
"epoch": 0.6090422219454478,
"grad_norm": 0.07689096654430388,
"learning_rate": 3.6445074806332724e-06,
"loss": 0.0358,
"step": 4890
},
{
"epoch": 0.6102877070619006,
"grad_norm": 0.07295422485911567,
"learning_rate": 3.624695087409399e-06,
"loss": 0.0338,
"step": 4900
},
{
"epoch": 0.6115331921783534,
"grad_norm": 0.07075949046510298,
"learning_rate": 3.604906028262858e-06,
"loss": 0.0345,
"step": 4910
},
{
"epoch": 0.6127786772948063,
"grad_norm": 0.08032966979093947,
"learning_rate": 3.585140638944242e-06,
"loss": 0.0347,
"step": 4920
},
{
"epoch": 0.6140241624112592,
"grad_norm": 0.07888132297133613,
"learning_rate": 3.565399254802545e-06,
"loss": 0.034,
"step": 4930
},
{
"epoch": 0.6152696475277121,
"grad_norm": 0.07750218855014168,
"learning_rate": 3.545682210779484e-06,
"loss": 0.0345,
"step": 4940
},
{
"epoch": 0.6165151326441649,
"grad_norm": 0.07217659392974636,
"learning_rate": 3.5259898414038053e-06,
"loss": 0.0331,
"step": 4950
},
{
"epoch": 0.6177606177606177,
"grad_norm": 0.07462403764615293,
"learning_rate": 3.5063224807856116e-06,
"loss": 0.0349,
"step": 4960
},
{
"epoch": 0.6190061028770706,
"grad_norm": 0.09085399586194012,
"learning_rate": 3.486680462610704e-06,
"loss": 0.0362,
"step": 4970
},
{
"epoch": 0.6202515879935234,
"grad_norm": 0.08134490027796866,
"learning_rate": 3.467064120134902e-06,
"loss": 0.0416,
"step": 4980
},
{
"epoch": 0.6214970731099764,
"grad_norm": 0.07439865538712978,
"learning_rate": 3.447473786178405e-06,
"loss": 0.0337,
"step": 4990
},
{
"epoch": 0.6227425582264292,
"grad_norm": 0.08988556766682597,
"learning_rate": 3.4279097931201365e-06,
"loss": 0.0366,
"step": 5000
},
{
"epoch": 0.6227425582264292,
"eval_loss": 0.037423599511384964,
"eval_runtime": 838.985,
"eval_samples_per_second": 4.769,
"eval_steps_per_second": 0.299,
"step": 5000
},
{
"epoch": 0.623988043342882,
"grad_norm": 0.07368562348225001,
"learning_rate": 3.4083724728921086e-06,
"loss": 0.0379,
"step": 5010
},
{
"epoch": 0.6252335284593349,
"grad_norm": 0.08044415630307507,
"learning_rate": 3.388862156973789e-06,
"loss": 0.0335,
"step": 5020
},
{
"epoch": 0.6264790135757877,
"grad_norm": 0.07652489551491354,
"learning_rate": 3.3693791763864784e-06,
"loss": 0.0334,
"step": 5030
},
{
"epoch": 0.6277244986922407,
"grad_norm": 0.07059001987028125,
"learning_rate": 3.3499238616876937e-06,
"loss": 0.036,
"step": 5040
},
{
"epoch": 0.6289699838086935,
"grad_norm": 0.0789315683738395,
"learning_rate": 3.3304965429655567e-06,
"loss": 0.0348,
"step": 5050
},
{
"epoch": 0.6302154689251463,
"grad_norm": 0.0784458276562498,
"learning_rate": 3.3110975498331986e-06,
"loss": 0.0426,
"step": 5060
},
{
"epoch": 0.6314609540415992,
"grad_norm": 0.07596701292317404,
"learning_rate": 3.2917272114231653e-06,
"loss": 0.037,
"step": 5070
},
{
"epoch": 0.632706439158052,
"grad_norm": 0.07339182015190028,
"learning_rate": 3.272385856381831e-06,
"loss": 0.0347,
"step": 5080
},
{
"epoch": 0.633951924274505,
"grad_norm": 0.08699033400119037,
"learning_rate": 3.253073812863824e-06,
"loss": 0.0352,
"step": 5090
},
{
"epoch": 0.6351974093909578,
"grad_norm": 0.08139071221185196,
"learning_rate": 3.233791408526461e-06,
"loss": 0.0361,
"step": 5100
},
{
"epoch": 0.6364428945074107,
"grad_norm": 0.08719769563593989,
"learning_rate": 3.21453897052419e-06,
"loss": 0.0349,
"step": 5110
},
{
"epoch": 0.6376883796238635,
"grad_norm": 0.0708422951525975,
"learning_rate": 3.195316825503029e-06,
"loss": 0.0337,
"step": 5120
},
{
"epoch": 0.6389338647403163,
"grad_norm": 0.0690633791829835,
"learning_rate": 3.176125299595033e-06,
"loss": 0.0335,
"step": 5130
},
{
"epoch": 0.6401793498567692,
"grad_norm": 0.07564271868343764,
"learning_rate": 3.1569647184127606e-06,
"loss": 0.0346,
"step": 5140
},
{
"epoch": 0.6414248349732221,
"grad_norm": 0.08599533730869659,
"learning_rate": 3.1378354070437455e-06,
"loss": 0.0359,
"step": 5150
},
{
"epoch": 0.642670320089675,
"grad_norm": 0.07806729703434577,
"learning_rate": 3.118737690044984e-06,
"loss": 0.0369,
"step": 5160
},
{
"epoch": 0.6439158052061278,
"grad_norm": 0.07961361093653316,
"learning_rate": 3.099671891437427e-06,
"loss": 0.0331,
"step": 5170
},
{
"epoch": 0.6451612903225806,
"grad_norm": 0.07733227309637837,
"learning_rate": 3.0806383347004807e-06,
"loss": 0.0333,
"step": 5180
},
{
"epoch": 0.6464067754390335,
"grad_norm": 0.08695812998134722,
"learning_rate": 3.0616373427665256e-06,
"loss": 0.0379,
"step": 5190
},
{
"epoch": 0.6476522605554864,
"grad_norm": 0.08035052137063912,
"learning_rate": 3.042669238015427e-06,
"loss": 0.0348,
"step": 5200
},
{
"epoch": 0.6488977456719393,
"grad_norm": 0.07238695738490918,
"learning_rate": 3.0237343422690725e-06,
"loss": 0.0341,
"step": 5210
},
{
"epoch": 0.6501432307883921,
"grad_norm": 0.08224468669410057,
"learning_rate": 3.0048329767859104e-06,
"loss": 0.0345,
"step": 5220
},
{
"epoch": 0.6513887159048449,
"grad_norm": 0.07629162025553535,
"learning_rate": 2.985965462255499e-06,
"loss": 0.0359,
"step": 5230
},
{
"epoch": 0.6526342010212978,
"grad_norm": 0.0717956931175302,
"learning_rate": 2.9671321187930615e-06,
"loss": 0.0352,
"step": 5240
},
{
"epoch": 0.6538796861377506,
"grad_norm": 0.0740179945086292,
"learning_rate": 2.9483332659340652e-06,
"loss": 0.0359,
"step": 5250
},
{
"epoch": 0.6551251712542036,
"grad_norm": 0.08241315185207748,
"learning_rate": 2.9295692226287875e-06,
"loss": 0.035,
"step": 5260
},
{
"epoch": 0.6563706563706564,
"grad_norm": 0.07213980543546894,
"learning_rate": 2.910840307236915e-06,
"loss": 0.0342,
"step": 5270
},
{
"epoch": 0.6576161414871092,
"grad_norm": 0.09165264225787473,
"learning_rate": 2.8921468375221308e-06,
"loss": 0.0358,
"step": 5280
},
{
"epoch": 0.6588616266035621,
"grad_norm": 0.07828015809476291,
"learning_rate": 2.8734891306467383e-06,
"loss": 0.033,
"step": 5290
},
{
"epoch": 0.6601071117200149,
"grad_norm": 0.07430820316934614,
"learning_rate": 2.854867503166265e-06,
"loss": 0.0348,
"step": 5300
},
{
"epoch": 0.6613525968364679,
"grad_norm": 0.07466403058109475,
"learning_rate": 2.8362822710241002e-06,
"loss": 0.0343,
"step": 5310
},
{
"epoch": 0.6625980819529207,
"grad_norm": 0.08245007542150891,
"learning_rate": 2.817733749546134e-06,
"loss": 0.035,
"step": 5320
},
{
"epoch": 0.6638435670693735,
"grad_norm": 0.08359123302429086,
"learning_rate": 2.799222253435403e-06,
"loss": 0.0341,
"step": 5330
},
{
"epoch": 0.6650890521858264,
"grad_norm": 0.10350263178886938,
"learning_rate": 2.780748096766758e-06,
"loss": 0.0372,
"step": 5340
},
{
"epoch": 0.6663345373022792,
"grad_norm": 0.07256896882094224,
"learning_rate": 2.762311592981525e-06,
"loss": 0.0334,
"step": 5350
},
{
"epoch": 0.667580022418732,
"grad_norm": 0.06577753138989224,
"learning_rate": 2.7439130548821986e-06,
"loss": 0.0364,
"step": 5360
},
{
"epoch": 0.668825507535185,
"grad_norm": 0.07686341741378933,
"learning_rate": 2.7255527946271287e-06,
"loss": 0.0358,
"step": 5370
},
{
"epoch": 0.6700709926516378,
"grad_norm": 0.08122361668658674,
"learning_rate": 2.7072311237252224e-06,
"loss": 0.0354,
"step": 5380
},
{
"epoch": 0.6713164777680907,
"grad_norm": 0.07345421003800728,
"learning_rate": 2.6889483530306657e-06,
"loss": 0.0345,
"step": 5390
},
{
"epoch": 0.6725619628845435,
"grad_norm": 0.07687336721271233,
"learning_rate": 2.670704792737642e-06,
"loss": 0.0342,
"step": 5400
},
{
"epoch": 0.6738074480009963,
"grad_norm": 0.0762945284392139,
"learning_rate": 2.6525007523750723e-06,
"loss": 0.0355,
"step": 5410
},
{
"epoch": 0.6750529331174493,
"grad_norm": 0.07771757792483129,
"learning_rate": 2.6343365408013642e-06,
"loss": 0.0345,
"step": 5420
},
{
"epoch": 0.6762984182339021,
"grad_norm": 0.08056552324055927,
"learning_rate": 2.6162124661991715e-06,
"loss": 0.0405,
"step": 5430
},
{
"epoch": 0.677543903350355,
"grad_norm": 0.0881648990710348,
"learning_rate": 2.598128836070164e-06,
"loss": 0.0354,
"step": 5440
},
{
"epoch": 0.6787893884668078,
"grad_norm": 0.0729587992152256,
"learning_rate": 2.5800859572298147e-06,
"loss": 0.035,
"step": 5450
},
{
"epoch": 0.6800348735832606,
"grad_norm": 0.06861549815799306,
"learning_rate": 2.5620841358021874e-06,
"loss": 0.0371,
"step": 5460
},
{
"epoch": 0.6812803586997135,
"grad_norm": 0.07161887920463696,
"learning_rate": 2.544123677214747e-06,
"loss": 0.0355,
"step": 5470
},
{
"epoch": 0.6825258438161664,
"grad_norm": 0.08025793937642813,
"learning_rate": 2.5262048861931767e-06,
"loss": 0.0343,
"step": 5480
},
{
"epoch": 0.6837713289326193,
"grad_norm": 0.07592732953174938,
"learning_rate": 2.5083280667562087e-06,
"loss": 0.0337,
"step": 5490
},
{
"epoch": 0.6850168140490721,
"grad_norm": 0.08841842120417269,
"learning_rate": 2.4904935222104654e-06,
"loss": 0.0449,
"step": 5500
},
{
"epoch": 0.686262299165525,
"grad_norm": 0.06970957798997367,
"learning_rate": 2.472701555145312e-06,
"loss": 0.0325,
"step": 5510
},
{
"epoch": 0.6875077842819778,
"grad_norm": 0.07452151758517693,
"learning_rate": 2.454952467427723e-06,
"loss": 0.0369,
"step": 5520
},
{
"epoch": 0.6887532693984307,
"grad_norm": 0.07907233219809202,
"learning_rate": 2.437246560197166e-06,
"loss": 0.034,
"step": 5530
},
{
"epoch": 0.6899987545148836,
"grad_norm": 0.07972745109645156,
"learning_rate": 2.4195841338604864e-06,
"loss": 0.0337,
"step": 5540
},
{
"epoch": 0.6912442396313364,
"grad_norm": 0.07781076797157135,
"learning_rate": 2.4019654880868092e-06,
"loss": 0.0349,
"step": 5550
},
{
"epoch": 0.6924897247477892,
"grad_norm": 0.07140371730661917,
"learning_rate": 2.384390921802459e-06,
"loss": 0.0336,
"step": 5560
},
{
"epoch": 0.6937352098642421,
"grad_norm": 0.0853135196439208,
"learning_rate": 2.3668607331858872e-06,
"loss": 0.0341,
"step": 5570
},
{
"epoch": 0.694980694980695,
"grad_norm": 0.06966149731893141,
"learning_rate": 2.3493752196626126e-06,
"loss": 0.0335,
"step": 5580
},
{
"epoch": 0.6962261800971479,
"grad_norm": 0.07825025391893654,
"learning_rate": 2.331934677900174e-06,
"loss": 0.0343,
"step": 5590
},
{
"epoch": 0.6974716652136007,
"grad_norm": 0.07204338779249021,
"learning_rate": 2.3145394038030977e-06,
"loss": 0.0345,
"step": 5600
},
{
"epoch": 0.6987171503300535,
"grad_norm": 0.08198716232024462,
"learning_rate": 2.297189692507882e-06,
"loss": 0.0355,
"step": 5610
},
{
"epoch": 0.6999626354465064,
"grad_norm": 0.08676996469723806,
"learning_rate": 2.279885838377979e-06,
"loss": 0.0343,
"step": 5620
},
{
"epoch": 0.7012081205629592,
"grad_norm": 0.07800253672080891,
"learning_rate": 2.2626281349988106e-06,
"loss": 0.0343,
"step": 5630
},
{
"epoch": 0.7024536056794122,
"grad_norm": 0.09856175213052162,
"learning_rate": 2.245416875172779e-06,
"loss": 0.0368,
"step": 5640
},
{
"epoch": 0.703699090795865,
"grad_norm": 0.07902099079501247,
"learning_rate": 2.228252350914306e-06,
"loss": 0.0453,
"step": 5650
},
{
"epoch": 0.7049445759123179,
"grad_norm": 0.13763358144193752,
"learning_rate": 2.2111348534448736e-06,
"loss": 0.0581,
"step": 5660
},
{
"epoch": 0.7061900610287707,
"grad_norm": 0.06722867008673782,
"learning_rate": 2.1940646731880887e-06,
"loss": 0.0339,
"step": 5670
},
{
"epoch": 0.7074355461452235,
"grad_norm": 0.08888938596568086,
"learning_rate": 2.177042099764748e-06,
"loss": 0.0343,
"step": 5680
},
{
"epoch": 0.7086810312616765,
"grad_norm": 0.08088289502197214,
"learning_rate": 2.1600674219879277e-06,
"loss": 0.036,
"step": 5690
},
{
"epoch": 0.7099265163781293,
"grad_norm": 0.0724852977114884,
"learning_rate": 2.1431409278580878e-06,
"loss": 0.0332,
"step": 5700
},
{
"epoch": 0.7111720014945822,
"grad_norm": 0.07400827446136445,
"learning_rate": 2.1279485175330795e-06,
"loss": 0.034,
"step": 5710
},
{
"epoch": 0.712417486611035,
"grad_norm": 0.0669817077398449,
"learning_rate": 2.1111143628419344e-06,
"loss": 0.034,
"step": 5720
},
{
"epoch": 0.7136629717274878,
"grad_norm": 0.0675754646108786,
"learning_rate": 2.0943292223586536e-06,
"loss": 0.0343,
"step": 5730
},
{
"epoch": 0.7149084568439407,
"grad_norm": 0.07721647391309475,
"learning_rate": 2.077593380867918e-06,
"loss": 0.0335,
"step": 5740
},
{
"epoch": 0.7161539419603936,
"grad_norm": 0.08116065782751342,
"learning_rate": 2.060907122317974e-06,
"loss": 0.0408,
"step": 5750
},
{
"epoch": 0.7173994270768465,
"grad_norm": 0.0695172297111555,
"learning_rate": 2.0442707298158203e-06,
"loss": 0.038,
"step": 5760
},
{
"epoch": 0.7186449121932993,
"grad_norm": 0.07339425885257164,
"learning_rate": 2.0276844856224058e-06,
"loss": 0.0357,
"step": 5770
},
{
"epoch": 0.7198903973097521,
"grad_norm": 0.07839644345847782,
"learning_rate": 2.011148671147838e-06,
"loss": 0.0335,
"step": 5780
},
{
"epoch": 0.721135882426205,
"grad_norm": 0.07242668204963501,
"learning_rate": 1.994663566946613e-06,
"loss": 0.0349,
"step": 5790
},
{
"epoch": 0.7223813675426579,
"grad_norm": 0.06638249084357817,
"learning_rate": 1.97822945271285e-06,
"loss": 0.0342,
"step": 5800
},
{
"epoch": 0.7236268526591108,
"grad_norm": 0.0784823693702381,
"learning_rate": 1.9618466072755464e-06,
"loss": 0.0384,
"step": 5810
},
{
"epoch": 0.7248723377755636,
"grad_norm": 0.0785131020021455,
"learning_rate": 1.9455153085938517e-06,
"loss": 0.0315,
"step": 5820
},
{
"epoch": 0.7261178228920164,
"grad_norm": 0.07073482685372633,
"learning_rate": 1.9292358337523477e-06,
"loss": 0.0356,
"step": 5830
},
{
"epoch": 0.7273633080084693,
"grad_norm": 0.08165610699922707,
"learning_rate": 1.9130084589563473e-06,
"loss": 0.0338,
"step": 5840
},
{
"epoch": 0.7286087931249221,
"grad_norm": 0.06498902084786715,
"learning_rate": 1.8968334595272097e-06,
"loss": 0.0331,
"step": 5850
},
{
"epoch": 0.729854278241375,
"grad_norm": 0.07583702681271821,
"learning_rate": 1.8807111098976716e-06,
"loss": 0.0336,
"step": 5860
},
{
"epoch": 0.7310997633578279,
"grad_norm": 0.07197627691130293,
"learning_rate": 1.8646416836071822e-06,
"loss": 0.0336,
"step": 5870
},
{
"epoch": 0.7323452484742807,
"grad_norm": 0.07891173135399616,
"learning_rate": 1.8486254532972758e-06,
"loss": 0.0332,
"step": 5880
},
{
"epoch": 0.7335907335907336,
"grad_norm": 0.07356845117523034,
"learning_rate": 1.8326626907069316e-06,
"loss": 0.0363,
"step": 5890
},
{
"epoch": 0.7348362187071864,
"grad_norm": 0.07343814227481309,
"learning_rate": 1.8167536666679731e-06,
"loss": 0.0344,
"step": 5900
},
{
"epoch": 0.7360817038236394,
"grad_norm": 0.08483159827157047,
"learning_rate": 1.80089865110047e-06,
"loss": 0.035,
"step": 5910
},
{
"epoch": 0.7373271889400922,
"grad_norm": 0.07339981334410413,
"learning_rate": 1.7850979130081576e-06,
"loss": 0.0328,
"step": 5920
},
{
"epoch": 0.738572674056545,
"grad_norm": 0.06258653634322751,
"learning_rate": 1.7693517204738736e-06,
"loss": 0.0315,
"step": 5930
},
{
"epoch": 0.7398181591729979,
"grad_norm": 0.08163291026144019,
"learning_rate": 1.7536603406550101e-06,
"loss": 0.0332,
"step": 5940
},
{
"epoch": 0.7410636442894507,
"grad_norm": 0.07902800672185287,
"learning_rate": 1.7380240397789838e-06,
"loss": 0.0365,
"step": 5950
},
{
"epoch": 0.7423091294059035,
"grad_norm": 0.07995187879995147,
"learning_rate": 1.7224430831387107e-06,
"loss": 0.0344,
"step": 5960
},
{
"epoch": 0.7435546145223565,
"grad_norm": 0.0763305452070105,
"learning_rate": 1.7069177350881138e-06,
"loss": 0.0351,
"step": 5970
},
{
"epoch": 0.7448000996388093,
"grad_norm": 0.07471841853434173,
"learning_rate": 1.6914482590376318e-06,
"loss": 0.0339,
"step": 5980
},
{
"epoch": 0.7460455847552622,
"grad_norm": 0.06801475140677903,
"learning_rate": 1.6760349174497542e-06,
"loss": 0.0374,
"step": 5990
},
{
"epoch": 0.747291069871715,
"grad_norm": 0.06206446688924529,
"learning_rate": 1.6606779718345662e-06,
"loss": 0.0329,
"step": 6000
},
{
"epoch": 0.747291069871715,
"eval_loss": 0.0365239642560482,
"eval_runtime": 838.4523,
"eval_samples_per_second": 4.772,
"eval_steps_per_second": 0.299,
"step": 6000
},
{
"epoch": 0.7485365549881678,
"grad_norm": 0.07526072165078002,
"learning_rate": 1.6453776827453099e-06,
"loss": 0.0339,
"step": 6010
},
{
"epoch": 0.7497820401046208,
"grad_norm": 0.07043851968247505,
"learning_rate": 1.6301343097739714e-06,
"loss": 0.0344,
"step": 6020
},
{
"epoch": 0.7510275252210736,
"grad_norm": 0.0783487848180965,
"learning_rate": 1.6149481115468634e-06,
"loss": 0.0354,
"step": 6030
},
{
"epoch": 0.7522730103375265,
"grad_norm": 0.069822418915247,
"learning_rate": 1.5998193457202532e-06,
"loss": 0.0339,
"step": 6040
},
{
"epoch": 0.7535184954539793,
"grad_norm": 0.06528825794120303,
"learning_rate": 1.5847482689759747e-06,
"loss": 0.033,
"step": 6050
},
{
"epoch": 0.7547639805704321,
"grad_norm": 0.07313532691839966,
"learning_rate": 1.569735137017086e-06,
"loss": 0.0364,
"step": 6060
},
{
"epoch": 0.7560094656868851,
"grad_norm": 0.0747987256879236,
"learning_rate": 1.5547802045635241e-06,
"loss": 0.0366,
"step": 6070
},
{
"epoch": 0.7572549508033379,
"grad_norm": 0.07210718675492371,
"learning_rate": 1.5398837253477877e-06,
"loss": 0.0334,
"step": 6080
},
{
"epoch": 0.7585004359197908,
"grad_norm": 0.06919547631061922,
"learning_rate": 1.5250459521106287e-06,
"loss": 0.0352,
"step": 6090
},
{
"epoch": 0.7597459210362436,
"grad_norm": 0.07119325113096954,
"learning_rate": 1.510267136596763e-06,
"loss": 0.0346,
"step": 6100
},
{
"epoch": 0.7609914061526964,
"grad_norm": 0.07318013984700833,
"learning_rate": 1.4955475295506094e-06,
"loss": 0.0342,
"step": 6110
},
{
"epoch": 0.7622368912691493,
"grad_norm": 0.07616351276339006,
"learning_rate": 1.4808873807120193e-06,
"loss": 0.0336,
"step": 6120
},
{
"epoch": 0.7634823763856022,
"grad_norm": 0.0796790430809141,
"learning_rate": 1.466286938812052e-06,
"loss": 0.0346,
"step": 6130
},
{
"epoch": 0.7647278615020551,
"grad_norm": 0.07509216834289828,
"learning_rate": 1.45174645156875e-06,
"loss": 0.0345,
"step": 6140
},
{
"epoch": 0.7659733466185079,
"grad_norm": 0.07381476393416156,
"learning_rate": 1.4372661656829345e-06,
"loss": 0.0354,
"step": 6150
},
{
"epoch": 0.7672188317349607,
"grad_norm": 0.08480920007243628,
"learning_rate": 1.422846326834027e-06,
"loss": 0.0362,
"step": 6160
},
{
"epoch": 0.7684643168514136,
"grad_norm": 0.08367728643096982,
"learning_rate": 1.4084871796758697e-06,
"loss": 0.0346,
"step": 6170
},
{
"epoch": 0.7697098019678665,
"grad_norm": 0.06943587680204816,
"learning_rate": 1.394188967832585e-06,
"loss": 0.0355,
"step": 6180
},
{
"epoch": 0.7709552870843194,
"grad_norm": 0.06973678116522715,
"learning_rate": 1.379951933894434e-06,
"loss": 0.0333,
"step": 6190
},
{
"epoch": 0.7722007722007722,
"grad_norm": 0.08517060522280089,
"learning_rate": 1.3657763194137096e-06,
"loss": 0.0341,
"step": 6200
},
{
"epoch": 0.773446257317225,
"grad_norm": 0.06449083173655625,
"learning_rate": 1.3516623649006283e-06,
"loss": 0.0315,
"step": 6210
},
{
"epoch": 0.7746917424336779,
"grad_norm": 0.06900815834199764,
"learning_rate": 1.3376103098192561e-06,
"loss": 0.0339,
"step": 6220
},
{
"epoch": 0.7759372275501307,
"grad_norm": 0.1471776063205498,
"learning_rate": 1.3236203925834435e-06,
"loss": 0.0395,
"step": 6230
},
{
"epoch": 0.7771827126665837,
"grad_norm": 0.06884582924522074,
"learning_rate": 1.3096928505527812e-06,
"loss": 0.0351,
"step": 6240
},
{
"epoch": 0.7784281977830365,
"grad_norm": 0.06486078332241468,
"learning_rate": 1.2958279200285717e-06,
"loss": 0.0334,
"step": 6250
},
{
"epoch": 0.7796736828994894,
"grad_norm": 0.06824091168803899,
"learning_rate": 1.28202583624982e-06,
"loss": 0.0334,
"step": 6260
},
{
"epoch": 0.7809191680159422,
"grad_norm": 0.07118767063867026,
"learning_rate": 1.2682868333892478e-06,
"loss": 0.0348,
"step": 6270
},
{
"epoch": 0.782164653132395,
"grad_norm": 0.07944253692635138,
"learning_rate": 1.2546111445493115e-06,
"loss": 0.035,
"step": 6280
},
{
"epoch": 0.783410138248848,
"grad_norm": 0.09656962543872709,
"learning_rate": 1.2409990017582507e-06,
"loss": 0.0354,
"step": 6290
},
{
"epoch": 0.7846556233653008,
"grad_norm": 0.07502602214178333,
"learning_rate": 1.2274506359661587e-06,
"loss": 0.0375,
"step": 6300
},
{
"epoch": 0.7859011084817537,
"grad_norm": 0.0799756957228317,
"learning_rate": 1.2139662770410526e-06,
"loss": 0.0352,
"step": 6310
},
{
"epoch": 0.7871465935982065,
"grad_norm": 0.08452422194255815,
"learning_rate": 1.2005461537649788e-06,
"loss": 0.0347,
"step": 6320
},
{
"epoch": 0.7883920787146593,
"grad_norm": 0.07815588940216203,
"learning_rate": 1.1871904938301338e-06,
"loss": 0.0358,
"step": 6330
},
{
"epoch": 0.7896375638311122,
"grad_norm": 0.08213543731741436,
"learning_rate": 1.173899523834996e-06,
"loss": 0.0344,
"step": 6340
},
{
"epoch": 0.7908830489475651,
"grad_norm": 0.07754270311286994,
"learning_rate": 1.1606734692804833e-06,
"loss": 0.0334,
"step": 6350
},
{
"epoch": 0.792128534064018,
"grad_norm": 0.07612816225051539,
"learning_rate": 1.1475125545661308e-06,
"loss": 0.0354,
"step": 6360
},
{
"epoch": 0.7933740191804708,
"grad_norm": 0.06720925454007079,
"learning_rate": 1.1344170029862773e-06,
"loss": 0.0344,
"step": 6370
},
{
"epoch": 0.7946195042969236,
"grad_norm": 0.06632967939022859,
"learning_rate": 1.121387036726279e-06,
"loss": 0.0359,
"step": 6380
},
{
"epoch": 0.7958649894133765,
"grad_norm": 0.0671224246804576,
"learning_rate": 1.108422876858742e-06,
"loss": 0.0352,
"step": 6390
},
{
"epoch": 0.7971104745298294,
"grad_norm": 0.06375225401763122,
"learning_rate": 1.0955247433397693e-06,
"loss": 0.0345,
"step": 6400
},
{
"epoch": 0.7983559596462823,
"grad_norm": 0.07554971366582727,
"learning_rate": 1.0826928550052286e-06,
"loss": 0.0321,
"step": 6410
},
{
"epoch": 0.7996014447627351,
"grad_norm": 0.08051411691295206,
"learning_rate": 1.069927429567041e-06,
"loss": 0.0334,
"step": 6420
},
{
"epoch": 0.8008469298791879,
"grad_norm": 0.07633425405522394,
"learning_rate": 1.057228683609488e-06,
"loss": 0.0332,
"step": 6430
},
{
"epoch": 0.8020924149956408,
"grad_norm": 0.07031116683085095,
"learning_rate": 1.0445968325855315e-06,
"loss": 0.0345,
"step": 6440
},
{
"epoch": 0.8033379001120936,
"grad_norm": 0.07126571844908727,
"learning_rate": 1.0320320908131681e-06,
"loss": 0.0349,
"step": 6450
},
{
"epoch": 0.8045833852285466,
"grad_norm": 0.06854061650174853,
"learning_rate": 1.0195346714717813e-06,
"loss": 0.0329,
"step": 6460
},
{
"epoch": 0.8058288703449994,
"grad_norm": 0.07110914763011278,
"learning_rate": 1.0071047865985318e-06,
"loss": 0.0383,
"step": 6470
},
{
"epoch": 0.8070743554614522,
"grad_norm": 0.0712690327715076,
"learning_rate": 9.947426470847598e-07,
"loss": 0.0345,
"step": 6480
},
{
"epoch": 0.8083198405779051,
"grad_norm": 0.07063224017464265,
"learning_rate": 9.82448462672404e-07,
"loss": 0.0347,
"step": 6490
},
{
"epoch": 0.8095653256943579,
"grad_norm": 0.07361058725909074,
"learning_rate": 9.702224419504453e-07,
"loss": 0.0336,
"step": 6500
},
{
"epoch": 0.8108108108108109,
"grad_norm": 0.06933750107159053,
"learning_rate": 9.580647923513647e-07,
"loss": 0.0329,
"step": 6510
},
{
"epoch": 0.8120562959272637,
"grad_norm": 0.06908483991302168,
"learning_rate": 9.459757201476322e-07,
"loss": 0.0352,
"step": 6520
},
{
"epoch": 0.8133017810437165,
"grad_norm": 0.07893525507405788,
"learning_rate": 9.339554304481952e-07,
"loss": 0.0333,
"step": 6530
},
{
"epoch": 0.8145472661601694,
"grad_norm": 0.06864557485898842,
"learning_rate": 9.220041271950059e-07,
"loss": 0.0341,
"step": 6540
},
{
"epoch": 0.8157927512766222,
"grad_norm": 0.06645002526767543,
"learning_rate": 9.101220131595612e-07,
"loss": 0.0343,
"step": 6550
},
{
"epoch": 0.8170382363930752,
"grad_norm": 0.06922021736452175,
"learning_rate": 8.983092899394585e-07,
"loss": 0.0391,
"step": 6560
},
{
"epoch": 0.818283721509528,
"grad_norm": 0.06287165787203189,
"learning_rate": 8.865661579549784e-07,
"loss": 0.0369,
"step": 6570
},
{
"epoch": 0.8195292066259808,
"grad_norm": 0.08007108761184983,
"learning_rate": 8.748928164456866e-07,
"loss": 0.0344,
"step": 6580
},
{
"epoch": 0.8207746917424337,
"grad_norm": 0.07917196668748554,
"learning_rate": 8.632894634670463e-07,
"loss": 0.0348,
"step": 6590
},
{
"epoch": 0.8220201768588865,
"grad_norm": 0.06481402186032732,
"learning_rate": 8.517562958870624e-07,
"loss": 0.0329,
"step": 6600
},
{
"epoch": 0.8232656619753393,
"grad_norm": 0.06706391748518929,
"learning_rate": 8.40293509382944e-07,
"loss": 0.0341,
"step": 6610
},
{
"epoch": 0.8245111470917923,
"grad_norm": 0.07465582263708473,
"learning_rate": 8.289012984377781e-07,
"loss": 0.0363,
"step": 6620
},
{
"epoch": 0.8257566322082451,
"grad_norm": 0.06769000558134151,
"learning_rate": 8.17579856337235e-07,
"loss": 0.0334,
"step": 6630
},
{
"epoch": 0.827002117324698,
"grad_norm": 0.0736599723550588,
"learning_rate": 8.063293751662865e-07,
"loss": 0.0334,
"step": 6640
},
{
"epoch": 0.8282476024411508,
"grad_norm": 0.07564679944959335,
"learning_rate": 7.951500458059475e-07,
"loss": 0.0333,
"step": 6650
},
{
"epoch": 0.8294930875576036,
"grad_norm": 0.07611705351089817,
"learning_rate": 7.840420579300379e-07,
"loss": 0.0333,
"step": 6660
},
{
"epoch": 0.8307385726740566,
"grad_norm": 0.07192878132208759,
"learning_rate": 7.730056000019614e-07,
"loss": 0.0328,
"step": 6670
},
{
"epoch": 0.8319840577905094,
"grad_norm": 0.06292019065033208,
"learning_rate": 7.620408592715167e-07,
"loss": 0.0343,
"step": 6680
},
{
"epoch": 0.8332295429069623,
"grad_norm": 0.07416346158348398,
"learning_rate": 7.511480217717088e-07,
"loss": 0.0334,
"step": 6690
},
{
"epoch": 0.8344750280234151,
"grad_norm": 0.06463777918980437,
"learning_rate": 7.403272723156002e-07,
"loss": 0.0352,
"step": 6700
},
{
"epoch": 0.835720513139868,
"grad_norm": 0.08165169020726025,
"learning_rate": 7.295787944931715e-07,
"loss": 0.0354,
"step": 6710
},
{
"epoch": 0.8369659982563208,
"grad_norm": 0.07131556603635962,
"learning_rate": 7.189027706682128e-07,
"loss": 0.0336,
"step": 6720
},
{
"epoch": 0.8382114833727737,
"grad_norm": 0.0719099714750811,
"learning_rate": 7.082993819752215e-07,
"loss": 0.0342,
"step": 6730
},
{
"epoch": 0.8394569684892266,
"grad_norm": 0.07154371844407563,
"learning_rate": 6.977688083163342e-07,
"loss": 0.032,
"step": 6740
},
{
"epoch": 0.8407024536056794,
"grad_norm": 0.07312235337311399,
"learning_rate": 6.873112283582722e-07,
"loss": 0.0329,
"step": 6750
},
{
"epoch": 0.8419479387221322,
"grad_norm": 0.06850012770711321,
"learning_rate": 6.7692681952931e-07,
"loss": 0.0327,
"step": 6760
},
{
"epoch": 0.8431934238385851,
"grad_norm": 0.06977693532355342,
"learning_rate": 6.666157580162702e-07,
"loss": 0.0329,
"step": 6770
},
{
"epoch": 0.844438908955038,
"grad_norm": 0.0719753214839795,
"learning_rate": 6.563782187615253e-07,
"loss": 0.0339,
"step": 6780
},
{
"epoch": 0.8456843940714909,
"grad_norm": 0.07280895416767508,
"learning_rate": 6.462143754600359e-07,
"loss": 0.0325,
"step": 6790
},
{
"epoch": 0.8469298791879437,
"grad_norm": 0.07571464330534158,
"learning_rate": 6.361244005564033e-07,
"loss": 0.0348,
"step": 6800
},
{
"epoch": 0.8481753643043966,
"grad_norm": 0.0784979836153967,
"learning_rate": 6.261084652419396e-07,
"loss": 0.0328,
"step": 6810
},
{
"epoch": 0.8494208494208494,
"grad_norm": 0.08689547960009887,
"learning_rate": 6.161667394517684e-07,
"loss": 0.0343,
"step": 6820
},
{
"epoch": 0.8506663345373022,
"grad_norm": 0.06584632674297429,
"learning_rate": 6.06299391861937e-07,
"loss": 0.0323,
"step": 6830
},
{
"epoch": 0.8519118196537552,
"grad_norm": 0.06953629816129804,
"learning_rate": 5.965065898865607e-07,
"loss": 0.0335,
"step": 6840
},
{
"epoch": 0.853157304770208,
"grad_norm": 0.07939544549983875,
"learning_rate": 5.86788499674974e-07,
"loss": 0.035,
"step": 6850
},
{
"epoch": 0.8544027898866609,
"grad_norm": 0.08134197942470128,
"learning_rate": 5.771452861089188e-07,
"loss": 0.0333,
"step": 6860
},
{
"epoch": 0.8556482750031137,
"grad_norm": 0.07280175167550847,
"learning_rate": 5.675771127997442e-07,
"loss": 0.0342,
"step": 6870
},
{
"epoch": 0.8568937601195665,
"grad_norm": 0.07641298068874042,
"learning_rate": 5.580841420856286e-07,
"loss": 0.0333,
"step": 6880
},
{
"epoch": 0.8581392452360195,
"grad_norm": 0.06946607578947374,
"learning_rate": 5.48666535028829e-07,
"loss": 0.0351,
"step": 6890
},
{
"epoch": 0.8593847303524723,
"grad_norm": 0.08052086443622808,
"learning_rate": 5.393244514129459e-07,
"loss": 0.0341,
"step": 6900
},
{
"epoch": 0.8606302154689252,
"grad_norm": 0.08590426319330488,
"learning_rate": 5.300580497402141e-07,
"loss": 0.0327,
"step": 6910
},
{
"epoch": 0.861875700585378,
"grad_norm": 0.06389092214924613,
"learning_rate": 5.208674872288111e-07,
"loss": 0.0344,
"step": 6920
},
{
"epoch": 0.8631211857018308,
"grad_norm": 0.0732213114303925,
"learning_rate": 5.117529198101944e-07,
"loss": 0.0327,
"step": 6930
},
{
"epoch": 0.8643666708182837,
"grad_norm": 0.07779488412768652,
"learning_rate": 5.027145021264484e-07,
"loss": 0.0325,
"step": 6940
},
{
"epoch": 0.8656121559347366,
"grad_norm": 0.06643221408607018,
"learning_rate": 4.937523875276679e-07,
"loss": 0.0355,
"step": 6950
},
{
"epoch": 0.8668576410511895,
"grad_norm": 0.0749954905480658,
"learning_rate": 4.848667280693515e-07,
"loss": 0.0341,
"step": 6960
},
{
"epoch": 0.8681031261676423,
"grad_norm": 0.06878078966121552,
"learning_rate": 4.7605767450982386e-07,
"loss": 0.0331,
"step": 6970
},
{
"epoch": 0.8693486112840951,
"grad_norm": 0.07671105542204411,
"learning_rate": 4.6732537630767847e-07,
"loss": 0.0319,
"step": 6980
},
{
"epoch": 0.870594096400548,
"grad_norm": 0.0771166533771837,
"learning_rate": 4.5866998161923916e-07,
"loss": 0.0345,
"step": 6990
},
{
"epoch": 0.8718395815170009,
"grad_norm": 0.06325435618259866,
"learning_rate": 4.5009163729605123e-07,
"loss": 0.0358,
"step": 7000
},
{
"epoch": 0.8718395815170009,
"eval_loss": 0.03601476922631264,
"eval_runtime": 837.2719,
"eval_samples_per_second": 4.779,
"eval_steps_per_second": 0.3,
"step": 7000
},
{
"epoch": 0.8730850666334538,
"grad_norm": 0.07013513035528042,
"learning_rate": 4.4159048888238296e-07,
"loss": 0.0371,
"step": 7010
},
{
"epoch": 0.8743305517499066,
"grad_norm": 0.0796985853346479,
"learning_rate": 4.3316668061276437e-07,
"loss": 0.0358,
"step": 7020
},
{
"epoch": 0.8755760368663594,
"grad_norm": 0.07657422399818352,
"learning_rate": 4.2482035540953146e-07,
"loss": 0.0345,
"step": 7030
},
{
"epoch": 0.8768215219828123,
"grad_norm": 0.08303698297275255,
"learning_rate": 4.165516548804088e-07,
"loss": 0.0339,
"step": 7040
},
{
"epoch": 0.8780670070992652,
"grad_norm": 0.06701075185473475,
"learning_rate": 4.083607193161021e-07,
"loss": 0.0337,
"step": 7050
},
{
"epoch": 0.8793124922157181,
"grad_norm": 0.07440386317561543,
"learning_rate": 4.002476876879202e-07,
"loss": 0.0337,
"step": 7060
},
{
"epoch": 0.8805579773321709,
"grad_norm": 0.06559265895649592,
"learning_rate": 3.922126976454171e-07,
"loss": 0.0328,
"step": 7070
},
{
"epoch": 0.8818034624486237,
"grad_norm": 0.07360217409200592,
"learning_rate": 3.842558855140549e-07,
"loss": 0.033,
"step": 7080
},
{
"epoch": 0.8830489475650766,
"grad_norm": 0.07352757190270637,
"learning_rate": 3.7637738629289487e-07,
"loss": 0.0325,
"step": 7090
},
{
"epoch": 0.8842944326815294,
"grad_norm": 0.09092598561951667,
"learning_rate": 3.685773336523013e-07,
"loss": 0.0397,
"step": 7100
},
{
"epoch": 0.8855399177979824,
"grad_norm": 0.06933268688345073,
"learning_rate": 3.6085585993167804e-07,
"loss": 0.0338,
"step": 7110
},
{
"epoch": 0.8867854029144352,
"grad_norm": 0.07631042389502997,
"learning_rate": 3.532130961372199e-07,
"loss": 0.0344,
"step": 7120
},
{
"epoch": 0.888030888030888,
"grad_norm": 0.06438416491374326,
"learning_rate": 3.4564917193969315e-07,
"loss": 0.0329,
"step": 7130
},
{
"epoch": 0.8892763731473409,
"grad_norm": 0.06287657387840852,
"learning_rate": 3.381642156722348e-07,
"loss": 0.0338,
"step": 7140
},
{
"epoch": 0.8905218582637937,
"grad_norm": 0.073583377093947,
"learning_rate": 3.3075835432817226e-07,
"loss": 0.0345,
"step": 7150
},
{
"epoch": 0.8917673433802467,
"grad_norm": 0.07084202608928916,
"learning_rate": 3.234317135588721e-07,
"loss": 0.0353,
"step": 7160
},
{
"epoch": 0.8930128284966995,
"grad_norm": 0.0683170544172917,
"learning_rate": 3.161844176716056e-07,
"loss": 0.0335,
"step": 7170
},
{
"epoch": 0.8942583136131523,
"grad_norm": 0.07513088308413,
"learning_rate": 3.0901658962744385e-07,
"loss": 0.0335,
"step": 7180
},
{
"epoch": 0.8955037987296052,
"grad_norm": 0.08217480053437234,
"learning_rate": 3.019283510391663e-07,
"loss": 0.0344,
"step": 7190
},
{
"epoch": 0.896749283846058,
"grad_norm": 0.07491705413401013,
"learning_rate": 2.949198221692001e-07,
"loss": 0.033,
"step": 7200
},
{
"epoch": 0.8979947689625108,
"grad_norm": 0.06926357003461246,
"learning_rate": 2.879911219275805e-07,
"loss": 0.0326,
"step": 7210
},
{
"epoch": 0.8992402540789638,
"grad_norm": 0.07318044389849242,
"learning_rate": 2.811423678699304e-07,
"loss": 0.0339,
"step": 7220
},
{
"epoch": 0.9004857391954166,
"grad_norm": 0.07056617475435807,
"learning_rate": 2.743736761954696e-07,
"loss": 0.0341,
"step": 7230
},
{
"epoch": 0.9017312243118695,
"grad_norm": 0.0715107422245273,
"learning_rate": 2.676851617450399e-07,
"loss": 0.0361,
"step": 7240
},
{
"epoch": 0.9029767094283223,
"grad_norm": 0.0838238159782872,
"learning_rate": 2.6107693799916066e-07,
"loss": 0.0336,
"step": 7250
},
{
"epoch": 0.9042221945447751,
"grad_norm": 0.07576479304363574,
"learning_rate": 2.5454911707609774e-07,
"loss": 0.0372,
"step": 7260
},
{
"epoch": 0.9054676796612281,
"grad_norm": 0.07883180504697644,
"learning_rate": 2.4810180972996643e-07,
"loss": 0.0337,
"step": 7270
},
{
"epoch": 0.9067131647776809,
"grad_norm": 0.07220224003715747,
"learning_rate": 2.4173512534885035e-07,
"loss": 0.0346,
"step": 7280
},
{
"epoch": 0.9079586498941338,
"grad_norm": 0.07243548971532544,
"learning_rate": 2.3544917195294448e-07,
"loss": 0.0349,
"step": 7290
},
{
"epoch": 0.9092041350105866,
"grad_norm": 0.06773035160663944,
"learning_rate": 2.2924405619272504e-07,
"loss": 0.033,
"step": 7300
},
{
"epoch": 0.9104496201270394,
"grad_norm": 0.0745810384721887,
"learning_rate": 2.2311988334713652e-07,
"loss": 0.0323,
"step": 7310
},
{
"epoch": 0.9116951052434923,
"grad_norm": 0.09334001378278338,
"learning_rate": 2.170767573218091e-07,
"loss": 0.0335,
"step": 7320
},
{
"epoch": 0.9129405903599452,
"grad_norm": 0.06509794428143283,
"learning_rate": 2.1111478064729296e-07,
"loss": 0.0323,
"step": 7330
},
{
"epoch": 0.9141860754763981,
"grad_norm": 0.06801541543633163,
"learning_rate": 2.052340544773207e-07,
"loss": 0.0325,
"step": 7340
},
{
"epoch": 0.9154315605928509,
"grad_norm": 0.0684881277926702,
"learning_rate": 1.9943467858708987e-07,
"loss": 0.038,
"step": 7350
},
{
"epoch": 0.9166770457093038,
"grad_norm": 0.07156360624516987,
"learning_rate": 1.9371675137157041e-07,
"loss": 0.0353,
"step": 7360
},
{
"epoch": 0.9179225308257566,
"grad_norm": 0.07770709574833054,
"learning_rate": 1.8808036984383494e-07,
"loss": 0.036,
"step": 7370
},
{
"epoch": 0.9191680159422095,
"grad_norm": 0.08221487487909415,
"learning_rate": 1.825256296334138e-07,
"loss": 0.0338,
"step": 7380
},
{
"epoch": 0.9204135010586624,
"grad_norm": 0.06484900429613018,
"learning_rate": 1.7705262498467101e-07,
"loss": 0.0323,
"step": 7390
},
{
"epoch": 0.9216589861751152,
"grad_norm": 0.08944384334335836,
"learning_rate": 1.7166144875520762e-07,
"loss": 0.04,
"step": 7400
},
{
"epoch": 0.922904471291568,
"grad_norm": 0.06977381086844832,
"learning_rate": 1.66352192414283e-07,
"loss": 0.0331,
"step": 7410
},
{
"epoch": 0.9241499564080209,
"grad_norm": 0.06972307171771487,
"learning_rate": 1.6112494604126617e-07,
"loss": 0.0342,
"step": 7420
},
{
"epoch": 0.9253954415244737,
"grad_norm": 0.07621473599772897,
"learning_rate": 1.559797983241057e-07,
"loss": 0.0354,
"step": 7430
},
{
"epoch": 0.9266409266409267,
"grad_norm": 0.06949411673451975,
"learning_rate": 1.5091683655782563e-07,
"loss": 0.0322,
"step": 7440
},
{
"epoch": 0.9278864117573795,
"grad_norm": 0.07109337082909367,
"learning_rate": 1.4593614664304202e-07,
"loss": 0.0335,
"step": 7450
},
{
"epoch": 0.9291318968738324,
"grad_norm": 0.07040888774127627,
"learning_rate": 1.410378130845108e-07,
"loss": 0.0333,
"step": 7460
},
{
"epoch": 0.9303773819902852,
"grad_norm": 0.08008290533382605,
"learning_rate": 1.3622191898968794e-07,
"loss": 0.0378,
"step": 7470
},
{
"epoch": 0.931622867106738,
"grad_norm": 0.09568843705776367,
"learning_rate": 1.3148854606732375e-07,
"loss": 0.0345,
"step": 7480
},
{
"epoch": 0.932868352223191,
"grad_norm": 0.06672547669772788,
"learning_rate": 1.268377746260746e-07,
"loss": 0.0338,
"step": 7490
},
{
"epoch": 0.9341138373396438,
"grad_norm": 0.06819892775291685,
"learning_rate": 1.2226968357314128e-07,
"loss": 0.0344,
"step": 7500
},
{
"epoch": 0.9353593224560967,
"grad_norm": 0.08377122880684172,
"learning_rate": 1.1778435041292947e-07,
"loss": 0.0341,
"step": 7510
},
{
"epoch": 0.9366048075725495,
"grad_norm": 0.06632013597935872,
"learning_rate": 1.133818512457352e-07,
"loss": 0.0321,
"step": 7520
},
{
"epoch": 0.9378502926890023,
"grad_norm": 0.06804778518558732,
"learning_rate": 1.0906226076645266e-07,
"loss": 0.0338,
"step": 7530
},
{
"epoch": 0.9390957778054553,
"grad_norm": 0.07515421815726762,
"learning_rate": 1.0482565226330955e-07,
"loss": 0.034,
"step": 7540
},
{
"epoch": 0.9403412629219081,
"grad_norm": 0.07383191334773143,
"learning_rate": 1.0067209761662033e-07,
"loss": 0.0331,
"step": 7550
},
{
"epoch": 0.941586748038361,
"grad_norm": 0.0656525186197339,
"learning_rate": 9.660166729756892e-08,
"loss": 0.0339,
"step": 7560
},
{
"epoch": 0.9428322331548138,
"grad_norm": 0.08473468853272019,
"learning_rate": 9.26144303670129e-08,
"loss": 0.0347,
"step": 7570
},
{
"epoch": 0.9440777182712666,
"grad_norm": 0.06772453808502675,
"learning_rate": 8.871045447430948e-08,
"loss": 0.0333,
"step": 7580
},
{
"epoch": 0.9453232033877195,
"grad_norm": 0.07105872568274649,
"learning_rate": 8.488980585617202e-08,
"loss": 0.0364,
"step": 7590
},
{
"epoch": 0.9465686885041724,
"grad_norm": 0.07268210266497814,
"learning_rate": 8.11525493355414e-08,
"loss": 0.0367,
"step": 7600
},
{
"epoch": 0.9478141736206253,
"grad_norm": 0.06304480996299118,
"learning_rate": 7.749874832048976e-08,
"loss": 0.032,
"step": 7610
},
{
"epoch": 0.9490596587370781,
"grad_norm": 0.07028355086867726,
"learning_rate": 7.392846480314297e-08,
"loss": 0.0337,
"step": 7620
},
{
"epoch": 0.9503051438535309,
"grad_norm": 0.06809291178200033,
"learning_rate": 7.044175935862985e-08,
"loss": 0.0351,
"step": 7630
},
{
"epoch": 0.9515506289699838,
"grad_norm": 0.07135816971461638,
"learning_rate": 6.703869114405292e-08,
"loss": 0.0331,
"step": 7640
},
{
"epoch": 0.9527961140864367,
"grad_norm": 0.07846522105920983,
"learning_rate": 6.371931789748598e-08,
"loss": 0.0376,
"step": 7650
},
{
"epoch": 0.9540415992028896,
"grad_norm": 0.07016745603597967,
"learning_rate": 6.048369593699533e-08,
"loss": 0.0343,
"step": 7660
},
{
"epoch": 0.9552870843193424,
"grad_norm": 0.07623936730645538,
"learning_rate": 5.733188015968172e-08,
"loss": 0.033,
"step": 7670
},
{
"epoch": 0.9565325694357952,
"grad_norm": 0.074221653559916,
"learning_rate": 5.426392404075109e-08,
"loss": 0.0336,
"step": 7680
},
{
"epoch": 0.9577780545522481,
"grad_norm": 0.081984794751667,
"learning_rate": 5.127987963260583e-08,
"loss": 0.0341,
"step": 7690
},
{
"epoch": 0.9590235396687009,
"grad_norm": 0.062234312126669994,
"learning_rate": 4.837979756396327e-08,
"loss": 0.0327,
"step": 7700
},
{
"epoch": 0.9602690247851539,
"grad_norm": 0.0718672304470415,
"learning_rate": 4.556372703899525e-08,
"loss": 0.0324,
"step": 7710
},
{
"epoch": 0.9615145099016067,
"grad_norm": 0.06876702733799522,
"learning_rate": 4.2831715836493814e-08,
"loss": 0.0324,
"step": 7720
},
{
"epoch": 0.9627599950180595,
"grad_norm": 0.08240294881302636,
"learning_rate": 4.018381030906016e-08,
"loss": 0.0347,
"step": 7730
},
{
"epoch": 0.9640054801345124,
"grad_norm": 0.0738460586959311,
"learning_rate": 3.762005538231861e-08,
"loss": 0.0319,
"step": 7740
},
{
"epoch": 0.9652509652509652,
"grad_norm": 0.0743464723827432,
"learning_rate": 3.514049455415558e-08,
"loss": 0.0331,
"step": 7750
},
{
"epoch": 0.9664964503674182,
"grad_norm": 0.0695618078874069,
"learning_rate": 3.274516989397958e-08,
"loss": 0.0347,
"step": 7760
},
{
"epoch": 0.967741935483871,
"grad_norm": 0.07847632210065156,
"learning_rate": 3.043412204200791e-08,
"loss": 0.0333,
"step": 7770
},
{
"epoch": 0.9689874206003238,
"grad_norm": 0.06762044113316065,
"learning_rate": 2.8207390208577768e-08,
"loss": 0.0329,
"step": 7780
},
{
"epoch": 0.9702329057167767,
"grad_norm": 0.06421198468053373,
"learning_rate": 2.606501217348012e-08,
"loss": 0.038,
"step": 7790
},
{
"epoch": 0.9714783908332295,
"grad_norm": 0.07605307023985369,
"learning_rate": 2.4007024285320202e-08,
"loss": 0.0341,
"step": 7800
},
{
"epoch": 0.9727238759496823,
"grad_norm": 0.07720901420141715,
"learning_rate": 2.2033461460899685e-08,
"loss": 0.0342,
"step": 7810
},
{
"epoch": 0.9739693610661353,
"grad_norm": 0.0706246030278367,
"learning_rate": 2.0144357184623265e-08,
"loss": 0.0336,
"step": 7820
},
{
"epoch": 0.9752148461825881,
"grad_norm": 0.07575875269027028,
"learning_rate": 1.833974350793355e-08,
"loss": 0.0434,
"step": 7830
},
{
"epoch": 0.976460331299041,
"grad_norm": 0.06902760318668755,
"learning_rate": 1.661965104876484e-08,
"loss": 0.0349,
"step": 7840
},
{
"epoch": 0.9777058164154938,
"grad_norm": 0.06632820472757216,
"learning_rate": 1.4984108991022985e-08,
"loss": 0.0323,
"step": 7850
},
{
"epoch": 0.9789513015319466,
"grad_norm": 0.07955882222040188,
"learning_rate": 1.3433145084093547e-08,
"loss": 0.0371,
"step": 7860
},
{
"epoch": 0.9801967866483996,
"grad_norm": 0.06755112335574645,
"learning_rate": 1.1966785642367195e-08,
"loss": 0.034,
"step": 7870
},
{
"epoch": 0.9814422717648524,
"grad_norm": 0.07115529103369396,
"learning_rate": 1.0585055544795608e-08,
"loss": 0.0367,
"step": 7880
},
{
"epoch": 0.9826877568813053,
"grad_norm": 0.07314774971455064,
"learning_rate": 9.287978234469585e-09,
"loss": 0.0334,
"step": 7890
},
{
"epoch": 0.9839332419977581,
"grad_norm": 0.06746293247651736,
"learning_rate": 8.07557571821882e-09,
"loss": 0.034,
"step": 7900
},
{
"epoch": 0.985178727114211,
"grad_norm": 0.08210523419371389,
"learning_rate": 6.947868566242188e-09,
"loss": 0.0341,
"step": 7910
},
{
"epoch": 0.9864242122306638,
"grad_norm": 0.0689360972402286,
"learning_rate": 5.904875911754704e-09,
"loss": 0.0335,
"step": 7920
},
{
"epoch": 0.9876696973471167,
"grad_norm": 0.07186254814721327,
"learning_rate": 4.946615450666659e-09,
"loss": 0.0358,
"step": 7930
},
{
"epoch": 0.9889151824635696,
"grad_norm": 0.06520526448362651,
"learning_rate": 4.07310344127998e-09,
"loss": 0.0342,
"step": 7940
},
{
"epoch": 0.9901606675800224,
"grad_norm": 0.07531178940209358,
"learning_rate": 3.2843547040151136e-09,
"loss": 0.0348,
"step": 7950
},
{
"epoch": 0.9914061526964753,
"grad_norm": 0.06750753677525495,
"learning_rate": 2.5803826211590056e-09,
"loss": 0.0311,
"step": 7960
},
{
"epoch": 0.9926516378129281,
"grad_norm": 0.07832974054383601,
"learning_rate": 1.961199136636394e-09,
"loss": 0.0499,
"step": 7970
},
{
"epoch": 0.993897122929381,
"grad_norm": 0.08569431873882229,
"learning_rate": 1.4268147558088585e-09,
"loss": 0.0347,
"step": 7980
},
{
"epoch": 0.9951426080458339,
"grad_norm": 0.07167215689654272,
"learning_rate": 9.772385452955223e-10,
"loss": 0.0336,
"step": 7990
},
{
"epoch": 0.9963880931622867,
"grad_norm": 0.07528961214946711,
"learning_rate": 6.124781328215034e-10,
"loss": 0.0319,
"step": 8000
},
{
"epoch": 0.9963880931622867,
"eval_loss": 0.03589002415537834,
"eval_runtime": 837.1266,
"eval_samples_per_second": 4.779,
"eval_steps_per_second": 0.3,
"step": 8000
},
{
"epoch": 0.9976335782787396,
"grad_norm": 0.08293808161992938,
"learning_rate": 3.325397070841341e-10,
"loss": 0.0338,
"step": 8010
},
{
"epoch": 0.9988790633951924,
"grad_norm": 0.0731739534718667,
"learning_rate": 1.3742801765193048e-10,
"loss": 0.0365,
"step": 8020
},
{
"epoch": 1.0,
"step": 8029,
"total_flos": 7777072302784512.0,
"train_loss": 0.06085372139276492,
"train_runtime": 119385.8868,
"train_samples_per_second": 1.076,
"train_steps_per_second": 0.067
}
],
"logging_steps": 10,
"max_steps": 8029,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 7777072302784512.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}