vit-base-anime-e10 / trainer_state.json
fuji12345's picture
End of training
c58c3a0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 10190,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009813542688910697,
"grad_norm": 1.582729697227478,
"learning_rate": 4.9955839057899906e-05,
"loss": 0.6655,
"step": 10
},
{
"epoch": 0.019627085377821395,
"grad_norm": 2.231220006942749,
"learning_rate": 4.990677134445535e-05,
"loss": 0.6564,
"step": 20
},
{
"epoch": 0.029440628066732092,
"grad_norm": 2.0203068256378174,
"learning_rate": 4.98577036310108e-05,
"loss": 0.5558,
"step": 30
},
{
"epoch": 0.03925417075564279,
"grad_norm": 1.8266687393188477,
"learning_rate": 4.980863591756624e-05,
"loss": 0.3794,
"step": 40
},
{
"epoch": 0.04906771344455348,
"grad_norm": 0.9987813830375671,
"learning_rate": 4.9759568204121696e-05,
"loss": 0.3129,
"step": 50
},
{
"epoch": 0.058881256133464184,
"grad_norm": 6.515292644500732,
"learning_rate": 4.971050049067714e-05,
"loss": 0.2824,
"step": 60
},
{
"epoch": 0.06869479882237488,
"grad_norm": 12.534858703613281,
"learning_rate": 4.966143277723259e-05,
"loss": 0.2215,
"step": 70
},
{
"epoch": 0.07850834151128558,
"grad_norm": 1.54276704788208,
"learning_rate": 4.961236506378803e-05,
"loss": 0.0981,
"step": 80
},
{
"epoch": 0.08832188420019627,
"grad_norm": 1.2632302045822144,
"learning_rate": 4.956329735034347e-05,
"loss": 0.0815,
"step": 90
},
{
"epoch": 0.09813542688910697,
"grad_norm": 8.447662353515625,
"learning_rate": 4.951422963689892e-05,
"loss": 0.076,
"step": 100
},
{
"epoch": 0.10794896957801767,
"grad_norm": 0.18015748262405396,
"learning_rate": 4.946516192345437e-05,
"loss": 0.0877,
"step": 110
},
{
"epoch": 0.11776251226692837,
"grad_norm": 0.57831871509552,
"learning_rate": 4.941609421000982e-05,
"loss": 0.2572,
"step": 120
},
{
"epoch": 0.12757605495583907,
"grad_norm": 0.19590969383716583,
"learning_rate": 4.936702649656526e-05,
"loss": 0.1026,
"step": 130
},
{
"epoch": 0.13738959764474976,
"grad_norm": 0.1655094027519226,
"learning_rate": 4.931795878312071e-05,
"loss": 0.0515,
"step": 140
},
{
"epoch": 0.14720314033366044,
"grad_norm": 0.09949669986963272,
"learning_rate": 4.926889106967615e-05,
"loss": 0.1725,
"step": 150
},
{
"epoch": 0.15701668302257116,
"grad_norm": 0.10112312436103821,
"learning_rate": 4.92198233562316e-05,
"loss": 0.279,
"step": 160
},
{
"epoch": 0.16683022571148184,
"grad_norm": 0.08742330223321915,
"learning_rate": 4.917075564278705e-05,
"loss": 0.1379,
"step": 170
},
{
"epoch": 0.17664376840039253,
"grad_norm": 0.08159990608692169,
"learning_rate": 4.91216879293425e-05,
"loss": 0.0306,
"step": 180
},
{
"epoch": 0.18645731108930325,
"grad_norm": 0.06950151175260544,
"learning_rate": 4.907262021589794e-05,
"loss": 0.0121,
"step": 190
},
{
"epoch": 0.19627085377821393,
"grad_norm": 10.549564361572266,
"learning_rate": 4.902355250245339e-05,
"loss": 0.1062,
"step": 200
},
{
"epoch": 0.20608439646712462,
"grad_norm": 0.06549222767353058,
"learning_rate": 4.897448478900883e-05,
"loss": 0.0633,
"step": 210
},
{
"epoch": 0.21589793915603533,
"grad_norm": 15.829712867736816,
"learning_rate": 4.892541707556428e-05,
"loss": 0.0802,
"step": 220
},
{
"epoch": 0.22571148184494602,
"grad_norm": 1.9238972663879395,
"learning_rate": 4.887634936211973e-05,
"loss": 0.0688,
"step": 230
},
{
"epoch": 0.23552502453385674,
"grad_norm": 0.07546406239271164,
"learning_rate": 4.882728164867517e-05,
"loss": 0.0553,
"step": 240
},
{
"epoch": 0.24533856722276742,
"grad_norm": 0.3736225366592407,
"learning_rate": 4.877821393523062e-05,
"loss": 0.0661,
"step": 250
},
{
"epoch": 0.25515210991167814,
"grad_norm": 0.2823021709918976,
"learning_rate": 4.872914622178606e-05,
"loss": 0.0617,
"step": 260
},
{
"epoch": 0.2649656526005888,
"grad_norm": 4.213985919952393,
"learning_rate": 4.868007850834151e-05,
"loss": 0.0628,
"step": 270
},
{
"epoch": 0.2747791952894995,
"grad_norm": 0.09498825669288635,
"learning_rate": 4.863101079489696e-05,
"loss": 0.2317,
"step": 280
},
{
"epoch": 0.2845927379784102,
"grad_norm": 0.05033993721008301,
"learning_rate": 4.858194308145241e-05,
"loss": 0.008,
"step": 290
},
{
"epoch": 0.2944062806673209,
"grad_norm": 0.05174524709582329,
"learning_rate": 4.853287536800785e-05,
"loss": 0.0062,
"step": 300
},
{
"epoch": 0.3042198233562316,
"grad_norm": 0.05778587609529495,
"learning_rate": 4.84838076545633e-05,
"loss": 0.0728,
"step": 310
},
{
"epoch": 0.3140333660451423,
"grad_norm": 0.04121818020939827,
"learning_rate": 4.8434739941118744e-05,
"loss": 0.1023,
"step": 320
},
{
"epoch": 0.323846908734053,
"grad_norm": 0.049691397696733475,
"learning_rate": 4.838567222767419e-05,
"loss": 0.1547,
"step": 330
},
{
"epoch": 0.3336604514229637,
"grad_norm": 18.06907081604004,
"learning_rate": 4.833660451422964e-05,
"loss": 0.1047,
"step": 340
},
{
"epoch": 0.3434739941118744,
"grad_norm": 4.178352355957031,
"learning_rate": 4.828753680078509e-05,
"loss": 0.0453,
"step": 350
},
{
"epoch": 0.35328753680078506,
"grad_norm": 0.04574347659945488,
"learning_rate": 4.823846908734053e-05,
"loss": 0.0051,
"step": 360
},
{
"epoch": 0.3631010794896958,
"grad_norm": 0.0428372398018837,
"learning_rate": 4.818940137389598e-05,
"loss": 0.0606,
"step": 370
},
{
"epoch": 0.3729146221786065,
"grad_norm": 1.1302443742752075,
"learning_rate": 4.8140333660451424e-05,
"loss": 0.1306,
"step": 380
},
{
"epoch": 0.38272816486751715,
"grad_norm": 0.05571115016937256,
"learning_rate": 4.8091265947006866e-05,
"loss": 0.1686,
"step": 390
},
{
"epoch": 0.39254170755642787,
"grad_norm": 7.147359371185303,
"learning_rate": 4.804219823356232e-05,
"loss": 0.2399,
"step": 400
},
{
"epoch": 0.4023552502453386,
"grad_norm": 15.144064903259277,
"learning_rate": 4.7993130520117764e-05,
"loss": 0.1326,
"step": 410
},
{
"epoch": 0.41216879293424924,
"grad_norm": 36.820003509521484,
"learning_rate": 4.794406280667321e-05,
"loss": 0.0448,
"step": 420
},
{
"epoch": 0.42198233562315995,
"grad_norm": 0.14495636522769928,
"learning_rate": 4.7894995093228655e-05,
"loss": 0.1117,
"step": 430
},
{
"epoch": 0.43179587831207067,
"grad_norm": 0.046849966049194336,
"learning_rate": 4.7845927379784104e-05,
"loss": 0.0092,
"step": 440
},
{
"epoch": 0.44160942100098133,
"grad_norm": 11.208320617675781,
"learning_rate": 4.7796859666339546e-05,
"loss": 0.0612,
"step": 450
},
{
"epoch": 0.45142296368989204,
"grad_norm": 0.039328742772340775,
"learning_rate": 4.7747791952895e-05,
"loss": 0.0114,
"step": 460
},
{
"epoch": 0.46123650637880276,
"grad_norm": 0.06963516771793365,
"learning_rate": 4.7698724239450444e-05,
"loss": 0.0859,
"step": 470
},
{
"epoch": 0.47105004906771347,
"grad_norm": 0.0437813363969326,
"learning_rate": 4.764965652600589e-05,
"loss": 0.0734,
"step": 480
},
{
"epoch": 0.48086359175662413,
"grad_norm": 0.039823539555072784,
"learning_rate": 4.7600588812561336e-05,
"loss": 0.0085,
"step": 490
},
{
"epoch": 0.49067713444553485,
"grad_norm": 0.041767820715904236,
"learning_rate": 4.7551521099116785e-05,
"loss": 0.192,
"step": 500
},
{
"epoch": 0.5004906771344455,
"grad_norm": 0.06061721593141556,
"learning_rate": 4.750245338567223e-05,
"loss": 0.0775,
"step": 510
},
{
"epoch": 0.5103042198233563,
"grad_norm": 0.06972894817590714,
"learning_rate": 4.7453385672227676e-05,
"loss": 0.0186,
"step": 520
},
{
"epoch": 0.5201177625122669,
"grad_norm": 0.07340658456087112,
"learning_rate": 4.7404317958783125e-05,
"loss": 0.1116,
"step": 530
},
{
"epoch": 0.5299313052011776,
"grad_norm": 0.03600945696234703,
"learning_rate": 4.735525024533857e-05,
"loss": 0.0052,
"step": 540
},
{
"epoch": 0.5397448478900884,
"grad_norm": 0.029176561161875725,
"learning_rate": 4.7306182531894016e-05,
"loss": 0.0042,
"step": 550
},
{
"epoch": 0.549558390578999,
"grad_norm": 0.029673976823687553,
"learning_rate": 4.725711481844946e-05,
"loss": 0.0649,
"step": 560
},
{
"epoch": 0.5593719332679097,
"grad_norm": 0.028555743396282196,
"learning_rate": 4.720804710500491e-05,
"loss": 0.0639,
"step": 570
},
{
"epoch": 0.5691854759568205,
"grad_norm": 5.0781378746032715,
"learning_rate": 4.7158979391560356e-05,
"loss": 0.1382,
"step": 580
},
{
"epoch": 0.5789990186457311,
"grad_norm": 0.05270172283053398,
"learning_rate": 4.7109911678115805e-05,
"loss": 0.042,
"step": 590
},
{
"epoch": 0.5888125613346418,
"grad_norm": 0.03829975053668022,
"learning_rate": 4.706084396467125e-05,
"loss": 0.1485,
"step": 600
},
{
"epoch": 0.5986261040235525,
"grad_norm": 0.03773980960249901,
"learning_rate": 4.7011776251226696e-05,
"loss": 0.0435,
"step": 610
},
{
"epoch": 0.6084396467124632,
"grad_norm": 0.06550557911396027,
"learning_rate": 4.696270853778214e-05,
"loss": 0.1656,
"step": 620
},
{
"epoch": 0.6182531894013739,
"grad_norm": 0.033752694725990295,
"learning_rate": 4.691364082433759e-05,
"loss": 0.0613,
"step": 630
},
{
"epoch": 0.6280667320902846,
"grad_norm": 0.10115873068571091,
"learning_rate": 4.6864573110893036e-05,
"loss": 0.0622,
"step": 640
},
{
"epoch": 0.6378802747791953,
"grad_norm": 0.10119258612394333,
"learning_rate": 4.6815505397448485e-05,
"loss": 0.0049,
"step": 650
},
{
"epoch": 0.647693817468106,
"grad_norm": 0.029145225882530212,
"learning_rate": 4.676643768400393e-05,
"loss": 0.0446,
"step": 660
},
{
"epoch": 0.6575073601570167,
"grad_norm": 0.026971256360411644,
"learning_rate": 4.671736997055937e-05,
"loss": 0.0035,
"step": 670
},
{
"epoch": 0.6673209028459274,
"grad_norm": 0.05692035332322121,
"learning_rate": 4.666830225711482e-05,
"loss": 0.0051,
"step": 680
},
{
"epoch": 0.677134445534838,
"grad_norm": 0.023497162386775017,
"learning_rate": 4.661923454367027e-05,
"loss": 0.0031,
"step": 690
},
{
"epoch": 0.6869479882237488,
"grad_norm": 44.54225540161133,
"learning_rate": 4.657016683022572e-05,
"loss": 0.0392,
"step": 700
},
{
"epoch": 0.6967615309126595,
"grad_norm": 79.16250610351562,
"learning_rate": 4.652109911678116e-05,
"loss": 0.034,
"step": 710
},
{
"epoch": 0.7065750736015701,
"grad_norm": 7.570616722106934,
"learning_rate": 4.647203140333661e-05,
"loss": 0.5111,
"step": 720
},
{
"epoch": 0.7163886162904809,
"grad_norm": 0.05000855773687363,
"learning_rate": 4.642296368989205e-05,
"loss": 0.0211,
"step": 730
},
{
"epoch": 0.7262021589793916,
"grad_norm": 0.04305935651063919,
"learning_rate": 4.63738959764475e-05,
"loss": 0.1129,
"step": 740
},
{
"epoch": 0.7360157016683022,
"grad_norm": 0.03774946555495262,
"learning_rate": 4.632482826300295e-05,
"loss": 0.0963,
"step": 750
},
{
"epoch": 0.745829244357213,
"grad_norm": 0.04534028843045235,
"learning_rate": 4.62757605495584e-05,
"loss": 0.027,
"step": 760
},
{
"epoch": 0.7556427870461236,
"grad_norm": 0.03019464947283268,
"learning_rate": 4.622669283611384e-05,
"loss": 0.0148,
"step": 770
},
{
"epoch": 0.7654563297350343,
"grad_norm": 0.028817512094974518,
"learning_rate": 4.617762512266929e-05,
"loss": 0.1937,
"step": 780
},
{
"epoch": 0.7752698724239451,
"grad_norm": 13.481318473815918,
"learning_rate": 4.612855740922473e-05,
"loss": 0.0143,
"step": 790
},
{
"epoch": 0.7850834151128557,
"grad_norm": 0.029670100659132004,
"learning_rate": 4.607948969578018e-05,
"loss": 0.132,
"step": 800
},
{
"epoch": 0.7948969578017664,
"grad_norm": 0.02657695673406124,
"learning_rate": 4.603042198233563e-05,
"loss": 0.0469,
"step": 810
},
{
"epoch": 0.8047105004906772,
"grad_norm": 17.418657302856445,
"learning_rate": 4.598135426889107e-05,
"loss": 0.2565,
"step": 820
},
{
"epoch": 0.8145240431795878,
"grad_norm": 32.446815490722656,
"learning_rate": 4.593228655544652e-05,
"loss": 0.1315,
"step": 830
},
{
"epoch": 0.8243375858684985,
"grad_norm": 0.07365565747022629,
"learning_rate": 4.588321884200196e-05,
"loss": 0.0099,
"step": 840
},
{
"epoch": 0.8341511285574092,
"grad_norm": 0.027613624930381775,
"learning_rate": 4.583415112855741e-05,
"loss": 0.1194,
"step": 850
},
{
"epoch": 0.8439646712463199,
"grad_norm": 0.0382981114089489,
"learning_rate": 4.578508341511285e-05,
"loss": 0.0584,
"step": 860
},
{
"epoch": 0.8537782139352306,
"grad_norm": 0.03215600922703743,
"learning_rate": 4.573601570166831e-05,
"loss": 0.0699,
"step": 870
},
{
"epoch": 0.8635917566241413,
"grad_norm": 0.09949254989624023,
"learning_rate": 4.568694798822375e-05,
"loss": 0.0052,
"step": 880
},
{
"epoch": 0.873405299313052,
"grad_norm": 0.024233952164649963,
"learning_rate": 4.56378802747792e-05,
"loss": 0.0739,
"step": 890
},
{
"epoch": 0.8832188420019627,
"grad_norm": 0.04114871844649315,
"learning_rate": 4.558881256133464e-05,
"loss": 0.0629,
"step": 900
},
{
"epoch": 0.8930323846908734,
"grad_norm": 0.02779584936797619,
"learning_rate": 4.553974484789009e-05,
"loss": 0.0137,
"step": 910
},
{
"epoch": 0.9028459273797841,
"grad_norm": 0.02605428174138069,
"learning_rate": 4.549067713444553e-05,
"loss": 0.0669,
"step": 920
},
{
"epoch": 0.9126594700686947,
"grad_norm": 0.02386913076043129,
"learning_rate": 4.544160942100099e-05,
"loss": 0.1172,
"step": 930
},
{
"epoch": 0.9224730127576055,
"grad_norm": 7.601992130279541,
"learning_rate": 4.539254170755643e-05,
"loss": 0.0611,
"step": 940
},
{
"epoch": 0.9322865554465162,
"grad_norm": 37.712459564208984,
"learning_rate": 4.534347399411188e-05,
"loss": 0.0191,
"step": 950
},
{
"epoch": 0.9421000981354269,
"grad_norm": 0.02080320380628109,
"learning_rate": 4.529440628066732e-05,
"loss": 0.0026,
"step": 960
},
{
"epoch": 0.9519136408243376,
"grad_norm": 56.210121154785156,
"learning_rate": 4.5245338567222765e-05,
"loss": 0.0201,
"step": 970
},
{
"epoch": 0.9617271835132483,
"grad_norm": 0.020656289532780647,
"learning_rate": 4.5196270853778214e-05,
"loss": 0.0601,
"step": 980
},
{
"epoch": 0.971540726202159,
"grad_norm": 2.9532973766326904,
"learning_rate": 4.514720314033366e-05,
"loss": 0.2589,
"step": 990
},
{
"epoch": 0.9813542688910697,
"grad_norm": 0.6973829865455627,
"learning_rate": 4.509813542688911e-05,
"loss": 0.1333,
"step": 1000
},
{
"epoch": 0.9911678115799804,
"grad_norm": 0.07507435232400894,
"learning_rate": 4.5049067713444554e-05,
"loss": 0.033,
"step": 1010
},
{
"epoch": 1.000981354268891,
"grad_norm": 0.025814848020672798,
"learning_rate": 4.5e-05,
"loss": 0.0752,
"step": 1020
},
{
"epoch": 1.0107948969578018,
"grad_norm": 0.02597714029252529,
"learning_rate": 4.4950932286555445e-05,
"loss": 0.073,
"step": 1030
},
{
"epoch": 1.0206084396467126,
"grad_norm": 0.045735057443380356,
"learning_rate": 4.4901864573110894e-05,
"loss": 0.0649,
"step": 1040
},
{
"epoch": 1.030421982335623,
"grad_norm": 0.022413084283471107,
"learning_rate": 4.485279685966634e-05,
"loss": 0.1111,
"step": 1050
},
{
"epoch": 1.0402355250245339,
"grad_norm": 0.07954522222280502,
"learning_rate": 4.480372914622179e-05,
"loss": 0.1623,
"step": 1060
},
{
"epoch": 1.0500490677134446,
"grad_norm": 0.042877595871686935,
"learning_rate": 4.4754661432777234e-05,
"loss": 0.0954,
"step": 1070
},
{
"epoch": 1.0598626104023552,
"grad_norm": 0.3953896462917328,
"learning_rate": 4.470559371933268e-05,
"loss": 0.007,
"step": 1080
},
{
"epoch": 1.069676153091266,
"grad_norm": 0.023617839440703392,
"learning_rate": 4.4656526005888125e-05,
"loss": 0.0037,
"step": 1090
},
{
"epoch": 1.0794896957801767,
"grad_norm": 0.018734032288193703,
"learning_rate": 4.4607458292443574e-05,
"loss": 0.0046,
"step": 1100
},
{
"epoch": 1.0893032384690873,
"grad_norm": 0.0192360058426857,
"learning_rate": 4.455839057899902e-05,
"loss": 0.1574,
"step": 1110
},
{
"epoch": 1.099116781157998,
"grad_norm": 0.028046630322933197,
"learning_rate": 4.4509322865554466e-05,
"loss": 0.0033,
"step": 1120
},
{
"epoch": 1.1089303238469088,
"grad_norm": 0.01861531473696232,
"learning_rate": 4.4460255152109915e-05,
"loss": 0.0025,
"step": 1130
},
{
"epoch": 1.1187438665358194,
"grad_norm": 0.018167071044445038,
"learning_rate": 4.441118743866536e-05,
"loss": 0.0023,
"step": 1140
},
{
"epoch": 1.1285574092247301,
"grad_norm": 0.07722876965999603,
"learning_rate": 4.4362119725220806e-05,
"loss": 0.058,
"step": 1150
},
{
"epoch": 1.138370951913641,
"grad_norm": 0.018998106941580772,
"learning_rate": 4.4313052011776255e-05,
"loss": 0.0022,
"step": 1160
},
{
"epoch": 1.1481844946025515,
"grad_norm": 0.01866711676120758,
"learning_rate": 4.4263984298331704e-05,
"loss": 0.0025,
"step": 1170
},
{
"epoch": 1.1579980372914622,
"grad_norm": 0.017769252881407738,
"learning_rate": 4.4214916584887146e-05,
"loss": 0.0022,
"step": 1180
},
{
"epoch": 1.167811579980373,
"grad_norm": 0.019811883568763733,
"learning_rate": 4.4165848871442595e-05,
"loss": 0.0019,
"step": 1190
},
{
"epoch": 1.1776251226692835,
"grad_norm": 0.02496548742055893,
"learning_rate": 4.411678115799804e-05,
"loss": 0.0021,
"step": 1200
},
{
"epoch": 1.1874386653581943,
"grad_norm": 0.01511597540229559,
"learning_rate": 4.4067713444553486e-05,
"loss": 0.0019,
"step": 1210
},
{
"epoch": 1.197252208047105,
"grad_norm": 0.01455361396074295,
"learning_rate": 4.4018645731108935e-05,
"loss": 0.0019,
"step": 1220
},
{
"epoch": 1.2070657507360156,
"grad_norm": 0.0400017648935318,
"learning_rate": 4.3969578017664384e-05,
"loss": 0.0018,
"step": 1230
},
{
"epoch": 1.2168792934249264,
"grad_norm": 0.016889173537492752,
"learning_rate": 4.3920510304219826e-05,
"loss": 0.1328,
"step": 1240
},
{
"epoch": 1.2266928361138372,
"grad_norm": 0.07678048312664032,
"learning_rate": 4.3871442590775275e-05,
"loss": 0.002,
"step": 1250
},
{
"epoch": 1.2365063788027477,
"grad_norm": 0.022459661588072777,
"learning_rate": 4.382237487733072e-05,
"loss": 0.0419,
"step": 1260
},
{
"epoch": 1.2463199214916585,
"grad_norm": 0.015639062970876694,
"learning_rate": 4.377330716388616e-05,
"loss": 0.0021,
"step": 1270
},
{
"epoch": 1.2561334641805693,
"grad_norm": 0.014097293838858604,
"learning_rate": 4.372423945044161e-05,
"loss": 0.013,
"step": 1280
},
{
"epoch": 1.2659470068694798,
"grad_norm": 0.014198847115039825,
"learning_rate": 4.367517173699706e-05,
"loss": 0.0018,
"step": 1290
},
{
"epoch": 1.2757605495583906,
"grad_norm": 0.020636072382330894,
"learning_rate": 4.3626104023552507e-05,
"loss": 0.002,
"step": 1300
},
{
"epoch": 1.2855740922473013,
"grad_norm": 0.013957252725958824,
"learning_rate": 4.357703631010795e-05,
"loss": 0.0016,
"step": 1310
},
{
"epoch": 1.295387634936212,
"grad_norm": 0.8039536476135254,
"learning_rate": 4.35279685966634e-05,
"loss": 0.0174,
"step": 1320
},
{
"epoch": 1.3052011776251227,
"grad_norm": 0.034514885395765305,
"learning_rate": 4.347890088321884e-05,
"loss": 0.0018,
"step": 1330
},
{
"epoch": 1.3150147203140334,
"grad_norm": 0.0127074820920825,
"learning_rate": 4.342983316977429e-05,
"loss": 0.2055,
"step": 1340
},
{
"epoch": 1.324828263002944,
"grad_norm": 0.09654640406370163,
"learning_rate": 4.338076545632974e-05,
"loss": 0.0653,
"step": 1350
},
{
"epoch": 1.3346418056918548,
"grad_norm": 0.018491486087441444,
"learning_rate": 4.333169774288519e-05,
"loss": 0.0019,
"step": 1360
},
{
"epoch": 1.3444553483807655,
"grad_norm": 0.014405413530766964,
"learning_rate": 4.328263002944063e-05,
"loss": 0.0752,
"step": 1370
},
{
"epoch": 1.354268891069676,
"grad_norm": 0.3947644531726837,
"learning_rate": 4.323356231599608e-05,
"loss": 0.0651,
"step": 1380
},
{
"epoch": 1.3640824337585868,
"grad_norm": 0.027137773111462593,
"learning_rate": 4.318449460255152e-05,
"loss": 0.0833,
"step": 1390
},
{
"epoch": 1.3738959764474976,
"grad_norm": 0.03568737953901291,
"learning_rate": 4.313542688910697e-05,
"loss": 0.002,
"step": 1400
},
{
"epoch": 1.3837095191364082,
"grad_norm": 0.14877857267856598,
"learning_rate": 4.308635917566242e-05,
"loss": 0.0691,
"step": 1410
},
{
"epoch": 1.393523061825319,
"grad_norm": 0.018405767157673836,
"learning_rate": 4.303729146221786e-05,
"loss": 0.0029,
"step": 1420
},
{
"epoch": 1.4033366045142297,
"grad_norm": 0.5927426815032959,
"learning_rate": 4.298822374877331e-05,
"loss": 0.0024,
"step": 1430
},
{
"epoch": 1.4131501472031402,
"grad_norm": 0.018540162593126297,
"learning_rate": 4.293915603532875e-05,
"loss": 0.0125,
"step": 1440
},
{
"epoch": 1.422963689892051,
"grad_norm": 27.07039451599121,
"learning_rate": 4.28900883218842e-05,
"loss": 0.125,
"step": 1450
},
{
"epoch": 1.4327772325809618,
"grad_norm": 0.020999347791075706,
"learning_rate": 4.284102060843965e-05,
"loss": 0.0787,
"step": 1460
},
{
"epoch": 1.4425907752698723,
"grad_norm": 0.09069288522005081,
"learning_rate": 4.27919528949951e-05,
"loss": 0.0228,
"step": 1470
},
{
"epoch": 1.452404317958783,
"grad_norm": 0.014280487783253193,
"learning_rate": 4.274288518155054e-05,
"loss": 0.0406,
"step": 1480
},
{
"epoch": 1.4622178606476939,
"grad_norm": 0.014194531366229057,
"learning_rate": 4.269381746810599e-05,
"loss": 0.0024,
"step": 1490
},
{
"epoch": 1.4720314033366044,
"grad_norm": 0.019226528704166412,
"learning_rate": 4.264474975466143e-05,
"loss": 0.0019,
"step": 1500
},
{
"epoch": 1.4818449460255152,
"grad_norm": 0.015254977159202099,
"learning_rate": 4.259568204121688e-05,
"loss": 0.0022,
"step": 1510
},
{
"epoch": 1.491658488714426,
"grad_norm": 5.0018768310546875,
"learning_rate": 4.254661432777233e-05,
"loss": 0.1376,
"step": 1520
},
{
"epoch": 1.5014720314033365,
"grad_norm": 0.032981228083372116,
"learning_rate": 4.249754661432778e-05,
"loss": 0.0174,
"step": 1530
},
{
"epoch": 1.5112855740922473,
"grad_norm": 0.011964638717472553,
"learning_rate": 4.244847890088322e-05,
"loss": 0.0028,
"step": 1540
},
{
"epoch": 1.521099116781158,
"grad_norm": 0.011394723318517208,
"learning_rate": 4.239941118743867e-05,
"loss": 0.0514,
"step": 1550
},
{
"epoch": 1.5309126594700686,
"grad_norm": 0.01083845179527998,
"learning_rate": 4.235034347399411e-05,
"loss": 0.0428,
"step": 1560
},
{
"epoch": 1.5407262021589794,
"grad_norm": 0.017966322600841522,
"learning_rate": 4.230127576054956e-05,
"loss": 0.0015,
"step": 1570
},
{
"epoch": 1.5505397448478901,
"grad_norm": 0.029729802161455154,
"learning_rate": 4.225220804710501e-05,
"loss": 0.0021,
"step": 1580
},
{
"epoch": 1.5603532875368007,
"grad_norm": 0.01271316409111023,
"learning_rate": 4.220314033366045e-05,
"loss": 0.1416,
"step": 1590
},
{
"epoch": 1.5701668302257115,
"grad_norm": 0.01406879723072052,
"learning_rate": 4.21540726202159e-05,
"loss": 0.0022,
"step": 1600
},
{
"epoch": 1.5799803729146222,
"grad_norm": 0.01311685424298048,
"learning_rate": 4.2105004906771344e-05,
"loss": 0.0505,
"step": 1610
},
{
"epoch": 1.5897939156035328,
"grad_norm": 0.015997188165783882,
"learning_rate": 4.205593719332679e-05,
"loss": 0.1032,
"step": 1620
},
{
"epoch": 1.5996074582924436,
"grad_norm": 0.021411443129181862,
"learning_rate": 4.2006869479882235e-05,
"loss": 0.0024,
"step": 1630
},
{
"epoch": 1.6094210009813543,
"grad_norm": 0.011407027952373028,
"learning_rate": 4.195780176643769e-05,
"loss": 0.0021,
"step": 1640
},
{
"epoch": 1.6192345436702649,
"grad_norm": 0.03794229030609131,
"learning_rate": 4.190873405299313e-05,
"loss": 0.0521,
"step": 1650
},
{
"epoch": 1.6290480863591756,
"grad_norm": 0.012096689082682133,
"learning_rate": 4.185966633954858e-05,
"loss": 0.0017,
"step": 1660
},
{
"epoch": 1.6388616290480864,
"grad_norm": 0.013807197101414204,
"learning_rate": 4.1810598626104024e-05,
"loss": 0.0014,
"step": 1670
},
{
"epoch": 1.648675171736997,
"grad_norm": 0.010036585852503777,
"learning_rate": 4.176153091265947e-05,
"loss": 0.0016,
"step": 1680
},
{
"epoch": 1.6584887144259077,
"grad_norm": 0.009630713611841202,
"learning_rate": 4.1712463199214915e-05,
"loss": 0.0017,
"step": 1690
},
{
"epoch": 1.6683022571148185,
"grad_norm": 0.009277078323066235,
"learning_rate": 4.1663395485770364e-05,
"loss": 0.0421,
"step": 1700
},
{
"epoch": 1.678115799803729,
"grad_norm": 0.01374200638383627,
"learning_rate": 4.161432777232581e-05,
"loss": 0.0755,
"step": 1710
},
{
"epoch": 1.6879293424926398,
"grad_norm": 0.010488603264093399,
"learning_rate": 4.1565260058881255e-05,
"loss": 0.1944,
"step": 1720
},
{
"epoch": 1.6977428851815506,
"grad_norm": 1.660660982131958,
"learning_rate": 4.1516192345436704e-05,
"loss": 0.0907,
"step": 1730
},
{
"epoch": 1.7075564278704611,
"grad_norm": 0.1264234334230423,
"learning_rate": 4.1467124631992147e-05,
"loss": 0.0785,
"step": 1740
},
{
"epoch": 1.717369970559372,
"grad_norm": 0.1920449286699295,
"learning_rate": 4.1418056918547595e-05,
"loss": 0.1021,
"step": 1750
},
{
"epoch": 1.7271835132482827,
"grad_norm": 0.012796717695891857,
"learning_rate": 4.1368989205103044e-05,
"loss": 0.0559,
"step": 1760
},
{
"epoch": 1.7369970559371932,
"grad_norm": 27.7369384765625,
"learning_rate": 4.1319921491658493e-05,
"loss": 0.0922,
"step": 1770
},
{
"epoch": 1.746810598626104,
"grad_norm": 6.674612522125244,
"learning_rate": 4.1270853778213936e-05,
"loss": 0.0059,
"step": 1780
},
{
"epoch": 1.7566241413150148,
"grad_norm": 0.016583973541855812,
"learning_rate": 4.1221786064769385e-05,
"loss": 0.0692,
"step": 1790
},
{
"epoch": 1.7664376840039253,
"grad_norm": 9.963221549987793,
"learning_rate": 4.117271835132483e-05,
"loss": 0.1275,
"step": 1800
},
{
"epoch": 1.776251226692836,
"grad_norm": 0.011029438115656376,
"learning_rate": 4.1123650637880276e-05,
"loss": 0.0016,
"step": 1810
},
{
"epoch": 1.7860647693817469,
"grad_norm": 0.019155096262693405,
"learning_rate": 4.1074582924435725e-05,
"loss": 0.0013,
"step": 1820
},
{
"epoch": 1.7958783120706574,
"grad_norm": 0.011651580221951008,
"learning_rate": 4.1025515210991174e-05,
"loss": 0.0022,
"step": 1830
},
{
"epoch": 1.8056918547595682,
"grad_norm": 0.08313179016113281,
"learning_rate": 4.0976447497546616e-05,
"loss": 0.0016,
"step": 1840
},
{
"epoch": 1.815505397448479,
"grad_norm": 0.0103986244648695,
"learning_rate": 4.092737978410206e-05,
"loss": 0.0791,
"step": 1850
},
{
"epoch": 1.8253189401373895,
"grad_norm": 0.009013223461806774,
"learning_rate": 4.087831207065751e-05,
"loss": 0.0582,
"step": 1860
},
{
"epoch": 1.8351324828263003,
"grad_norm": 0.010367879644036293,
"learning_rate": 4.0829244357212956e-05,
"loss": 0.0111,
"step": 1870
},
{
"epoch": 1.844946025515211,
"grad_norm": 0.03960138186812401,
"learning_rate": 4.0780176643768405e-05,
"loss": 0.0027,
"step": 1880
},
{
"epoch": 1.8547595682041216,
"grad_norm": 8.385934829711914,
"learning_rate": 4.073110893032385e-05,
"loss": 0.0666,
"step": 1890
},
{
"epoch": 1.8645731108930323,
"grad_norm": 0.008534184657037258,
"learning_rate": 4.0682041216879296e-05,
"loss": 0.0012,
"step": 1900
},
{
"epoch": 1.8743866535819431,
"grad_norm": 0.009065428748726845,
"learning_rate": 4.063297350343474e-05,
"loss": 0.0017,
"step": 1910
},
{
"epoch": 1.8842001962708537,
"grad_norm": 0.6313449144363403,
"learning_rate": 4.058390578999019e-05,
"loss": 0.13,
"step": 1920
},
{
"epoch": 1.8940137389597644,
"grad_norm": 7.004214286804199,
"learning_rate": 4.0534838076545636e-05,
"loss": 0.1427,
"step": 1930
},
{
"epoch": 1.9038272816486752,
"grad_norm": 0.045783668756484985,
"learning_rate": 4.0485770363101085e-05,
"loss": 0.1027,
"step": 1940
},
{
"epoch": 1.9136408243375858,
"grad_norm": 0.28262466192245483,
"learning_rate": 4.043670264965653e-05,
"loss": 0.0067,
"step": 1950
},
{
"epoch": 1.9234543670264965,
"grad_norm": 0.010344293899834156,
"learning_rate": 4.038763493621198e-05,
"loss": 0.0525,
"step": 1960
},
{
"epoch": 1.9332679097154073,
"grad_norm": 0.06860088557004929,
"learning_rate": 4.033856722276742e-05,
"loss": 0.1299,
"step": 1970
},
{
"epoch": 1.9430814524043178,
"grad_norm": 1.1787910461425781,
"learning_rate": 4.028949950932287e-05,
"loss": 0.123,
"step": 1980
},
{
"epoch": 1.9528949950932286,
"grad_norm": 0.008300206623971462,
"learning_rate": 4.024043179587832e-05,
"loss": 0.0547,
"step": 1990
},
{
"epoch": 1.9627085377821394,
"grad_norm": 19.000707626342773,
"learning_rate": 4.019136408243376e-05,
"loss": 0.1152,
"step": 2000
},
{
"epoch": 1.97252208047105,
"grad_norm": 0.17480367422103882,
"learning_rate": 4.014229636898921e-05,
"loss": 0.0875,
"step": 2010
},
{
"epoch": 1.9823356231599607,
"grad_norm": 0.014936638996005058,
"learning_rate": 4.009322865554465e-05,
"loss": 0.0463,
"step": 2020
},
{
"epoch": 1.9921491658488715,
"grad_norm": 0.04136461392045021,
"learning_rate": 4.00441609421001e-05,
"loss": 0.0205,
"step": 2030
},
{
"epoch": 2.001962708537782,
"grad_norm": 0.008648752234876156,
"learning_rate": 3.999509322865554e-05,
"loss": 0.0015,
"step": 2040
},
{
"epoch": 2.011776251226693,
"grad_norm": 0.03223758190870285,
"learning_rate": 3.9946025515211e-05,
"loss": 0.0015,
"step": 2050
},
{
"epoch": 2.0215897939156036,
"grad_norm": 0.07674010843038559,
"learning_rate": 3.989695780176644e-05,
"loss": 0.0437,
"step": 2060
},
{
"epoch": 2.031403336604514,
"grad_norm": 0.00833066739141941,
"learning_rate": 3.984789008832189e-05,
"loss": 0.0073,
"step": 2070
},
{
"epoch": 2.041216879293425,
"grad_norm": 0.04498624801635742,
"learning_rate": 3.979882237487733e-05,
"loss": 0.001,
"step": 2080
},
{
"epoch": 2.0510304219823356,
"grad_norm": 0.00966518186032772,
"learning_rate": 3.974975466143278e-05,
"loss": 0.0009,
"step": 2090
},
{
"epoch": 2.060843964671246,
"grad_norm": 0.00706259673461318,
"learning_rate": 3.970068694798822e-05,
"loss": 0.0399,
"step": 2100
},
{
"epoch": 2.070657507360157,
"grad_norm": 0.8026844263076782,
"learning_rate": 3.965161923454368e-05,
"loss": 0.0022,
"step": 2110
},
{
"epoch": 2.0804710500490677,
"grad_norm": 0.006883909460157156,
"learning_rate": 3.960255152109912e-05,
"loss": 0.0477,
"step": 2120
},
{
"epoch": 2.0902845927379783,
"grad_norm": 0.007603227626532316,
"learning_rate": 3.955348380765457e-05,
"loss": 0.001,
"step": 2130
},
{
"epoch": 2.1000981354268893,
"grad_norm": 0.007116556167602539,
"learning_rate": 3.950441609421001e-05,
"loss": 0.1037,
"step": 2140
},
{
"epoch": 2.1099116781158,
"grad_norm": 0.01041839923709631,
"learning_rate": 3.945534838076545e-05,
"loss": 0.002,
"step": 2150
},
{
"epoch": 2.1197252208047104,
"grad_norm": 0.007161868270486593,
"learning_rate": 3.94062806673209e-05,
"loss": 0.0019,
"step": 2160
},
{
"epoch": 2.1295387634936214,
"grad_norm": 5.720839977264404,
"learning_rate": 3.935721295387635e-05,
"loss": 0.0342,
"step": 2170
},
{
"epoch": 2.139352306182532,
"grad_norm": 0.006590616423636675,
"learning_rate": 3.93081452404318e-05,
"loss": 0.0009,
"step": 2180
},
{
"epoch": 2.1491658488714425,
"grad_norm": 0.006484217941761017,
"learning_rate": 3.925907752698724e-05,
"loss": 0.0009,
"step": 2190
},
{
"epoch": 2.1589793915603535,
"grad_norm": 0.006214112509042025,
"learning_rate": 3.921000981354269e-05,
"loss": 0.0007,
"step": 2200
},
{
"epoch": 2.168792934249264,
"grad_norm": 0.006126193795353174,
"learning_rate": 3.9160942100098133e-05,
"loss": 0.0007,
"step": 2210
},
{
"epoch": 2.1786064769381746,
"grad_norm": 0.006230359897017479,
"learning_rate": 3.911187438665358e-05,
"loss": 0.0008,
"step": 2220
},
{
"epoch": 2.1884200196270855,
"grad_norm": 0.0061959754675626755,
"learning_rate": 3.906280667320903e-05,
"loss": 0.0008,
"step": 2230
},
{
"epoch": 2.198233562315996,
"grad_norm": 0.006188957951962948,
"learning_rate": 3.901373895976448e-05,
"loss": 0.0007,
"step": 2240
},
{
"epoch": 2.2080471050049066,
"grad_norm": 0.005858860444277525,
"learning_rate": 3.896467124631992e-05,
"loss": 0.0705,
"step": 2250
},
{
"epoch": 2.2178606476938176,
"grad_norm": 0.006062773987650871,
"learning_rate": 3.891560353287537e-05,
"loss": 0.0526,
"step": 2260
},
{
"epoch": 2.227674190382728,
"grad_norm": 0.005856741685420275,
"learning_rate": 3.8866535819430814e-05,
"loss": 0.0007,
"step": 2270
},
{
"epoch": 2.2374877330716387,
"grad_norm": 0.009888865053653717,
"learning_rate": 3.881746810598626e-05,
"loss": 0.0467,
"step": 2280
},
{
"epoch": 2.2473012757605497,
"grad_norm": 31.81001853942871,
"learning_rate": 3.876840039254171e-05,
"loss": 0.008,
"step": 2290
},
{
"epoch": 2.2571148184494603,
"grad_norm": 0.006301484536379576,
"learning_rate": 3.8719332679097154e-05,
"loss": 0.0007,
"step": 2300
},
{
"epoch": 2.266928361138371,
"grad_norm": 0.006280009169131517,
"learning_rate": 3.86702649656526e-05,
"loss": 0.001,
"step": 2310
},
{
"epoch": 2.276741903827282,
"grad_norm": 1.7603585720062256,
"learning_rate": 3.8621197252208045e-05,
"loss": 0.1061,
"step": 2320
},
{
"epoch": 2.2865554465161924,
"grad_norm": 0.006645340472459793,
"learning_rate": 3.8572129538763494e-05,
"loss": 0.0007,
"step": 2330
},
{
"epoch": 2.296368989205103,
"grad_norm": 0.008996882475912571,
"learning_rate": 3.852306182531894e-05,
"loss": 0.0857,
"step": 2340
},
{
"epoch": 2.306182531894014,
"grad_norm": 16.020790100097656,
"learning_rate": 3.847399411187439e-05,
"loss": 0.0797,
"step": 2350
},
{
"epoch": 2.3159960745829244,
"grad_norm": 30.399795532226562,
"learning_rate": 3.8424926398429834e-05,
"loss": 0.0316,
"step": 2360
},
{
"epoch": 2.325809617271835,
"grad_norm": 0.019369609653949738,
"learning_rate": 3.837585868498528e-05,
"loss": 0.0244,
"step": 2370
},
{
"epoch": 2.335623159960746,
"grad_norm": 0.009330210275948048,
"learning_rate": 3.8326790971540725e-05,
"loss": 0.0308,
"step": 2380
},
{
"epoch": 2.3454367026496565,
"grad_norm": 0.016002874821424484,
"learning_rate": 3.8277723258096174e-05,
"loss": 0.0056,
"step": 2390
},
{
"epoch": 2.355250245338567,
"grad_norm": 0.008927385322749615,
"learning_rate": 3.8228655544651623e-05,
"loss": 0.001,
"step": 2400
},
{
"epoch": 2.365063788027478,
"grad_norm": 0.010494639165699482,
"learning_rate": 3.817958783120707e-05,
"loss": 0.0014,
"step": 2410
},
{
"epoch": 2.3748773307163886,
"grad_norm": 0.007917719893157482,
"learning_rate": 3.8130520117762515e-05,
"loss": 0.0009,
"step": 2420
},
{
"epoch": 2.384690873405299,
"grad_norm": 0.005997834727168083,
"learning_rate": 3.8081452404317964e-05,
"loss": 0.001,
"step": 2430
},
{
"epoch": 2.39450441609421,
"grad_norm": 0.006399065256118774,
"learning_rate": 3.8032384690873406e-05,
"loss": 0.0008,
"step": 2440
},
{
"epoch": 2.4043179587831207,
"grad_norm": 0.010832864791154861,
"learning_rate": 3.798331697742885e-05,
"loss": 0.0016,
"step": 2450
},
{
"epoch": 2.4141315014720313,
"grad_norm": 0.007472364231944084,
"learning_rate": 3.7934249263984304e-05,
"loss": 0.0007,
"step": 2460
},
{
"epoch": 2.4239450441609423,
"grad_norm": 0.005750945303589106,
"learning_rate": 3.7885181550539746e-05,
"loss": 0.0007,
"step": 2470
},
{
"epoch": 2.433758586849853,
"grad_norm": 0.00842629000544548,
"learning_rate": 3.7836113837095195e-05,
"loss": 0.0009,
"step": 2480
},
{
"epoch": 2.4435721295387633,
"grad_norm": 1.4052761793136597,
"learning_rate": 3.778704612365064e-05,
"loss": 0.1304,
"step": 2490
},
{
"epoch": 2.4533856722276743,
"grad_norm": 0.007391956634819508,
"learning_rate": 3.7737978410206086e-05,
"loss": 0.0007,
"step": 2500
},
{
"epoch": 2.463199214916585,
"grad_norm": 20.43938446044922,
"learning_rate": 3.768891069676153e-05,
"loss": 0.1268,
"step": 2510
},
{
"epoch": 2.4730127576054954,
"grad_norm": 0.05450147017836571,
"learning_rate": 3.7639842983316984e-05,
"loss": 0.0139,
"step": 2520
},
{
"epoch": 2.4828263002944064,
"grad_norm": 0.03355271369218826,
"learning_rate": 3.7590775269872426e-05,
"loss": 0.0147,
"step": 2530
},
{
"epoch": 2.492639842983317,
"grad_norm": 0.007103215903043747,
"learning_rate": 3.7541707556427875e-05,
"loss": 0.1271,
"step": 2540
},
{
"epoch": 2.5024533856722275,
"grad_norm": 0.007379031740128994,
"learning_rate": 3.749263984298332e-05,
"loss": 0.0009,
"step": 2550
},
{
"epoch": 2.5122669283611385,
"grad_norm": 0.2919241786003113,
"learning_rate": 3.7443572129538766e-05,
"loss": 0.0025,
"step": 2560
},
{
"epoch": 2.522080471050049,
"grad_norm": 0.01704682782292366,
"learning_rate": 3.739450441609421e-05,
"loss": 0.0007,
"step": 2570
},
{
"epoch": 2.5318940137389596,
"grad_norm": 0.005663587246090174,
"learning_rate": 3.734543670264966e-05,
"loss": 0.001,
"step": 2580
},
{
"epoch": 2.5417075564278706,
"grad_norm": 0.006800634786486626,
"learning_rate": 3.7296368989205107e-05,
"loss": 0.0006,
"step": 2590
},
{
"epoch": 2.551521099116781,
"grad_norm": 0.0049642156809568405,
"learning_rate": 3.724730127576055e-05,
"loss": 0.0007,
"step": 2600
},
{
"epoch": 2.5613346418056917,
"grad_norm": 0.0051542771980166435,
"learning_rate": 3.7198233562316e-05,
"loss": 0.0006,
"step": 2610
},
{
"epoch": 2.5711481844946027,
"grad_norm": 0.005128064192831516,
"learning_rate": 3.714916584887144e-05,
"loss": 0.0006,
"step": 2620
},
{
"epoch": 2.5809617271835132,
"grad_norm": 0.005018030758947134,
"learning_rate": 3.710009813542689e-05,
"loss": 0.0007,
"step": 2630
},
{
"epoch": 2.590775269872424,
"grad_norm": 0.004934113007038832,
"learning_rate": 3.705103042198234e-05,
"loss": 0.0006,
"step": 2640
},
{
"epoch": 2.600588812561335,
"grad_norm": 0.004958492703735828,
"learning_rate": 3.700196270853779e-05,
"loss": 0.0006,
"step": 2650
},
{
"epoch": 2.6104023552502453,
"grad_norm": 0.0050879898481070995,
"learning_rate": 3.695289499509323e-05,
"loss": 0.0006,
"step": 2660
},
{
"epoch": 2.620215897939156,
"grad_norm": 0.004783379379659891,
"learning_rate": 3.690382728164868e-05,
"loss": 0.0006,
"step": 2670
},
{
"epoch": 2.630029440628067,
"grad_norm": 0.004975931718945503,
"learning_rate": 3.685475956820412e-05,
"loss": 0.0006,
"step": 2680
},
{
"epoch": 2.6398429833169774,
"grad_norm": 0.006240040063858032,
"learning_rate": 3.680569185475957e-05,
"loss": 0.0006,
"step": 2690
},
{
"epoch": 2.649656526005888,
"grad_norm": 0.0050759222358465195,
"learning_rate": 3.675662414131502e-05,
"loss": 0.0006,
"step": 2700
},
{
"epoch": 2.659470068694799,
"grad_norm": 0.004622638691216707,
"learning_rate": 3.670755642787047e-05,
"loss": 0.0005,
"step": 2710
},
{
"epoch": 2.6692836113837095,
"grad_norm": 0.005237213335931301,
"learning_rate": 3.665848871442591e-05,
"loss": 0.0526,
"step": 2720
},
{
"epoch": 2.67909715407262,
"grad_norm": 0.15502117574214935,
"learning_rate": 3.660942100098136e-05,
"loss": 0.0014,
"step": 2730
},
{
"epoch": 2.688910696761531,
"grad_norm": 0.004649411886930466,
"learning_rate": 3.65603532875368e-05,
"loss": 0.0327,
"step": 2740
},
{
"epoch": 2.6987242394504416,
"grad_norm": 0.004374220035970211,
"learning_rate": 3.651128557409225e-05,
"loss": 0.0019,
"step": 2750
},
{
"epoch": 2.708537782139352,
"grad_norm": 7.425215244293213,
"learning_rate": 3.64622178606477e-05,
"loss": 0.0813,
"step": 2760
},
{
"epoch": 2.718351324828263,
"grad_norm": 0.004420330747961998,
"learning_rate": 3.641315014720314e-05,
"loss": 0.0006,
"step": 2770
},
{
"epoch": 2.7281648675171737,
"grad_norm": 0.004426442552357912,
"learning_rate": 3.636408243375859e-05,
"loss": 0.0008,
"step": 2780
},
{
"epoch": 2.7379784102060842,
"grad_norm": 0.005173469893634319,
"learning_rate": 3.631501472031403e-05,
"loss": 0.0006,
"step": 2790
},
{
"epoch": 2.7477919528949952,
"grad_norm": 0.0050672367215156555,
"learning_rate": 3.626594700686948e-05,
"loss": 0.0853,
"step": 2800
},
{
"epoch": 2.7576054955839058,
"grad_norm": 0.005417036823928356,
"learning_rate": 3.621687929342493e-05,
"loss": 0.001,
"step": 2810
},
{
"epoch": 2.7674190382728163,
"grad_norm": 0.005575211253017187,
"learning_rate": 3.616781157998038e-05,
"loss": 0.0007,
"step": 2820
},
{
"epoch": 2.7772325809617273,
"grad_norm": 0.0057277195155620575,
"learning_rate": 3.611874386653582e-05,
"loss": 0.0006,
"step": 2830
},
{
"epoch": 2.787046123650638,
"grad_norm": 0.005416057072579861,
"learning_rate": 3.606967615309127e-05,
"loss": 0.0006,
"step": 2840
},
{
"epoch": 2.7968596663395484,
"grad_norm": 0.004573365673422813,
"learning_rate": 3.602060843964671e-05,
"loss": 0.0011,
"step": 2850
},
{
"epoch": 2.8066732090284594,
"grad_norm": 0.0056626503355801105,
"learning_rate": 3.597154072620216e-05,
"loss": 0.001,
"step": 2860
},
{
"epoch": 2.81648675171737,
"grad_norm": 0.006272735074162483,
"learning_rate": 3.592247301275761e-05,
"loss": 0.0005,
"step": 2870
},
{
"epoch": 2.8263002944062805,
"grad_norm": 0.004290241748094559,
"learning_rate": 3.587340529931306e-05,
"loss": 0.0006,
"step": 2880
},
{
"epoch": 2.8361138370951915,
"grad_norm": 0.0073272231966257095,
"learning_rate": 3.58243375858685e-05,
"loss": 0.059,
"step": 2890
},
{
"epoch": 2.845927379784102,
"grad_norm": 0.0045128497295081615,
"learning_rate": 3.5775269872423944e-05,
"loss": 0.0773,
"step": 2900
},
{
"epoch": 2.8557409224730126,
"grad_norm": 0.005028576590120792,
"learning_rate": 3.572620215897939e-05,
"loss": 0.0008,
"step": 2910
},
{
"epoch": 2.8655544651619236,
"grad_norm": 0.004786403849720955,
"learning_rate": 3.5677134445534835e-05,
"loss": 0.0855,
"step": 2920
},
{
"epoch": 2.875368007850834,
"grad_norm": 0.02878345362842083,
"learning_rate": 3.5628066732090284e-05,
"loss": 0.0006,
"step": 2930
},
{
"epoch": 2.8851815505397447,
"grad_norm": 32.582359313964844,
"learning_rate": 3.557899901864573e-05,
"loss": 0.0652,
"step": 2940
},
{
"epoch": 2.8949950932286557,
"grad_norm": 0.06951310485601425,
"learning_rate": 3.552993130520118e-05,
"loss": 0.033,
"step": 2950
},
{
"epoch": 2.904808635917566,
"grad_norm": 0.00533737288787961,
"learning_rate": 3.5480863591756624e-05,
"loss": 0.0057,
"step": 2960
},
{
"epoch": 2.9146221786064768,
"grad_norm": 0.005290019791573286,
"learning_rate": 3.543179587831207e-05,
"loss": 0.095,
"step": 2970
},
{
"epoch": 2.9244357212953878,
"grad_norm": 0.0044818902388215065,
"learning_rate": 3.5382728164867515e-05,
"loss": 0.0009,
"step": 2980
},
{
"epoch": 2.9342492639842983,
"grad_norm": 0.005349620245397091,
"learning_rate": 3.5333660451422964e-05,
"loss": 0.0829,
"step": 2990
},
{
"epoch": 2.944062806673209,
"grad_norm": 0.011460080742835999,
"learning_rate": 3.528459273797841e-05,
"loss": 0.0529,
"step": 3000
},
{
"epoch": 2.95387634936212,
"grad_norm": 0.0047313557006418705,
"learning_rate": 3.523552502453386e-05,
"loss": 0.0037,
"step": 3010
},
{
"epoch": 2.9636898920510304,
"grad_norm": 0.01534937135875225,
"learning_rate": 3.5186457311089304e-05,
"loss": 0.1236,
"step": 3020
},
{
"epoch": 2.973503434739941,
"grad_norm": 0.007522872183471918,
"learning_rate": 3.5137389597644747e-05,
"loss": 0.0115,
"step": 3030
},
{
"epoch": 2.983316977428852,
"grad_norm": 0.024374373257160187,
"learning_rate": 3.5088321884200196e-05,
"loss": 0.0008,
"step": 3040
},
{
"epoch": 2.9931305201177625,
"grad_norm": 0.08516921103000641,
"learning_rate": 3.5039254170755645e-05,
"loss": 0.0956,
"step": 3050
},
{
"epoch": 3.002944062806673,
"grad_norm": 0.005535255651921034,
"learning_rate": 3.4990186457311094e-05,
"loss": 0.0519,
"step": 3060
},
{
"epoch": 3.012757605495584,
"grad_norm": 0.015444884076714516,
"learning_rate": 3.4941118743866536e-05,
"loss": 0.0013,
"step": 3070
},
{
"epoch": 3.0225711481844946,
"grad_norm": 0.00661628320813179,
"learning_rate": 3.4892051030421985e-05,
"loss": 0.0008,
"step": 3080
},
{
"epoch": 3.032384690873405,
"grad_norm": 0.01968499645590782,
"learning_rate": 3.484298331697743e-05,
"loss": 0.0041,
"step": 3090
},
{
"epoch": 3.042198233562316,
"grad_norm": 0.004277428146451712,
"learning_rate": 3.4793915603532876e-05,
"loss": 0.0104,
"step": 3100
},
{
"epoch": 3.0520117762512267,
"grad_norm": 0.007642016280442476,
"learning_rate": 3.4744847890088325e-05,
"loss": 0.0006,
"step": 3110
},
{
"epoch": 3.061825318940137,
"grad_norm": 0.004083346109837294,
"learning_rate": 3.4695780176643774e-05,
"loss": 0.128,
"step": 3120
},
{
"epoch": 3.071638861629048,
"grad_norm": 0.01271857414394617,
"learning_rate": 3.4646712463199216e-05,
"loss": 0.0659,
"step": 3130
},
{
"epoch": 3.0814524043179587,
"grad_norm": 0.009639259427785873,
"learning_rate": 3.4597644749754665e-05,
"loss": 0.0424,
"step": 3140
},
{
"epoch": 3.0912659470068693,
"grad_norm": 0.023669827729463577,
"learning_rate": 3.454857703631011e-05,
"loss": 0.0309,
"step": 3150
},
{
"epoch": 3.1010794896957803,
"grad_norm": 0.004919820465147495,
"learning_rate": 3.4499509322865556e-05,
"loss": 0.0146,
"step": 3160
},
{
"epoch": 3.110893032384691,
"grad_norm": 0.003851011861115694,
"learning_rate": 3.4450441609421005e-05,
"loss": 0.0006,
"step": 3170
},
{
"epoch": 3.1207065750736014,
"grad_norm": 0.005380318965762854,
"learning_rate": 3.440137389597645e-05,
"loss": 0.0177,
"step": 3180
},
{
"epoch": 3.1305201177625124,
"grad_norm": 0.00603041285648942,
"learning_rate": 3.4352306182531896e-05,
"loss": 0.0006,
"step": 3190
},
{
"epoch": 3.140333660451423,
"grad_norm": 0.003694745246320963,
"learning_rate": 3.430323846908734e-05,
"loss": 0.0007,
"step": 3200
},
{
"epoch": 3.1501472031403335,
"grad_norm": 0.009091987274587154,
"learning_rate": 3.425417075564279e-05,
"loss": 0.0046,
"step": 3210
},
{
"epoch": 3.1599607458292445,
"grad_norm": 44.486915588378906,
"learning_rate": 3.4205103042198237e-05,
"loss": 0.0156,
"step": 3220
},
{
"epoch": 3.169774288518155,
"grad_norm": 0.003716795239597559,
"learning_rate": 3.4156035328753686e-05,
"loss": 0.0006,
"step": 3230
},
{
"epoch": 3.1795878312070656,
"grad_norm": 0.010979007929563522,
"learning_rate": 3.410696761530913e-05,
"loss": 0.0006,
"step": 3240
},
{
"epoch": 3.1894013738959766,
"grad_norm": 0.0035946909338235855,
"learning_rate": 3.405789990186458e-05,
"loss": 0.0526,
"step": 3250
},
{
"epoch": 3.199214916584887,
"grad_norm": 0.0067933835089206696,
"learning_rate": 3.400883218842002e-05,
"loss": 0.0004,
"step": 3260
},
{
"epoch": 3.2090284592737977,
"grad_norm": 0.0035232524387538433,
"learning_rate": 3.395976447497547e-05,
"loss": 0.0009,
"step": 3270
},
{
"epoch": 3.2188420019627086,
"grad_norm": 0.02211836725473404,
"learning_rate": 3.391069676153091e-05,
"loss": 0.0004,
"step": 3280
},
{
"epoch": 3.228655544651619,
"grad_norm": 0.0037303888238966465,
"learning_rate": 3.3861629048086366e-05,
"loss": 0.0006,
"step": 3290
},
{
"epoch": 3.2384690873405297,
"grad_norm": 0.007376148831099272,
"learning_rate": 3.381256133464181e-05,
"loss": 0.0004,
"step": 3300
},
{
"epoch": 3.2482826300294407,
"grad_norm": 0.003410831792280078,
"learning_rate": 3.376349362119726e-05,
"loss": 0.0004,
"step": 3310
},
{
"epoch": 3.2580961727183513,
"grad_norm": 0.0033686254173517227,
"learning_rate": 3.37144259077527e-05,
"loss": 0.0004,
"step": 3320
},
{
"epoch": 3.267909715407262,
"grad_norm": 0.0036628427915275097,
"learning_rate": 3.366535819430814e-05,
"loss": 0.0004,
"step": 3330
},
{
"epoch": 3.277723258096173,
"grad_norm": 0.0034903271589428186,
"learning_rate": 3.361629048086359e-05,
"loss": 0.0676,
"step": 3340
},
{
"epoch": 3.2875368007850834,
"grad_norm": 0.007418110966682434,
"learning_rate": 3.356722276741904e-05,
"loss": 0.0998,
"step": 3350
},
{
"epoch": 3.297350343473994,
"grad_norm": 0.003807367756962776,
"learning_rate": 3.351815505397449e-05,
"loss": 0.0004,
"step": 3360
},
{
"epoch": 3.307163886162905,
"grad_norm": 0.006307406350970268,
"learning_rate": 3.346908734052993e-05,
"loss": 0.0546,
"step": 3370
},
{
"epoch": 3.3169774288518155,
"grad_norm": 0.004091127309948206,
"learning_rate": 3.342001962708538e-05,
"loss": 0.0688,
"step": 3380
},
{
"epoch": 3.326790971540726,
"grad_norm": 0.008122970350086689,
"learning_rate": 3.337095191364082e-05,
"loss": 0.0535,
"step": 3390
},
{
"epoch": 3.336604514229637,
"grad_norm": 0.2856459617614746,
"learning_rate": 3.332188420019627e-05,
"loss": 0.0479,
"step": 3400
},
{
"epoch": 3.3464180569185475,
"grad_norm": 0.011355056427419186,
"learning_rate": 3.327281648675172e-05,
"loss": 0.062,
"step": 3410
},
{
"epoch": 3.356231599607458,
"grad_norm": 0.010982933454215527,
"learning_rate": 3.322374877330717e-05,
"loss": 0.119,
"step": 3420
},
{
"epoch": 3.366045142296369,
"grad_norm": 0.14039351046085358,
"learning_rate": 3.317468105986261e-05,
"loss": 0.0129,
"step": 3430
},
{
"epoch": 3.3758586849852796,
"grad_norm": 0.005223874468356371,
"learning_rate": 3.312561334641806e-05,
"loss": 0.0417,
"step": 3440
},
{
"epoch": 3.38567222767419,
"grad_norm": 0.0041849189437925816,
"learning_rate": 3.30765456329735e-05,
"loss": 0.036,
"step": 3450
},
{
"epoch": 3.395485770363101,
"grad_norm": 0.004221642389893532,
"learning_rate": 3.302747791952895e-05,
"loss": 0.0021,
"step": 3460
},
{
"epoch": 3.4052993130520117,
"grad_norm": 57.141910552978516,
"learning_rate": 3.29784102060844e-05,
"loss": 0.0982,
"step": 3470
},
{
"epoch": 3.4151128557409223,
"grad_norm": 0.009060889482498169,
"learning_rate": 3.292934249263984e-05,
"loss": 0.0006,
"step": 3480
},
{
"epoch": 3.4249263984298333,
"grad_norm": 0.003756599733605981,
"learning_rate": 3.288027477919529e-05,
"loss": 0.0033,
"step": 3490
},
{
"epoch": 3.434739941118744,
"grad_norm": 0.0041136653162539005,
"learning_rate": 3.2831207065750733e-05,
"loss": 0.0015,
"step": 3500
},
{
"epoch": 3.4445534838076544,
"grad_norm": 0.003665735013782978,
"learning_rate": 3.278213935230618e-05,
"loss": 0.0614,
"step": 3510
},
{
"epoch": 3.4543670264965654,
"grad_norm": 0.003554809372872114,
"learning_rate": 3.273307163886163e-05,
"loss": 0.001,
"step": 3520
},
{
"epoch": 3.464180569185476,
"grad_norm": 0.0034583976957947016,
"learning_rate": 3.268400392541708e-05,
"loss": 0.0008,
"step": 3530
},
{
"epoch": 3.4739941118743864,
"grad_norm": 0.003728943644091487,
"learning_rate": 3.263493621197252e-05,
"loss": 0.0004,
"step": 3540
},
{
"epoch": 3.4838076545632974,
"grad_norm": 0.003582128556445241,
"learning_rate": 3.258586849852797e-05,
"loss": 0.0004,
"step": 3550
},
{
"epoch": 3.493621197252208,
"grad_norm": 0.0033694806043058634,
"learning_rate": 3.2536800785083414e-05,
"loss": 0.0004,
"step": 3560
},
{
"epoch": 3.5034347399411185,
"grad_norm": 0.10974390059709549,
"learning_rate": 3.248773307163886e-05,
"loss": 0.0004,
"step": 3570
},
{
"epoch": 3.5132482826300295,
"grad_norm": 0.003454001620411873,
"learning_rate": 3.243866535819431e-05,
"loss": 0.0004,
"step": 3580
},
{
"epoch": 3.52306182531894,
"grad_norm": 0.0036948120687156916,
"learning_rate": 3.238959764474976e-05,
"loss": 0.0657,
"step": 3590
},
{
"epoch": 3.5328753680078506,
"grad_norm": 0.0033204422798007727,
"learning_rate": 3.23405299313052e-05,
"loss": 0.0604,
"step": 3600
},
{
"epoch": 3.5426889106967616,
"grad_norm": 0.00640474446117878,
"learning_rate": 3.229146221786065e-05,
"loss": 0.0013,
"step": 3610
},
{
"epoch": 3.552502453385672,
"grad_norm": 0.0037864702753722668,
"learning_rate": 3.2242394504416094e-05,
"loss": 0.0388,
"step": 3620
},
{
"epoch": 3.5623159960745827,
"grad_norm": 0.0036447476595640182,
"learning_rate": 3.2193326790971536e-05,
"loss": 0.0015,
"step": 3630
},
{
"epoch": 3.5721295387634937,
"grad_norm": 0.004586994647979736,
"learning_rate": 3.214425907752699e-05,
"loss": 0.0008,
"step": 3640
},
{
"epoch": 3.5819430814524043,
"grad_norm": 0.006420999765396118,
"learning_rate": 3.2095191364082434e-05,
"loss": 0.081,
"step": 3650
},
{
"epoch": 3.591756624141315,
"grad_norm": 0.037869326770305634,
"learning_rate": 3.204612365063788e-05,
"loss": 0.0027,
"step": 3660
},
{
"epoch": 3.601570166830226,
"grad_norm": 0.0033209428656846285,
"learning_rate": 3.1997055937193325e-05,
"loss": 0.0004,
"step": 3670
},
{
"epoch": 3.6113837095191363,
"grad_norm": 0.0032525446731597185,
"learning_rate": 3.1947988223748774e-05,
"loss": 0.0004,
"step": 3680
},
{
"epoch": 3.621197252208047,
"grad_norm": 0.0034604640677571297,
"learning_rate": 3.189892051030422e-05,
"loss": 0.0004,
"step": 3690
},
{
"epoch": 3.631010794896958,
"grad_norm": 0.0048661488108336926,
"learning_rate": 3.184985279685967e-05,
"loss": 0.0006,
"step": 3700
},
{
"epoch": 3.6408243375858684,
"grad_norm": 0.003736069891601801,
"learning_rate": 3.1800785083415115e-05,
"loss": 0.0004,
"step": 3710
},
{
"epoch": 3.650637880274779,
"grad_norm": 0.0031651423778384924,
"learning_rate": 3.1751717369970564e-05,
"loss": 0.0004,
"step": 3720
},
{
"epoch": 3.66045142296369,
"grad_norm": 0.0032348737586289644,
"learning_rate": 3.1702649656526006e-05,
"loss": 0.0004,
"step": 3730
},
{
"epoch": 3.6702649656526005,
"grad_norm": 0.003265490522608161,
"learning_rate": 3.1653581943081455e-05,
"loss": 0.0004,
"step": 3740
},
{
"epoch": 3.680078508341511,
"grad_norm": 0.18621397018432617,
"learning_rate": 3.16045142296369e-05,
"loss": 0.0432,
"step": 3750
},
{
"epoch": 3.689892051030422,
"grad_norm": 49.72319793701172,
"learning_rate": 3.155544651619235e-05,
"loss": 0.0419,
"step": 3760
},
{
"epoch": 3.6997055937193326,
"grad_norm": 0.003202399704605341,
"learning_rate": 3.1506378802747795e-05,
"loss": 0.0005,
"step": 3770
},
{
"epoch": 3.709519136408243,
"grad_norm": 0.003484070301055908,
"learning_rate": 3.145731108930324e-05,
"loss": 0.0004,
"step": 3780
},
{
"epoch": 3.719332679097154,
"grad_norm": 0.003013091627508402,
"learning_rate": 3.1408243375858686e-05,
"loss": 0.0004,
"step": 3790
},
{
"epoch": 3.7291462217860647,
"grad_norm": 0.0030194155406206846,
"learning_rate": 3.135917566241413e-05,
"loss": 0.0008,
"step": 3800
},
{
"epoch": 3.7389597644749752,
"grad_norm": 0.0030365772545337677,
"learning_rate": 3.131010794896958e-05,
"loss": 0.0004,
"step": 3810
},
{
"epoch": 3.7487733071638862,
"grad_norm": 0.002989945001900196,
"learning_rate": 3.1261040235525026e-05,
"loss": 0.0006,
"step": 3820
},
{
"epoch": 3.758586849852797,
"grad_norm": 0.0031468605156987906,
"learning_rate": 3.1211972522080475e-05,
"loss": 0.0005,
"step": 3830
},
{
"epoch": 3.7684003925417073,
"grad_norm": 0.004800264723598957,
"learning_rate": 3.116290480863592e-05,
"loss": 0.0108,
"step": 3840
},
{
"epoch": 3.7782139352306183,
"grad_norm": 0.0044929636642336845,
"learning_rate": 3.1113837095191366e-05,
"loss": 0.0009,
"step": 3850
},
{
"epoch": 3.788027477919529,
"grad_norm": 0.0028576962649822235,
"learning_rate": 3.106476938174681e-05,
"loss": 0.0003,
"step": 3860
},
{
"epoch": 3.7978410206084394,
"grad_norm": 0.0031541618518531322,
"learning_rate": 3.101570166830226e-05,
"loss": 0.0003,
"step": 3870
},
{
"epoch": 3.8076545632973504,
"grad_norm": 0.0027680331841111183,
"learning_rate": 3.096663395485771e-05,
"loss": 0.0004,
"step": 3880
},
{
"epoch": 3.817468105986261,
"grad_norm": 0.0027752595487982035,
"learning_rate": 3.0917566241413156e-05,
"loss": 0.0003,
"step": 3890
},
{
"epoch": 3.8272816486751715,
"grad_norm": 0.0027524100150913,
"learning_rate": 3.08684985279686e-05,
"loss": 0.0003,
"step": 3900
},
{
"epoch": 3.8370951913640825,
"grad_norm": 0.0028699261602014303,
"learning_rate": 3.081943081452405e-05,
"loss": 0.0005,
"step": 3910
},
{
"epoch": 3.846908734052993,
"grad_norm": 0.002797529799863696,
"learning_rate": 3.077036310107949e-05,
"loss": 0.0003,
"step": 3920
},
{
"epoch": 3.8567222767419036,
"grad_norm": 0.002745892619714141,
"learning_rate": 3.072129538763494e-05,
"loss": 0.0003,
"step": 3930
},
{
"epoch": 3.8665358194308146,
"grad_norm": 0.0030794497579336166,
"learning_rate": 3.067222767419039e-05,
"loss": 0.1156,
"step": 3940
},
{
"epoch": 3.876349362119725,
"grad_norm": 0.007240855600684881,
"learning_rate": 3.062315996074583e-05,
"loss": 0.0018,
"step": 3950
},
{
"epoch": 3.8861629048086357,
"grad_norm": 0.0031701885163784027,
"learning_rate": 3.057409224730128e-05,
"loss": 0.0008,
"step": 3960
},
{
"epoch": 3.8959764474975467,
"grad_norm": 0.007171397563070059,
"learning_rate": 3.052502453385672e-05,
"loss": 0.0545,
"step": 3970
},
{
"epoch": 3.9057899901864572,
"grad_norm": 0.0026376626919955015,
"learning_rate": 3.0475956820412173e-05,
"loss": 0.1456,
"step": 3980
},
{
"epoch": 3.9156035328753678,
"grad_norm": 0.004865538328886032,
"learning_rate": 3.0426889106967615e-05,
"loss": 0.054,
"step": 3990
},
{
"epoch": 3.9254170755642788,
"grad_norm": 0.06169740855693817,
"learning_rate": 3.0377821393523064e-05,
"loss": 0.1127,
"step": 4000
},
{
"epoch": 3.9352306182531893,
"grad_norm": 0.01491067185997963,
"learning_rate": 3.032875368007851e-05,
"loss": 0.0013,
"step": 4010
},
{
"epoch": 3.9450441609421,
"grad_norm": 0.003037821501493454,
"learning_rate": 3.027968596663396e-05,
"loss": 0.1189,
"step": 4020
},
{
"epoch": 3.954857703631011,
"grad_norm": 0.05605999380350113,
"learning_rate": 3.02306182531894e-05,
"loss": 0.0008,
"step": 4030
},
{
"epoch": 3.9646712463199214,
"grad_norm": 0.0034519529435783625,
"learning_rate": 3.0181550539744853e-05,
"loss": 0.0003,
"step": 4040
},
{
"epoch": 3.974484789008832,
"grad_norm": 0.0033396417275071144,
"learning_rate": 3.0132482826300295e-05,
"loss": 0.0431,
"step": 4050
},
{
"epoch": 3.984298331697743,
"grad_norm": 0.002848528092727065,
"learning_rate": 3.0083415112855744e-05,
"loss": 0.0003,
"step": 4060
},
{
"epoch": 3.9941118743866535,
"grad_norm": 0.09806457161903381,
"learning_rate": 3.003434739941119e-05,
"loss": 0.0576,
"step": 4070
},
{
"epoch": 4.003925417075564,
"grad_norm": 0.009162220172584057,
"learning_rate": 2.9985279685966632e-05,
"loss": 0.0003,
"step": 4080
},
{
"epoch": 4.013738959764475,
"grad_norm": 0.039267465472221375,
"learning_rate": 2.993621197252208e-05,
"loss": 0.0004,
"step": 4090
},
{
"epoch": 4.023552502453386,
"grad_norm": 0.002605535788461566,
"learning_rate": 2.9887144259077527e-05,
"loss": 0.0004,
"step": 4100
},
{
"epoch": 4.033366045142296,
"grad_norm": 0.003241546219214797,
"learning_rate": 2.9838076545632976e-05,
"loss": 0.0004,
"step": 4110
},
{
"epoch": 4.043179587831207,
"grad_norm": 11.987616539001465,
"learning_rate": 2.978900883218842e-05,
"loss": 0.0393,
"step": 4120
},
{
"epoch": 4.052993130520118,
"grad_norm": 0.002549890661612153,
"learning_rate": 2.973994111874387e-05,
"loss": 0.0011,
"step": 4130
},
{
"epoch": 4.062806673209028,
"grad_norm": 0.002623894950374961,
"learning_rate": 2.9690873405299312e-05,
"loss": 0.0008,
"step": 4140
},
{
"epoch": 4.072620215897939,
"grad_norm": 0.0023546249140053988,
"learning_rate": 2.964180569185476e-05,
"loss": 0.0026,
"step": 4150
},
{
"epoch": 4.08243375858685,
"grad_norm": 0.0023659905418753624,
"learning_rate": 2.9592737978410207e-05,
"loss": 0.078,
"step": 4160
},
{
"epoch": 4.09224730127576,
"grad_norm": 0.002533614169806242,
"learning_rate": 2.9543670264965656e-05,
"loss": 0.0745,
"step": 4170
},
{
"epoch": 4.102060843964671,
"grad_norm": 0.012038661167025566,
"learning_rate": 2.94946025515211e-05,
"loss": 0.0551,
"step": 4180
},
{
"epoch": 4.111874386653582,
"grad_norm": 26.8253173828125,
"learning_rate": 2.944553483807655e-05,
"loss": 0.0928,
"step": 4190
},
{
"epoch": 4.121687929342492,
"grad_norm": 0.03977564349770546,
"learning_rate": 2.9396467124631993e-05,
"loss": 0.0528,
"step": 4200
},
{
"epoch": 4.131501472031403,
"grad_norm": 0.0031746893655508757,
"learning_rate": 2.934739941118744e-05,
"loss": 0.0106,
"step": 4210
},
{
"epoch": 4.141315014720314,
"grad_norm": 0.0031474102288484573,
"learning_rate": 2.9298331697742887e-05,
"loss": 0.0008,
"step": 4220
},
{
"epoch": 4.1511285574092245,
"grad_norm": 0.04280337691307068,
"learning_rate": 2.924926398429833e-05,
"loss": 0.0004,
"step": 4230
},
{
"epoch": 4.1609421000981355,
"grad_norm": 0.002831744961440563,
"learning_rate": 2.920019627085378e-05,
"loss": 0.0004,
"step": 4240
},
{
"epoch": 4.1707556427870465,
"grad_norm": 0.002495395252481103,
"learning_rate": 2.9151128557409224e-05,
"loss": 0.0009,
"step": 4250
},
{
"epoch": 4.180569185475957,
"grad_norm": 0.0024046385660767555,
"learning_rate": 2.9102060843964673e-05,
"loss": 0.0007,
"step": 4260
},
{
"epoch": 4.190382728164868,
"grad_norm": 0.0030680035706609488,
"learning_rate": 2.905299313052012e-05,
"loss": 0.0005,
"step": 4270
},
{
"epoch": 4.200196270853779,
"grad_norm": 0.0061622122302651405,
"learning_rate": 2.9003925417075568e-05,
"loss": 0.0018,
"step": 4280
},
{
"epoch": 4.210009813542689,
"grad_norm": 0.0022845251951366663,
"learning_rate": 2.895485770363101e-05,
"loss": 0.0366,
"step": 4290
},
{
"epoch": 4.2198233562316,
"grad_norm": 0.011359172873198986,
"learning_rate": 2.890578999018646e-05,
"loss": 0.0005,
"step": 4300
},
{
"epoch": 4.229636898920511,
"grad_norm": 0.002846726682037115,
"learning_rate": 2.8856722276741904e-05,
"loss": 0.0004,
"step": 4310
},
{
"epoch": 4.239450441609421,
"grad_norm": 0.002284892601892352,
"learning_rate": 2.8807654563297353e-05,
"loss": 0.0984,
"step": 4320
},
{
"epoch": 4.249263984298332,
"grad_norm": 0.002528236713260412,
"learning_rate": 2.87585868498528e-05,
"loss": 0.0007,
"step": 4330
},
{
"epoch": 4.259077526987243,
"grad_norm": 0.003352473024278879,
"learning_rate": 2.8709519136408248e-05,
"loss": 0.0004,
"step": 4340
},
{
"epoch": 4.268891069676153,
"grad_norm": 0.004708737134933472,
"learning_rate": 2.866045142296369e-05,
"loss": 0.0013,
"step": 4350
},
{
"epoch": 4.278704612365064,
"grad_norm": 0.358195036649704,
"learning_rate": 2.8611383709519136e-05,
"loss": 0.0282,
"step": 4360
},
{
"epoch": 4.288518155053975,
"grad_norm": 0.002740907482802868,
"learning_rate": 2.8562315996074585e-05,
"loss": 0.1356,
"step": 4370
},
{
"epoch": 4.298331697742885,
"grad_norm": 0.002787757897749543,
"learning_rate": 2.8513248282630027e-05,
"loss": 0.0028,
"step": 4380
},
{
"epoch": 4.308145240431796,
"grad_norm": 8.950927734375,
"learning_rate": 2.846418056918548e-05,
"loss": 0.0024,
"step": 4390
},
{
"epoch": 4.317958783120707,
"grad_norm": 0.0048212092369794846,
"learning_rate": 2.841511285574092e-05,
"loss": 0.0006,
"step": 4400
},
{
"epoch": 4.327772325809617,
"grad_norm": 0.0025500452611595392,
"learning_rate": 2.836604514229637e-05,
"loss": 0.0006,
"step": 4410
},
{
"epoch": 4.337585868498528,
"grad_norm": 0.0027642964851111174,
"learning_rate": 2.8316977428851816e-05,
"loss": 0.0436,
"step": 4420
},
{
"epoch": 4.347399411187439,
"grad_norm": 0.0026419861242175102,
"learning_rate": 2.8267909715407265e-05,
"loss": 0.0004,
"step": 4430
},
{
"epoch": 4.357212953876349,
"grad_norm": 0.004611722193658352,
"learning_rate": 2.8218842001962707e-05,
"loss": 0.0823,
"step": 4440
},
{
"epoch": 4.36702649656526,
"grad_norm": 0.0055962237529456615,
"learning_rate": 2.816977428851816e-05,
"loss": 0.0005,
"step": 4450
},
{
"epoch": 4.376840039254171,
"grad_norm": 0.004676250275224447,
"learning_rate": 2.8120706575073602e-05,
"loss": 0.0004,
"step": 4460
},
{
"epoch": 4.386653581943081,
"grad_norm": 0.0034532281570136547,
"learning_rate": 2.807163886162905e-05,
"loss": 0.0006,
"step": 4470
},
{
"epoch": 4.396467124631992,
"grad_norm": 0.016467634588479996,
"learning_rate": 2.8022571148184496e-05,
"loss": 0.0005,
"step": 4480
},
{
"epoch": 4.406280667320903,
"grad_norm": 0.011575533077120781,
"learning_rate": 2.7973503434739945e-05,
"loss": 0.0004,
"step": 4490
},
{
"epoch": 4.416094210009813,
"grad_norm": 0.002654923591762781,
"learning_rate": 2.7924435721295388e-05,
"loss": 0.0003,
"step": 4500
},
{
"epoch": 4.425907752698724,
"grad_norm": 0.00244724890217185,
"learning_rate": 2.7875368007850833e-05,
"loss": 0.0003,
"step": 4510
},
{
"epoch": 4.435721295387635,
"grad_norm": 0.00237859645858407,
"learning_rate": 2.7826300294406282e-05,
"loss": 0.0004,
"step": 4520
},
{
"epoch": 4.445534838076545,
"grad_norm": 0.002822286682203412,
"learning_rate": 2.7777232580961728e-05,
"loss": 0.0003,
"step": 4530
},
{
"epoch": 4.455348380765456,
"grad_norm": 0.0033684764057397842,
"learning_rate": 2.7728164867517177e-05,
"loss": 0.0568,
"step": 4540
},
{
"epoch": 4.465161923454367,
"grad_norm": 0.003250374225899577,
"learning_rate": 2.767909715407262e-05,
"loss": 0.0004,
"step": 4550
},
{
"epoch": 4.4749754661432775,
"grad_norm": 0.0025315198581665754,
"learning_rate": 2.7630029440628068e-05,
"loss": 0.0008,
"step": 4560
},
{
"epoch": 4.4847890088321885,
"grad_norm": 0.010016725398600101,
"learning_rate": 2.7580961727183514e-05,
"loss": 0.0572,
"step": 4570
},
{
"epoch": 4.494602551521099,
"grad_norm": 0.0029501202516257763,
"learning_rate": 2.7531894013738963e-05,
"loss": 0.0003,
"step": 4580
},
{
"epoch": 4.5044160942100095,
"grad_norm": 0.002199607901275158,
"learning_rate": 2.7482826300294405e-05,
"loss": 0.0003,
"step": 4590
},
{
"epoch": 4.5142296368989205,
"grad_norm": 0.002721391385421157,
"learning_rate": 2.7433758586849857e-05,
"loss": 0.0319,
"step": 4600
},
{
"epoch": 4.5240431795878315,
"grad_norm": 0.0022027925588190556,
"learning_rate": 2.73846908734053e-05,
"loss": 0.001,
"step": 4610
},
{
"epoch": 4.533856722276742,
"grad_norm": 0.002053765347227454,
"learning_rate": 2.7335623159960748e-05,
"loss": 0.0539,
"step": 4620
},
{
"epoch": 4.543670264965653,
"grad_norm": 0.019470343366265297,
"learning_rate": 2.7286555446516194e-05,
"loss": 0.07,
"step": 4630
},
{
"epoch": 4.553483807654564,
"grad_norm": 0.0020952706690877676,
"learning_rate": 2.7237487733071643e-05,
"loss": 0.0896,
"step": 4640
},
{
"epoch": 4.563297350343474,
"grad_norm": 0.0032566250301897526,
"learning_rate": 2.7188420019627085e-05,
"loss": 0.0004,
"step": 4650
},
{
"epoch": 4.573110893032385,
"grad_norm": 0.01735255867242813,
"learning_rate": 2.713935230618253e-05,
"loss": 0.0763,
"step": 4660
},
{
"epoch": 4.582924435721296,
"grad_norm": 0.005380355753004551,
"learning_rate": 2.709028459273798e-05,
"loss": 0.0004,
"step": 4670
},
{
"epoch": 4.592737978410206,
"grad_norm": 0.021537847816944122,
"learning_rate": 2.7041216879293425e-05,
"loss": 0.008,
"step": 4680
},
{
"epoch": 4.602551521099117,
"grad_norm": 0.004185757599771023,
"learning_rate": 2.6992149165848874e-05,
"loss": 0.0004,
"step": 4690
},
{
"epoch": 4.612365063788028,
"grad_norm": 0.012351655401289463,
"learning_rate": 2.6943081452404316e-05,
"loss": 0.0016,
"step": 4700
},
{
"epoch": 4.622178606476938,
"grad_norm": 0.002798211993649602,
"learning_rate": 2.6894013738959765e-05,
"loss": 0.0403,
"step": 4710
},
{
"epoch": 4.631992149165849,
"grad_norm": 0.002490241779014468,
"learning_rate": 2.684494602551521e-05,
"loss": 0.0003,
"step": 4720
},
{
"epoch": 4.64180569185476,
"grad_norm": 0.020930418744683266,
"learning_rate": 2.679587831207066e-05,
"loss": 0.0029,
"step": 4730
},
{
"epoch": 4.65161923454367,
"grad_norm": 37.52565383911133,
"learning_rate": 2.6746810598626106e-05,
"loss": 0.0143,
"step": 4740
},
{
"epoch": 4.661432777232581,
"grad_norm": 0.002027578419074416,
"learning_rate": 2.6697742885181555e-05,
"loss": 0.0002,
"step": 4750
},
{
"epoch": 4.671246319921492,
"grad_norm": 0.0019966133404523134,
"learning_rate": 2.6648675171736997e-05,
"loss": 0.0002,
"step": 4760
},
{
"epoch": 4.681059862610402,
"grad_norm": 0.001950482139363885,
"learning_rate": 2.6599607458292446e-05,
"loss": 0.0002,
"step": 4770
},
{
"epoch": 4.690873405299313,
"grad_norm": 0.0020267153158783913,
"learning_rate": 2.655053974484789e-05,
"loss": 0.0826,
"step": 4780
},
{
"epoch": 4.700686947988224,
"grad_norm": 17.92084503173828,
"learning_rate": 2.650147203140334e-05,
"loss": 0.0432,
"step": 4790
},
{
"epoch": 4.710500490677134,
"grad_norm": 0.002029955852776766,
"learning_rate": 2.6452404317958786e-05,
"loss": 0.0004,
"step": 4800
},
{
"epoch": 4.720314033366045,
"grad_norm": 0.002043253742158413,
"learning_rate": 2.6403336604514228e-05,
"loss": 0.0004,
"step": 4810
},
{
"epoch": 4.730127576054956,
"grad_norm": 8.624307632446289,
"learning_rate": 2.6354268891069677e-05,
"loss": 0.0568,
"step": 4820
},
{
"epoch": 4.739941118743866,
"grad_norm": 0.0022134315222501755,
"learning_rate": 2.6305201177625123e-05,
"loss": 0.0003,
"step": 4830
},
{
"epoch": 4.749754661432777,
"grad_norm": 0.003354401560500264,
"learning_rate": 2.625613346418057e-05,
"loss": 0.0004,
"step": 4840
},
{
"epoch": 4.759568204121688,
"grad_norm": 0.025983460247516632,
"learning_rate": 2.6207065750736014e-05,
"loss": 0.0021,
"step": 4850
},
{
"epoch": 4.769381746810598,
"grad_norm": 0.0028674264904111624,
"learning_rate": 2.6157998037291466e-05,
"loss": 0.0003,
"step": 4860
},
{
"epoch": 4.779195289499509,
"grad_norm": 0.0024552124086767435,
"learning_rate": 2.610893032384691e-05,
"loss": 0.0161,
"step": 4870
},
{
"epoch": 4.78900883218842,
"grad_norm": 0.01599975675344467,
"learning_rate": 2.6059862610402357e-05,
"loss": 0.0003,
"step": 4880
},
{
"epoch": 4.79882237487733,
"grad_norm": 0.05640334263443947,
"learning_rate": 2.6010794896957803e-05,
"loss": 0.0003,
"step": 4890
},
{
"epoch": 4.808635917566241,
"grad_norm": 0.1503908485174179,
"learning_rate": 2.5961727183513252e-05,
"loss": 0.0426,
"step": 4900
},
{
"epoch": 4.818449460255152,
"grad_norm": 0.0021854902151972055,
"learning_rate": 2.5912659470068694e-05,
"loss": 0.0003,
"step": 4910
},
{
"epoch": 4.8282630029440625,
"grad_norm": 0.0022083704825490713,
"learning_rate": 2.5863591756624143e-05,
"loss": 0.0002,
"step": 4920
},
{
"epoch": 4.8380765456329735,
"grad_norm": 0.0018174449214711785,
"learning_rate": 2.581452404317959e-05,
"loss": 0.0441,
"step": 4930
},
{
"epoch": 4.8478900883218845,
"grad_norm": 0.0019975032191723585,
"learning_rate": 2.5765456329735038e-05,
"loss": 0.0008,
"step": 4940
},
{
"epoch": 4.857703631010795,
"grad_norm": 0.0022116098552942276,
"learning_rate": 2.5716388616290483e-05,
"loss": 0.0002,
"step": 4950
},
{
"epoch": 4.867517173699706,
"grad_norm": 0.0019927374087274075,
"learning_rate": 2.5667320902845926e-05,
"loss": 0.0005,
"step": 4960
},
{
"epoch": 4.877330716388617,
"grad_norm": 0.003586186794564128,
"learning_rate": 2.5618253189401375e-05,
"loss": 0.0357,
"step": 4970
},
{
"epoch": 4.887144259077527,
"grad_norm": 0.006166779901832342,
"learning_rate": 2.556918547595682e-05,
"loss": 0.0003,
"step": 4980
},
{
"epoch": 4.896957801766438,
"grad_norm": 0.03852635622024536,
"learning_rate": 2.552011776251227e-05,
"loss": 0.0005,
"step": 4990
},
{
"epoch": 4.906771344455349,
"grad_norm": 0.001826342660933733,
"learning_rate": 2.547105004906771e-05,
"loss": 0.0007,
"step": 5000
},
{
"epoch": 4.916584887144259,
"grad_norm": 0.0018040341092273593,
"learning_rate": 2.5421982335623164e-05,
"loss": 0.0002,
"step": 5010
},
{
"epoch": 4.92639842983317,
"grad_norm": 0.0018869714112952352,
"learning_rate": 2.5372914622178606e-05,
"loss": 0.0002,
"step": 5020
},
{
"epoch": 4.936211972522081,
"grad_norm": 0.0017143889563158154,
"learning_rate": 2.5323846908734055e-05,
"loss": 0.0002,
"step": 5030
},
{
"epoch": 4.946025515210991,
"grad_norm": 0.0018076589331030846,
"learning_rate": 2.52747791952895e-05,
"loss": 0.0002,
"step": 5040
},
{
"epoch": 4.955839057899902,
"grad_norm": 0.002003490924835205,
"learning_rate": 2.522571148184495e-05,
"loss": 0.0002,
"step": 5050
},
{
"epoch": 4.965652600588813,
"grad_norm": 0.001990032149478793,
"learning_rate": 2.517664376840039e-05,
"loss": 0.0002,
"step": 5060
},
{
"epoch": 4.975466143277723,
"grad_norm": 0.0017091418849304318,
"learning_rate": 2.5127576054955844e-05,
"loss": 0.0002,
"step": 5070
},
{
"epoch": 4.985279685966634,
"grad_norm": 0.0019396455027163029,
"learning_rate": 2.5078508341511286e-05,
"loss": 0.0002,
"step": 5080
},
{
"epoch": 4.995093228655545,
"grad_norm": 0.0016776375705376267,
"learning_rate": 2.5029440628066735e-05,
"loss": 0.0002,
"step": 5090
},
{
"epoch": 5.004906771344455,
"grad_norm": 0.0017573200166225433,
"learning_rate": 2.498037291462218e-05,
"loss": 0.0442,
"step": 5100
},
{
"epoch": 5.014720314033366,
"grad_norm": 0.0017642441671341658,
"learning_rate": 2.4931305201177626e-05,
"loss": 0.0002,
"step": 5110
},
{
"epoch": 5.024533856722277,
"grad_norm": 0.0016604288248345256,
"learning_rate": 2.4882237487733072e-05,
"loss": 0.0016,
"step": 5120
},
{
"epoch": 5.034347399411187,
"grad_norm": 0.001731898752041161,
"learning_rate": 2.483316977428852e-05,
"loss": 0.0002,
"step": 5130
},
{
"epoch": 5.044160942100098,
"grad_norm": 0.0016634787898510695,
"learning_rate": 2.4784102060843967e-05,
"loss": 0.0002,
"step": 5140
},
{
"epoch": 5.053974484789009,
"grad_norm": 0.0016294183442369103,
"learning_rate": 2.4735034347399412e-05,
"loss": 0.0002,
"step": 5150
},
{
"epoch": 5.063788027477919,
"grad_norm": 0.0017350780544802547,
"learning_rate": 2.468596663395486e-05,
"loss": 0.0006,
"step": 5160
},
{
"epoch": 5.07360157016683,
"grad_norm": 0.0015964311314746737,
"learning_rate": 2.4636898920510303e-05,
"loss": 0.0033,
"step": 5170
},
{
"epoch": 5.083415112855741,
"grad_norm": 0.0018725765403360128,
"learning_rate": 2.4587831207065752e-05,
"loss": 0.0002,
"step": 5180
},
{
"epoch": 5.093228655544651,
"grad_norm": 0.001561222830787301,
"learning_rate": 2.4538763493621198e-05,
"loss": 0.0002,
"step": 5190
},
{
"epoch": 5.103042198233562,
"grad_norm": 0.0913846343755722,
"learning_rate": 2.4489695780176643e-05,
"loss": 0.0003,
"step": 5200
},
{
"epoch": 5.112855740922473,
"grad_norm": 0.001611059415154159,
"learning_rate": 2.4440628066732092e-05,
"loss": 0.0002,
"step": 5210
},
{
"epoch": 5.122669283611383,
"grad_norm": 0.0015166820958256721,
"learning_rate": 2.4391560353287538e-05,
"loss": 0.0002,
"step": 5220
},
{
"epoch": 5.132482826300294,
"grad_norm": 0.0015743138501420617,
"learning_rate": 2.4342492639842984e-05,
"loss": 0.0002,
"step": 5230
},
{
"epoch": 5.142296368989205,
"grad_norm": 0.0015384262660518289,
"learning_rate": 2.429342492639843e-05,
"loss": 0.0002,
"step": 5240
},
{
"epoch": 5.1521099116781155,
"grad_norm": 0.004025735892355442,
"learning_rate": 2.4244357212953878e-05,
"loss": 0.1963,
"step": 5250
},
{
"epoch": 5.1619234543670265,
"grad_norm": 0.052797187119722366,
"learning_rate": 2.4195289499509324e-05,
"loss": 0.001,
"step": 5260
},
{
"epoch": 5.1717369970559375,
"grad_norm": 0.0028745972085744143,
"learning_rate": 2.414622178606477e-05,
"loss": 0.0236,
"step": 5270
},
{
"epoch": 5.181550539744848,
"grad_norm": 0.005021668970584869,
"learning_rate": 2.409715407262022e-05,
"loss": 0.0002,
"step": 5280
},
{
"epoch": 5.191364082433759,
"grad_norm": 0.08027222752571106,
"learning_rate": 2.4048086359175664e-05,
"loss": 0.0006,
"step": 5290
},
{
"epoch": 5.20117762512267,
"grad_norm": 0.001573985326103866,
"learning_rate": 2.399901864573111e-05,
"loss": 0.0002,
"step": 5300
},
{
"epoch": 5.21099116781158,
"grad_norm": 0.0015114896232262254,
"learning_rate": 2.394995093228656e-05,
"loss": 0.0002,
"step": 5310
},
{
"epoch": 5.220804710500491,
"grad_norm": 0.0018572395201772451,
"learning_rate": 2.3900883218842e-05,
"loss": 0.0003,
"step": 5320
},
{
"epoch": 5.230618253189402,
"grad_norm": 0.0015180202899500728,
"learning_rate": 2.385181550539745e-05,
"loss": 0.0003,
"step": 5330
},
{
"epoch": 5.240431795878312,
"grad_norm": 0.0016390462405979633,
"learning_rate": 2.3802747791952895e-05,
"loss": 0.0002,
"step": 5340
},
{
"epoch": 5.250245338567223,
"grad_norm": 0.0015287548303604126,
"learning_rate": 2.375368007850834e-05,
"loss": 0.0002,
"step": 5350
},
{
"epoch": 5.260058881256134,
"grad_norm": 0.0014907275326550007,
"learning_rate": 2.370461236506379e-05,
"loss": 0.0002,
"step": 5360
},
{
"epoch": 5.269872423945044,
"grad_norm": 0.001557844690978527,
"learning_rate": 2.3655544651619236e-05,
"loss": 0.0002,
"step": 5370
},
{
"epoch": 5.279685966633955,
"grad_norm": 0.0018678128253668547,
"learning_rate": 2.360647693817468e-05,
"loss": 0.0002,
"step": 5380
},
{
"epoch": 5.289499509322866,
"grad_norm": 0.0015175668522715569,
"learning_rate": 2.355740922473013e-05,
"loss": 0.0002,
"step": 5390
},
{
"epoch": 5.299313052011776,
"grad_norm": 0.0014625848270952702,
"learning_rate": 2.3508341511285576e-05,
"loss": 0.0002,
"step": 5400
},
{
"epoch": 5.309126594700687,
"grad_norm": 0.00429932726547122,
"learning_rate": 2.345927379784102e-05,
"loss": 0.0607,
"step": 5410
},
{
"epoch": 5.318940137389598,
"grad_norm": 0.0014821887016296387,
"learning_rate": 2.341020608439647e-05,
"loss": 0.0033,
"step": 5420
},
{
"epoch": 5.328753680078508,
"grad_norm": 0.001476548844948411,
"learning_rate": 2.3361138370951916e-05,
"loss": 0.0002,
"step": 5430
},
{
"epoch": 5.338567222767419,
"grad_norm": 0.0014416587073355913,
"learning_rate": 2.331207065750736e-05,
"loss": 0.0002,
"step": 5440
},
{
"epoch": 5.34838076545633,
"grad_norm": 0.001489553484134376,
"learning_rate": 2.326300294406281e-05,
"loss": 0.0698,
"step": 5450
},
{
"epoch": 5.35819430814524,
"grad_norm": 0.004528726451098919,
"learning_rate": 2.3213935230618256e-05,
"loss": 0.0628,
"step": 5460
},
{
"epoch": 5.368007850834151,
"grad_norm": 0.0017940645338967443,
"learning_rate": 2.3164867517173698e-05,
"loss": 0.0002,
"step": 5470
},
{
"epoch": 5.377821393523062,
"grad_norm": 0.0015537918079644442,
"learning_rate": 2.3115799803729147e-05,
"loss": 0.0002,
"step": 5480
},
{
"epoch": 5.387634936211972,
"grad_norm": 0.0015587827656418085,
"learning_rate": 2.3066732090284593e-05,
"loss": 0.0002,
"step": 5490
},
{
"epoch": 5.397448478900883,
"grad_norm": 0.0015445965109393,
"learning_rate": 2.301766437684004e-05,
"loss": 0.0002,
"step": 5500
},
{
"epoch": 5.407262021589794,
"grad_norm": 0.00230443780310452,
"learning_rate": 2.2968596663395487e-05,
"loss": 0.0002,
"step": 5510
},
{
"epoch": 5.417075564278704,
"grad_norm": 0.001530683832243085,
"learning_rate": 2.2919528949950933e-05,
"loss": 0.0002,
"step": 5520
},
{
"epoch": 5.426889106967615,
"grad_norm": 0.006643714848905802,
"learning_rate": 2.287046123650638e-05,
"loss": 0.0003,
"step": 5530
},
{
"epoch": 5.436702649656526,
"grad_norm": 0.0021695613395422697,
"learning_rate": 2.2821393523061828e-05,
"loss": 0.0002,
"step": 5540
},
{
"epoch": 5.446516192345436,
"grad_norm": 0.0014126679161563516,
"learning_rate": 2.2772325809617273e-05,
"loss": 0.0655,
"step": 5550
},
{
"epoch": 5.456329735034347,
"grad_norm": 0.01729333959519863,
"learning_rate": 2.272325809617272e-05,
"loss": 0.0002,
"step": 5560
},
{
"epoch": 5.466143277723258,
"grad_norm": 0.0014916701475158334,
"learning_rate": 2.2674190382728168e-05,
"loss": 0.0002,
"step": 5570
},
{
"epoch": 5.4759568204121685,
"grad_norm": 0.001467019901610911,
"learning_rate": 2.2625122669283613e-05,
"loss": 0.0002,
"step": 5580
},
{
"epoch": 5.4857703631010795,
"grad_norm": 0.0014575383393093944,
"learning_rate": 2.257605495583906e-05,
"loss": 0.0002,
"step": 5590
},
{
"epoch": 5.4955839057899905,
"grad_norm": 0.0014117214595898986,
"learning_rate": 2.2526987242394508e-05,
"loss": 0.0002,
"step": 5600
},
{
"epoch": 5.505397448478901,
"grad_norm": 0.0014430248411372304,
"learning_rate": 2.2477919528949953e-05,
"loss": 0.0002,
"step": 5610
},
{
"epoch": 5.5152109911678115,
"grad_norm": 0.001443715300410986,
"learning_rate": 2.2428851815505396e-05,
"loss": 0.0002,
"step": 5620
},
{
"epoch": 5.5250245338567225,
"grad_norm": 0.0013962725643068552,
"learning_rate": 2.2379784102060845e-05,
"loss": 0.0767,
"step": 5630
},
{
"epoch": 5.534838076545633,
"grad_norm": 0.0016859096940606833,
"learning_rate": 2.233071638861629e-05,
"loss": 0.0314,
"step": 5640
},
{
"epoch": 5.544651619234544,
"grad_norm": 0.0021301463712006807,
"learning_rate": 2.2281648675171736e-05,
"loss": 0.0968,
"step": 5650
},
{
"epoch": 5.554465161923455,
"grad_norm": 0.003654947504401207,
"learning_rate": 2.2232580961727185e-05,
"loss": 0.0051,
"step": 5660
},
{
"epoch": 5.564278704612365,
"grad_norm": 0.003530005691573024,
"learning_rate": 2.218351324828263e-05,
"loss": 0.0053,
"step": 5670
},
{
"epoch": 5.574092247301276,
"grad_norm": 0.004440093878656626,
"learning_rate": 2.2134445534838076e-05,
"loss": 0.0012,
"step": 5680
},
{
"epoch": 5.583905789990187,
"grad_norm": 0.0015052916714921594,
"learning_rate": 2.2085377821393525e-05,
"loss": 0.0013,
"step": 5690
},
{
"epoch": 5.593719332679097,
"grad_norm": 0.0014009432634338737,
"learning_rate": 2.203631010794897e-05,
"loss": 0.0492,
"step": 5700
},
{
"epoch": 5.603532875368008,
"grad_norm": 0.0015393829671666026,
"learning_rate": 2.1987242394504416e-05,
"loss": 0.0005,
"step": 5710
},
{
"epoch": 5.613346418056919,
"grad_norm": 0.0039021980483084917,
"learning_rate": 2.1938174681059865e-05,
"loss": 0.0002,
"step": 5720
},
{
"epoch": 5.623159960745829,
"grad_norm": 0.0014669959200546145,
"learning_rate": 2.188910696761531e-05,
"loss": 0.0004,
"step": 5730
},
{
"epoch": 5.63297350343474,
"grad_norm": 0.0015139420283958316,
"learning_rate": 2.1840039254170756e-05,
"loss": 0.0303,
"step": 5740
},
{
"epoch": 5.642787046123651,
"grad_norm": 0.001543746329843998,
"learning_rate": 2.1790971540726205e-05,
"loss": 0.0009,
"step": 5750
},
{
"epoch": 5.652600588812561,
"grad_norm": 0.2851181924343109,
"learning_rate": 2.174190382728165e-05,
"loss": 0.0021,
"step": 5760
},
{
"epoch": 5.662414131501472,
"grad_norm": 0.001427607610821724,
"learning_rate": 2.1692836113837096e-05,
"loss": 0.0002,
"step": 5770
},
{
"epoch": 5.672227674190383,
"grad_norm": 0.0017000263324007392,
"learning_rate": 2.1643768400392542e-05,
"loss": 0.0101,
"step": 5780
},
{
"epoch": 5.682041216879293,
"grad_norm": 0.02387947216629982,
"learning_rate": 2.1594700686947988e-05,
"loss": 0.0002,
"step": 5790
},
{
"epoch": 5.691854759568204,
"grad_norm": 0.0013223286950960755,
"learning_rate": 2.1545632973503437e-05,
"loss": 0.0433,
"step": 5800
},
{
"epoch": 5.701668302257115,
"grad_norm": 0.0013629156164824963,
"learning_rate": 2.1496565260058882e-05,
"loss": 0.0002,
"step": 5810
},
{
"epoch": 5.711481844946025,
"grad_norm": 0.0015034314710646868,
"learning_rate": 2.1447497546614328e-05,
"loss": 0.0009,
"step": 5820
},
{
"epoch": 5.721295387634936,
"grad_norm": 0.001305502257309854,
"learning_rate": 2.1398429833169777e-05,
"loss": 0.0002,
"step": 5830
},
{
"epoch": 5.731108930323847,
"grad_norm": 0.0013675469672307372,
"learning_rate": 2.1349362119725222e-05,
"loss": 0.0002,
"step": 5840
},
{
"epoch": 5.740922473012757,
"grad_norm": 0.0012498252326622605,
"learning_rate": 2.1300294406280668e-05,
"loss": 0.0285,
"step": 5850
},
{
"epoch": 5.750736015701668,
"grad_norm": 0.001314906869083643,
"learning_rate": 2.1251226692836117e-05,
"loss": 0.0002,
"step": 5860
},
{
"epoch": 5.760549558390579,
"grad_norm": 0.004441590514034033,
"learning_rate": 2.1202158979391563e-05,
"loss": 0.1261,
"step": 5870
},
{
"epoch": 5.770363101079489,
"grad_norm": 0.018667880445718765,
"learning_rate": 2.1153091265947008e-05,
"loss": 0.0004,
"step": 5880
},
{
"epoch": 5.7801766437684,
"grad_norm": 21.129253387451172,
"learning_rate": 2.1104023552502454e-05,
"loss": 0.0915,
"step": 5890
},
{
"epoch": 5.789990186457311,
"grad_norm": 0.0012923305621370673,
"learning_rate": 2.1054955839057903e-05,
"loss": 0.0008,
"step": 5900
},
{
"epoch": 5.799803729146221,
"grad_norm": 0.1054319515824318,
"learning_rate": 2.1005888125613345e-05,
"loss": 0.0003,
"step": 5910
},
{
"epoch": 5.809617271835132,
"grad_norm": 0.002641110448166728,
"learning_rate": 2.0956820412168794e-05,
"loss": 0.0021,
"step": 5920
},
{
"epoch": 5.819430814524043,
"grad_norm": 0.0012492777314037085,
"learning_rate": 2.090775269872424e-05,
"loss": 0.0002,
"step": 5930
},
{
"epoch": 5.8292443572129535,
"grad_norm": 0.0012710640439763665,
"learning_rate": 2.0858684985279685e-05,
"loss": 0.0005,
"step": 5940
},
{
"epoch": 5.8390578999018645,
"grad_norm": 0.0014566375175490975,
"learning_rate": 2.0809617271835134e-05,
"loss": 0.0004,
"step": 5950
},
{
"epoch": 5.8488714425907755,
"grad_norm": 0.0022309215273708105,
"learning_rate": 2.076054955839058e-05,
"loss": 0.0755,
"step": 5960
},
{
"epoch": 5.858684985279686,
"grad_norm": 0.00341408746317029,
"learning_rate": 2.0711481844946025e-05,
"loss": 0.014,
"step": 5970
},
{
"epoch": 5.868498527968597,
"grad_norm": 0.001304444158449769,
"learning_rate": 2.0662414131501474e-05,
"loss": 0.0015,
"step": 5980
},
{
"epoch": 5.878312070657508,
"grad_norm": 0.0012671782169491053,
"learning_rate": 2.061334641805692e-05,
"loss": 0.0002,
"step": 5990
},
{
"epoch": 5.888125613346418,
"grad_norm": 0.0035885085817426443,
"learning_rate": 2.0564278704612365e-05,
"loss": 0.0002,
"step": 6000
},
{
"epoch": 5.897939156035329,
"grad_norm": 0.0014621804002672434,
"learning_rate": 2.0515210991167814e-05,
"loss": 0.0003,
"step": 6010
},
{
"epoch": 5.90775269872424,
"grad_norm": 0.001226249267347157,
"learning_rate": 2.046614327772326e-05,
"loss": 0.0002,
"step": 6020
},
{
"epoch": 5.91756624141315,
"grad_norm": 0.0012753872433677316,
"learning_rate": 2.0417075564278706e-05,
"loss": 0.0002,
"step": 6030
},
{
"epoch": 5.927379784102061,
"grad_norm": 0.0011854572221636772,
"learning_rate": 2.0368007850834155e-05,
"loss": 0.0002,
"step": 6040
},
{
"epoch": 5.937193326790972,
"grad_norm": 0.0012309462763369083,
"learning_rate": 2.03189401373896e-05,
"loss": 0.0001,
"step": 6050
},
{
"epoch": 5.947006869479882,
"grad_norm": 0.001222968683578074,
"learning_rate": 2.0269872423945042e-05,
"loss": 0.0001,
"step": 6060
},
{
"epoch": 5.956820412168793,
"grad_norm": 0.004279905930161476,
"learning_rate": 2.022080471050049e-05,
"loss": 0.0002,
"step": 6070
},
{
"epoch": 5.966633954857704,
"grad_norm": 0.0012088885996490717,
"learning_rate": 2.0171736997055937e-05,
"loss": 0.0001,
"step": 6080
},
{
"epoch": 5.976447497546614,
"grad_norm": 0.0017938670935109258,
"learning_rate": 2.0122669283611383e-05,
"loss": 0.0001,
"step": 6090
},
{
"epoch": 5.986261040235525,
"grad_norm": 0.033533725887537,
"learning_rate": 2.007360157016683e-05,
"loss": 0.0002,
"step": 6100
},
{
"epoch": 5.996074582924436,
"grad_norm": 0.0012293298495933414,
"learning_rate": 2.0024533856722277e-05,
"loss": 0.0001,
"step": 6110
},
{
"epoch": 6.005888125613346,
"grad_norm": 0.001815044553950429,
"learning_rate": 1.9975466143277723e-05,
"loss": 0.0002,
"step": 6120
},
{
"epoch": 6.015701668302257,
"grad_norm": 0.001358096138574183,
"learning_rate": 1.9926398429833172e-05,
"loss": 0.0002,
"step": 6130
},
{
"epoch": 6.025515210991168,
"grad_norm": 0.015642492100596428,
"learning_rate": 1.9877330716388617e-05,
"loss": 0.0001,
"step": 6140
},
{
"epoch": 6.035328753680078,
"grad_norm": 0.001149074058048427,
"learning_rate": 1.9828263002944063e-05,
"loss": 0.0001,
"step": 6150
},
{
"epoch": 6.045142296368989,
"grad_norm": 0.0011097900569438934,
"learning_rate": 1.9779195289499512e-05,
"loss": 0.0001,
"step": 6160
},
{
"epoch": 6.0549558390579,
"grad_norm": 0.0014940439723432064,
"learning_rate": 1.9730127576054957e-05,
"loss": 0.0001,
"step": 6170
},
{
"epoch": 6.06476938174681,
"grad_norm": 0.03362993523478508,
"learning_rate": 1.9681059862610403e-05,
"loss": 0.0004,
"step": 6180
},
{
"epoch": 6.074582924435721,
"grad_norm": 0.16991779208183289,
"learning_rate": 1.9631992149165852e-05,
"loss": 0.0003,
"step": 6190
},
{
"epoch": 6.084396467124632,
"grad_norm": 0.0011343214428052306,
"learning_rate": 1.9582924435721298e-05,
"loss": 0.0021,
"step": 6200
},
{
"epoch": 6.094210009813542,
"grad_norm": 0.0011410163715481758,
"learning_rate": 1.9533856722276743e-05,
"loss": 0.0001,
"step": 6210
},
{
"epoch": 6.104023552502453,
"grad_norm": 0.0011109898332506418,
"learning_rate": 1.948478900883219e-05,
"loss": 0.0001,
"step": 6220
},
{
"epoch": 6.113837095191364,
"grad_norm": 0.001067674602381885,
"learning_rate": 1.9435721295387634e-05,
"loss": 0.0001,
"step": 6230
},
{
"epoch": 6.123650637880274,
"grad_norm": 0.0010826255893334746,
"learning_rate": 1.938665358194308e-05,
"loss": 0.0001,
"step": 6240
},
{
"epoch": 6.133464180569185,
"grad_norm": 0.0010834899730980396,
"learning_rate": 1.933758586849853e-05,
"loss": 0.0541,
"step": 6250
},
{
"epoch": 6.143277723258096,
"grad_norm": 0.007662464864552021,
"learning_rate": 1.9288518155053975e-05,
"loss": 0.0002,
"step": 6260
},
{
"epoch": 6.1530912659470065,
"grad_norm": 0.0014158189296722412,
"learning_rate": 1.923945044160942e-05,
"loss": 0.1787,
"step": 6270
},
{
"epoch": 6.1629048086359175,
"grad_norm": 0.0036792519968003035,
"learning_rate": 1.919038272816487e-05,
"loss": 0.0462,
"step": 6280
},
{
"epoch": 6.1727183513248285,
"grad_norm": 0.1517615020275116,
"learning_rate": 1.9141315014720315e-05,
"loss": 0.0004,
"step": 6290
},
{
"epoch": 6.182531894013739,
"grad_norm": 0.002872257027775049,
"learning_rate": 1.909224730127576e-05,
"loss": 0.0003,
"step": 6300
},
{
"epoch": 6.19234543670265,
"grad_norm": 0.0014831377193331718,
"learning_rate": 1.904317958783121e-05,
"loss": 0.0002,
"step": 6310
},
{
"epoch": 6.202158979391561,
"grad_norm": 0.0015966458013281226,
"learning_rate": 1.8994111874386655e-05,
"loss": 0.0006,
"step": 6320
},
{
"epoch": 6.211972522080471,
"grad_norm": 0.001315574860200286,
"learning_rate": 1.89450441609421e-05,
"loss": 0.0002,
"step": 6330
},
{
"epoch": 6.221786064769382,
"grad_norm": 0.003673387225717306,
"learning_rate": 1.889597644749755e-05,
"loss": 0.0002,
"step": 6340
},
{
"epoch": 6.231599607458293,
"grad_norm": 0.0022277962416410446,
"learning_rate": 1.8846908734052995e-05,
"loss": 0.0002,
"step": 6350
},
{
"epoch": 6.241413150147203,
"grad_norm": 0.0013255071826279163,
"learning_rate": 1.879784102060844e-05,
"loss": 0.0002,
"step": 6360
},
{
"epoch": 6.251226692836114,
"grad_norm": 0.0024367074947804213,
"learning_rate": 1.8748773307163886e-05,
"loss": 0.0002,
"step": 6370
},
{
"epoch": 6.261040235525025,
"grad_norm": 0.0018190988339483738,
"learning_rate": 1.8699705593719332e-05,
"loss": 0.0002,
"step": 6380
},
{
"epoch": 6.270853778213935,
"grad_norm": 0.002138520823791623,
"learning_rate": 1.865063788027478e-05,
"loss": 0.0002,
"step": 6390
},
{
"epoch": 6.280667320902846,
"grad_norm": 0.0013069864362478256,
"learning_rate": 1.8601570166830226e-05,
"loss": 0.0002,
"step": 6400
},
{
"epoch": 6.290480863591757,
"grad_norm": 0.0013102535158395767,
"learning_rate": 1.8552502453385672e-05,
"loss": 0.0003,
"step": 6410
},
{
"epoch": 6.300294406280667,
"grad_norm": 0.004578573163598776,
"learning_rate": 1.850343473994112e-05,
"loss": 0.0002,
"step": 6420
},
{
"epoch": 6.310107948969578,
"grad_norm": 0.00831854809075594,
"learning_rate": 1.8454367026496567e-05,
"loss": 0.0002,
"step": 6430
},
{
"epoch": 6.319921491658489,
"grad_norm": 0.0014605351025238633,
"learning_rate": 1.8405299313052012e-05,
"loss": 0.0002,
"step": 6440
},
{
"epoch": 6.329735034347399,
"grad_norm": 0.0013795517152175307,
"learning_rate": 1.835623159960746e-05,
"loss": 0.0002,
"step": 6450
},
{
"epoch": 6.33954857703631,
"grad_norm": 0.0015935307601466775,
"learning_rate": 1.8307163886162907e-05,
"loss": 0.0002,
"step": 6460
},
{
"epoch": 6.349362119725221,
"grad_norm": 0.0013198903761804104,
"learning_rate": 1.8258096172718352e-05,
"loss": 0.0003,
"step": 6470
},
{
"epoch": 6.359175662414131,
"grad_norm": 0.002860839944332838,
"learning_rate": 1.82090284592738e-05,
"loss": 0.0002,
"step": 6480
},
{
"epoch": 6.368989205103042,
"grad_norm": 0.0013555525802075863,
"learning_rate": 1.8159960745829247e-05,
"loss": 0.0002,
"step": 6490
},
{
"epoch": 6.378802747791953,
"grad_norm": 0.0020145312882959843,
"learning_rate": 1.811089303238469e-05,
"loss": 0.0002,
"step": 6500
},
{
"epoch": 6.388616290480863,
"grad_norm": 0.00473778136074543,
"learning_rate": 1.8061825318940138e-05,
"loss": 0.0001,
"step": 6510
},
{
"epoch": 6.398429833169774,
"grad_norm": 0.0017492013284936547,
"learning_rate": 1.8012757605495584e-05,
"loss": 0.0002,
"step": 6520
},
{
"epoch": 6.408243375858685,
"grad_norm": 0.0012156120501458645,
"learning_rate": 1.796368989205103e-05,
"loss": 0.0001,
"step": 6530
},
{
"epoch": 6.418056918547595,
"grad_norm": 0.001362017123028636,
"learning_rate": 1.7914622178606478e-05,
"loss": 0.0005,
"step": 6540
},
{
"epoch": 6.427870461236506,
"grad_norm": 0.0011874830815941095,
"learning_rate": 1.7865554465161924e-05,
"loss": 0.0002,
"step": 6550
},
{
"epoch": 6.437684003925417,
"grad_norm": 0.0020989649929106236,
"learning_rate": 1.781648675171737e-05,
"loss": 0.0002,
"step": 6560
},
{
"epoch": 6.447497546614327,
"grad_norm": 0.001271673827432096,
"learning_rate": 1.776741903827282e-05,
"loss": 0.0858,
"step": 6570
},
{
"epoch": 6.457311089303238,
"grad_norm": 0.001192873460240662,
"learning_rate": 1.7718351324828264e-05,
"loss": 0.0001,
"step": 6580
},
{
"epoch": 6.467124631992149,
"grad_norm": 0.011513526551425457,
"learning_rate": 1.766928361138371e-05,
"loss": 0.1562,
"step": 6590
},
{
"epoch": 6.4769381746810595,
"grad_norm": 0.001225059386342764,
"learning_rate": 1.762021589793916e-05,
"loss": 0.0002,
"step": 6600
},
{
"epoch": 6.4867517173699705,
"grad_norm": 0.0013161891838535666,
"learning_rate": 1.7571148184494604e-05,
"loss": 0.0074,
"step": 6610
},
{
"epoch": 6.4965652600588815,
"grad_norm": 0.001231314497999847,
"learning_rate": 1.752208047105005e-05,
"loss": 0.0001,
"step": 6620
},
{
"epoch": 6.506378802747792,
"grad_norm": 0.0012088754447177052,
"learning_rate": 1.74730127576055e-05,
"loss": 0.0285,
"step": 6630
},
{
"epoch": 6.516192345436703,
"grad_norm": 0.0013558064820244908,
"learning_rate": 1.7423945044160944e-05,
"loss": 0.0004,
"step": 6640
},
{
"epoch": 6.5260058881256136,
"grad_norm": 0.0016369909280911088,
"learning_rate": 1.7374877330716387e-05,
"loss": 0.0016,
"step": 6650
},
{
"epoch": 6.535819430814524,
"grad_norm": 0.035988856106996536,
"learning_rate": 1.7325809617271836e-05,
"loss": 0.0002,
"step": 6660
},
{
"epoch": 6.545632973503435,
"grad_norm": 0.0011288542300462723,
"learning_rate": 1.727674190382728e-05,
"loss": 0.0213,
"step": 6670
},
{
"epoch": 6.555446516192346,
"grad_norm": 0.0014625934418290854,
"learning_rate": 1.7227674190382727e-05,
"loss": 0.0002,
"step": 6680
},
{
"epoch": 6.565260058881256,
"grad_norm": 0.0011535960948094726,
"learning_rate": 1.7178606476938176e-05,
"loss": 0.0001,
"step": 6690
},
{
"epoch": 6.575073601570167,
"grad_norm": 0.0011100315023213625,
"learning_rate": 1.712953876349362e-05,
"loss": 0.0002,
"step": 6700
},
{
"epoch": 6.584887144259078,
"grad_norm": 0.0011173097882419825,
"learning_rate": 1.7080471050049067e-05,
"loss": 0.0001,
"step": 6710
},
{
"epoch": 6.594700686947988,
"grad_norm": 0.0011760563356801867,
"learning_rate": 1.7031403336604516e-05,
"loss": 0.0002,
"step": 6720
},
{
"epoch": 6.604514229636899,
"grad_norm": 0.0012068103533238173,
"learning_rate": 1.698233562315996e-05,
"loss": 0.0001,
"step": 6730
},
{
"epoch": 6.61432777232581,
"grad_norm": 0.0010894141159951687,
"learning_rate": 1.6933267909715407e-05,
"loss": 0.0001,
"step": 6740
},
{
"epoch": 6.62414131501472,
"grad_norm": 0.0014370041899383068,
"learning_rate": 1.6884200196270856e-05,
"loss": 0.0001,
"step": 6750
},
{
"epoch": 6.633954857703631,
"grad_norm": 0.002420579083263874,
"learning_rate": 1.68351324828263e-05,
"loss": 0.0001,
"step": 6760
},
{
"epoch": 6.643768400392542,
"grad_norm": 0.001223103143274784,
"learning_rate": 1.6786064769381747e-05,
"loss": 0.0001,
"step": 6770
},
{
"epoch": 6.653581943081452,
"grad_norm": 0.0010998743819072843,
"learning_rate": 1.6736997055937196e-05,
"loss": 0.0314,
"step": 6780
},
{
"epoch": 6.663395485770363,
"grad_norm": 0.00108517415355891,
"learning_rate": 1.6687929342492642e-05,
"loss": 0.0001,
"step": 6790
},
{
"epoch": 6.673209028459274,
"grad_norm": 0.0011395640904083848,
"learning_rate": 1.6638861629048087e-05,
"loss": 0.0001,
"step": 6800
},
{
"epoch": 6.683022571148184,
"grad_norm": 0.001564236357808113,
"learning_rate": 1.6589793915603533e-05,
"loss": 0.0962,
"step": 6810
},
{
"epoch": 6.692836113837095,
"grad_norm": 0.0016074421582743526,
"learning_rate": 1.654072620215898e-05,
"loss": 0.0002,
"step": 6820
},
{
"epoch": 6.702649656526006,
"grad_norm": 0.0012334993807598948,
"learning_rate": 1.6491658488714428e-05,
"loss": 0.0023,
"step": 6830
},
{
"epoch": 6.712463199214916,
"grad_norm": 0.0011435603955760598,
"learning_rate": 1.6442590775269873e-05,
"loss": 0.0001,
"step": 6840
},
{
"epoch": 6.722276741903827,
"grad_norm": 0.0016410372918471694,
"learning_rate": 1.639352306182532e-05,
"loss": 0.0298,
"step": 6850
},
{
"epoch": 6.732090284592738,
"grad_norm": 0.0012846958125010133,
"learning_rate": 1.6344455348380768e-05,
"loss": 0.0001,
"step": 6860
},
{
"epoch": 6.741903827281648,
"grad_norm": 0.0011800202773883939,
"learning_rate": 1.6295387634936213e-05,
"loss": 0.0002,
"step": 6870
},
{
"epoch": 6.751717369970559,
"grad_norm": 0.0015586729859933257,
"learning_rate": 1.624631992149166e-05,
"loss": 0.0003,
"step": 6880
},
{
"epoch": 6.76153091265947,
"grad_norm": 0.001090590376406908,
"learning_rate": 1.6197252208047105e-05,
"loss": 0.0002,
"step": 6890
},
{
"epoch": 6.77134445534838,
"grad_norm": 0.0011874845949932933,
"learning_rate": 1.6148184494602554e-05,
"loss": 0.0001,
"step": 6900
},
{
"epoch": 6.781157998037291,
"grad_norm": 0.0011030100286006927,
"learning_rate": 1.6099116781158e-05,
"loss": 0.0001,
"step": 6910
},
{
"epoch": 6.790971540726202,
"grad_norm": 0.0012315625790506601,
"learning_rate": 1.6050049067713445e-05,
"loss": 0.0001,
"step": 6920
},
{
"epoch": 6.8007850834151125,
"grad_norm": 0.0011062839766964316,
"learning_rate": 1.6000981354268894e-05,
"loss": 0.0001,
"step": 6930
},
{
"epoch": 6.8105986261040234,
"grad_norm": 0.0011281865881755948,
"learning_rate": 1.595191364082434e-05,
"loss": 0.0001,
"step": 6940
},
{
"epoch": 6.820412168792934,
"grad_norm": 0.001074342057108879,
"learning_rate": 1.5902845927379785e-05,
"loss": 0.0001,
"step": 6950
},
{
"epoch": 6.8302257114818445,
"grad_norm": 0.0011061643017455935,
"learning_rate": 1.585377821393523e-05,
"loss": 0.0001,
"step": 6960
},
{
"epoch": 6.8400392541707555,
"grad_norm": 0.002780759707093239,
"learning_rate": 1.5804710500490676e-05,
"loss": 0.0001,
"step": 6970
},
{
"epoch": 6.8498527968596665,
"grad_norm": 0.0010947277769446373,
"learning_rate": 1.5755642787046125e-05,
"loss": 0.0001,
"step": 6980
},
{
"epoch": 6.859666339548577,
"grad_norm": 0.001039006281644106,
"learning_rate": 1.570657507360157e-05,
"loss": 0.0001,
"step": 6990
},
{
"epoch": 6.869479882237488,
"grad_norm": 0.0011975034140050411,
"learning_rate": 1.5657507360157016e-05,
"loss": 0.0001,
"step": 7000
},
{
"epoch": 6.879293424926399,
"grad_norm": 0.0010505706304684281,
"learning_rate": 1.5608439646712465e-05,
"loss": 0.0001,
"step": 7010
},
{
"epoch": 6.889106967615309,
"grad_norm": 0.001015416462905705,
"learning_rate": 1.555937193326791e-05,
"loss": 0.0001,
"step": 7020
},
{
"epoch": 6.89892051030422,
"grad_norm": 0.001166634145192802,
"learning_rate": 1.5510304219823356e-05,
"loss": 0.0001,
"step": 7030
},
{
"epoch": 6.908734052993131,
"grad_norm": 0.005132897291332483,
"learning_rate": 1.5461236506378805e-05,
"loss": 0.0001,
"step": 7040
},
{
"epoch": 6.918547595682041,
"grad_norm": 0.001034508110024035,
"learning_rate": 1.541216879293425e-05,
"loss": 0.0001,
"step": 7050
},
{
"epoch": 6.928361138370952,
"grad_norm": 0.0013660124968737364,
"learning_rate": 1.5363101079489697e-05,
"loss": 0.0001,
"step": 7060
},
{
"epoch": 6.938174681059863,
"grad_norm": 0.001023141318000853,
"learning_rate": 1.5314033366045146e-05,
"loss": 0.0001,
"step": 7070
},
{
"epoch": 6.947988223748773,
"grad_norm": 0.0009852561634033918,
"learning_rate": 1.526496565260059e-05,
"loss": 0.0006,
"step": 7080
},
{
"epoch": 6.957801766437684,
"grad_norm": 0.0028536063618957996,
"learning_rate": 1.5215897939156035e-05,
"loss": 0.0001,
"step": 7090
},
{
"epoch": 6.967615309126595,
"grad_norm": 0.0010083414381369948,
"learning_rate": 1.516683022571148e-05,
"loss": 0.0001,
"step": 7100
},
{
"epoch": 6.977428851815505,
"grad_norm": 0.0009895939147099853,
"learning_rate": 1.5117762512266928e-05,
"loss": 0.0003,
"step": 7110
},
{
"epoch": 6.987242394504416,
"grad_norm": 0.0009826653404161334,
"learning_rate": 1.5068694798822375e-05,
"loss": 0.0002,
"step": 7120
},
{
"epoch": 6.997055937193327,
"grad_norm": 0.0010616799117997289,
"learning_rate": 1.501962708537782e-05,
"loss": 0.0001,
"step": 7130
},
{
"epoch": 7.006869479882237,
"grad_norm": 0.000987286097370088,
"learning_rate": 1.4970559371933268e-05,
"loss": 0.0001,
"step": 7140
},
{
"epoch": 7.016683022571148,
"grad_norm": 0.0009880246361717582,
"learning_rate": 1.4921491658488715e-05,
"loss": 0.0001,
"step": 7150
},
{
"epoch": 7.026496565260059,
"grad_norm": 0.0010042705107480288,
"learning_rate": 1.4872423945044161e-05,
"loss": 0.0001,
"step": 7160
},
{
"epoch": 7.036310107948969,
"grad_norm": 0.0010432158596813679,
"learning_rate": 1.4823356231599608e-05,
"loss": 0.0001,
"step": 7170
},
{
"epoch": 7.04612365063788,
"grad_norm": 0.0010717209661379457,
"learning_rate": 1.4774288518155056e-05,
"loss": 0.0047,
"step": 7180
},
{
"epoch": 7.055937193326791,
"grad_norm": 0.001039078924804926,
"learning_rate": 1.4725220804710501e-05,
"loss": 0.0001,
"step": 7190
},
{
"epoch": 7.065750736015701,
"grad_norm": 0.0010033833095803857,
"learning_rate": 1.4676153091265948e-05,
"loss": 0.0002,
"step": 7200
},
{
"epoch": 7.075564278704612,
"grad_norm": 0.003389047458767891,
"learning_rate": 1.4627085377821396e-05,
"loss": 0.0001,
"step": 7210
},
{
"epoch": 7.085377821393523,
"grad_norm": 0.0009605743689462543,
"learning_rate": 1.4578017664376841e-05,
"loss": 0.0002,
"step": 7220
},
{
"epoch": 7.095191364082433,
"grad_norm": 0.0016811139648780227,
"learning_rate": 1.4528949950932289e-05,
"loss": 0.0001,
"step": 7230
},
{
"epoch": 7.105004906771344,
"grad_norm": 0.0009887183550745249,
"learning_rate": 1.4479882237487732e-05,
"loss": 0.0146,
"step": 7240
},
{
"epoch": 7.114818449460255,
"grad_norm": 0.0009474638500250876,
"learning_rate": 1.443081452404318e-05,
"loss": 0.0001,
"step": 7250
},
{
"epoch": 7.124631992149165,
"grad_norm": 0.0017863448010757565,
"learning_rate": 1.4381746810598625e-05,
"loss": 0.0001,
"step": 7260
},
{
"epoch": 7.134445534838076,
"grad_norm": 0.0009406275930814445,
"learning_rate": 1.4332679097154073e-05,
"loss": 0.0069,
"step": 7270
},
{
"epoch": 7.144259077526987,
"grad_norm": 0.0010239857947453856,
"learning_rate": 1.428361138370952e-05,
"loss": 0.1679,
"step": 7280
},
{
"epoch": 7.1540726202158975,
"grad_norm": 0.0018188258400186896,
"learning_rate": 1.4234543670264966e-05,
"loss": 0.0003,
"step": 7290
},
{
"epoch": 7.1638861629048085,
"grad_norm": 0.0012613933067768812,
"learning_rate": 1.4185475956820413e-05,
"loss": 0.0001,
"step": 7300
},
{
"epoch": 7.1736997055937195,
"grad_norm": 0.019094325602054596,
"learning_rate": 1.413640824337586e-05,
"loss": 0.1222,
"step": 7310
},
{
"epoch": 7.18351324828263,
"grad_norm": 0.013140466995537281,
"learning_rate": 1.4087340529931306e-05,
"loss": 0.0002,
"step": 7320
},
{
"epoch": 7.193326790971541,
"grad_norm": 0.001887351623736322,
"learning_rate": 1.4038272816486753e-05,
"loss": 0.0028,
"step": 7330
},
{
"epoch": 7.203140333660452,
"grad_norm": 0.008172539062798023,
"learning_rate": 1.39892051030422e-05,
"loss": 0.0002,
"step": 7340
},
{
"epoch": 7.212953876349362,
"grad_norm": 0.017021648585796356,
"learning_rate": 1.3940137389597646e-05,
"loss": 0.0002,
"step": 7350
},
{
"epoch": 7.222767419038273,
"grad_norm": 0.0010052472352981567,
"learning_rate": 1.3891069676153093e-05,
"loss": 0.0015,
"step": 7360
},
{
"epoch": 7.232580961727184,
"grad_norm": 0.001076782587915659,
"learning_rate": 1.3842001962708539e-05,
"loss": 0.0048,
"step": 7370
},
{
"epoch": 7.242394504416094,
"grad_norm": 0.05915454775094986,
"learning_rate": 1.3792934249263986e-05,
"loss": 0.0002,
"step": 7380
},
{
"epoch": 7.252208047105005,
"grad_norm": 0.0009720100206322968,
"learning_rate": 1.374386653581943e-05,
"loss": 0.0002,
"step": 7390
},
{
"epoch": 7.262021589793916,
"grad_norm": 0.019856898114085197,
"learning_rate": 1.3694798822374877e-05,
"loss": 0.0002,
"step": 7400
},
{
"epoch": 7.271835132482826,
"grad_norm": 0.0022591969463974237,
"learning_rate": 1.3645731108930323e-05,
"loss": 0.0004,
"step": 7410
},
{
"epoch": 7.281648675171737,
"grad_norm": 0.0010053004371002316,
"learning_rate": 1.359666339548577e-05,
"loss": 0.0001,
"step": 7420
},
{
"epoch": 7.291462217860648,
"grad_norm": 0.0015725187258794904,
"learning_rate": 1.3547595682041217e-05,
"loss": 0.0002,
"step": 7430
},
{
"epoch": 7.301275760549558,
"grad_norm": 0.0009938733419403434,
"learning_rate": 1.3498527968596663e-05,
"loss": 0.0001,
"step": 7440
},
{
"epoch": 7.311089303238469,
"grad_norm": 0.0009750658646225929,
"learning_rate": 1.344946025515211e-05,
"loss": 0.0001,
"step": 7450
},
{
"epoch": 7.32090284592738,
"grad_norm": 0.0026528111193329096,
"learning_rate": 1.3400392541707558e-05,
"loss": 0.0001,
"step": 7460
},
{
"epoch": 7.33071638861629,
"grad_norm": 0.0010182139230892062,
"learning_rate": 1.3351324828263003e-05,
"loss": 0.0001,
"step": 7470
},
{
"epoch": 7.340529931305201,
"grad_norm": 0.0009615565068088472,
"learning_rate": 1.330225711481845e-05,
"loss": 0.0001,
"step": 7480
},
{
"epoch": 7.350343473994112,
"grad_norm": 0.000971368863247335,
"learning_rate": 1.3253189401373898e-05,
"loss": 0.0004,
"step": 7490
},
{
"epoch": 7.360157016683022,
"grad_norm": 0.027576476335525513,
"learning_rate": 1.3204121687929343e-05,
"loss": 0.0002,
"step": 7500
},
{
"epoch": 7.369970559371933,
"grad_norm": 0.0009151269332505763,
"learning_rate": 1.315505397448479e-05,
"loss": 0.0003,
"step": 7510
},
{
"epoch": 7.379784102060844,
"grad_norm": 0.0013021818595007062,
"learning_rate": 1.3105986261040238e-05,
"loss": 0.0001,
"step": 7520
},
{
"epoch": 7.389597644749754,
"grad_norm": 0.001062211929820478,
"learning_rate": 1.3056918547595683e-05,
"loss": 0.0001,
"step": 7530
},
{
"epoch": 7.399411187438665,
"grad_norm": 114.82591247558594,
"learning_rate": 1.3007850834151127e-05,
"loss": 0.029,
"step": 7540
},
{
"epoch": 7.409224730127576,
"grad_norm": 0.0009047266212292016,
"learning_rate": 1.2958783120706575e-05,
"loss": 0.0001,
"step": 7550
},
{
"epoch": 7.419038272816486,
"grad_norm": 0.0017496274085715413,
"learning_rate": 1.2909715407262022e-05,
"loss": 0.0001,
"step": 7560
},
{
"epoch": 7.428851815505397,
"grad_norm": 0.0009102231124415994,
"learning_rate": 1.2860647693817468e-05,
"loss": 0.0001,
"step": 7570
},
{
"epoch": 7.438665358194308,
"grad_norm": 0.0017243401380255818,
"learning_rate": 1.2811579980372915e-05,
"loss": 0.0003,
"step": 7580
},
{
"epoch": 7.448478900883218,
"grad_norm": 0.05692388117313385,
"learning_rate": 1.2762512266928362e-05,
"loss": 0.0001,
"step": 7590
},
{
"epoch": 7.458292443572129,
"grad_norm": 0.0009312523761764169,
"learning_rate": 1.2713444553483808e-05,
"loss": 0.0915,
"step": 7600
},
{
"epoch": 7.46810598626104,
"grad_norm": 0.0009480075677856803,
"learning_rate": 1.2664376840039255e-05,
"loss": 0.0001,
"step": 7610
},
{
"epoch": 7.4779195289499505,
"grad_norm": 0.0009222645312547684,
"learning_rate": 1.2615309126594702e-05,
"loss": 0.0002,
"step": 7620
},
{
"epoch": 7.4877330716388615,
"grad_norm": 0.0009319439996033907,
"learning_rate": 1.2566241413150148e-05,
"loss": 0.0001,
"step": 7630
},
{
"epoch": 7.4975466143277725,
"grad_norm": 0.0008977550896815956,
"learning_rate": 1.2517173699705595e-05,
"loss": 0.0001,
"step": 7640
},
{
"epoch": 7.507360157016683,
"grad_norm": 0.0010047757532447577,
"learning_rate": 1.246810598626104e-05,
"loss": 0.0001,
"step": 7650
},
{
"epoch": 7.517173699705594,
"grad_norm": 0.0038417112082242966,
"learning_rate": 1.2419038272816486e-05,
"loss": 0.0001,
"step": 7660
},
{
"epoch": 7.526987242394505,
"grad_norm": 0.0010750400833785534,
"learning_rate": 1.2369970559371934e-05,
"loss": 0.0001,
"step": 7670
},
{
"epoch": 7.536800785083415,
"grad_norm": 0.0008918473613448441,
"learning_rate": 1.2320902845927381e-05,
"loss": 0.0001,
"step": 7680
},
{
"epoch": 7.546614327772326,
"grad_norm": 0.0010516536422073841,
"learning_rate": 1.2271835132482827e-05,
"loss": 0.0001,
"step": 7690
},
{
"epoch": 7.556427870461237,
"grad_norm": 0.0009324781713075936,
"learning_rate": 1.2222767419038274e-05,
"loss": 0.0001,
"step": 7700
},
{
"epoch": 7.566241413150147,
"grad_norm": 0.0009400816052220762,
"learning_rate": 1.2173699705593721e-05,
"loss": 0.0001,
"step": 7710
},
{
"epoch": 7.576054955839058,
"grad_norm": 0.0008609534706920385,
"learning_rate": 1.2124631992149165e-05,
"loss": 0.0001,
"step": 7720
},
{
"epoch": 7.585868498527969,
"grad_norm": 0.0009011939982883632,
"learning_rate": 1.2075564278704612e-05,
"loss": 0.0001,
"step": 7730
},
{
"epoch": 7.595682041216879,
"grad_norm": 0.0008951441268436611,
"learning_rate": 1.202649656526006e-05,
"loss": 0.0001,
"step": 7740
},
{
"epoch": 7.60549558390579,
"grad_norm": 0.0008742365753278136,
"learning_rate": 1.1977428851815505e-05,
"loss": 0.0001,
"step": 7750
},
{
"epoch": 7.615309126594701,
"grad_norm": 0.17169933021068573,
"learning_rate": 1.1928361138370952e-05,
"loss": 0.0002,
"step": 7760
},
{
"epoch": 7.625122669283611,
"grad_norm": 0.0008738868637010455,
"learning_rate": 1.18792934249264e-05,
"loss": 0.0001,
"step": 7770
},
{
"epoch": 7.634936211972522,
"grad_norm": 0.0011609562207013369,
"learning_rate": 1.1830225711481845e-05,
"loss": 0.0001,
"step": 7780
},
{
"epoch": 7.644749754661433,
"grad_norm": 0.0009239889914169908,
"learning_rate": 1.1781157998037293e-05,
"loss": 0.0001,
"step": 7790
},
{
"epoch": 7.654563297350343,
"grad_norm": 0.0009010569774545729,
"learning_rate": 1.1732090284592738e-05,
"loss": 0.0001,
"step": 7800
},
{
"epoch": 7.664376840039254,
"grad_norm": 0.000879693659953773,
"learning_rate": 1.1683022571148185e-05,
"loss": 0.0001,
"step": 7810
},
{
"epoch": 7.674190382728165,
"grad_norm": 0.0008639395819045603,
"learning_rate": 1.1633954857703631e-05,
"loss": 0.0001,
"step": 7820
},
{
"epoch": 7.684003925417075,
"grad_norm": 0.0008466942235827446,
"learning_rate": 1.1584887144259078e-05,
"loss": 0.0001,
"step": 7830
},
{
"epoch": 7.693817468105986,
"grad_norm": 0.0008819219656288624,
"learning_rate": 1.1535819430814526e-05,
"loss": 0.0001,
"step": 7840
},
{
"epoch": 7.703631010794897,
"grad_norm": 0.009510258212685585,
"learning_rate": 1.1486751717369971e-05,
"loss": 0.0001,
"step": 7850
},
{
"epoch": 7.713444553483807,
"grad_norm": 0.0008892007754184306,
"learning_rate": 1.1437684003925419e-05,
"loss": 0.0001,
"step": 7860
},
{
"epoch": 7.723258096172718,
"grad_norm": 0.0009460031287744641,
"learning_rate": 1.1388616290480864e-05,
"loss": 0.0001,
"step": 7870
},
{
"epoch": 7.733071638861629,
"grad_norm": 0.0008965510060079396,
"learning_rate": 1.133954857703631e-05,
"loss": 0.0001,
"step": 7880
},
{
"epoch": 7.742885181550539,
"grad_norm": 0.05526250973343849,
"learning_rate": 1.1290480863591757e-05,
"loss": 0.0001,
"step": 7890
},
{
"epoch": 7.75269872423945,
"grad_norm": 0.000836291816085577,
"learning_rate": 1.1241413150147204e-05,
"loss": 0.0001,
"step": 7900
},
{
"epoch": 7.762512266928361,
"grad_norm": 0.0008228803635574877,
"learning_rate": 1.119234543670265e-05,
"loss": 0.0001,
"step": 7910
},
{
"epoch": 7.772325809617271,
"grad_norm": 0.0009072457323782146,
"learning_rate": 1.1143277723258097e-05,
"loss": 0.0001,
"step": 7920
},
{
"epoch": 7.782139352306182,
"grad_norm": 0.0010595405474305153,
"learning_rate": 1.1094210009813544e-05,
"loss": 0.0001,
"step": 7930
},
{
"epoch": 7.791952894995093,
"grad_norm": 0.0008154577808454633,
"learning_rate": 1.1045142296368988e-05,
"loss": 0.0001,
"step": 7940
},
{
"epoch": 7.8017664376840035,
"grad_norm": 0.0009557644953019917,
"learning_rate": 1.0996074582924436e-05,
"loss": 0.0001,
"step": 7950
},
{
"epoch": 7.8115799803729145,
"grad_norm": 0.0008630304364487529,
"learning_rate": 1.0947006869479883e-05,
"loss": 0.0001,
"step": 7960
},
{
"epoch": 7.8213935230618254,
"grad_norm": 0.004290347453206778,
"learning_rate": 1.0897939156035329e-05,
"loss": 0.0001,
"step": 7970
},
{
"epoch": 7.8312070657507356,
"grad_norm": 0.0008026896975934505,
"learning_rate": 1.0848871442590776e-05,
"loss": 0.0001,
"step": 7980
},
{
"epoch": 7.8410206084396465,
"grad_norm": 0.0008485147845931351,
"learning_rate": 1.0799803729146223e-05,
"loss": 0.0001,
"step": 7990
},
{
"epoch": 7.8508341511285575,
"grad_norm": 0.0009684371179901063,
"learning_rate": 1.0750736015701669e-05,
"loss": 0.0001,
"step": 8000
},
{
"epoch": 7.860647693817468,
"grad_norm": 0.00081270607188344,
"learning_rate": 1.0701668302257116e-05,
"loss": 0.0001,
"step": 8010
},
{
"epoch": 7.870461236506379,
"grad_norm": 0.0008527148747816682,
"learning_rate": 1.0652600588812562e-05,
"loss": 0.0001,
"step": 8020
},
{
"epoch": 7.88027477919529,
"grad_norm": 0.0011228329967707396,
"learning_rate": 1.0603532875368007e-05,
"loss": 0.0001,
"step": 8030
},
{
"epoch": 7.8900883218842,
"grad_norm": 0.0011605530744418502,
"learning_rate": 1.0554465161923454e-05,
"loss": 0.0001,
"step": 8040
},
{
"epoch": 7.899901864573111,
"grad_norm": 0.0008033498888835311,
"learning_rate": 1.0505397448478902e-05,
"loss": 0.0001,
"step": 8050
},
{
"epoch": 7.909715407262022,
"grad_norm": 0.0008764792000874877,
"learning_rate": 1.0456329735034347e-05,
"loss": 0.0955,
"step": 8060
},
{
"epoch": 7.919528949950932,
"grad_norm": 0.04982365667819977,
"learning_rate": 1.0407262021589795e-05,
"loss": 0.0002,
"step": 8070
},
{
"epoch": 7.929342492639843,
"grad_norm": 0.0008406474371440709,
"learning_rate": 1.0358194308145242e-05,
"loss": 0.0002,
"step": 8080
},
{
"epoch": 7.939156035328754,
"grad_norm": 0.000985965714789927,
"learning_rate": 1.0309126594700687e-05,
"loss": 0.0002,
"step": 8090
},
{
"epoch": 7.948969578017664,
"grad_norm": 0.0008393987664021552,
"learning_rate": 1.0260058881256133e-05,
"loss": 0.0002,
"step": 8100
},
{
"epoch": 7.958783120706575,
"grad_norm": 0.0008538268739357591,
"learning_rate": 1.021099116781158e-05,
"loss": 0.0001,
"step": 8110
},
{
"epoch": 7.968596663395486,
"grad_norm": 0.0054728141985833645,
"learning_rate": 1.0161923454367028e-05,
"loss": 0.0001,
"step": 8120
},
{
"epoch": 7.978410206084396,
"grad_norm": 0.0009096296853385866,
"learning_rate": 1.0112855740922473e-05,
"loss": 0.0001,
"step": 8130
},
{
"epoch": 7.988223748773307,
"grad_norm": 0.0008633875986561179,
"learning_rate": 1.006378802747792e-05,
"loss": 0.0001,
"step": 8140
},
{
"epoch": 7.998037291462218,
"grad_norm": 0.0009331282926723361,
"learning_rate": 1.0014720314033368e-05,
"loss": 0.0001,
"step": 8150
},
{
"epoch": 8.007850834151128,
"grad_norm": 0.0007973794708959758,
"learning_rate": 9.965652600588813e-06,
"loss": 0.0001,
"step": 8160
},
{
"epoch": 8.01766437684004,
"grad_norm": 0.0007764511392451823,
"learning_rate": 9.916584887144259e-06,
"loss": 0.0001,
"step": 8170
},
{
"epoch": 8.02747791952895,
"grad_norm": 0.0008435621275566518,
"learning_rate": 9.867517173699706e-06,
"loss": 0.0001,
"step": 8180
},
{
"epoch": 8.03729146221786,
"grad_norm": 0.0008471392211504281,
"learning_rate": 9.818449460255152e-06,
"loss": 0.0001,
"step": 8190
},
{
"epoch": 8.047105004906772,
"grad_norm": 0.0015691117150709033,
"learning_rate": 9.7693817468106e-06,
"loss": 0.0001,
"step": 8200
},
{
"epoch": 8.056918547595682,
"grad_norm": 0.0007785743218846619,
"learning_rate": 9.720314033366046e-06,
"loss": 0.0002,
"step": 8210
},
{
"epoch": 8.066732090284592,
"grad_norm": 0.001200017984956503,
"learning_rate": 9.671246319921492e-06,
"loss": 0.0001,
"step": 8220
},
{
"epoch": 8.076545632973504,
"grad_norm": 0.0007911358843557537,
"learning_rate": 9.62217860647694e-06,
"loss": 0.0001,
"step": 8230
},
{
"epoch": 8.086359175662414,
"grad_norm": 0.0007746540359221399,
"learning_rate": 9.573110893032385e-06,
"loss": 0.0002,
"step": 8240
},
{
"epoch": 8.096172718351324,
"grad_norm": 0.0007667599711567163,
"learning_rate": 9.52404317958783e-06,
"loss": 0.0001,
"step": 8250
},
{
"epoch": 8.105986261040236,
"grad_norm": 0.0008504064753651619,
"learning_rate": 9.474975466143278e-06,
"loss": 0.0001,
"step": 8260
},
{
"epoch": 8.115799803729146,
"grad_norm": 0.0007812583935447037,
"learning_rate": 9.425907752698725e-06,
"loss": 0.0001,
"step": 8270
},
{
"epoch": 8.125613346418056,
"grad_norm": 0.0013848438393324614,
"learning_rate": 9.37684003925417e-06,
"loss": 0.0001,
"step": 8280
},
{
"epoch": 8.135426889106968,
"grad_norm": 0.0008914385107345879,
"learning_rate": 9.327772325809618e-06,
"loss": 0.0001,
"step": 8290
},
{
"epoch": 8.145240431795878,
"grad_norm": 0.0007613406050950289,
"learning_rate": 9.278704612365065e-06,
"loss": 0.0004,
"step": 8300
},
{
"epoch": 8.155053974484789,
"grad_norm": 0.0007612567278556526,
"learning_rate": 9.229636898920511e-06,
"loss": 0.0786,
"step": 8310
},
{
"epoch": 8.1648675171737,
"grad_norm": 0.0008015862549655139,
"learning_rate": 9.180569185475956e-06,
"loss": 0.0001,
"step": 8320
},
{
"epoch": 8.17468105986261,
"grad_norm": 0.0007999239605851471,
"learning_rate": 9.131501472031404e-06,
"loss": 0.0001,
"step": 8330
},
{
"epoch": 8.18449460255152,
"grad_norm": 0.0008124898886308074,
"learning_rate": 9.082433758586851e-06,
"loss": 0.0001,
"step": 8340
},
{
"epoch": 8.194308145240432,
"grad_norm": 0.0007925584213808179,
"learning_rate": 9.033366045142297e-06,
"loss": 0.0001,
"step": 8350
},
{
"epoch": 8.204121687929343,
"grad_norm": 0.0007474345620721579,
"learning_rate": 8.984298331697744e-06,
"loss": 0.0001,
"step": 8360
},
{
"epoch": 8.213935230618253,
"grad_norm": 0.00099629582837224,
"learning_rate": 8.93523061825319e-06,
"loss": 0.0001,
"step": 8370
},
{
"epoch": 8.223748773307165,
"grad_norm": 0.0007709822966717184,
"learning_rate": 8.886162904808637e-06,
"loss": 0.0001,
"step": 8380
},
{
"epoch": 8.233562315996075,
"grad_norm": 0.0008605083567090333,
"learning_rate": 8.837095191364082e-06,
"loss": 0.0001,
"step": 8390
},
{
"epoch": 8.243375858684985,
"grad_norm": 0.0007616875227540731,
"learning_rate": 8.78802747791953e-06,
"loss": 0.0001,
"step": 8400
},
{
"epoch": 8.253189401373897,
"grad_norm": 0.0008648928487673402,
"learning_rate": 8.738959764474975e-06,
"loss": 0.0001,
"step": 8410
},
{
"epoch": 8.263002944062807,
"grad_norm": 0.0007865010411478579,
"learning_rate": 8.689892051030423e-06,
"loss": 0.0001,
"step": 8420
},
{
"epoch": 8.272816486751717,
"grad_norm": 0.0007759992149658501,
"learning_rate": 8.64082433758587e-06,
"loss": 0.0001,
"step": 8430
},
{
"epoch": 8.282630029440629,
"grad_norm": 0.0007434001890942454,
"learning_rate": 8.591756624141315e-06,
"loss": 0.0001,
"step": 8440
},
{
"epoch": 8.292443572129539,
"grad_norm": 0.0007561213569715619,
"learning_rate": 8.542688910696763e-06,
"loss": 0.0001,
"step": 8450
},
{
"epoch": 8.302257114818449,
"grad_norm": 0.0008792446460574865,
"learning_rate": 8.493621197252208e-06,
"loss": 0.0001,
"step": 8460
},
{
"epoch": 8.31207065750736,
"grad_norm": 0.0008201678283512592,
"learning_rate": 8.444553483807654e-06,
"loss": 0.0001,
"step": 8470
},
{
"epoch": 8.321884200196271,
"grad_norm": 0.0007656855159439147,
"learning_rate": 8.395485770363101e-06,
"loss": 0.0001,
"step": 8480
},
{
"epoch": 8.331697742885181,
"grad_norm": 0.0009525881614536047,
"learning_rate": 8.346418056918548e-06,
"loss": 0.0001,
"step": 8490
},
{
"epoch": 8.341511285574093,
"grad_norm": 0.0007570137386210263,
"learning_rate": 8.297350343473994e-06,
"loss": 0.0001,
"step": 8500
},
{
"epoch": 8.351324828263003,
"grad_norm": 0.002343561267480254,
"learning_rate": 8.248282630029441e-06,
"loss": 0.0001,
"step": 8510
},
{
"epoch": 8.361138370951913,
"grad_norm": 0.000724265119060874,
"learning_rate": 8.199214916584889e-06,
"loss": 0.0001,
"step": 8520
},
{
"epoch": 8.370951913640825,
"grad_norm": 0.0007559550576843321,
"learning_rate": 8.150147203140333e-06,
"loss": 0.0001,
"step": 8530
},
{
"epoch": 8.380765456329735,
"grad_norm": 0.0007767178467474878,
"learning_rate": 8.10107948969578e-06,
"loss": 0.0001,
"step": 8540
},
{
"epoch": 8.390578999018645,
"grad_norm": 0.0014819581992924213,
"learning_rate": 8.052011776251227e-06,
"loss": 0.095,
"step": 8550
},
{
"epoch": 8.400392541707557,
"grad_norm": 0.0008053283672779799,
"learning_rate": 8.002944062806673e-06,
"loss": 0.0001,
"step": 8560
},
{
"epoch": 8.410206084396467,
"grad_norm": 0.000741046154871583,
"learning_rate": 7.95387634936212e-06,
"loss": 0.0001,
"step": 8570
},
{
"epoch": 8.420019627085377,
"grad_norm": 0.0009256862103939056,
"learning_rate": 7.904808635917567e-06,
"loss": 0.0001,
"step": 8580
},
{
"epoch": 8.42983316977429,
"grad_norm": 0.0007935376488603652,
"learning_rate": 7.855740922473013e-06,
"loss": 0.0001,
"step": 8590
},
{
"epoch": 8.4396467124632,
"grad_norm": 0.010961124673485756,
"learning_rate": 7.80667320902846e-06,
"loss": 0.0002,
"step": 8600
},
{
"epoch": 8.44946025515211,
"grad_norm": 0.006000032182782888,
"learning_rate": 7.757605495583906e-06,
"loss": 0.0001,
"step": 8610
},
{
"epoch": 8.459273797841021,
"grad_norm": 0.008318673819303513,
"learning_rate": 7.708537782139353e-06,
"loss": 0.0001,
"step": 8620
},
{
"epoch": 8.469087340529931,
"grad_norm": 0.0007991963066160679,
"learning_rate": 7.659470068694799e-06,
"loss": 0.0616,
"step": 8630
},
{
"epoch": 8.478900883218841,
"grad_norm": 0.0011054244823753834,
"learning_rate": 7.610402355250246e-06,
"loss": 0.0002,
"step": 8640
},
{
"epoch": 8.488714425907753,
"grad_norm": 0.0007553680334240198,
"learning_rate": 7.561334641805692e-06,
"loss": 0.0001,
"step": 8650
},
{
"epoch": 8.498527968596663,
"grad_norm": 0.0007292833179235458,
"learning_rate": 7.512266928361139e-06,
"loss": 0.0001,
"step": 8660
},
{
"epoch": 8.508341511285574,
"grad_norm": 0.0007289135828614235,
"learning_rate": 7.463199214916586e-06,
"loss": 0.0001,
"step": 8670
},
{
"epoch": 8.518155053974485,
"grad_norm": 0.0007795288693159819,
"learning_rate": 7.414131501472031e-06,
"loss": 0.0001,
"step": 8680
},
{
"epoch": 8.527968596663396,
"grad_norm": 0.0007076899637468159,
"learning_rate": 7.365063788027478e-06,
"loss": 0.0001,
"step": 8690
},
{
"epoch": 8.537782139352306,
"grad_norm": 0.0007375687710009515,
"learning_rate": 7.3159960745829246e-06,
"loss": 0.0001,
"step": 8700
},
{
"epoch": 8.547595682041218,
"grad_norm": 0.0007277546101249754,
"learning_rate": 7.266928361138371e-06,
"loss": 0.0001,
"step": 8710
},
{
"epoch": 8.557409224730128,
"grad_norm": 0.0007561793318018317,
"learning_rate": 7.217860647693818e-06,
"loss": 0.0001,
"step": 8720
},
{
"epoch": 8.567222767419038,
"grad_norm": 0.000729912135284394,
"learning_rate": 7.168792934249265e-06,
"loss": 0.0001,
"step": 8730
},
{
"epoch": 8.57703631010795,
"grad_norm": 0.0007444035727530718,
"learning_rate": 7.119725220804711e-06,
"loss": 0.0004,
"step": 8740
},
{
"epoch": 8.58684985279686,
"grad_norm": 0.002724673831835389,
"learning_rate": 7.0706575073601584e-06,
"loss": 0.0001,
"step": 8750
},
{
"epoch": 8.59666339548577,
"grad_norm": 0.0009111511171795428,
"learning_rate": 7.021589793915603e-06,
"loss": 0.0001,
"step": 8760
},
{
"epoch": 8.606476938174682,
"grad_norm": 0.0007101638475432992,
"learning_rate": 6.97252208047105e-06,
"loss": 0.0638,
"step": 8770
},
{
"epoch": 8.616290480863592,
"grad_norm": 0.0007315074326470494,
"learning_rate": 6.923454367026497e-06,
"loss": 0.0001,
"step": 8780
},
{
"epoch": 8.626104023552502,
"grad_norm": 0.0007471499848179519,
"learning_rate": 6.874386653581943e-06,
"loss": 0.0001,
"step": 8790
},
{
"epoch": 8.635917566241414,
"grad_norm": 0.001743357628583908,
"learning_rate": 6.82531894013739e-06,
"loss": 0.0001,
"step": 8800
},
{
"epoch": 8.645731108930324,
"grad_norm": 0.0007213126518763602,
"learning_rate": 6.776251226692837e-06,
"loss": 0.0001,
"step": 8810
},
{
"epoch": 8.655544651619234,
"grad_norm": 0.006596927065402269,
"learning_rate": 6.7271835132482835e-06,
"loss": 0.0001,
"step": 8820
},
{
"epoch": 8.665358194308146,
"grad_norm": 0.0007276834803633392,
"learning_rate": 6.678115799803729e-06,
"loss": 0.0001,
"step": 8830
},
{
"epoch": 8.675171736997056,
"grad_norm": 0.0007477464969269931,
"learning_rate": 6.6290480863591756e-06,
"loss": 0.0001,
"step": 8840
},
{
"epoch": 8.684985279685966,
"grad_norm": 0.0008145422907546163,
"learning_rate": 6.579980372914622e-06,
"loss": 0.0758,
"step": 8850
},
{
"epoch": 8.694798822374878,
"grad_norm": 0.0007996530621312559,
"learning_rate": 6.530912659470069e-06,
"loss": 0.0001,
"step": 8860
},
{
"epoch": 8.704612365063788,
"grad_norm": 0.0007228550384752452,
"learning_rate": 6.481844946025516e-06,
"loss": 0.0003,
"step": 8870
},
{
"epoch": 8.714425907752698,
"grad_norm": 0.0007497305050492287,
"learning_rate": 6.432777232580962e-06,
"loss": 0.0001,
"step": 8880
},
{
"epoch": 8.72423945044161,
"grad_norm": 0.0018314624903723598,
"learning_rate": 6.3837095191364094e-06,
"loss": 0.0001,
"step": 8890
},
{
"epoch": 8.73405299313052,
"grad_norm": 0.005474488250911236,
"learning_rate": 6.334641805691854e-06,
"loss": 0.0001,
"step": 8900
},
{
"epoch": 8.74386653581943,
"grad_norm": 0.0007096781046129763,
"learning_rate": 6.2855740922473015e-06,
"loss": 0.0004,
"step": 8910
},
{
"epoch": 8.753680078508342,
"grad_norm": 0.0007362039759755135,
"learning_rate": 6.236506378802748e-06,
"loss": 0.0001,
"step": 8920
},
{
"epoch": 8.763493621197252,
"grad_norm": 0.0007442686473950744,
"learning_rate": 6.187438665358194e-06,
"loss": 0.0698,
"step": 8930
},
{
"epoch": 8.773307163886162,
"grad_norm": 0.002618088386952877,
"learning_rate": 6.138370951913641e-06,
"loss": 0.0001,
"step": 8940
},
{
"epoch": 8.783120706575074,
"grad_norm": 0.0007100084330886602,
"learning_rate": 6.089303238469088e-06,
"loss": 0.0001,
"step": 8950
},
{
"epoch": 8.792934249263984,
"grad_norm": 0.0009615476010367274,
"learning_rate": 6.040235525024534e-06,
"loss": 0.0001,
"step": 8960
},
{
"epoch": 8.802747791952894,
"grad_norm": 0.0011131309438496828,
"learning_rate": 5.991167811579981e-06,
"loss": 0.0001,
"step": 8970
},
{
"epoch": 8.812561334641806,
"grad_norm": 0.000781961134634912,
"learning_rate": 5.942100098135427e-06,
"loss": 0.0001,
"step": 8980
},
{
"epoch": 8.822374877330716,
"grad_norm": 0.0007456222083419561,
"learning_rate": 5.893032384690874e-06,
"loss": 0.0001,
"step": 8990
},
{
"epoch": 8.832188420019627,
"grad_norm": 0.0007512273732572794,
"learning_rate": 5.84396467124632e-06,
"loss": 0.0001,
"step": 9000
},
{
"epoch": 8.842001962708538,
"grad_norm": 0.0007723625167272985,
"learning_rate": 5.794896957801767e-06,
"loss": 0.0001,
"step": 9010
},
{
"epoch": 8.851815505397449,
"grad_norm": 0.0006950558163225651,
"learning_rate": 5.745829244357213e-06,
"loss": 0.0001,
"step": 9020
},
{
"epoch": 8.861629048086359,
"grad_norm": 0.0006956023280508816,
"learning_rate": 5.69676153091266e-06,
"loss": 0.0001,
"step": 9030
},
{
"epoch": 8.87144259077527,
"grad_norm": 0.0006997225573286414,
"learning_rate": 5.647693817468106e-06,
"loss": 0.0001,
"step": 9040
},
{
"epoch": 8.88125613346418,
"grad_norm": 0.0007857059827074409,
"learning_rate": 5.5986261040235525e-06,
"loss": 0.0001,
"step": 9050
},
{
"epoch": 8.89106967615309,
"grad_norm": 0.0020336457528173923,
"learning_rate": 5.549558390579e-06,
"loss": 0.0001,
"step": 9060
},
{
"epoch": 8.900883218842003,
"grad_norm": 0.0007115107146091759,
"learning_rate": 5.500490677134445e-06,
"loss": 0.0001,
"step": 9070
},
{
"epoch": 8.910696761530913,
"grad_norm": 0.0007492152508348227,
"learning_rate": 5.451422963689893e-06,
"loss": 0.0001,
"step": 9080
},
{
"epoch": 8.920510304219823,
"grad_norm": 0.0029001296497881413,
"learning_rate": 5.402355250245339e-06,
"loss": 0.0001,
"step": 9090
},
{
"epoch": 8.930323846908735,
"grad_norm": 0.0006878664717078209,
"learning_rate": 5.3532875368007855e-06,
"loss": 0.0002,
"step": 9100
},
{
"epoch": 8.940137389597645,
"grad_norm": 0.0007307238993234932,
"learning_rate": 5.304219823356232e-06,
"loss": 0.0001,
"step": 9110
},
{
"epoch": 8.949950932286555,
"grad_norm": 0.0007018332253210247,
"learning_rate": 5.255152109911678e-06,
"loss": 0.0001,
"step": 9120
},
{
"epoch": 8.959764474975467,
"grad_norm": 0.01293295156210661,
"learning_rate": 5.206084396467125e-06,
"loss": 0.0001,
"step": 9130
},
{
"epoch": 8.969578017664377,
"grad_norm": 0.0006952588446438313,
"learning_rate": 5.157016683022571e-06,
"loss": 0.0001,
"step": 9140
},
{
"epoch": 8.979391560353287,
"grad_norm": 0.0006813241052441299,
"learning_rate": 5.107948969578018e-06,
"loss": 0.0002,
"step": 9150
},
{
"epoch": 8.989205103042199,
"grad_norm": 0.0007726841140538454,
"learning_rate": 5.058881256133464e-06,
"loss": 0.0001,
"step": 9160
},
{
"epoch": 8.999018645731109,
"grad_norm": 0.0008112427312880754,
"learning_rate": 5.0098135426889115e-06,
"loss": 0.0001,
"step": 9170
},
{
"epoch": 9.008832188420019,
"grad_norm": 0.0007123980321921408,
"learning_rate": 4.960745829244357e-06,
"loss": 0.0001,
"step": 9180
},
{
"epoch": 9.018645731108931,
"grad_norm": 0.0006635936442762613,
"learning_rate": 4.9116781157998035e-06,
"loss": 0.0001,
"step": 9190
},
{
"epoch": 9.028459273797841,
"grad_norm": 0.0006985082291066647,
"learning_rate": 4.862610402355251e-06,
"loss": 0.0001,
"step": 9200
},
{
"epoch": 9.038272816486751,
"grad_norm": 0.0009679241920821369,
"learning_rate": 4.813542688910697e-06,
"loss": 0.0001,
"step": 9210
},
{
"epoch": 9.048086359175663,
"grad_norm": 0.0007227755268104374,
"learning_rate": 4.764474975466144e-06,
"loss": 0.0001,
"step": 9220
},
{
"epoch": 9.057899901864573,
"grad_norm": 0.0006677210330963135,
"learning_rate": 4.71540726202159e-06,
"loss": 0.0001,
"step": 9230
},
{
"epoch": 9.067713444553483,
"grad_norm": 0.0007603775011375546,
"learning_rate": 4.6663395485770365e-06,
"loss": 0.0001,
"step": 9240
},
{
"epoch": 9.077526987242395,
"grad_norm": 0.0007459863554686308,
"learning_rate": 4.617271835132483e-06,
"loss": 0.0001,
"step": 9250
},
{
"epoch": 9.087340529931305,
"grad_norm": 0.001274227281101048,
"learning_rate": 4.568204121687929e-06,
"loss": 0.0001,
"step": 9260
},
{
"epoch": 9.097154072620215,
"grad_norm": 0.0008047525770962238,
"learning_rate": 4.519136408243376e-06,
"loss": 0.0001,
"step": 9270
},
{
"epoch": 9.106967615309127,
"grad_norm": 0.0009299516095779836,
"learning_rate": 4.470068694798823e-06,
"loss": 0.0001,
"step": 9280
},
{
"epoch": 9.116781157998037,
"grad_norm": 0.0007023366051726043,
"learning_rate": 4.421000981354269e-06,
"loss": 0.0001,
"step": 9290
},
{
"epoch": 9.126594700686947,
"grad_norm": 0.0008517011883668602,
"learning_rate": 4.371933267909715e-06,
"loss": 0.0001,
"step": 9300
},
{
"epoch": 9.13640824337586,
"grad_norm": 0.0007383475895039737,
"learning_rate": 4.3228655544651625e-06,
"loss": 0.0001,
"step": 9310
},
{
"epoch": 9.14622178606477,
"grad_norm": 0.0006836687098257244,
"learning_rate": 4.273797841020609e-06,
"loss": 0.0011,
"step": 9320
},
{
"epoch": 9.15603532875368,
"grad_norm": 0.0007958838832564652,
"learning_rate": 4.224730127576055e-06,
"loss": 0.0001,
"step": 9330
},
{
"epoch": 9.165848871442591,
"grad_norm": 0.0051173255778849125,
"learning_rate": 4.175662414131502e-06,
"loss": 0.0736,
"step": 9340
},
{
"epoch": 9.175662414131502,
"grad_norm": 0.0006751357577741146,
"learning_rate": 4.126594700686948e-06,
"loss": 0.0001,
"step": 9350
},
{
"epoch": 9.185475956820412,
"grad_norm": 0.0008489376050420105,
"learning_rate": 4.077526987242395e-06,
"loss": 0.0001,
"step": 9360
},
{
"epoch": 9.195289499509324,
"grad_norm": 0.0006518946029245853,
"learning_rate": 4.028459273797841e-06,
"loss": 0.0001,
"step": 9370
},
{
"epoch": 9.205103042198234,
"grad_norm": 0.0006742589175701141,
"learning_rate": 3.9793915603532875e-06,
"loss": 0.0002,
"step": 9380
},
{
"epoch": 9.214916584887144,
"grad_norm": 0.0006998268072493374,
"learning_rate": 3.930323846908735e-06,
"loss": 0.0001,
"step": 9390
},
{
"epoch": 9.224730127576056,
"grad_norm": 0.0006446267361752689,
"learning_rate": 3.8812561334641804e-06,
"loss": 0.0001,
"step": 9400
},
{
"epoch": 9.234543670264966,
"grad_norm": 0.0006532249972224236,
"learning_rate": 3.832188420019627e-06,
"loss": 0.0001,
"step": 9410
},
{
"epoch": 9.244357212953876,
"grad_norm": 0.0023807811085134745,
"learning_rate": 3.7831207065750737e-06,
"loss": 0.0279,
"step": 9420
},
{
"epoch": 9.254170755642788,
"grad_norm": 39.921600341796875,
"learning_rate": 3.7340529931305206e-06,
"loss": 0.0554,
"step": 9430
},
{
"epoch": 9.263984298331698,
"grad_norm": 0.08718841522932053,
"learning_rate": 3.6849852796859666e-06,
"loss": 0.0003,
"step": 9440
},
{
"epoch": 9.273797841020608,
"grad_norm": 0.0006705551641061902,
"learning_rate": 3.6359175662414135e-06,
"loss": 0.0001,
"step": 9450
},
{
"epoch": 9.28361138370952,
"grad_norm": 0.0007212602067738771,
"learning_rate": 3.58684985279686e-06,
"loss": 0.0001,
"step": 9460
},
{
"epoch": 9.29342492639843,
"grad_norm": 0.0006956434808671474,
"learning_rate": 3.5377821393523068e-06,
"loss": 0.0001,
"step": 9470
},
{
"epoch": 9.30323846908734,
"grad_norm": 0.0007320587756112218,
"learning_rate": 3.488714425907753e-06,
"loss": 0.0001,
"step": 9480
},
{
"epoch": 9.313052011776252,
"grad_norm": 0.0006620934000238776,
"learning_rate": 3.4396467124631992e-06,
"loss": 0.0001,
"step": 9490
},
{
"epoch": 9.322865554465162,
"grad_norm": 0.0008825812255963683,
"learning_rate": 3.390578999018646e-06,
"loss": 0.0001,
"step": 9500
},
{
"epoch": 9.332679097154072,
"grad_norm": 0.000674651877488941,
"learning_rate": 3.341511285574092e-06,
"loss": 0.0001,
"step": 9510
},
{
"epoch": 9.342492639842984,
"grad_norm": 0.0006929274532012641,
"learning_rate": 3.292443572129539e-06,
"loss": 0.0001,
"step": 9520
},
{
"epoch": 9.352306182531894,
"grad_norm": 0.0007789958617649972,
"learning_rate": 3.2433758586849854e-06,
"loss": 0.0001,
"step": 9530
},
{
"epoch": 9.362119725220804,
"grad_norm": 0.0006808873731642962,
"learning_rate": 3.1943081452404323e-06,
"loss": 0.0001,
"step": 9540
},
{
"epoch": 9.371933267909716,
"grad_norm": 0.0006374814547598362,
"learning_rate": 3.1452404317958783e-06,
"loss": 0.0001,
"step": 9550
},
{
"epoch": 9.381746810598626,
"grad_norm": 0.0006496473215520382,
"learning_rate": 3.0961727183513247e-06,
"loss": 0.0921,
"step": 9560
},
{
"epoch": 9.391560353287536,
"grad_norm": 0.0006751060136593878,
"learning_rate": 3.0471050049067716e-06,
"loss": 0.0001,
"step": 9570
},
{
"epoch": 9.401373895976448,
"grad_norm": 0.0006818071124143898,
"learning_rate": 2.998037291462218e-06,
"loss": 0.0001,
"step": 9580
},
{
"epoch": 9.411187438665358,
"grad_norm": 0.008655051700770855,
"learning_rate": 2.9489695780176645e-06,
"loss": 0.0182,
"step": 9590
},
{
"epoch": 9.421000981354268,
"grad_norm": 0.0007353053661063313,
"learning_rate": 2.899901864573111e-06,
"loss": 0.0001,
"step": 9600
},
{
"epoch": 9.43081452404318,
"grad_norm": 0.0007057326729409397,
"learning_rate": 2.8508341511285574e-06,
"loss": 0.0001,
"step": 9610
},
{
"epoch": 9.44062806673209,
"grad_norm": 0.0007384234922938049,
"learning_rate": 2.8017664376840042e-06,
"loss": 0.0004,
"step": 9620
},
{
"epoch": 9.450441609421,
"grad_norm": 0.0007162457914091647,
"learning_rate": 2.7526987242394502e-06,
"loss": 0.0001,
"step": 9630
},
{
"epoch": 9.460255152109912,
"grad_norm": 0.002138860058039427,
"learning_rate": 2.703631010794897e-06,
"loss": 0.0001,
"step": 9640
},
{
"epoch": 9.470068694798822,
"grad_norm": 0.0006910859956406057,
"learning_rate": 2.6545632973503435e-06,
"loss": 0.0704,
"step": 9650
},
{
"epoch": 9.479882237487733,
"grad_norm": 0.0006951667019166052,
"learning_rate": 2.6054955839057904e-06,
"loss": 0.0001,
"step": 9660
},
{
"epoch": 9.489695780176644,
"grad_norm": 0.0007034169393591583,
"learning_rate": 2.5564278704612364e-06,
"loss": 0.0001,
"step": 9670
},
{
"epoch": 9.499509322865554,
"grad_norm": 0.0009360564290545881,
"learning_rate": 2.5073601570166833e-06,
"loss": 0.0001,
"step": 9680
},
{
"epoch": 9.509322865554465,
"grad_norm": 0.0009853884112089872,
"learning_rate": 2.4582924435721297e-06,
"loss": 0.0001,
"step": 9690
},
{
"epoch": 9.519136408243376,
"grad_norm": 0.0009145635995082557,
"learning_rate": 2.409224730127576e-06,
"loss": 0.0001,
"step": 9700
},
{
"epoch": 9.528949950932287,
"grad_norm": 0.000657937373034656,
"learning_rate": 2.3601570166830226e-06,
"loss": 0.0001,
"step": 9710
},
{
"epoch": 9.538763493621197,
"grad_norm": 0.0009716741042211652,
"learning_rate": 2.3110893032384695e-06,
"loss": 0.0001,
"step": 9720
},
{
"epoch": 9.548577036310109,
"grad_norm": 0.0008224455523304641,
"learning_rate": 2.262021589793916e-06,
"loss": 0.0001,
"step": 9730
},
{
"epoch": 9.558390578999019,
"grad_norm": 0.0007166486466303468,
"learning_rate": 2.212953876349362e-06,
"loss": 0.0001,
"step": 9740
},
{
"epoch": 9.568204121687929,
"grad_norm": 0.0006854226812720299,
"learning_rate": 2.1638861629048088e-06,
"loss": 0.0001,
"step": 9750
},
{
"epoch": 9.57801766437684,
"grad_norm": 0.0013223073910921812,
"learning_rate": 2.1148184494602552e-06,
"loss": 0.0001,
"step": 9760
},
{
"epoch": 9.58783120706575,
"grad_norm": 0.0006512215477414429,
"learning_rate": 2.0657507360157017e-06,
"loss": 0.0001,
"step": 9770
},
{
"epoch": 9.59764474975466,
"grad_norm": 0.0006538184825330973,
"learning_rate": 2.016683022571148e-06,
"loss": 0.0001,
"step": 9780
},
{
"epoch": 9.607458292443573,
"grad_norm": 0.0006954250857234001,
"learning_rate": 1.967615309126595e-06,
"loss": 0.0001,
"step": 9790
},
{
"epoch": 9.617271835132483,
"grad_norm": 0.0006559567409567535,
"learning_rate": 1.9185475956820414e-06,
"loss": 0.0001,
"step": 9800
},
{
"epoch": 9.627085377821393,
"grad_norm": 0.0012906268239021301,
"learning_rate": 1.8694798822374878e-06,
"loss": 0.0001,
"step": 9810
},
{
"epoch": 9.636898920510305,
"grad_norm": 0.0006794478395022452,
"learning_rate": 1.8204121687929343e-06,
"loss": 0.0001,
"step": 9820
},
{
"epoch": 9.646712463199215,
"grad_norm": 0.0007485067471861839,
"learning_rate": 1.771344455348381e-06,
"loss": 0.0001,
"step": 9830
},
{
"epoch": 9.656526005888125,
"grad_norm": 0.0007018350879661739,
"learning_rate": 1.7222767419038274e-06,
"loss": 0.0001,
"step": 9840
},
{
"epoch": 9.666339548577037,
"grad_norm": 0.000663910701405257,
"learning_rate": 1.6732090284592738e-06,
"loss": 0.0001,
"step": 9850
},
{
"epoch": 9.676153091265947,
"grad_norm": 0.000718809780664742,
"learning_rate": 1.6241413150147205e-06,
"loss": 0.0001,
"step": 9860
},
{
"epoch": 9.685966633954857,
"grad_norm": 0.0008578874403610826,
"learning_rate": 1.5750736015701667e-06,
"loss": 0.0001,
"step": 9870
},
{
"epoch": 9.695780176643769,
"grad_norm": 0.0007033746223896742,
"learning_rate": 1.5260058881256136e-06,
"loss": 0.0001,
"step": 9880
},
{
"epoch": 9.70559371933268,
"grad_norm": 0.00067708152346313,
"learning_rate": 1.47693817468106e-06,
"loss": 0.0001,
"step": 9890
},
{
"epoch": 9.71540726202159,
"grad_norm": 0.0006639899802394211,
"learning_rate": 1.4278704612365064e-06,
"loss": 0.0001,
"step": 9900
},
{
"epoch": 9.725220804710501,
"grad_norm": 0.0006598685868084431,
"learning_rate": 1.3788027477919529e-06,
"loss": 0.0001,
"step": 9910
},
{
"epoch": 9.735034347399411,
"grad_norm": 0.01395090576261282,
"learning_rate": 1.3297350343473993e-06,
"loss": 0.0001,
"step": 9920
},
{
"epoch": 9.744847890088321,
"grad_norm": 0.0008143746526911855,
"learning_rate": 1.280667320902846e-06,
"loss": 0.0001,
"step": 9930
},
{
"epoch": 9.754661432777233,
"grad_norm": 0.0010220261756330729,
"learning_rate": 1.2315996074582924e-06,
"loss": 0.0001,
"step": 9940
},
{
"epoch": 9.764474975466143,
"grad_norm": 0.003531807102262974,
"learning_rate": 1.182531894013739e-06,
"loss": 0.0001,
"step": 9950
},
{
"epoch": 9.774288518155053,
"grad_norm": 0.0006864424794912338,
"learning_rate": 1.1334641805691855e-06,
"loss": 0.0001,
"step": 9960
},
{
"epoch": 9.784102060843965,
"grad_norm": 0.0008860233356244862,
"learning_rate": 1.0843964671246322e-06,
"loss": 0.0001,
"step": 9970
},
{
"epoch": 9.793915603532875,
"grad_norm": 0.001267165644094348,
"learning_rate": 1.0353287536800786e-06,
"loss": 0.0001,
"step": 9980
},
{
"epoch": 9.803729146221785,
"grad_norm": 0.0006668745772913098,
"learning_rate": 9.86261040235525e-07,
"loss": 0.0001,
"step": 9990
},
{
"epoch": 9.813542688910697,
"grad_norm": 0.000662625883705914,
"learning_rate": 9.371933267909717e-07,
"loss": 0.0001,
"step": 10000
},
{
"epoch": 9.823356231599607,
"grad_norm": 0.0006619680789299309,
"learning_rate": 8.881256133464181e-07,
"loss": 0.0001,
"step": 10010
},
{
"epoch": 9.833169774288518,
"grad_norm": 0.000696695176884532,
"learning_rate": 8.390578999018647e-07,
"loss": 0.0001,
"step": 10020
},
{
"epoch": 9.84298331697743,
"grad_norm": 0.0006725791026838124,
"learning_rate": 7.89990186457311e-07,
"loss": 0.0001,
"step": 10030
},
{
"epoch": 9.85279685966634,
"grad_norm": 0.0006575717707164586,
"learning_rate": 7.409224730127577e-07,
"loss": 0.0001,
"step": 10040
},
{
"epoch": 9.86261040235525,
"grad_norm": 0.0006885197362862527,
"learning_rate": 6.918547595682042e-07,
"loss": 0.0001,
"step": 10050
},
{
"epoch": 9.872423945044162,
"grad_norm": 0.0018992492696270347,
"learning_rate": 6.427870461236506e-07,
"loss": 0.0185,
"step": 10060
},
{
"epoch": 9.882237487733072,
"grad_norm": 0.000654397183097899,
"learning_rate": 5.937193326790972e-07,
"loss": 0.0001,
"step": 10070
},
{
"epoch": 9.892051030421982,
"grad_norm": 0.0006650349241681397,
"learning_rate": 5.446516192345437e-07,
"loss": 0.0001,
"step": 10080
},
{
"epoch": 9.901864573110894,
"grad_norm": 0.0006798275862820446,
"learning_rate": 4.955839057899902e-07,
"loss": 0.0001,
"step": 10090
},
{
"epoch": 9.911678115799804,
"grad_norm": 0.0006500816671177745,
"learning_rate": 4.4651619234543677e-07,
"loss": 0.0001,
"step": 10100
},
{
"epoch": 9.921491658488714,
"grad_norm": 0.0008393925963900983,
"learning_rate": 3.9744847890088327e-07,
"loss": 0.0001,
"step": 10110
},
{
"epoch": 9.931305201177626,
"grad_norm": 0.0007067256956361234,
"learning_rate": 3.4838076545632976e-07,
"loss": 0.0001,
"step": 10120
},
{
"epoch": 9.941118743866536,
"grad_norm": 0.0007005089428275824,
"learning_rate": 2.9931305201177625e-07,
"loss": 0.0002,
"step": 10130
},
{
"epoch": 9.950932286555446,
"grad_norm": 0.0006531529943458736,
"learning_rate": 2.502453385672228e-07,
"loss": 0.0001,
"step": 10140
},
{
"epoch": 9.960745829244358,
"grad_norm": 0.0009129344252869487,
"learning_rate": 2.0117762512266932e-07,
"loss": 0.0001,
"step": 10150
},
{
"epoch": 9.970559371933268,
"grad_norm": 0.0006892773672007024,
"learning_rate": 1.521099116781158e-07,
"loss": 0.0001,
"step": 10160
},
{
"epoch": 9.980372914622178,
"grad_norm": 0.0006506032077595592,
"learning_rate": 1.0304219823356231e-07,
"loss": 0.0001,
"step": 10170
},
{
"epoch": 9.99018645731109,
"grad_norm": 0.0006497541908174753,
"learning_rate": 5.3974484789008834e-08,
"loss": 0.0001,
"step": 10180
},
{
"epoch": 10.0,
"grad_norm": 0.0008801518124528229,
"learning_rate": 4.906771344455348e-09,
"loss": 0.0001,
"step": 10190
},
{
"epoch": 10.0,
"step": 10190,
"total_flos": 6.31327239390081e+18,
"train_loss": 0.024946213553118556,
"train_runtime": 4093.664,
"train_samples_per_second": 19.901,
"train_steps_per_second": 2.489
}
],
"logging_steps": 10,
"max_steps": 10190,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 6.31327239390081e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}