MambalgaP-drivenpaper / trainer_state.json
David Roy Nelson
Upload 11 files
863b050 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9984532159196,
"eval_steps": 500,
"global_step": 122000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.3454886674880981,
"learning_rate": 9.991815957246561e-06,
"loss": 5.7653,
"step": 100
},
{
"epoch": 0.0,
"grad_norm": 0.9953828454017639,
"learning_rate": 9.983631914493123e-06,
"loss": 5.6192,
"step": 200
},
{
"epoch": 0.0,
"grad_norm": 0.7844876646995544,
"learning_rate": 9.975447871739683e-06,
"loss": 5.4452,
"step": 300
},
{
"epoch": 0.0,
"grad_norm": 0.567190945148468,
"learning_rate": 9.967263828986244e-06,
"loss": 5.3183,
"step": 400
},
{
"epoch": 0.0,
"grad_norm": 0.5304737687110901,
"learning_rate": 9.959079786232804e-06,
"loss": 5.2156,
"step": 500
},
{
"epoch": 0.0,
"grad_norm": 0.3840586841106415,
"learning_rate": 9.950895743479366e-06,
"loss": 5.1203,
"step": 600
},
{
"epoch": 0.01,
"grad_norm": 0.34571415185928345,
"learning_rate": 9.942711700725926e-06,
"loss": 5.0732,
"step": 700
},
{
"epoch": 0.01,
"grad_norm": 0.32429179549217224,
"learning_rate": 9.934527657972486e-06,
"loss": 5.0458,
"step": 800
},
{
"epoch": 0.01,
"grad_norm": 0.4024583101272583,
"learning_rate": 9.926343615219047e-06,
"loss": 5.0157,
"step": 900
},
{
"epoch": 0.01,
"grad_norm": 0.3507966101169586,
"learning_rate": 9.918159572465607e-06,
"loss": 4.9921,
"step": 1000
},
{
"epoch": 0.01,
"grad_norm": 0.37788429856300354,
"learning_rate": 9.909975529712169e-06,
"loss": 4.9803,
"step": 1100
},
{
"epoch": 0.01,
"grad_norm": 0.3670382797718048,
"learning_rate": 9.901791486958729e-06,
"loss": 4.9651,
"step": 1200
},
{
"epoch": 0.01,
"grad_norm": 0.2876070737838745,
"learning_rate": 9.89360744420529e-06,
"loss": 4.9585,
"step": 1300
},
{
"epoch": 0.01,
"grad_norm": 0.3649226725101471,
"learning_rate": 9.88542340145185e-06,
"loss": 4.9491,
"step": 1400
},
{
"epoch": 0.01,
"grad_norm": 0.39525964856147766,
"learning_rate": 9.87723935869841e-06,
"loss": 4.9416,
"step": 1500
},
{
"epoch": 0.01,
"grad_norm": 0.30799126625061035,
"learning_rate": 9.86905531594497e-06,
"loss": 4.9318,
"step": 1600
},
{
"epoch": 0.01,
"grad_norm": 0.2949569523334503,
"learning_rate": 9.860871273191532e-06,
"loss": 4.9201,
"step": 1700
},
{
"epoch": 0.01,
"grad_norm": 0.3036958575248718,
"learning_rate": 9.852687230438092e-06,
"loss": 4.9212,
"step": 1800
},
{
"epoch": 0.02,
"grad_norm": 0.7450295686721802,
"learning_rate": 9.844503187684653e-06,
"loss": 4.9166,
"step": 1900
},
{
"epoch": 0.02,
"grad_norm": 0.439261794090271,
"learning_rate": 9.836319144931213e-06,
"loss": 4.9028,
"step": 2000
},
{
"epoch": 0.02,
"grad_norm": 0.28415539860725403,
"learning_rate": 9.828135102177775e-06,
"loss": 4.8989,
"step": 2100
},
{
"epoch": 0.02,
"grad_norm": 0.3490685522556305,
"learning_rate": 9.819951059424335e-06,
"loss": 4.9008,
"step": 2200
},
{
"epoch": 0.02,
"grad_norm": 0.4549264907836914,
"learning_rate": 9.811767016670895e-06,
"loss": 4.8968,
"step": 2300
},
{
"epoch": 0.02,
"grad_norm": 0.3350650668144226,
"learning_rate": 9.803582973917457e-06,
"loss": 4.8886,
"step": 2400
},
{
"epoch": 0.02,
"grad_norm": 0.41439351439476013,
"learning_rate": 9.795398931164018e-06,
"loss": 4.8869,
"step": 2500
},
{
"epoch": 0.02,
"grad_norm": 0.47745582461357117,
"learning_rate": 9.787214888410578e-06,
"loss": 4.8864,
"step": 2600
},
{
"epoch": 0.02,
"grad_norm": 0.4192076027393341,
"learning_rate": 9.779030845657138e-06,
"loss": 4.886,
"step": 2700
},
{
"epoch": 0.02,
"grad_norm": 0.38134631514549255,
"learning_rate": 9.7708468029037e-06,
"loss": 4.8803,
"step": 2800
},
{
"epoch": 0.02,
"grad_norm": 1.0664669275283813,
"learning_rate": 9.76266276015026e-06,
"loss": 4.8764,
"step": 2900
},
{
"epoch": 0.02,
"grad_norm": 0.6459155678749084,
"learning_rate": 9.75447871739682e-06,
"loss": 4.8721,
"step": 3000
},
{
"epoch": 0.03,
"grad_norm": 0.4217374622821808,
"learning_rate": 9.746294674643381e-06,
"loss": 4.8758,
"step": 3100
},
{
"epoch": 0.03,
"grad_norm": 1.4077221155166626,
"learning_rate": 9.738110631889943e-06,
"loss": 4.8694,
"step": 3200
},
{
"epoch": 0.03,
"grad_norm": 0.793872058391571,
"learning_rate": 9.729926589136503e-06,
"loss": 4.872,
"step": 3300
},
{
"epoch": 0.03,
"grad_norm": 0.6410694718360901,
"learning_rate": 9.721742546383063e-06,
"loss": 4.8697,
"step": 3400
},
{
"epoch": 0.03,
"grad_norm": 0.9800311923027039,
"learning_rate": 9.713558503629624e-06,
"loss": 4.86,
"step": 3500
},
{
"epoch": 0.03,
"grad_norm": 0.999591052532196,
"learning_rate": 9.705374460876184e-06,
"loss": 4.8601,
"step": 3600
},
{
"epoch": 0.03,
"grad_norm": 1.2334632873535156,
"learning_rate": 9.697190418122744e-06,
"loss": 4.8603,
"step": 3700
},
{
"epoch": 0.03,
"grad_norm": 0.6848945617675781,
"learning_rate": 9.689006375369305e-06,
"loss": 4.8537,
"step": 3800
},
{
"epoch": 0.03,
"grad_norm": 0.4569337069988251,
"learning_rate": 9.680822332615866e-06,
"loss": 4.859,
"step": 3900
},
{
"epoch": 0.03,
"grad_norm": 1.2655045986175537,
"learning_rate": 9.672638289862427e-06,
"loss": 4.8556,
"step": 4000
},
{
"epoch": 0.03,
"grad_norm": 1.2269976139068604,
"learning_rate": 9.664454247108987e-06,
"loss": 4.8569,
"step": 4100
},
{
"epoch": 0.03,
"grad_norm": 0.5749172568321228,
"learning_rate": 9.656270204355547e-06,
"loss": 4.8549,
"step": 4200
},
{
"epoch": 0.04,
"grad_norm": 0.5450171232223511,
"learning_rate": 9.64808616160211e-06,
"loss": 4.853,
"step": 4300
},
{
"epoch": 0.04,
"grad_norm": 0.6002165675163269,
"learning_rate": 9.63990211884867e-06,
"loss": 4.8495,
"step": 4400
},
{
"epoch": 0.04,
"grad_norm": 0.4352850914001465,
"learning_rate": 9.63171807609523e-06,
"loss": 4.8471,
"step": 4500
},
{
"epoch": 0.04,
"grad_norm": 0.8037896752357483,
"learning_rate": 9.623534033341792e-06,
"loss": 4.841,
"step": 4600
},
{
"epoch": 0.04,
"grad_norm": 0.7141408324241638,
"learning_rate": 9.615349990588352e-06,
"loss": 4.8433,
"step": 4700
},
{
"epoch": 0.04,
"grad_norm": 1.3830125331878662,
"learning_rate": 9.607165947834912e-06,
"loss": 4.84,
"step": 4800
},
{
"epoch": 0.04,
"grad_norm": 1.2149487733840942,
"learning_rate": 9.598981905081473e-06,
"loss": 4.8414,
"step": 4900
},
{
"epoch": 0.04,
"grad_norm": 1.5489853620529175,
"learning_rate": 9.590797862328034e-06,
"loss": 4.8425,
"step": 5000
},
{
"epoch": 0.04,
"grad_norm": 1.0431714057922363,
"learning_rate": 9.582613819574595e-06,
"loss": 4.8369,
"step": 5100
},
{
"epoch": 0.04,
"grad_norm": 1.2853955030441284,
"learning_rate": 9.574429776821155e-06,
"loss": 4.8411,
"step": 5200
},
{
"epoch": 0.04,
"grad_norm": 0.6190789937973022,
"learning_rate": 9.566245734067715e-06,
"loss": 4.8381,
"step": 5300
},
{
"epoch": 0.04,
"grad_norm": 0.9078882336616516,
"learning_rate": 9.558061691314277e-06,
"loss": 4.8355,
"step": 5400
},
{
"epoch": 0.05,
"grad_norm": 0.6225169897079468,
"learning_rate": 9.549877648560838e-06,
"loss": 4.8382,
"step": 5500
},
{
"epoch": 0.05,
"grad_norm": 0.5269715189933777,
"learning_rate": 9.541693605807398e-06,
"loss": 4.8368,
"step": 5600
},
{
"epoch": 0.05,
"grad_norm": 0.5707417130470276,
"learning_rate": 9.533509563053958e-06,
"loss": 4.8331,
"step": 5700
},
{
"epoch": 0.05,
"grad_norm": 1.0135433673858643,
"learning_rate": 9.525325520300518e-06,
"loss": 4.8372,
"step": 5800
},
{
"epoch": 0.05,
"grad_norm": 0.9796397686004639,
"learning_rate": 9.517141477547079e-06,
"loss": 4.8301,
"step": 5900
},
{
"epoch": 0.05,
"grad_norm": 0.6885799765586853,
"learning_rate": 9.50895743479364e-06,
"loss": 4.8309,
"step": 6000
},
{
"epoch": 0.05,
"grad_norm": 0.8593583703041077,
"learning_rate": 9.5007733920402e-06,
"loss": 4.829,
"step": 6100
},
{
"epoch": 0.05,
"grad_norm": 1.0259130001068115,
"learning_rate": 9.492589349286761e-06,
"loss": 4.8298,
"step": 6200
},
{
"epoch": 0.05,
"grad_norm": 0.5682145357131958,
"learning_rate": 9.484405306533321e-06,
"loss": 4.8283,
"step": 6300
},
{
"epoch": 0.05,
"grad_norm": 1.9293478727340698,
"learning_rate": 9.476221263779882e-06,
"loss": 4.8321,
"step": 6400
},
{
"epoch": 0.05,
"grad_norm": 1.0222758054733276,
"learning_rate": 9.468037221026444e-06,
"loss": 4.8286,
"step": 6500
},
{
"epoch": 0.05,
"grad_norm": 0.748030960559845,
"learning_rate": 9.459853178273004e-06,
"loss": 4.8277,
"step": 6600
},
{
"epoch": 0.05,
"grad_norm": 1.7026931047439575,
"learning_rate": 9.451669135519564e-06,
"loss": 4.8227,
"step": 6700
},
{
"epoch": 0.06,
"grad_norm": 0.8596793413162231,
"learning_rate": 9.443485092766126e-06,
"loss": 4.8296,
"step": 6800
},
{
"epoch": 0.06,
"grad_norm": 1.9839978218078613,
"learning_rate": 9.435301050012686e-06,
"loss": 4.8298,
"step": 6900
},
{
"epoch": 0.06,
"grad_norm": 2.7572035789489746,
"learning_rate": 9.427117007259247e-06,
"loss": 4.8285,
"step": 7000
},
{
"epoch": 0.06,
"grad_norm": 1.3615273237228394,
"learning_rate": 9.418932964505807e-06,
"loss": 4.8263,
"step": 7100
},
{
"epoch": 0.06,
"grad_norm": 0.8525121808052063,
"learning_rate": 9.410748921752369e-06,
"loss": 4.8233,
"step": 7200
},
{
"epoch": 0.06,
"grad_norm": 0.6726747751235962,
"learning_rate": 9.402564878998929e-06,
"loss": 4.8232,
"step": 7300
},
{
"epoch": 0.06,
"grad_norm": 2.404517412185669,
"learning_rate": 9.39438083624549e-06,
"loss": 4.8222,
"step": 7400
},
{
"epoch": 0.06,
"grad_norm": 2.3438527584075928,
"learning_rate": 9.38619679349205e-06,
"loss": 4.8275,
"step": 7500
},
{
"epoch": 0.06,
"grad_norm": 0.9558647871017456,
"learning_rate": 9.378012750738612e-06,
"loss": 4.8188,
"step": 7600
},
{
"epoch": 0.06,
"grad_norm": 1.5687648057937622,
"learning_rate": 9.369828707985172e-06,
"loss": 4.8203,
"step": 7700
},
{
"epoch": 0.06,
"grad_norm": 4.528586387634277,
"learning_rate": 9.361644665231732e-06,
"loss": 4.8237,
"step": 7800
},
{
"epoch": 0.06,
"grad_norm": 1.5821335315704346,
"learning_rate": 9.353460622478292e-06,
"loss": 4.8213,
"step": 7900
},
{
"epoch": 0.07,
"grad_norm": 1.3139538764953613,
"learning_rate": 9.345276579724853e-06,
"loss": 4.8241,
"step": 8000
},
{
"epoch": 0.07,
"grad_norm": 0.6411435008049011,
"learning_rate": 9.337092536971415e-06,
"loss": 4.8176,
"step": 8100
},
{
"epoch": 0.07,
"grad_norm": 2.011660575866699,
"learning_rate": 9.328908494217975e-06,
"loss": 4.8142,
"step": 8200
},
{
"epoch": 0.07,
"grad_norm": 0.9855501055717468,
"learning_rate": 9.320724451464535e-06,
"loss": 4.816,
"step": 8300
},
{
"epoch": 0.07,
"grad_norm": 0.8996455073356628,
"learning_rate": 9.312540408711095e-06,
"loss": 4.8162,
"step": 8400
},
{
"epoch": 0.07,
"grad_norm": 1.75440514087677,
"learning_rate": 9.304356365957656e-06,
"loss": 4.8174,
"step": 8500
},
{
"epoch": 0.07,
"grad_norm": 0.5616024136543274,
"learning_rate": 9.296172323204216e-06,
"loss": 4.8137,
"step": 8600
},
{
"epoch": 0.07,
"grad_norm": 1.5236667394638062,
"learning_rate": 9.287988280450778e-06,
"loss": 4.8156,
"step": 8700
},
{
"epoch": 0.07,
"grad_norm": 0.869793176651001,
"learning_rate": 9.279804237697338e-06,
"loss": 4.8201,
"step": 8800
},
{
"epoch": 0.07,
"grad_norm": 2.2847354412078857,
"learning_rate": 9.271620194943898e-06,
"loss": 4.816,
"step": 8900
},
{
"epoch": 0.07,
"grad_norm": 1.1544415950775146,
"learning_rate": 9.26343615219046e-06,
"loss": 4.8174,
"step": 9000
},
{
"epoch": 0.07,
"grad_norm": 0.6653382182121277,
"learning_rate": 9.25525210943702e-06,
"loss": 4.8136,
"step": 9100
},
{
"epoch": 0.08,
"grad_norm": 1.0501242876052856,
"learning_rate": 9.247068066683581e-06,
"loss": 4.8104,
"step": 9200
},
{
"epoch": 0.08,
"grad_norm": 0.6880828142166138,
"learning_rate": 9.238884023930141e-06,
"loss": 4.8093,
"step": 9300
},
{
"epoch": 0.08,
"grad_norm": 0.9488196969032288,
"learning_rate": 9.230699981176703e-06,
"loss": 4.8121,
"step": 9400
},
{
"epoch": 0.08,
"grad_norm": 4.403717041015625,
"learning_rate": 9.222515938423263e-06,
"loss": 4.8113,
"step": 9500
},
{
"epoch": 0.08,
"grad_norm": 2.2665512561798096,
"learning_rate": 9.214331895669824e-06,
"loss": 4.8092,
"step": 9600
},
{
"epoch": 0.08,
"grad_norm": 0.7057957053184509,
"learning_rate": 9.206147852916384e-06,
"loss": 4.8108,
"step": 9700
},
{
"epoch": 0.08,
"grad_norm": 3.9034390449523926,
"learning_rate": 9.197963810162946e-06,
"loss": 4.808,
"step": 9800
},
{
"epoch": 0.08,
"grad_norm": 0.9307584762573242,
"learning_rate": 9.189779767409506e-06,
"loss": 4.809,
"step": 9900
},
{
"epoch": 0.08,
"grad_norm": 3.995647668838501,
"learning_rate": 9.181595724656066e-06,
"loss": 4.8079,
"step": 10000
},
{
"epoch": 0.08,
"grad_norm": 2.2378621101379395,
"learning_rate": 9.173411681902627e-06,
"loss": 4.81,
"step": 10100
},
{
"epoch": 0.08,
"grad_norm": 1.424851417541504,
"learning_rate": 9.165227639149189e-06,
"loss": 4.8045,
"step": 10200
},
{
"epoch": 0.08,
"grad_norm": 1.8927842378616333,
"learning_rate": 9.157043596395749e-06,
"loss": 4.8085,
"step": 10300
},
{
"epoch": 0.09,
"grad_norm": 3.9594876766204834,
"learning_rate": 9.14885955364231e-06,
"loss": 4.808,
"step": 10400
},
{
"epoch": 0.09,
"grad_norm": 2.6992530822753906,
"learning_rate": 9.14067551088887e-06,
"loss": 4.8099,
"step": 10500
},
{
"epoch": 0.09,
"grad_norm": 1.1311434507369995,
"learning_rate": 9.13249146813543e-06,
"loss": 4.8071,
"step": 10600
},
{
"epoch": 0.09,
"grad_norm": 0.8243028521537781,
"learning_rate": 9.12430742538199e-06,
"loss": 4.8056,
"step": 10700
},
{
"epoch": 0.09,
"grad_norm": 1.7261260747909546,
"learning_rate": 9.11612338262855e-06,
"loss": 4.8076,
"step": 10800
},
{
"epoch": 0.09,
"grad_norm": 4.485065460205078,
"learning_rate": 9.107939339875112e-06,
"loss": 4.8072,
"step": 10900
},
{
"epoch": 0.09,
"grad_norm": 1.044236660003662,
"learning_rate": 9.099755297121673e-06,
"loss": 4.8076,
"step": 11000
},
{
"epoch": 0.09,
"grad_norm": 2.155515670776367,
"learning_rate": 9.091571254368233e-06,
"loss": 4.8061,
"step": 11100
},
{
"epoch": 0.09,
"grad_norm": 1.4560600519180298,
"learning_rate": 9.083387211614795e-06,
"loss": 4.8079,
"step": 11200
},
{
"epoch": 0.09,
"grad_norm": 2.577777624130249,
"learning_rate": 9.075203168861355e-06,
"loss": 4.8058,
"step": 11300
},
{
"epoch": 0.09,
"grad_norm": 2.1256022453308105,
"learning_rate": 9.067019126107915e-06,
"loss": 4.8047,
"step": 11400
},
{
"epoch": 0.09,
"grad_norm": 0.9217951893806458,
"learning_rate": 9.058835083354476e-06,
"loss": 4.8047,
"step": 11500
},
{
"epoch": 0.09,
"grad_norm": 1.4003558158874512,
"learning_rate": 9.050651040601038e-06,
"loss": 4.8006,
"step": 11600
},
{
"epoch": 0.1,
"grad_norm": 1.1220279932022095,
"learning_rate": 9.042466997847598e-06,
"loss": 4.8061,
"step": 11700
},
{
"epoch": 0.1,
"grad_norm": 3.371317148208618,
"learning_rate": 9.034282955094158e-06,
"loss": 4.8034,
"step": 11800
},
{
"epoch": 0.1,
"grad_norm": 1.5556405782699585,
"learning_rate": 9.026098912340718e-06,
"loss": 4.8022,
"step": 11900
},
{
"epoch": 0.1,
"grad_norm": 0.9803473949432373,
"learning_rate": 9.01791486958728e-06,
"loss": 4.8029,
"step": 12000
},
{
"epoch": 0.1,
"grad_norm": 0.7116793990135193,
"learning_rate": 9.00973082683384e-06,
"loss": 4.8047,
"step": 12100
},
{
"epoch": 0.1,
"grad_norm": 0.8312737941741943,
"learning_rate": 9.0015467840804e-06,
"loss": 4.8004,
"step": 12200
},
{
"epoch": 0.1,
"grad_norm": 2.492680072784424,
"learning_rate": 8.993362741326963e-06,
"loss": 4.8031,
"step": 12300
},
{
"epoch": 0.1,
"grad_norm": 1.0807607173919678,
"learning_rate": 8.985178698573523e-06,
"loss": 4.7997,
"step": 12400
},
{
"epoch": 0.1,
"grad_norm": 4.5318403244018555,
"learning_rate": 8.976994655820083e-06,
"loss": 4.8052,
"step": 12500
},
{
"epoch": 0.1,
"grad_norm": 6.294131278991699,
"learning_rate": 8.968810613066644e-06,
"loss": 4.8025,
"step": 12600
},
{
"epoch": 0.1,
"grad_norm": 1.2838228940963745,
"learning_rate": 8.960626570313204e-06,
"loss": 4.8014,
"step": 12700
},
{
"epoch": 0.1,
"grad_norm": 3.4684486389160156,
"learning_rate": 8.952442527559764e-06,
"loss": 4.7991,
"step": 12800
},
{
"epoch": 0.11,
"grad_norm": 0.9761227965354919,
"learning_rate": 8.944258484806324e-06,
"loss": 4.8013,
"step": 12900
},
{
"epoch": 0.11,
"grad_norm": 5.355940341949463,
"learning_rate": 8.936074442052886e-06,
"loss": 4.8014,
"step": 13000
},
{
"epoch": 0.11,
"grad_norm": 3.334829568862915,
"learning_rate": 8.927890399299447e-06,
"loss": 4.7994,
"step": 13100
},
{
"epoch": 0.11,
"grad_norm": 4.577433109283447,
"learning_rate": 8.919706356546007e-06,
"loss": 4.7931,
"step": 13200
},
{
"epoch": 0.11,
"grad_norm": 4.9149322509765625,
"learning_rate": 8.911522313792567e-06,
"loss": 4.8025,
"step": 13300
},
{
"epoch": 0.11,
"grad_norm": 1.5174356698989868,
"learning_rate": 8.903338271039129e-06,
"loss": 4.7982,
"step": 13400
},
{
"epoch": 0.11,
"grad_norm": 4.420125961303711,
"learning_rate": 8.89515422828569e-06,
"loss": 4.7976,
"step": 13500
},
{
"epoch": 0.11,
"grad_norm": 4.246557712554932,
"learning_rate": 8.88697018553225e-06,
"loss": 4.7954,
"step": 13600
},
{
"epoch": 0.11,
"grad_norm": 1.887624979019165,
"learning_rate": 8.87878614277881e-06,
"loss": 4.7956,
"step": 13700
},
{
"epoch": 0.11,
"grad_norm": 1.2699388265609741,
"learning_rate": 8.870602100025372e-06,
"loss": 4.7993,
"step": 13800
},
{
"epoch": 0.11,
"grad_norm": 2.5432891845703125,
"learning_rate": 8.862418057271932e-06,
"loss": 4.8017,
"step": 13900
},
{
"epoch": 0.11,
"grad_norm": 3.0773308277130127,
"learning_rate": 8.854234014518492e-06,
"loss": 4.8002,
"step": 14000
},
{
"epoch": 0.12,
"grad_norm": 2.0699973106384277,
"learning_rate": 8.846049971765053e-06,
"loss": 4.7958,
"step": 14100
},
{
"epoch": 0.12,
"grad_norm": 0.9238589406013489,
"learning_rate": 8.837865929011615e-06,
"loss": 4.7907,
"step": 14200
},
{
"epoch": 0.12,
"grad_norm": 1.0784467458724976,
"learning_rate": 8.829681886258175e-06,
"loss": 4.797,
"step": 14300
},
{
"epoch": 0.12,
"grad_norm": 1.1584768295288086,
"learning_rate": 8.821497843504735e-06,
"loss": 4.7947,
"step": 14400
},
{
"epoch": 0.12,
"grad_norm": 1.0812736749649048,
"learning_rate": 8.813313800751297e-06,
"loss": 4.7894,
"step": 14500
},
{
"epoch": 0.12,
"grad_norm": 1.7061692476272583,
"learning_rate": 8.805129757997857e-06,
"loss": 4.7946,
"step": 14600
},
{
"epoch": 0.12,
"grad_norm": 2.771735429763794,
"learning_rate": 8.796945715244418e-06,
"loss": 4.7973,
"step": 14700
},
{
"epoch": 0.12,
"grad_norm": 2.1708245277404785,
"learning_rate": 8.788761672490978e-06,
"loss": 4.7953,
"step": 14800
},
{
"epoch": 0.12,
"grad_norm": 2.4777073860168457,
"learning_rate": 8.780577629737538e-06,
"loss": 4.7917,
"step": 14900
},
{
"epoch": 0.12,
"grad_norm": 1.5953549146652222,
"learning_rate": 8.772393586984098e-06,
"loss": 4.7917,
"step": 15000
},
{
"epoch": 0.12,
"grad_norm": 0.7896368503570557,
"learning_rate": 8.76420954423066e-06,
"loss": 4.7923,
"step": 15100
},
{
"epoch": 0.12,
"grad_norm": 0.7050272822380066,
"learning_rate": 8.75602550147722e-06,
"loss": 4.7977,
"step": 15200
},
{
"epoch": 0.13,
"grad_norm": 4.6306023597717285,
"learning_rate": 8.747841458723781e-06,
"loss": 4.7911,
"step": 15300
},
{
"epoch": 0.13,
"grad_norm": 2.3503997325897217,
"learning_rate": 8.739657415970341e-06,
"loss": 4.7895,
"step": 15400
},
{
"epoch": 0.13,
"grad_norm": 4.242427825927734,
"learning_rate": 8.731473373216901e-06,
"loss": 4.7902,
"step": 15500
},
{
"epoch": 0.13,
"grad_norm": 0.8283806443214417,
"learning_rate": 8.723289330463463e-06,
"loss": 4.7938,
"step": 15600
},
{
"epoch": 0.13,
"grad_norm": 3.901630401611328,
"learning_rate": 8.715105287710024e-06,
"loss": 4.7936,
"step": 15700
},
{
"epoch": 0.13,
"grad_norm": 6.563976764678955,
"learning_rate": 8.706921244956584e-06,
"loss": 4.7907,
"step": 15800
},
{
"epoch": 0.13,
"grad_norm": 2.2243263721466064,
"learning_rate": 8.698737202203144e-06,
"loss": 4.7852,
"step": 15900
},
{
"epoch": 0.13,
"grad_norm": 2.0529608726501465,
"learning_rate": 8.690553159449706e-06,
"loss": 4.7916,
"step": 16000
},
{
"epoch": 0.13,
"grad_norm": 3.0728704929351807,
"learning_rate": 8.682369116696266e-06,
"loss": 4.7947,
"step": 16100
},
{
"epoch": 0.13,
"grad_norm": 0.8949060440063477,
"learning_rate": 8.674185073942827e-06,
"loss": 4.7935,
"step": 16200
},
{
"epoch": 0.13,
"grad_norm": 0.9687060713768005,
"learning_rate": 8.666001031189387e-06,
"loss": 4.7939,
"step": 16300
},
{
"epoch": 0.13,
"grad_norm": 1.091792345046997,
"learning_rate": 8.657816988435949e-06,
"loss": 4.7941,
"step": 16400
},
{
"epoch": 0.14,
"grad_norm": 0.7492835521697998,
"learning_rate": 8.64963294568251e-06,
"loss": 4.7897,
"step": 16500
},
{
"epoch": 0.14,
"grad_norm": 1.856666088104248,
"learning_rate": 8.64144890292907e-06,
"loss": 4.7917,
"step": 16600
},
{
"epoch": 0.14,
"grad_norm": 2.1373181343078613,
"learning_rate": 8.633264860175631e-06,
"loss": 4.7875,
"step": 16700
},
{
"epoch": 0.14,
"grad_norm": 3.3695366382598877,
"learning_rate": 8.625080817422192e-06,
"loss": 4.7935,
"step": 16800
},
{
"epoch": 0.14,
"grad_norm": 3.0197083950042725,
"learning_rate": 8.616896774668752e-06,
"loss": 4.7896,
"step": 16900
},
{
"epoch": 0.14,
"grad_norm": 2.717109441757202,
"learning_rate": 8.608712731915312e-06,
"loss": 4.7898,
"step": 17000
},
{
"epoch": 0.14,
"grad_norm": 2.4040961265563965,
"learning_rate": 8.600528689161873e-06,
"loss": 4.7919,
"step": 17100
},
{
"epoch": 0.14,
"grad_norm": 7.917181968688965,
"learning_rate": 8.592344646408434e-06,
"loss": 4.7904,
"step": 17200
},
{
"epoch": 0.14,
"grad_norm": 1.2932788133621216,
"learning_rate": 8.584160603654995e-06,
"loss": 4.7856,
"step": 17300
},
{
"epoch": 0.14,
"grad_norm": 4.480923175811768,
"learning_rate": 8.575976560901555e-06,
"loss": 4.7952,
"step": 17400
},
{
"epoch": 0.14,
"grad_norm": 1.211406946182251,
"learning_rate": 8.567792518148115e-06,
"loss": 4.7903,
"step": 17500
},
{
"epoch": 0.14,
"grad_norm": 1.0489588975906372,
"learning_rate": 8.559608475394676e-06,
"loss": 4.7941,
"step": 17600
},
{
"epoch": 0.14,
"grad_norm": 3.5961625576019287,
"learning_rate": 8.551424432641236e-06,
"loss": 4.7845,
"step": 17700
},
{
"epoch": 0.15,
"grad_norm": 1.3053170442581177,
"learning_rate": 8.543240389887798e-06,
"loss": 4.7919,
"step": 17800
},
{
"epoch": 0.15,
"grad_norm": 0.7930333614349365,
"learning_rate": 8.535056347134358e-06,
"loss": 4.7893,
"step": 17900
},
{
"epoch": 0.15,
"grad_norm": 0.9989149570465088,
"learning_rate": 8.526872304380918e-06,
"loss": 4.7879,
"step": 18000
},
{
"epoch": 0.15,
"grad_norm": 7.412938594818115,
"learning_rate": 8.518688261627479e-06,
"loss": 4.7876,
"step": 18100
},
{
"epoch": 0.15,
"grad_norm": 0.9795100688934326,
"learning_rate": 8.51050421887404e-06,
"loss": 4.7895,
"step": 18200
},
{
"epoch": 0.15,
"grad_norm": 1.8868602514266968,
"learning_rate": 8.5023201761206e-06,
"loss": 4.7895,
"step": 18300
},
{
"epoch": 0.15,
"grad_norm": 0.9831721186637878,
"learning_rate": 8.494136133367161e-06,
"loss": 4.7874,
"step": 18400
},
{
"epoch": 0.15,
"grad_norm": 3.5411503314971924,
"learning_rate": 8.485952090613721e-06,
"loss": 4.786,
"step": 18500
},
{
"epoch": 0.15,
"grad_norm": 2.617466926574707,
"learning_rate": 8.477768047860283e-06,
"loss": 4.7832,
"step": 18600
},
{
"epoch": 0.15,
"grad_norm": 2.1326940059661865,
"learning_rate": 8.469584005106844e-06,
"loss": 4.789,
"step": 18700
},
{
"epoch": 0.15,
"grad_norm": 2.020888328552246,
"learning_rate": 8.461399962353404e-06,
"loss": 4.7867,
"step": 18800
},
{
"epoch": 0.15,
"grad_norm": 2.366783618927002,
"learning_rate": 8.453215919599966e-06,
"loss": 4.7871,
"step": 18900
},
{
"epoch": 0.16,
"grad_norm": 2.739832878112793,
"learning_rate": 8.445031876846526e-06,
"loss": 4.7829,
"step": 19000
},
{
"epoch": 0.16,
"grad_norm": 1.9564040899276733,
"learning_rate": 8.436847834093086e-06,
"loss": 4.7901,
"step": 19100
},
{
"epoch": 0.16,
"grad_norm": 4.063528537750244,
"learning_rate": 8.428663791339647e-06,
"loss": 4.7839,
"step": 19200
},
{
"epoch": 0.16,
"grad_norm": 1.9199309349060059,
"learning_rate": 8.420479748586209e-06,
"loss": 4.7919,
"step": 19300
},
{
"epoch": 0.16,
"grad_norm": 0.8330548405647278,
"learning_rate": 8.412295705832769e-06,
"loss": 4.7877,
"step": 19400
},
{
"epoch": 0.16,
"grad_norm": 2.459280490875244,
"learning_rate": 8.404111663079329e-06,
"loss": 4.7854,
"step": 19500
},
{
"epoch": 0.16,
"grad_norm": 5.8329572677612305,
"learning_rate": 8.39592762032589e-06,
"loss": 4.7864,
"step": 19600
},
{
"epoch": 0.16,
"grad_norm": 1.6338814496994019,
"learning_rate": 8.38774357757245e-06,
"loss": 4.7855,
"step": 19700
},
{
"epoch": 0.16,
"grad_norm": 0.8343069553375244,
"learning_rate": 8.37955953481901e-06,
"loss": 4.786,
"step": 19800
},
{
"epoch": 0.16,
"grad_norm": 1.5812463760375977,
"learning_rate": 8.37137549206557e-06,
"loss": 4.784,
"step": 19900
},
{
"epoch": 0.16,
"grad_norm": 1.3143149614334106,
"learning_rate": 8.363191449312132e-06,
"loss": 4.7761,
"step": 20000
},
{
"epoch": 0.16,
"grad_norm": 2.27919340133667,
"learning_rate": 8.355007406558692e-06,
"loss": 4.7841,
"step": 20100
},
{
"epoch": 0.17,
"grad_norm": 2.111786365509033,
"learning_rate": 8.346823363805253e-06,
"loss": 4.7902,
"step": 20200
},
{
"epoch": 0.17,
"grad_norm": 1.4459553956985474,
"learning_rate": 8.338639321051813e-06,
"loss": 4.7827,
"step": 20300
},
{
"epoch": 0.17,
"grad_norm": 5.489434242248535,
"learning_rate": 8.330455278298375e-06,
"loss": 4.7804,
"step": 20400
},
{
"epoch": 0.17,
"grad_norm": 0.8438489437103271,
"learning_rate": 8.322271235544935e-06,
"loss": 4.7826,
"step": 20500
},
{
"epoch": 0.17,
"grad_norm": 0.8289409875869751,
"learning_rate": 8.314087192791495e-06,
"loss": 4.788,
"step": 20600
},
{
"epoch": 0.17,
"grad_norm": 2.2698094844818115,
"learning_rate": 8.305903150038056e-06,
"loss": 4.785,
"step": 20700
},
{
"epoch": 0.17,
"grad_norm": 6.737819194793701,
"learning_rate": 8.297719107284618e-06,
"loss": 4.7879,
"step": 20800
},
{
"epoch": 0.17,
"grad_norm": 1.1006556749343872,
"learning_rate": 8.289535064531178e-06,
"loss": 4.7861,
"step": 20900
},
{
"epoch": 0.17,
"grad_norm": 0.757918119430542,
"learning_rate": 8.281351021777738e-06,
"loss": 4.7838,
"step": 21000
},
{
"epoch": 0.17,
"grad_norm": 1.443129539489746,
"learning_rate": 8.273166979024298e-06,
"loss": 4.7815,
"step": 21100
},
{
"epoch": 0.17,
"grad_norm": 1.2434520721435547,
"learning_rate": 8.26498293627086e-06,
"loss": 4.7837,
"step": 21200
},
{
"epoch": 0.17,
"grad_norm": 0.9488204717636108,
"learning_rate": 8.25679889351742e-06,
"loss": 4.7826,
"step": 21300
},
{
"epoch": 0.18,
"grad_norm": 1.4644496440887451,
"learning_rate": 8.248614850763981e-06,
"loss": 4.7805,
"step": 21400
},
{
"epoch": 0.18,
"grad_norm": 1.1131641864776611,
"learning_rate": 8.240430808010543e-06,
"loss": 4.7848,
"step": 21500
},
{
"epoch": 0.18,
"grad_norm": 0.9792927503585815,
"learning_rate": 8.232246765257103e-06,
"loss": 4.7844,
"step": 21600
},
{
"epoch": 0.18,
"grad_norm": 6.909112453460693,
"learning_rate": 8.224062722503663e-06,
"loss": 4.7791,
"step": 21700
},
{
"epoch": 0.18,
"grad_norm": 1.1648467779159546,
"learning_rate": 8.215878679750224e-06,
"loss": 4.7834,
"step": 21800
},
{
"epoch": 0.18,
"grad_norm": 1.4722325801849365,
"learning_rate": 8.207694636996784e-06,
"loss": 4.7775,
"step": 21900
},
{
"epoch": 0.18,
"grad_norm": 0.8322229385375977,
"learning_rate": 8.199510594243344e-06,
"loss": 4.7848,
"step": 22000
},
{
"epoch": 0.18,
"grad_norm": 1.4244656562805176,
"learning_rate": 8.191326551489906e-06,
"loss": 4.7784,
"step": 22100
},
{
"epoch": 0.18,
"grad_norm": 3.1128671169281006,
"learning_rate": 8.183142508736466e-06,
"loss": 4.7819,
"step": 22200
},
{
"epoch": 0.18,
"grad_norm": 2.6106808185577393,
"learning_rate": 8.174958465983027e-06,
"loss": 4.7791,
"step": 22300
},
{
"epoch": 0.18,
"grad_norm": 5.204963207244873,
"learning_rate": 8.166774423229587e-06,
"loss": 4.7841,
"step": 22400
},
{
"epoch": 0.18,
"grad_norm": 0.9447894096374512,
"learning_rate": 8.158590380476147e-06,
"loss": 4.7776,
"step": 22500
},
{
"epoch": 0.18,
"grad_norm": 1.2476049661636353,
"learning_rate": 8.15040633772271e-06,
"loss": 4.7759,
"step": 22600
},
{
"epoch": 0.19,
"grad_norm": 0.7892690896987915,
"learning_rate": 8.14222229496927e-06,
"loss": 4.7797,
"step": 22700
},
{
"epoch": 0.19,
"grad_norm": 2.874403476715088,
"learning_rate": 8.13403825221583e-06,
"loss": 4.7807,
"step": 22800
},
{
"epoch": 0.19,
"grad_norm": 2.8661632537841797,
"learning_rate": 8.12585420946239e-06,
"loss": 4.7772,
"step": 22900
},
{
"epoch": 0.19,
"grad_norm": 0.8099445104598999,
"learning_rate": 8.117670166708952e-06,
"loss": 4.7814,
"step": 23000
},
{
"epoch": 0.19,
"grad_norm": 1.0212429761886597,
"learning_rate": 8.109486123955512e-06,
"loss": 4.7802,
"step": 23100
},
{
"epoch": 0.19,
"grad_norm": 1.0928311347961426,
"learning_rate": 8.101302081202072e-06,
"loss": 4.777,
"step": 23200
},
{
"epoch": 0.19,
"grad_norm": 1.8933788537979126,
"learning_rate": 8.093118038448633e-06,
"loss": 4.7832,
"step": 23300
},
{
"epoch": 0.19,
"grad_norm": 0.9154900908470154,
"learning_rate": 8.084933995695195e-06,
"loss": 4.7854,
"step": 23400
},
{
"epoch": 0.19,
"grad_norm": 4.510463714599609,
"learning_rate": 8.076749952941755e-06,
"loss": 4.7869,
"step": 23500
},
{
"epoch": 0.19,
"grad_norm": 2.9989612102508545,
"learning_rate": 8.068565910188315e-06,
"loss": 4.785,
"step": 23600
},
{
"epoch": 0.19,
"grad_norm": 1.1799613237380981,
"learning_rate": 8.060381867434877e-06,
"loss": 4.7845,
"step": 23700
},
{
"epoch": 0.19,
"grad_norm": 0.8453476428985596,
"learning_rate": 8.052197824681437e-06,
"loss": 4.7849,
"step": 23800
},
{
"epoch": 0.2,
"grad_norm": 1.3734474182128906,
"learning_rate": 8.044013781927998e-06,
"loss": 4.7799,
"step": 23900
},
{
"epoch": 0.2,
"grad_norm": 7.692543983459473,
"learning_rate": 8.035829739174558e-06,
"loss": 4.7787,
"step": 24000
},
{
"epoch": 0.2,
"grad_norm": 4.369629859924316,
"learning_rate": 8.027645696421118e-06,
"loss": 4.779,
"step": 24100
},
{
"epoch": 0.2,
"grad_norm": 1.2958143949508667,
"learning_rate": 8.01946165366768e-06,
"loss": 4.7801,
"step": 24200
},
{
"epoch": 0.2,
"grad_norm": 0.838066041469574,
"learning_rate": 8.01127761091424e-06,
"loss": 4.78,
"step": 24300
},
{
"epoch": 0.2,
"grad_norm": 2.3854634761810303,
"learning_rate": 8.0030935681608e-06,
"loss": 4.7797,
"step": 24400
},
{
"epoch": 0.2,
"grad_norm": 1.1053544282913208,
"learning_rate": 7.994909525407361e-06,
"loss": 4.7816,
"step": 24500
},
{
"epoch": 0.2,
"grad_norm": 1.4308538436889648,
"learning_rate": 7.986725482653921e-06,
"loss": 4.7819,
"step": 24600
},
{
"epoch": 0.2,
"grad_norm": 3.392876148223877,
"learning_rate": 7.978541439900482e-06,
"loss": 4.7782,
"step": 24700
},
{
"epoch": 0.2,
"grad_norm": 1.5023071765899658,
"learning_rate": 7.970357397147044e-06,
"loss": 4.7828,
"step": 24800
},
{
"epoch": 0.2,
"grad_norm": 0.7376088500022888,
"learning_rate": 7.962173354393604e-06,
"loss": 4.7816,
"step": 24900
},
{
"epoch": 0.2,
"grad_norm": 3.311122417449951,
"learning_rate": 7.953989311640164e-06,
"loss": 4.7819,
"step": 25000
},
{
"epoch": 0.21,
"grad_norm": 1.5772916078567505,
"learning_rate": 7.945805268886724e-06,
"loss": 4.7779,
"step": 25100
},
{
"epoch": 0.21,
"grad_norm": 0.9813963174819946,
"learning_rate": 7.937621226133286e-06,
"loss": 4.781,
"step": 25200
},
{
"epoch": 0.21,
"grad_norm": 0.7451574802398682,
"learning_rate": 7.929437183379847e-06,
"loss": 4.7817,
"step": 25300
},
{
"epoch": 0.21,
"grad_norm": 1.064371943473816,
"learning_rate": 7.921253140626407e-06,
"loss": 4.7794,
"step": 25400
},
{
"epoch": 0.21,
"grad_norm": 1.4085423946380615,
"learning_rate": 7.913069097872967e-06,
"loss": 4.7838,
"step": 25500
},
{
"epoch": 0.21,
"grad_norm": 0.9586715698242188,
"learning_rate": 7.904885055119529e-06,
"loss": 4.7799,
"step": 25600
},
{
"epoch": 0.21,
"grad_norm": 2.1608307361602783,
"learning_rate": 7.89670101236609e-06,
"loss": 4.7803,
"step": 25700
},
{
"epoch": 0.21,
"grad_norm": 0.8091859221458435,
"learning_rate": 7.88851696961265e-06,
"loss": 4.784,
"step": 25800
},
{
"epoch": 0.21,
"grad_norm": 0.7846326231956482,
"learning_rate": 7.880332926859212e-06,
"loss": 4.778,
"step": 25900
},
{
"epoch": 0.21,
"grad_norm": 2.0327844619750977,
"learning_rate": 7.872148884105772e-06,
"loss": 4.7825,
"step": 26000
},
{
"epoch": 0.21,
"grad_norm": 1.0779097080230713,
"learning_rate": 7.863964841352332e-06,
"loss": 4.7765,
"step": 26100
},
{
"epoch": 0.21,
"grad_norm": 1.2201029062271118,
"learning_rate": 7.855780798598892e-06,
"loss": 4.776,
"step": 26200
},
{
"epoch": 0.22,
"grad_norm": 1.4344494342803955,
"learning_rate": 7.847596755845454e-06,
"loss": 4.7781,
"step": 26300
},
{
"epoch": 0.22,
"grad_norm": 1.114273190498352,
"learning_rate": 7.839412713092015e-06,
"loss": 4.7764,
"step": 26400
},
{
"epoch": 0.22,
"grad_norm": 0.8567458391189575,
"learning_rate": 7.831228670338575e-06,
"loss": 4.7831,
"step": 26500
},
{
"epoch": 0.22,
"grad_norm": 4.325285911560059,
"learning_rate": 7.823044627585135e-06,
"loss": 4.7811,
"step": 26600
},
{
"epoch": 0.22,
"grad_norm": 1.2828031778335571,
"learning_rate": 7.814860584831695e-06,
"loss": 4.7795,
"step": 26700
},
{
"epoch": 0.22,
"grad_norm": 2.048180341720581,
"learning_rate": 7.806676542078256e-06,
"loss": 4.7782,
"step": 26800
},
{
"epoch": 0.22,
"grad_norm": 0.9398120045661926,
"learning_rate": 7.798492499324818e-06,
"loss": 4.778,
"step": 26900
},
{
"epoch": 0.22,
"grad_norm": 0.8492949604988098,
"learning_rate": 7.790308456571378e-06,
"loss": 4.776,
"step": 27000
},
{
"epoch": 0.22,
"grad_norm": 1.9857730865478516,
"learning_rate": 7.782124413817938e-06,
"loss": 4.7741,
"step": 27100
},
{
"epoch": 0.22,
"grad_norm": 1.0787758827209473,
"learning_rate": 7.773940371064498e-06,
"loss": 4.7738,
"step": 27200
},
{
"epoch": 0.22,
"grad_norm": 1.2202094793319702,
"learning_rate": 7.765756328311059e-06,
"loss": 4.777,
"step": 27300
},
{
"epoch": 0.22,
"grad_norm": 6.543772220611572,
"learning_rate": 7.75757228555762e-06,
"loss": 4.778,
"step": 27400
},
{
"epoch": 0.23,
"grad_norm": 0.9749574065208435,
"learning_rate": 7.749388242804181e-06,
"loss": 4.7761,
"step": 27500
},
{
"epoch": 0.23,
"grad_norm": 1.2425750494003296,
"learning_rate": 7.741204200050741e-06,
"loss": 4.7799,
"step": 27600
},
{
"epoch": 0.23,
"grad_norm": 2.1919734477996826,
"learning_rate": 7.733020157297301e-06,
"loss": 4.7781,
"step": 27700
},
{
"epoch": 0.23,
"grad_norm": 1.6858105659484863,
"learning_rate": 7.724836114543863e-06,
"loss": 4.7761,
"step": 27800
},
{
"epoch": 0.23,
"grad_norm": 1.082306146621704,
"learning_rate": 7.716652071790424e-06,
"loss": 4.7727,
"step": 27900
},
{
"epoch": 0.23,
"grad_norm": 1.4394657611846924,
"learning_rate": 7.708468029036984e-06,
"loss": 4.7773,
"step": 28000
},
{
"epoch": 0.23,
"grad_norm": 3.90745210647583,
"learning_rate": 7.700283986283546e-06,
"loss": 4.7771,
"step": 28100
},
{
"epoch": 0.23,
"grad_norm": 1.1074409484863281,
"learning_rate": 7.692099943530106e-06,
"loss": 4.7813,
"step": 28200
},
{
"epoch": 0.23,
"grad_norm": 0.775972306728363,
"learning_rate": 7.683915900776666e-06,
"loss": 4.7793,
"step": 28300
},
{
"epoch": 0.23,
"grad_norm": 0.8545662760734558,
"learning_rate": 7.675731858023227e-06,
"loss": 4.7745,
"step": 28400
},
{
"epoch": 0.23,
"grad_norm": 0.9553655385971069,
"learning_rate": 7.667547815269789e-06,
"loss": 4.7741,
"step": 28500
},
{
"epoch": 0.23,
"grad_norm": 0.9458415508270264,
"learning_rate": 7.659363772516349e-06,
"loss": 4.7757,
"step": 28600
},
{
"epoch": 0.23,
"grad_norm": 1.3447215557098389,
"learning_rate": 7.65117972976291e-06,
"loss": 4.7721,
"step": 28700
},
{
"epoch": 0.24,
"grad_norm": 0.8614388108253479,
"learning_rate": 7.64299568700947e-06,
"loss": 4.7728,
"step": 28800
},
{
"epoch": 0.24,
"grad_norm": 1.083814263343811,
"learning_rate": 7.63481164425603e-06,
"loss": 4.7714,
"step": 28900
},
{
"epoch": 0.24,
"grad_norm": 0.9633229374885559,
"learning_rate": 7.626627601502591e-06,
"loss": 4.7736,
"step": 29000
},
{
"epoch": 0.24,
"grad_norm": 1.1467323303222656,
"learning_rate": 7.618443558749151e-06,
"loss": 4.7771,
"step": 29100
},
{
"epoch": 0.24,
"grad_norm": 2.2095165252685547,
"learning_rate": 7.610259515995713e-06,
"loss": 4.7712,
"step": 29200
},
{
"epoch": 0.24,
"grad_norm": 0.9931008815765381,
"learning_rate": 7.602075473242273e-06,
"loss": 4.7776,
"step": 29300
},
{
"epoch": 0.24,
"grad_norm": 12.287310600280762,
"learning_rate": 7.593891430488834e-06,
"loss": 4.7797,
"step": 29400
},
{
"epoch": 0.24,
"grad_norm": 3.100107192993164,
"learning_rate": 7.585707387735394e-06,
"loss": 4.7759,
"step": 29500
},
{
"epoch": 0.24,
"grad_norm": 2.249577045440674,
"learning_rate": 7.577523344981955e-06,
"loss": 4.7708,
"step": 29600
},
{
"epoch": 0.24,
"grad_norm": 3.1430749893188477,
"learning_rate": 7.569339302228515e-06,
"loss": 4.7742,
"step": 29700
},
{
"epoch": 0.24,
"grad_norm": 0.9823663830757141,
"learning_rate": 7.5611552594750755e-06,
"loss": 4.7739,
"step": 29800
},
{
"epoch": 0.24,
"grad_norm": 4.1920166015625,
"learning_rate": 7.552971216721637e-06,
"loss": 4.778,
"step": 29900
},
{
"epoch": 0.25,
"grad_norm": 2.579256534576416,
"learning_rate": 7.544787173968198e-06,
"loss": 4.7784,
"step": 30000
},
{
"epoch": 0.25,
"grad_norm": 1.4403995275497437,
"learning_rate": 7.536603131214758e-06,
"loss": 4.7701,
"step": 30100
},
{
"epoch": 0.25,
"grad_norm": 3.48757266998291,
"learning_rate": 7.528419088461318e-06,
"loss": 4.7705,
"step": 30200
},
{
"epoch": 0.25,
"grad_norm": 4.121363162994385,
"learning_rate": 7.52023504570788e-06,
"loss": 4.7704,
"step": 30300
},
{
"epoch": 0.25,
"grad_norm": 1.439936876296997,
"learning_rate": 7.5120510029544405e-06,
"loss": 4.7711,
"step": 30400
},
{
"epoch": 0.25,
"grad_norm": 1.546036720275879,
"learning_rate": 7.503866960201001e-06,
"loss": 4.7691,
"step": 30500
},
{
"epoch": 0.25,
"grad_norm": 5.736770153045654,
"learning_rate": 7.495682917447561e-06,
"loss": 4.7802,
"step": 30600
},
{
"epoch": 0.25,
"grad_norm": 1.7358049154281616,
"learning_rate": 7.487498874694122e-06,
"loss": 4.771,
"step": 30700
},
{
"epoch": 0.25,
"grad_norm": 0.9707129001617432,
"learning_rate": 7.479314831940682e-06,
"loss": 4.7691,
"step": 30800
},
{
"epoch": 0.25,
"grad_norm": 2.6308770179748535,
"learning_rate": 7.471130789187243e-06,
"loss": 4.7706,
"step": 30900
},
{
"epoch": 0.25,
"grad_norm": 2.7046618461608887,
"learning_rate": 7.462946746433804e-06,
"loss": 4.7767,
"step": 31000
},
{
"epoch": 0.25,
"grad_norm": 1.627031922340393,
"learning_rate": 7.454762703680365e-06,
"loss": 4.7743,
"step": 31100
},
{
"epoch": 0.26,
"grad_norm": 1.203337550163269,
"learning_rate": 7.446578660926925e-06,
"loss": 4.7764,
"step": 31200
},
{
"epoch": 0.26,
"grad_norm": 1.2097506523132324,
"learning_rate": 7.4383946181734854e-06,
"loss": 4.7711,
"step": 31300
},
{
"epoch": 0.26,
"grad_norm": 1.1853944063186646,
"learning_rate": 7.430210575420047e-06,
"loss": 4.7699,
"step": 31400
},
{
"epoch": 0.26,
"grad_norm": 0.8438019752502441,
"learning_rate": 7.422026532666608e-06,
"loss": 4.7732,
"step": 31500
},
{
"epoch": 0.26,
"grad_norm": 1.5300862789154053,
"learning_rate": 7.413842489913168e-06,
"loss": 4.7747,
"step": 31600
},
{
"epoch": 0.26,
"grad_norm": 1.4141496419906616,
"learning_rate": 7.405658447159728e-06,
"loss": 4.7711,
"step": 31700
},
{
"epoch": 0.26,
"grad_norm": 1.673567533493042,
"learning_rate": 7.397474404406289e-06,
"loss": 4.777,
"step": 31800
},
{
"epoch": 0.26,
"grad_norm": 2.1357908248901367,
"learning_rate": 7.38929036165285e-06,
"loss": 4.7723,
"step": 31900
},
{
"epoch": 0.26,
"grad_norm": 4.316195964813232,
"learning_rate": 7.381106318899411e-06,
"loss": 4.7678,
"step": 32000
},
{
"epoch": 0.26,
"grad_norm": 1.7443231344223022,
"learning_rate": 7.372922276145971e-06,
"loss": 4.7707,
"step": 32100
},
{
"epoch": 0.26,
"grad_norm": 2.7221803665161133,
"learning_rate": 7.364738233392532e-06,
"loss": 4.7708,
"step": 32200
},
{
"epoch": 0.26,
"grad_norm": 0.8770394921302795,
"learning_rate": 7.356554190639092e-06,
"loss": 4.7721,
"step": 32300
},
{
"epoch": 0.27,
"grad_norm": 0.9764662384986877,
"learning_rate": 7.348370147885653e-06,
"loss": 4.7756,
"step": 32400
},
{
"epoch": 0.27,
"grad_norm": 3.6692917346954346,
"learning_rate": 7.3401861051322146e-06,
"loss": 4.7773,
"step": 32500
},
{
"epoch": 0.27,
"grad_norm": 1.8451597690582275,
"learning_rate": 7.332002062378775e-06,
"loss": 4.7742,
"step": 32600
},
{
"epoch": 0.27,
"grad_norm": 0.9092561602592468,
"learning_rate": 7.323818019625335e-06,
"loss": 4.7666,
"step": 32700
},
{
"epoch": 0.27,
"grad_norm": 0.9218481779098511,
"learning_rate": 7.315633976871895e-06,
"loss": 4.7717,
"step": 32800
},
{
"epoch": 0.27,
"grad_norm": 0.8461436033248901,
"learning_rate": 7.3074499341184565e-06,
"loss": 4.7752,
"step": 32900
},
{
"epoch": 0.27,
"grad_norm": 0.879395067691803,
"learning_rate": 7.299265891365017e-06,
"loss": 4.7736,
"step": 33000
},
{
"epoch": 0.27,
"grad_norm": 0.9056565165519714,
"learning_rate": 7.291081848611578e-06,
"loss": 4.7671,
"step": 33100
},
{
"epoch": 0.27,
"grad_norm": 2.0318729877471924,
"learning_rate": 7.282897805858138e-06,
"loss": 4.7672,
"step": 33200
},
{
"epoch": 0.27,
"grad_norm": 1.0115817785263062,
"learning_rate": 7.274713763104699e-06,
"loss": 4.7782,
"step": 33300
},
{
"epoch": 0.27,
"grad_norm": 1.3754358291625977,
"learning_rate": 7.2665297203512595e-06,
"loss": 4.7711,
"step": 33400
},
{
"epoch": 0.27,
"grad_norm": 1.6245019435882568,
"learning_rate": 7.25834567759782e-06,
"loss": 4.7716,
"step": 33500
},
{
"epoch": 0.27,
"grad_norm": 1.5871042013168335,
"learning_rate": 7.250161634844382e-06,
"loss": 4.7689,
"step": 33600
},
{
"epoch": 0.28,
"grad_norm": 1.6681900024414062,
"learning_rate": 7.241977592090942e-06,
"loss": 4.7734,
"step": 33700
},
{
"epoch": 0.28,
"grad_norm": 1.2739856243133545,
"learning_rate": 7.233793549337502e-06,
"loss": 4.7746,
"step": 33800
},
{
"epoch": 0.28,
"grad_norm": 1.8256975412368774,
"learning_rate": 7.2256095065840625e-06,
"loss": 4.7683,
"step": 33900
},
{
"epoch": 0.28,
"grad_norm": 2.5301151275634766,
"learning_rate": 7.217425463830624e-06,
"loss": 4.7715,
"step": 34000
},
{
"epoch": 0.28,
"grad_norm": 1.9835234880447388,
"learning_rate": 7.209241421077185e-06,
"loss": 4.7675,
"step": 34100
},
{
"epoch": 0.28,
"grad_norm": 1.1152873039245605,
"learning_rate": 7.201057378323745e-06,
"loss": 4.767,
"step": 34200
},
{
"epoch": 0.28,
"grad_norm": 4.025820732116699,
"learning_rate": 7.192873335570305e-06,
"loss": 4.7739,
"step": 34300
},
{
"epoch": 0.28,
"grad_norm": 4.386086463928223,
"learning_rate": 7.184689292816866e-06,
"loss": 4.7716,
"step": 34400
},
{
"epoch": 0.28,
"grad_norm": 3.9003477096557617,
"learning_rate": 7.176505250063427e-06,
"loss": 4.7691,
"step": 34500
},
{
"epoch": 0.28,
"grad_norm": 1.3681395053863525,
"learning_rate": 7.168321207309987e-06,
"loss": 4.7747,
"step": 34600
},
{
"epoch": 0.28,
"grad_norm": 12.061843872070312,
"learning_rate": 7.160137164556549e-06,
"loss": 4.7714,
"step": 34700
},
{
"epoch": 0.28,
"grad_norm": 1.0302884578704834,
"learning_rate": 7.151953121803109e-06,
"loss": 4.7711,
"step": 34800
},
{
"epoch": 0.29,
"grad_norm": 1.5776879787445068,
"learning_rate": 7.1437690790496694e-06,
"loss": 4.7682,
"step": 34900
},
{
"epoch": 0.29,
"grad_norm": 1.52390718460083,
"learning_rate": 7.13558503629623e-06,
"loss": 4.7712,
"step": 35000
},
{
"epoch": 0.29,
"grad_norm": 2.443223237991333,
"learning_rate": 7.127400993542791e-06,
"loss": 4.7696,
"step": 35100
},
{
"epoch": 0.29,
"grad_norm": 1.4894499778747559,
"learning_rate": 7.119216950789352e-06,
"loss": 4.7739,
"step": 35200
},
{
"epoch": 0.29,
"grad_norm": 1.0729478597640991,
"learning_rate": 7.111032908035912e-06,
"loss": 4.7728,
"step": 35300
},
{
"epoch": 0.29,
"grad_norm": 1.0306514501571655,
"learning_rate": 7.1028488652824725e-06,
"loss": 4.7739,
"step": 35400
},
{
"epoch": 0.29,
"grad_norm": 2.0036280155181885,
"learning_rate": 7.094664822529034e-06,
"loss": 4.7714,
"step": 35500
},
{
"epoch": 0.29,
"grad_norm": 0.9095188975334167,
"learning_rate": 7.086480779775594e-06,
"loss": 4.7647,
"step": 35600
},
{
"epoch": 0.29,
"grad_norm": 1.0899231433868408,
"learning_rate": 7.078296737022154e-06,
"loss": 4.7637,
"step": 35700
},
{
"epoch": 0.29,
"grad_norm": 5.961045742034912,
"learning_rate": 7.070112694268716e-06,
"loss": 4.7722,
"step": 35800
},
{
"epoch": 0.29,
"grad_norm": 0.9270179867744446,
"learning_rate": 7.061928651515276e-06,
"loss": 4.7682,
"step": 35900
},
{
"epoch": 0.29,
"grad_norm": 1.7026699781417847,
"learning_rate": 7.053744608761837e-06,
"loss": 4.7697,
"step": 36000
},
{
"epoch": 0.3,
"grad_norm": 1.5568283796310425,
"learning_rate": 7.045560566008397e-06,
"loss": 4.7695,
"step": 36100
},
{
"epoch": 0.3,
"grad_norm": 4.148453712463379,
"learning_rate": 7.037376523254959e-06,
"loss": 4.7667,
"step": 36200
},
{
"epoch": 0.3,
"grad_norm": 2.0565524101257324,
"learning_rate": 7.029192480501519e-06,
"loss": 4.7728,
"step": 36300
},
{
"epoch": 0.3,
"grad_norm": 1.3457672595977783,
"learning_rate": 7.021008437748079e-06,
"loss": 4.7719,
"step": 36400
},
{
"epoch": 0.3,
"grad_norm": 1.2144137620925903,
"learning_rate": 7.01282439499464e-06,
"loss": 4.7714,
"step": 36500
},
{
"epoch": 0.3,
"grad_norm": 0.7697350978851318,
"learning_rate": 7.004640352241201e-06,
"loss": 4.7712,
"step": 36600
},
{
"epoch": 0.3,
"grad_norm": 0.9844958782196045,
"learning_rate": 6.996456309487761e-06,
"loss": 4.7723,
"step": 36700
},
{
"epoch": 0.3,
"grad_norm": 3.750422716140747,
"learning_rate": 6.988272266734321e-06,
"loss": 4.7717,
"step": 36800
},
{
"epoch": 0.3,
"grad_norm": 2.6416289806365967,
"learning_rate": 6.980088223980883e-06,
"loss": 4.7696,
"step": 36900
},
{
"epoch": 0.3,
"grad_norm": 1.745283842086792,
"learning_rate": 6.9719041812274435e-06,
"loss": 4.7681,
"step": 37000
},
{
"epoch": 0.3,
"grad_norm": 1.1269015073776245,
"learning_rate": 6.963720138474004e-06,
"loss": 4.7659,
"step": 37100
},
{
"epoch": 0.3,
"grad_norm": 4.63193941116333,
"learning_rate": 6.955536095720564e-06,
"loss": 4.7676,
"step": 37200
},
{
"epoch": 0.31,
"grad_norm": 1.586788296699524,
"learning_rate": 6.947352052967126e-06,
"loss": 4.7729,
"step": 37300
},
{
"epoch": 0.31,
"grad_norm": 1.2502408027648926,
"learning_rate": 6.939168010213686e-06,
"loss": 4.7718,
"step": 37400
},
{
"epoch": 0.31,
"grad_norm": 1.620895504951477,
"learning_rate": 6.9309839674602465e-06,
"loss": 4.7703,
"step": 37500
},
{
"epoch": 0.31,
"grad_norm": 0.9904155731201172,
"learning_rate": 6.922799924706807e-06,
"loss": 4.77,
"step": 37600
},
{
"epoch": 0.31,
"grad_norm": 1.9440988302230835,
"learning_rate": 6.914615881953368e-06,
"loss": 4.7688,
"step": 37700
},
{
"epoch": 0.31,
"grad_norm": 2.9490699768066406,
"learning_rate": 6.906431839199928e-06,
"loss": 4.7718,
"step": 37800
},
{
"epoch": 0.31,
"grad_norm": 1.7235488891601562,
"learning_rate": 6.898247796446489e-06,
"loss": 4.7685,
"step": 37900
},
{
"epoch": 0.31,
"grad_norm": 1.076762318611145,
"learning_rate": 6.89006375369305e-06,
"loss": 4.7708,
"step": 38000
},
{
"epoch": 0.31,
"grad_norm": 0.8281159400939941,
"learning_rate": 6.881879710939611e-06,
"loss": 4.7697,
"step": 38100
},
{
"epoch": 0.31,
"grad_norm": 1.1257106065750122,
"learning_rate": 6.873695668186171e-06,
"loss": 4.7727,
"step": 38200
},
{
"epoch": 0.31,
"grad_norm": 4.7161760330200195,
"learning_rate": 6.865511625432731e-06,
"loss": 4.7695,
"step": 38300
},
{
"epoch": 0.31,
"grad_norm": 2.1901695728302,
"learning_rate": 6.857327582679293e-06,
"loss": 4.7712,
"step": 38400
},
{
"epoch": 0.32,
"grad_norm": 1.988418459892273,
"learning_rate": 6.8491435399258535e-06,
"loss": 4.7691,
"step": 38500
},
{
"epoch": 0.32,
"grad_norm": 1.4924893379211426,
"learning_rate": 6.840959497172414e-06,
"loss": 4.7705,
"step": 38600
},
{
"epoch": 0.32,
"grad_norm": 2.154937744140625,
"learning_rate": 6.832775454418974e-06,
"loss": 4.7652,
"step": 38700
},
{
"epoch": 0.32,
"grad_norm": 1.1525272130966187,
"learning_rate": 6.824591411665535e-06,
"loss": 4.7705,
"step": 38800
},
{
"epoch": 0.32,
"grad_norm": 0.9818894863128662,
"learning_rate": 6.816407368912095e-06,
"loss": 4.767,
"step": 38900
},
{
"epoch": 0.32,
"grad_norm": 4.012678146362305,
"learning_rate": 6.8082233261586565e-06,
"loss": 4.7657,
"step": 39000
},
{
"epoch": 0.32,
"grad_norm": 3.9307897090911865,
"learning_rate": 6.800039283405218e-06,
"loss": 4.7629,
"step": 39100
},
{
"epoch": 0.32,
"grad_norm": 2.6684534549713135,
"learning_rate": 6.791855240651778e-06,
"loss": 4.7695,
"step": 39200
},
{
"epoch": 0.32,
"grad_norm": 3.2453012466430664,
"learning_rate": 6.783671197898338e-06,
"loss": 4.7733,
"step": 39300
},
{
"epoch": 0.32,
"grad_norm": 1.1828510761260986,
"learning_rate": 6.775487155144898e-06,
"loss": 4.7696,
"step": 39400
},
{
"epoch": 0.32,
"grad_norm": 1.2635380029678345,
"learning_rate": 6.76730311239146e-06,
"loss": 4.7726,
"step": 39500
},
{
"epoch": 0.32,
"grad_norm": 0.8589321970939636,
"learning_rate": 6.759119069638021e-06,
"loss": 4.768,
"step": 39600
},
{
"epoch": 0.32,
"grad_norm": 1.4459155797958374,
"learning_rate": 6.750935026884581e-06,
"loss": 4.7664,
"step": 39700
},
{
"epoch": 0.33,
"grad_norm": 3.3713083267211914,
"learning_rate": 6.742750984131141e-06,
"loss": 4.7684,
"step": 39800
},
{
"epoch": 0.33,
"grad_norm": 3.4334757328033447,
"learning_rate": 6.734566941377702e-06,
"loss": 4.767,
"step": 39900
},
{
"epoch": 0.33,
"grad_norm": 1.072906255722046,
"learning_rate": 6.726382898624263e-06,
"loss": 4.7623,
"step": 40000
},
{
"epoch": 0.33,
"grad_norm": 1.8414703607559204,
"learning_rate": 6.718198855870824e-06,
"loss": 4.7657,
"step": 40100
},
{
"epoch": 0.33,
"grad_norm": 1.423050045967102,
"learning_rate": 6.710014813117385e-06,
"loss": 4.7684,
"step": 40200
},
{
"epoch": 0.33,
"grad_norm": 1.1521214246749878,
"learning_rate": 6.701830770363945e-06,
"loss": 4.7682,
"step": 40300
},
{
"epoch": 0.33,
"grad_norm": 1.2841377258300781,
"learning_rate": 6.693646727610505e-06,
"loss": 4.7661,
"step": 40400
},
{
"epoch": 0.33,
"grad_norm": 2.815016508102417,
"learning_rate": 6.6854626848570656e-06,
"loss": 4.7724,
"step": 40500
},
{
"epoch": 0.33,
"grad_norm": 0.9150479435920715,
"learning_rate": 6.6772786421036275e-06,
"loss": 4.7667,
"step": 40600
},
{
"epoch": 0.33,
"grad_norm": 1.2387914657592773,
"learning_rate": 6.669094599350188e-06,
"loss": 4.7695,
"step": 40700
},
{
"epoch": 0.33,
"grad_norm": 3.5044503211975098,
"learning_rate": 6.660910556596748e-06,
"loss": 4.7671,
"step": 40800
},
{
"epoch": 0.33,
"grad_norm": 1.1371254920959473,
"learning_rate": 6.652726513843308e-06,
"loss": 4.7652,
"step": 40900
},
{
"epoch": 0.34,
"grad_norm": 1.3063691854476929,
"learning_rate": 6.6445424710898694e-06,
"loss": 4.7674,
"step": 41000
},
{
"epoch": 0.34,
"grad_norm": 1.7822604179382324,
"learning_rate": 6.6363584283364306e-06,
"loss": 4.7692,
"step": 41100
},
{
"epoch": 0.34,
"grad_norm": 0.9057841300964355,
"learning_rate": 6.628174385582991e-06,
"loss": 4.7676,
"step": 41200
},
{
"epoch": 0.34,
"grad_norm": 1.9072014093399048,
"learning_rate": 6.619990342829552e-06,
"loss": 4.7625,
"step": 41300
},
{
"epoch": 0.34,
"grad_norm": 0.9912985563278198,
"learning_rate": 6.611806300076112e-06,
"loss": 4.7709,
"step": 41400
},
{
"epoch": 0.34,
"grad_norm": 2.278571605682373,
"learning_rate": 6.6036222573226725e-06,
"loss": 4.7725,
"step": 41500
},
{
"epoch": 0.34,
"grad_norm": 2.2209765911102295,
"learning_rate": 6.595438214569233e-06,
"loss": 4.7693,
"step": 41600
},
{
"epoch": 0.34,
"grad_norm": 1.4683972597122192,
"learning_rate": 6.587254171815795e-06,
"loss": 4.7715,
"step": 41700
},
{
"epoch": 0.34,
"grad_norm": 2.1982457637786865,
"learning_rate": 6.579070129062355e-06,
"loss": 4.7629,
"step": 41800
},
{
"epoch": 0.34,
"grad_norm": 3.916114568710327,
"learning_rate": 6.570886086308915e-06,
"loss": 4.7628,
"step": 41900
},
{
"epoch": 0.34,
"grad_norm": 4.219468116760254,
"learning_rate": 6.5627020435554755e-06,
"loss": 4.766,
"step": 42000
},
{
"epoch": 0.34,
"grad_norm": 1.3876959085464478,
"learning_rate": 6.5545180008020375e-06,
"loss": 4.7689,
"step": 42100
},
{
"epoch": 0.35,
"grad_norm": 4.21665620803833,
"learning_rate": 6.546333958048598e-06,
"loss": 4.767,
"step": 42200
},
{
"epoch": 0.35,
"grad_norm": 0.8205769658088684,
"learning_rate": 6.538149915295158e-06,
"loss": 4.7673,
"step": 42300
},
{
"epoch": 0.35,
"grad_norm": 2.474163293838501,
"learning_rate": 6.529965872541719e-06,
"loss": 4.7662,
"step": 42400
},
{
"epoch": 0.35,
"grad_norm": 2.4474871158599854,
"learning_rate": 6.521781829788279e-06,
"loss": 4.7672,
"step": 42500
},
{
"epoch": 0.35,
"grad_norm": 1.422839879989624,
"learning_rate": 6.51359778703484e-06,
"loss": 4.7639,
"step": 42600
},
{
"epoch": 0.35,
"grad_norm": 1.002898097038269,
"learning_rate": 6.5054137442814e-06,
"loss": 4.7707,
"step": 42700
},
{
"epoch": 0.35,
"grad_norm": 1.21475350856781,
"learning_rate": 6.497229701527962e-06,
"loss": 4.7643,
"step": 42800
},
{
"epoch": 0.35,
"grad_norm": 1.273319959640503,
"learning_rate": 6.489045658774522e-06,
"loss": 4.764,
"step": 42900
},
{
"epoch": 0.35,
"grad_norm": 1.963934063911438,
"learning_rate": 6.480861616021082e-06,
"loss": 4.7675,
"step": 43000
},
{
"epoch": 0.35,
"grad_norm": 2.1018524169921875,
"learning_rate": 6.472677573267643e-06,
"loss": 4.7722,
"step": 43100
},
{
"epoch": 0.35,
"grad_norm": 1.1742087602615356,
"learning_rate": 6.464493530514205e-06,
"loss": 4.7697,
"step": 43200
},
{
"epoch": 0.35,
"grad_norm": 1.3663380146026611,
"learning_rate": 6.456309487760765e-06,
"loss": 4.7654,
"step": 43300
},
{
"epoch": 0.36,
"grad_norm": 1.2402204275131226,
"learning_rate": 6.448125445007325e-06,
"loss": 4.7657,
"step": 43400
},
{
"epoch": 0.36,
"grad_norm": 1.8193938732147217,
"learning_rate": 6.439941402253886e-06,
"loss": 4.7631,
"step": 43500
},
{
"epoch": 0.36,
"grad_norm": 0.9568284749984741,
"learning_rate": 6.4317573595004465e-06,
"loss": 4.7687,
"step": 43600
},
{
"epoch": 0.36,
"grad_norm": 0.9550923109054565,
"learning_rate": 6.423573316747007e-06,
"loss": 4.766,
"step": 43700
},
{
"epoch": 0.36,
"grad_norm": 1.496541976928711,
"learning_rate": 6.415389273993567e-06,
"loss": 4.7674,
"step": 43800
},
{
"epoch": 0.36,
"grad_norm": 7.160891532897949,
"learning_rate": 6.407205231240129e-06,
"loss": 4.7667,
"step": 43900
},
{
"epoch": 0.36,
"grad_norm": 1.1409087181091309,
"learning_rate": 6.399021188486689e-06,
"loss": 4.7657,
"step": 44000
},
{
"epoch": 0.36,
"grad_norm": 1.3777464628219604,
"learning_rate": 6.3908371457332496e-06,
"loss": 4.7657,
"step": 44100
},
{
"epoch": 0.36,
"grad_norm": 1.171425223350525,
"learning_rate": 6.38265310297981e-06,
"loss": 4.7645,
"step": 44200
},
{
"epoch": 0.36,
"grad_norm": 1.4748115539550781,
"learning_rate": 6.374469060226372e-06,
"loss": 4.7622,
"step": 44300
},
{
"epoch": 0.36,
"grad_norm": 4.880084991455078,
"learning_rate": 6.366285017472932e-06,
"loss": 4.766,
"step": 44400
},
{
"epoch": 0.36,
"grad_norm": 1.557969331741333,
"learning_rate": 6.358100974719492e-06,
"loss": 4.7688,
"step": 44500
},
{
"epoch": 0.37,
"grad_norm": 2.078839063644409,
"learning_rate": 6.3499169319660534e-06,
"loss": 4.7652,
"step": 44600
},
{
"epoch": 0.37,
"grad_norm": 5.024000644683838,
"learning_rate": 6.341732889212614e-06,
"loss": 4.767,
"step": 44700
},
{
"epoch": 0.37,
"grad_norm": 1.7246519327163696,
"learning_rate": 6.333548846459174e-06,
"loss": 4.7631,
"step": 44800
},
{
"epoch": 0.37,
"grad_norm": 0.9058144688606262,
"learning_rate": 6.325364803705735e-06,
"loss": 4.7641,
"step": 44900
},
{
"epoch": 0.37,
"grad_norm": 1.1066261529922485,
"learning_rate": 6.317180760952296e-06,
"loss": 4.7657,
"step": 45000
},
{
"epoch": 0.37,
"grad_norm": 1.048341989517212,
"learning_rate": 6.3089967181988565e-06,
"loss": 4.7573,
"step": 45100
},
{
"epoch": 0.37,
"grad_norm": 2.8883166313171387,
"learning_rate": 6.300812675445417e-06,
"loss": 4.7681,
"step": 45200
},
{
"epoch": 0.37,
"grad_norm": 3.5398635864257812,
"learning_rate": 6.292628632691977e-06,
"loss": 4.7622,
"step": 45300
},
{
"epoch": 0.37,
"grad_norm": 4.646157741546631,
"learning_rate": 6.284444589938539e-06,
"loss": 4.7675,
"step": 45400
},
{
"epoch": 0.37,
"grad_norm": 1.9889692068099976,
"learning_rate": 6.276260547185099e-06,
"loss": 4.7616,
"step": 45500
},
{
"epoch": 0.37,
"grad_norm": 1.8759746551513672,
"learning_rate": 6.2680765044316595e-06,
"loss": 4.7674,
"step": 45600
},
{
"epoch": 0.37,
"grad_norm": 1.6846829652786255,
"learning_rate": 6.259892461678221e-06,
"loss": 4.762,
"step": 45700
},
{
"epoch": 0.37,
"grad_norm": 0.9014168381690979,
"learning_rate": 6.251708418924781e-06,
"loss": 4.7671,
"step": 45800
},
{
"epoch": 0.38,
"grad_norm": 2.2552223205566406,
"learning_rate": 6.243524376171341e-06,
"loss": 4.7665,
"step": 45900
},
{
"epoch": 0.38,
"grad_norm": 7.994378566741943,
"learning_rate": 6.235340333417902e-06,
"loss": 4.7679,
"step": 46000
},
{
"epoch": 0.38,
"grad_norm": 1.071326732635498,
"learning_rate": 6.227156290664463e-06,
"loss": 4.7621,
"step": 46100
},
{
"epoch": 0.38,
"grad_norm": 1.366959810256958,
"learning_rate": 6.218972247911024e-06,
"loss": 4.7714,
"step": 46200
},
{
"epoch": 0.38,
"grad_norm": 1.2230727672576904,
"learning_rate": 6.210788205157584e-06,
"loss": 4.7671,
"step": 46300
},
{
"epoch": 0.38,
"grad_norm": 3.3939905166625977,
"learning_rate": 6.202604162404144e-06,
"loss": 4.7652,
"step": 46400
},
{
"epoch": 0.38,
"grad_norm": 1.4360277652740479,
"learning_rate": 6.194420119650706e-06,
"loss": 4.7613,
"step": 46500
},
{
"epoch": 0.38,
"grad_norm": 2.0407369136810303,
"learning_rate": 6.186236076897266e-06,
"loss": 4.7704,
"step": 46600
},
{
"epoch": 0.38,
"grad_norm": 0.9247026443481445,
"learning_rate": 6.178052034143827e-06,
"loss": 4.7611,
"step": 46700
},
{
"epoch": 0.38,
"grad_norm": 1.4636540412902832,
"learning_rate": 6.169867991390388e-06,
"loss": 4.7658,
"step": 46800
},
{
"epoch": 0.38,
"grad_norm": 1.443965196609497,
"learning_rate": 6.161683948636948e-06,
"loss": 4.7674,
"step": 46900
},
{
"epoch": 0.38,
"grad_norm": 1.4305676221847534,
"learning_rate": 6.153499905883509e-06,
"loss": 4.7653,
"step": 47000
},
{
"epoch": 0.39,
"grad_norm": 0.9233379364013672,
"learning_rate": 6.1453158631300694e-06,
"loss": 4.7685,
"step": 47100
},
{
"epoch": 0.39,
"grad_norm": 0.9005165100097656,
"learning_rate": 6.1371318203766305e-06,
"loss": 4.7605,
"step": 47200
},
{
"epoch": 0.39,
"grad_norm": 2.1827316284179688,
"learning_rate": 6.128947777623191e-06,
"loss": 4.7639,
"step": 47300
},
{
"epoch": 0.39,
"grad_norm": 1.330802083015442,
"learning_rate": 6.120763734869751e-06,
"loss": 4.765,
"step": 47400
},
{
"epoch": 0.39,
"grad_norm": 1.0128445625305176,
"learning_rate": 6.112579692116311e-06,
"loss": 4.7704,
"step": 47500
},
{
"epoch": 0.39,
"grad_norm": 3.6746978759765625,
"learning_rate": 6.104395649362873e-06,
"loss": 4.7645,
"step": 47600
},
{
"epoch": 0.39,
"grad_norm": 1.546904444694519,
"learning_rate": 6.0962116066094336e-06,
"loss": 4.764,
"step": 47700
},
{
"epoch": 0.39,
"grad_norm": 3.3686161041259766,
"learning_rate": 6.088027563855994e-06,
"loss": 4.7679,
"step": 47800
},
{
"epoch": 0.39,
"grad_norm": 1.7452073097229004,
"learning_rate": 6.079843521102555e-06,
"loss": 4.7597,
"step": 47900
},
{
"epoch": 0.39,
"grad_norm": 1.0575987100601196,
"learning_rate": 6.071659478349115e-06,
"loss": 4.7656,
"step": 48000
},
{
"epoch": 0.39,
"grad_norm": 5.366837501525879,
"learning_rate": 6.063475435595676e-06,
"loss": 4.761,
"step": 48100
},
{
"epoch": 0.39,
"grad_norm": 0.977851927280426,
"learning_rate": 6.055291392842237e-06,
"loss": 4.7644,
"step": 48200
},
{
"epoch": 0.4,
"grad_norm": 4.891844749450684,
"learning_rate": 6.047107350088798e-06,
"loss": 4.7681,
"step": 48300
},
{
"epoch": 0.4,
"grad_norm": 1.2509273290634155,
"learning_rate": 6.038923307335358e-06,
"loss": 4.7587,
"step": 48400
},
{
"epoch": 0.4,
"grad_norm": 1.095798373222351,
"learning_rate": 6.030739264581918e-06,
"loss": 4.7617,
"step": 48500
},
{
"epoch": 0.4,
"grad_norm": 1.093984842300415,
"learning_rate": 6.0225552218284785e-06,
"loss": 4.7629,
"step": 48600
},
{
"epoch": 0.4,
"grad_norm": 0.8967887759208679,
"learning_rate": 6.0143711790750405e-06,
"loss": 4.7623,
"step": 48700
},
{
"epoch": 0.4,
"grad_norm": 0.8838722705841064,
"learning_rate": 6.006187136321601e-06,
"loss": 4.7658,
"step": 48800
},
{
"epoch": 0.4,
"grad_norm": 1.5173430442810059,
"learning_rate": 5.998003093568161e-06,
"loss": 4.7631,
"step": 48900
},
{
"epoch": 0.4,
"grad_norm": 2.8584578037261963,
"learning_rate": 5.989819050814722e-06,
"loss": 4.7597,
"step": 49000
},
{
"epoch": 0.4,
"grad_norm": 2.672813892364502,
"learning_rate": 5.981635008061283e-06,
"loss": 4.7614,
"step": 49100
},
{
"epoch": 0.4,
"grad_norm": 1.2858943939208984,
"learning_rate": 5.9734509653078435e-06,
"loss": 4.7585,
"step": 49200
},
{
"epoch": 0.4,
"grad_norm": 2.0808370113372803,
"learning_rate": 5.965266922554404e-06,
"loss": 4.7652,
"step": 49300
},
{
"epoch": 0.4,
"grad_norm": 1.6658849716186523,
"learning_rate": 5.957082879800965e-06,
"loss": 4.7622,
"step": 49400
},
{
"epoch": 0.41,
"grad_norm": 1.312129259109497,
"learning_rate": 5.948898837047525e-06,
"loss": 4.7643,
"step": 49500
},
{
"epoch": 0.41,
"grad_norm": 1.109014868736267,
"learning_rate": 5.940714794294085e-06,
"loss": 4.7635,
"step": 49600
},
{
"epoch": 0.41,
"grad_norm": 1.059754729270935,
"learning_rate": 5.932530751540646e-06,
"loss": 4.7635,
"step": 49700
},
{
"epoch": 0.41,
"grad_norm": 1.4219303131103516,
"learning_rate": 5.924346708787208e-06,
"loss": 4.7644,
"step": 49800
},
{
"epoch": 0.41,
"grad_norm": 4.515329360961914,
"learning_rate": 5.916162666033768e-06,
"loss": 4.764,
"step": 49900
},
{
"epoch": 0.41,
"grad_norm": 2.7879600524902344,
"learning_rate": 5.907978623280328e-06,
"loss": 4.7629,
"step": 50000
},
{
"epoch": 0.41,
"grad_norm": 1.4872534275054932,
"learning_rate": 5.899794580526889e-06,
"loss": 4.7627,
"step": 50100
},
{
"epoch": 0.41,
"grad_norm": 1.7036062479019165,
"learning_rate": 5.89161053777345e-06,
"loss": 4.7602,
"step": 50200
},
{
"epoch": 0.41,
"grad_norm": 1.1334501504898071,
"learning_rate": 5.883426495020011e-06,
"loss": 4.7649,
"step": 50300
},
{
"epoch": 0.41,
"grad_norm": 1.8255513906478882,
"learning_rate": 5.875242452266571e-06,
"loss": 4.7662,
"step": 50400
},
{
"epoch": 0.41,
"grad_norm": 0.9870157241821289,
"learning_rate": 5.867058409513132e-06,
"loss": 4.7613,
"step": 50500
},
{
"epoch": 0.41,
"grad_norm": 1.7012158632278442,
"learning_rate": 5.858874366759692e-06,
"loss": 4.7609,
"step": 50600
},
{
"epoch": 0.41,
"grad_norm": 0.9148879051208496,
"learning_rate": 5.850690324006253e-06,
"loss": 4.7601,
"step": 50700
},
{
"epoch": 0.42,
"grad_norm": 1.1383265256881714,
"learning_rate": 5.842506281252813e-06,
"loss": 4.7583,
"step": 50800
},
{
"epoch": 0.42,
"grad_norm": 0.8608391284942627,
"learning_rate": 5.834322238499375e-06,
"loss": 4.7639,
"step": 50900
},
{
"epoch": 0.42,
"grad_norm": 2.019380807876587,
"learning_rate": 5.826138195745935e-06,
"loss": 4.7638,
"step": 51000
},
{
"epoch": 0.42,
"grad_norm": 2.069756031036377,
"learning_rate": 5.817954152992495e-06,
"loss": 4.7691,
"step": 51100
},
{
"epoch": 0.42,
"grad_norm": 1.2474431991577148,
"learning_rate": 5.809770110239057e-06,
"loss": 4.7668,
"step": 51200
},
{
"epoch": 0.42,
"grad_norm": 1.1330208778381348,
"learning_rate": 5.8015860674856176e-06,
"loss": 4.7647,
"step": 51300
},
{
"epoch": 0.42,
"grad_norm": 1.4908925294876099,
"learning_rate": 5.793402024732178e-06,
"loss": 4.759,
"step": 51400
},
{
"epoch": 0.42,
"grad_norm": 1.1022288799285889,
"learning_rate": 5.785217981978738e-06,
"loss": 4.7583,
"step": 51500
},
{
"epoch": 0.42,
"grad_norm": 1.2714922428131104,
"learning_rate": 5.777033939225299e-06,
"loss": 4.7619,
"step": 51600
},
{
"epoch": 0.42,
"grad_norm": 1.3751145601272583,
"learning_rate": 5.7688498964718595e-06,
"loss": 4.7607,
"step": 51700
},
{
"epoch": 0.42,
"grad_norm": 0.9955260753631592,
"learning_rate": 5.76066585371842e-06,
"loss": 4.7612,
"step": 51800
},
{
"epoch": 0.42,
"grad_norm": 1.48429274559021,
"learning_rate": 5.752481810964981e-06,
"loss": 4.7631,
"step": 51900
},
{
"epoch": 0.43,
"grad_norm": 3.2120578289031982,
"learning_rate": 5.744297768211542e-06,
"loss": 4.7652,
"step": 52000
},
{
"epoch": 0.43,
"grad_norm": 1.1913766860961914,
"learning_rate": 5.736113725458102e-06,
"loss": 4.7651,
"step": 52100
},
{
"epoch": 0.43,
"grad_norm": 3.3173420429229736,
"learning_rate": 5.7279296827046625e-06,
"loss": 4.7583,
"step": 52200
},
{
"epoch": 0.43,
"grad_norm": 2.184640407562256,
"learning_rate": 5.7197456399512245e-06,
"loss": 4.7674,
"step": 52300
},
{
"epoch": 0.43,
"grad_norm": 1.406989574432373,
"learning_rate": 5.711561597197785e-06,
"loss": 4.7624,
"step": 52400
},
{
"epoch": 0.43,
"grad_norm": 4.420721530914307,
"learning_rate": 5.703377554444345e-06,
"loss": 4.7584,
"step": 52500
},
{
"epoch": 0.43,
"grad_norm": 1.6438698768615723,
"learning_rate": 5.695193511690905e-06,
"loss": 4.759,
"step": 52600
},
{
"epoch": 0.43,
"grad_norm": 1.0008363723754883,
"learning_rate": 5.687009468937466e-06,
"loss": 4.7622,
"step": 52700
},
{
"epoch": 0.43,
"grad_norm": 0.9954501986503601,
"learning_rate": 5.678825426184027e-06,
"loss": 4.7599,
"step": 52800
},
{
"epoch": 0.43,
"grad_norm": 2.2024335861206055,
"learning_rate": 5.670641383430587e-06,
"loss": 4.757,
"step": 52900
},
{
"epoch": 0.43,
"grad_norm": 0.8690518140792847,
"learning_rate": 5.662457340677148e-06,
"loss": 4.7638,
"step": 53000
},
{
"epoch": 0.43,
"grad_norm": 4.789288520812988,
"learning_rate": 5.654273297923709e-06,
"loss": 4.7651,
"step": 53100
},
{
"epoch": 0.44,
"grad_norm": 1.9161509275436401,
"learning_rate": 5.646089255170269e-06,
"loss": 4.762,
"step": 53200
},
{
"epoch": 0.44,
"grad_norm": 1.1791253089904785,
"learning_rate": 5.63790521241683e-06,
"loss": 4.7629,
"step": 53300
},
{
"epoch": 0.44,
"grad_norm": 3.780832529067993,
"learning_rate": 5.629721169663392e-06,
"loss": 4.7553,
"step": 53400
},
{
"epoch": 0.44,
"grad_norm": 1.1403292417526245,
"learning_rate": 5.621537126909952e-06,
"loss": 4.7615,
"step": 53500
},
{
"epoch": 0.44,
"grad_norm": 1.2787580490112305,
"learning_rate": 5.613353084156512e-06,
"loss": 4.7623,
"step": 53600
},
{
"epoch": 0.44,
"grad_norm": 2.774376153945923,
"learning_rate": 5.6051690414030724e-06,
"loss": 4.7626,
"step": 53700
},
{
"epoch": 0.44,
"grad_norm": 3.4365766048431396,
"learning_rate": 5.5969849986496336e-06,
"loss": 4.7573,
"step": 53800
},
{
"epoch": 0.44,
"grad_norm": 1.0744109153747559,
"learning_rate": 5.588800955896194e-06,
"loss": 4.7621,
"step": 53900
},
{
"epoch": 0.44,
"grad_norm": 0.9712745547294617,
"learning_rate": 5.580616913142755e-06,
"loss": 4.7605,
"step": 54000
},
{
"epoch": 0.44,
"grad_norm": 1.24153470993042,
"learning_rate": 5.572432870389315e-06,
"loss": 4.7654,
"step": 54100
},
{
"epoch": 0.44,
"grad_norm": 1.9112986326217651,
"learning_rate": 5.564248827635876e-06,
"loss": 4.7666,
"step": 54200
},
{
"epoch": 0.44,
"grad_norm": 2.2796552181243896,
"learning_rate": 5.556064784882437e-06,
"loss": 4.7594,
"step": 54300
},
{
"epoch": 0.45,
"grad_norm": 1.3988897800445557,
"learning_rate": 5.547880742128997e-06,
"loss": 4.7596,
"step": 54400
},
{
"epoch": 0.45,
"grad_norm": 1.2597512006759644,
"learning_rate": 5.539696699375559e-06,
"loss": 4.7648,
"step": 54500
},
{
"epoch": 0.45,
"grad_norm": 2.735841989517212,
"learning_rate": 5.531512656622119e-06,
"loss": 4.766,
"step": 54600
},
{
"epoch": 0.45,
"grad_norm": 1.2517529726028442,
"learning_rate": 5.523328613868679e-06,
"loss": 4.7585,
"step": 54700
},
{
"epoch": 0.45,
"grad_norm": 1.7145378589630127,
"learning_rate": 5.51514457111524e-06,
"loss": 4.7575,
"step": 54800
},
{
"epoch": 0.45,
"grad_norm": 1.1680995225906372,
"learning_rate": 5.506960528361801e-06,
"loss": 4.7611,
"step": 54900
},
{
"epoch": 0.45,
"grad_norm": 3.5684611797332764,
"learning_rate": 5.498776485608361e-06,
"loss": 4.7579,
"step": 55000
},
{
"epoch": 0.45,
"grad_norm": 1.0898152589797974,
"learning_rate": 5.490592442854922e-06,
"loss": 4.7587,
"step": 55100
},
{
"epoch": 0.45,
"grad_norm": 1.0716261863708496,
"learning_rate": 5.482408400101482e-06,
"loss": 4.7605,
"step": 55200
},
{
"epoch": 0.45,
"grad_norm": 2.373514413833618,
"learning_rate": 5.4742243573480435e-06,
"loss": 4.7626,
"step": 55300
},
{
"epoch": 0.45,
"grad_norm": 3.0533201694488525,
"learning_rate": 5.466040314594604e-06,
"loss": 4.7601,
"step": 55400
},
{
"epoch": 0.45,
"grad_norm": 0.9586790800094604,
"learning_rate": 5.457856271841164e-06,
"loss": 4.7607,
"step": 55500
},
{
"epoch": 0.46,
"grad_norm": 1.8159968852996826,
"learning_rate": 5.449672229087726e-06,
"loss": 4.7613,
"step": 55600
},
{
"epoch": 0.46,
"grad_norm": 1.0811960697174072,
"learning_rate": 5.441488186334286e-06,
"loss": 4.7613,
"step": 55700
},
{
"epoch": 0.46,
"grad_norm": 1.0583518743515015,
"learning_rate": 5.4333041435808465e-06,
"loss": 4.7558,
"step": 55800
},
{
"epoch": 0.46,
"grad_norm": 0.993607223033905,
"learning_rate": 5.425120100827407e-06,
"loss": 4.7577,
"step": 55900
},
{
"epoch": 0.46,
"grad_norm": 1.3715596199035645,
"learning_rate": 5.416936058073968e-06,
"loss": 4.7565,
"step": 56000
},
{
"epoch": 0.46,
"grad_norm": 1.3586755990982056,
"learning_rate": 5.408752015320529e-06,
"loss": 4.7627,
"step": 56100
},
{
"epoch": 0.46,
"grad_norm": 1.5538294315338135,
"learning_rate": 5.400567972567089e-06,
"loss": 4.7616,
"step": 56200
},
{
"epoch": 0.46,
"grad_norm": 1.3117858171463013,
"learning_rate": 5.3923839298136495e-06,
"loss": 4.7615,
"step": 56300
},
{
"epoch": 0.46,
"grad_norm": 2.4825961589813232,
"learning_rate": 5.384199887060211e-06,
"loss": 4.7673,
"step": 56400
},
{
"epoch": 0.46,
"grad_norm": 3.6540427207946777,
"learning_rate": 5.376015844306771e-06,
"loss": 4.7583,
"step": 56500
},
{
"epoch": 0.46,
"grad_norm": 1.217731237411499,
"learning_rate": 5.367831801553331e-06,
"loss": 4.7647,
"step": 56600
},
{
"epoch": 0.46,
"grad_norm": 1.3690531253814697,
"learning_rate": 5.359647758799893e-06,
"loss": 4.7607,
"step": 56700
},
{
"epoch": 0.46,
"grad_norm": 2.986236572265625,
"learning_rate": 5.351463716046453e-06,
"loss": 4.7649,
"step": 56800
},
{
"epoch": 0.47,
"grad_norm": 4.169172763824463,
"learning_rate": 5.343279673293014e-06,
"loss": 4.7609,
"step": 56900
},
{
"epoch": 0.47,
"grad_norm": 3.941506862640381,
"learning_rate": 5.335095630539574e-06,
"loss": 4.7588,
"step": 57000
},
{
"epoch": 0.47,
"grad_norm": 1.151626467704773,
"learning_rate": 5.326911587786135e-06,
"loss": 4.7626,
"step": 57100
},
{
"epoch": 0.47,
"grad_norm": 1.1332154273986816,
"learning_rate": 5.318727545032696e-06,
"loss": 4.7638,
"step": 57200
},
{
"epoch": 0.47,
"grad_norm": 1.0343974828720093,
"learning_rate": 5.3105435022792564e-06,
"loss": 4.7591,
"step": 57300
},
{
"epoch": 0.47,
"grad_norm": 1.0772465467453003,
"learning_rate": 5.302359459525817e-06,
"loss": 4.7594,
"step": 57400
},
{
"epoch": 0.47,
"grad_norm": 5.96359920501709,
"learning_rate": 5.294175416772378e-06,
"loss": 4.7597,
"step": 57500
},
{
"epoch": 0.47,
"grad_norm": 1.451434850692749,
"learning_rate": 5.285991374018938e-06,
"loss": 4.7622,
"step": 57600
},
{
"epoch": 0.47,
"grad_norm": 3.3634424209594727,
"learning_rate": 5.277807331265498e-06,
"loss": 4.7639,
"step": 57700
},
{
"epoch": 0.47,
"grad_norm": 1.3753291368484497,
"learning_rate": 5.2696232885120595e-06,
"loss": 4.762,
"step": 57800
},
{
"epoch": 0.47,
"grad_norm": 1.0764108896255493,
"learning_rate": 5.261439245758621e-06,
"loss": 4.7565,
"step": 57900
},
{
"epoch": 0.47,
"grad_norm": 1.7822853326797485,
"learning_rate": 5.253255203005181e-06,
"loss": 4.76,
"step": 58000
},
{
"epoch": 0.48,
"grad_norm": 1.0505635738372803,
"learning_rate": 5.245071160251741e-06,
"loss": 4.7563,
"step": 58100
},
{
"epoch": 0.48,
"grad_norm": 2.2111964225769043,
"learning_rate": 5.236887117498303e-06,
"loss": 4.7613,
"step": 58200
},
{
"epoch": 0.48,
"grad_norm": 1.0773766040802002,
"learning_rate": 5.228703074744863e-06,
"loss": 4.7645,
"step": 58300
},
{
"epoch": 0.48,
"grad_norm": 1.387395977973938,
"learning_rate": 5.220519031991424e-06,
"loss": 4.7581,
"step": 58400
},
{
"epoch": 0.48,
"grad_norm": 1.5407155752182007,
"learning_rate": 5.212334989237984e-06,
"loss": 4.7599,
"step": 58500
},
{
"epoch": 0.48,
"grad_norm": 1.9045255184173584,
"learning_rate": 5.204150946484545e-06,
"loss": 4.7632,
"step": 58600
},
{
"epoch": 0.48,
"grad_norm": 1.1140589714050293,
"learning_rate": 5.195966903731105e-06,
"loss": 4.758,
"step": 58700
},
{
"epoch": 0.48,
"grad_norm": 3.0775272846221924,
"learning_rate": 5.1877828609776655e-06,
"loss": 4.7591,
"step": 58800
},
{
"epoch": 0.48,
"grad_norm": 1.5358976125717163,
"learning_rate": 5.179598818224227e-06,
"loss": 4.7613,
"step": 58900
},
{
"epoch": 0.48,
"grad_norm": 2.506425380706787,
"learning_rate": 5.171414775470788e-06,
"loss": 4.7584,
"step": 59000
},
{
"epoch": 0.48,
"grad_norm": 1.562016248703003,
"learning_rate": 5.163230732717348e-06,
"loss": 4.7603,
"step": 59100
},
{
"epoch": 0.48,
"grad_norm": 3.894599437713623,
"learning_rate": 5.155046689963908e-06,
"loss": 4.7582,
"step": 59200
},
{
"epoch": 0.49,
"grad_norm": 1.7475073337554932,
"learning_rate": 5.14686264721047e-06,
"loss": 4.7552,
"step": 59300
},
{
"epoch": 0.49,
"grad_norm": 2.168311357498169,
"learning_rate": 5.1386786044570305e-06,
"loss": 4.7546,
"step": 59400
},
{
"epoch": 0.49,
"grad_norm": 1.3866901397705078,
"learning_rate": 5.130494561703591e-06,
"loss": 4.7595,
"step": 59500
},
{
"epoch": 0.49,
"grad_norm": 1.5141569375991821,
"learning_rate": 5.122310518950151e-06,
"loss": 4.7594,
"step": 59600
},
{
"epoch": 0.49,
"grad_norm": 1.22174072265625,
"learning_rate": 5.114126476196712e-06,
"loss": 4.7591,
"step": 59700
},
{
"epoch": 0.49,
"grad_norm": 1.8501654863357544,
"learning_rate": 5.1059424334432724e-06,
"loss": 4.7566,
"step": 59800
},
{
"epoch": 0.49,
"grad_norm": 0.8938846588134766,
"learning_rate": 5.0977583906898336e-06,
"loss": 4.7601,
"step": 59900
},
{
"epoch": 0.49,
"grad_norm": 3.8000481128692627,
"learning_rate": 5.089574347936394e-06,
"loss": 4.7645,
"step": 60000
},
{
"epoch": 0.49,
"grad_norm": 2.3883254528045654,
"learning_rate": 5.081390305182955e-06,
"loss": 4.759,
"step": 60100
},
{
"epoch": 0.49,
"grad_norm": 1.5231989622116089,
"learning_rate": 5.073206262429515e-06,
"loss": 4.7591,
"step": 60200
},
{
"epoch": 0.49,
"grad_norm": 2.584988832473755,
"learning_rate": 5.0650222196760755e-06,
"loss": 4.7614,
"step": 60300
},
{
"epoch": 0.49,
"grad_norm": 1.4782294034957886,
"learning_rate": 5.0568381769226374e-06,
"loss": 4.7577,
"step": 60400
},
{
"epoch": 0.5,
"grad_norm": 2.2520744800567627,
"learning_rate": 5.048654134169198e-06,
"loss": 4.7576,
"step": 60500
},
{
"epoch": 0.5,
"grad_norm": 1.1555761098861694,
"learning_rate": 5.040470091415758e-06,
"loss": 4.7613,
"step": 60600
},
{
"epoch": 0.5,
"grad_norm": 1.0871673822402954,
"learning_rate": 5.032286048662318e-06,
"loss": 4.7534,
"step": 60700
},
{
"epoch": 0.5,
"grad_norm": 0.9198378324508667,
"learning_rate": 5.024102005908879e-06,
"loss": 4.7599,
"step": 60800
},
{
"epoch": 0.5,
"grad_norm": 1.8254669904708862,
"learning_rate": 5.01591796315544e-06,
"loss": 4.7577,
"step": 60900
},
{
"epoch": 0.5,
"grad_norm": 2.592374563217163,
"learning_rate": 5.007733920402001e-06,
"loss": 4.7596,
"step": 61000
},
{
"epoch": 0.5,
"grad_norm": 1.4294451475143433,
"learning_rate": 4.999549877648561e-06,
"loss": 4.7587,
"step": 61100
},
{
"epoch": 0.5,
"grad_norm": 1.3959410190582275,
"learning_rate": 4.991365834895122e-06,
"loss": 4.7587,
"step": 61200
},
{
"epoch": 0.5,
"grad_norm": 1.02007257938385,
"learning_rate": 4.983181792141682e-06,
"loss": 4.7591,
"step": 61300
},
{
"epoch": 0.5,
"grad_norm": 1.9636856317520142,
"learning_rate": 4.9749977493882435e-06,
"loss": 4.7586,
"step": 61400
},
{
"epoch": 0.5,
"grad_norm": 1.2003675699234009,
"learning_rate": 4.966813706634804e-06,
"loss": 4.7578,
"step": 61500
},
{
"epoch": 0.5,
"grad_norm": 1.2069259881973267,
"learning_rate": 4.958629663881365e-06,
"loss": 4.7594,
"step": 61600
},
{
"epoch": 0.5,
"grad_norm": 1.160438895225525,
"learning_rate": 4.950445621127925e-06,
"loss": 4.7537,
"step": 61700
},
{
"epoch": 0.51,
"grad_norm": 1.4509081840515137,
"learning_rate": 4.942261578374486e-06,
"loss": 4.7566,
"step": 61800
},
{
"epoch": 0.51,
"grad_norm": 1.4008055925369263,
"learning_rate": 4.9340775356210465e-06,
"loss": 4.7596,
"step": 61900
},
{
"epoch": 0.51,
"grad_norm": 2.445732593536377,
"learning_rate": 4.925893492867608e-06,
"loss": 4.7585,
"step": 62000
},
{
"epoch": 0.51,
"grad_norm": 2.155773162841797,
"learning_rate": 4.917709450114168e-06,
"loss": 4.7594,
"step": 62100
},
{
"epoch": 0.51,
"grad_norm": 1.3300435543060303,
"learning_rate": 4.909525407360728e-06,
"loss": 4.7599,
"step": 62200
},
{
"epoch": 0.51,
"grad_norm": 1.1474816799163818,
"learning_rate": 4.901341364607289e-06,
"loss": 4.7575,
"step": 62300
},
{
"epoch": 0.51,
"grad_norm": 1.1090396642684937,
"learning_rate": 4.8931573218538495e-06,
"loss": 4.7556,
"step": 62400
},
{
"epoch": 0.51,
"grad_norm": 1.4400147199630737,
"learning_rate": 4.884973279100411e-06,
"loss": 4.7543,
"step": 62500
},
{
"epoch": 0.51,
"grad_norm": 1.534568190574646,
"learning_rate": 4.876789236346971e-06,
"loss": 4.7542,
"step": 62600
},
{
"epoch": 0.51,
"grad_norm": 1.1177995204925537,
"learning_rate": 4.868605193593532e-06,
"loss": 4.7557,
"step": 62700
},
{
"epoch": 0.51,
"grad_norm": 1.2250655889511108,
"learning_rate": 4.860421150840092e-06,
"loss": 4.7588,
"step": 62800
},
{
"epoch": 0.51,
"grad_norm": 1.2332854270935059,
"learning_rate": 4.852237108086653e-06,
"loss": 4.7623,
"step": 62900
},
{
"epoch": 0.52,
"grad_norm": 1.0365347862243652,
"learning_rate": 4.844053065333214e-06,
"loss": 4.7591,
"step": 63000
},
{
"epoch": 0.52,
"grad_norm": 3.2265894412994385,
"learning_rate": 4.835869022579775e-06,
"loss": 4.7589,
"step": 63100
},
{
"epoch": 0.52,
"grad_norm": 1.374605417251587,
"learning_rate": 4.827684979826335e-06,
"loss": 4.7574,
"step": 63200
},
{
"epoch": 0.52,
"grad_norm": 1.239890694618225,
"learning_rate": 4.819500937072895e-06,
"loss": 4.7622,
"step": 63300
},
{
"epoch": 0.52,
"grad_norm": 4.042061805725098,
"learning_rate": 4.8113168943194564e-06,
"loss": 4.7591,
"step": 63400
},
{
"epoch": 0.52,
"grad_norm": 2.166978597640991,
"learning_rate": 4.803132851566017e-06,
"loss": 4.7594,
"step": 63500
},
{
"epoch": 0.52,
"grad_norm": 1.0814965963363647,
"learning_rate": 4.794948808812578e-06,
"loss": 4.759,
"step": 63600
},
{
"epoch": 0.52,
"grad_norm": 1.7993803024291992,
"learning_rate": 4.786764766059138e-06,
"loss": 4.7574,
"step": 63700
},
{
"epoch": 0.52,
"grad_norm": 1.1397624015808105,
"learning_rate": 4.778580723305699e-06,
"loss": 4.7568,
"step": 63800
},
{
"epoch": 0.52,
"grad_norm": 1.801677942276001,
"learning_rate": 4.7703966805522595e-06,
"loss": 4.7561,
"step": 63900
},
{
"epoch": 0.52,
"grad_norm": 1.5364161729812622,
"learning_rate": 4.762212637798821e-06,
"loss": 4.7584,
"step": 64000
},
{
"epoch": 0.52,
"grad_norm": 1.0341291427612305,
"learning_rate": 4.754028595045381e-06,
"loss": 4.7581,
"step": 64100
},
{
"epoch": 0.53,
"grad_norm": 1.0642578601837158,
"learning_rate": 4.745844552291942e-06,
"loss": 4.7566,
"step": 64200
},
{
"epoch": 0.53,
"grad_norm": 1.6422146558761597,
"learning_rate": 4.737660509538502e-06,
"loss": 4.7583,
"step": 64300
},
{
"epoch": 0.53,
"grad_norm": 1.8048427104949951,
"learning_rate": 4.7294764667850625e-06,
"loss": 4.7575,
"step": 64400
},
{
"epoch": 0.53,
"grad_norm": 1.5397706031799316,
"learning_rate": 4.721292424031624e-06,
"loss": 4.7603,
"step": 64500
},
{
"epoch": 0.53,
"grad_norm": 1.1673585176467896,
"learning_rate": 4.713108381278184e-06,
"loss": 4.7603,
"step": 64600
},
{
"epoch": 0.53,
"grad_norm": 1.130509376525879,
"learning_rate": 4.704924338524745e-06,
"loss": 4.7582,
"step": 64700
},
{
"epoch": 0.53,
"grad_norm": 1.8829139471054077,
"learning_rate": 4.696740295771305e-06,
"loss": 4.7559,
"step": 64800
},
{
"epoch": 0.53,
"grad_norm": 1.014657974243164,
"learning_rate": 4.688556253017866e-06,
"loss": 4.7585,
"step": 64900
},
{
"epoch": 0.53,
"grad_norm": 1.3047457933425903,
"learning_rate": 4.680372210264427e-06,
"loss": 4.7582,
"step": 65000
},
{
"epoch": 0.53,
"grad_norm": 5.515758514404297,
"learning_rate": 4.672188167510988e-06,
"loss": 4.7581,
"step": 65100
},
{
"epoch": 0.53,
"grad_norm": 1.061341643333435,
"learning_rate": 4.664004124757548e-06,
"loss": 4.7537,
"step": 65200
},
{
"epoch": 0.53,
"grad_norm": 1.1620299816131592,
"learning_rate": 4.655820082004109e-06,
"loss": 4.7589,
"step": 65300
},
{
"epoch": 0.54,
"grad_norm": 0.9839982390403748,
"learning_rate": 4.647636039250669e-06,
"loss": 4.7602,
"step": 65400
},
{
"epoch": 0.54,
"grad_norm": 1.1345555782318115,
"learning_rate": 4.6394519964972305e-06,
"loss": 4.7595,
"step": 65500
},
{
"epoch": 0.54,
"grad_norm": 1.2723256349563599,
"learning_rate": 4.631267953743791e-06,
"loss": 4.7631,
"step": 65600
},
{
"epoch": 0.54,
"grad_norm": 1.2903733253479004,
"learning_rate": 4.623083910990351e-06,
"loss": 4.7572,
"step": 65700
},
{
"epoch": 0.54,
"grad_norm": 1.849879503250122,
"learning_rate": 4.614899868236912e-06,
"loss": 4.7536,
"step": 65800
},
{
"epoch": 0.54,
"grad_norm": 0.9856821298599243,
"learning_rate": 4.6067158254834724e-06,
"loss": 4.7566,
"step": 65900
},
{
"epoch": 0.54,
"grad_norm": 1.0516011714935303,
"learning_rate": 4.5985317827300335e-06,
"loss": 4.7577,
"step": 66000
},
{
"epoch": 0.54,
"grad_norm": 1.0833971500396729,
"learning_rate": 4.590347739976594e-06,
"loss": 4.7553,
"step": 66100
},
{
"epoch": 0.54,
"grad_norm": 3.236478805541992,
"learning_rate": 4.582163697223155e-06,
"loss": 4.7524,
"step": 66200
},
{
"epoch": 0.54,
"grad_norm": 4.561278820037842,
"learning_rate": 4.573979654469715e-06,
"loss": 4.7598,
"step": 66300
},
{
"epoch": 0.54,
"grad_norm": 1.0862793922424316,
"learning_rate": 4.565795611716276e-06,
"loss": 4.756,
"step": 66400
},
{
"epoch": 0.54,
"grad_norm": 1.248744249343872,
"learning_rate": 4.5576115689628366e-06,
"loss": 4.7579,
"step": 66500
},
{
"epoch": 0.55,
"grad_norm": 1.2721776962280273,
"learning_rate": 4.549427526209398e-06,
"loss": 4.7549,
"step": 66600
},
{
"epoch": 0.55,
"grad_norm": 1.6902192831039429,
"learning_rate": 4.541243483455958e-06,
"loss": 4.7605,
"step": 66700
},
{
"epoch": 0.55,
"grad_norm": 2.721341609954834,
"learning_rate": 4.533059440702518e-06,
"loss": 4.7619,
"step": 66800
},
{
"epoch": 0.55,
"grad_norm": 1.5549167394638062,
"learning_rate": 4.524875397949079e-06,
"loss": 4.7581,
"step": 66900
},
{
"epoch": 0.55,
"grad_norm": 4.686578273773193,
"learning_rate": 4.51669135519564e-06,
"loss": 4.757,
"step": 67000
},
{
"epoch": 0.55,
"grad_norm": 1.2544666528701782,
"learning_rate": 4.508507312442201e-06,
"loss": 4.7614,
"step": 67100
},
{
"epoch": 0.55,
"grad_norm": 1.3165531158447266,
"learning_rate": 4.500323269688761e-06,
"loss": 4.7549,
"step": 67200
},
{
"epoch": 0.55,
"grad_norm": 1.4608168601989746,
"learning_rate": 4.492139226935322e-06,
"loss": 4.7553,
"step": 67300
},
{
"epoch": 0.55,
"grad_norm": 1.003299593925476,
"learning_rate": 4.483955184181882e-06,
"loss": 4.7638,
"step": 67400
},
{
"epoch": 0.55,
"grad_norm": 1.496551752090454,
"learning_rate": 4.4757711414284435e-06,
"loss": 4.759,
"step": 67500
},
{
"epoch": 0.55,
"grad_norm": 1.3934059143066406,
"learning_rate": 4.467587098675004e-06,
"loss": 4.7566,
"step": 67600
},
{
"epoch": 0.55,
"grad_norm": 2.459867238998413,
"learning_rate": 4.459403055921565e-06,
"loss": 4.7567,
"step": 67700
},
{
"epoch": 0.55,
"grad_norm": 1.2848294973373413,
"learning_rate": 4.451219013168125e-06,
"loss": 4.7562,
"step": 67800
},
{
"epoch": 0.56,
"grad_norm": 1.5182512998580933,
"learning_rate": 4.443034970414685e-06,
"loss": 4.7595,
"step": 67900
},
{
"epoch": 0.56,
"grad_norm": 1.2391725778579712,
"learning_rate": 4.4348509276612465e-06,
"loss": 4.7538,
"step": 68000
},
{
"epoch": 0.56,
"grad_norm": 3.008521318435669,
"learning_rate": 4.426666884907807e-06,
"loss": 4.7568,
"step": 68100
},
{
"epoch": 0.56,
"grad_norm": 1.6599717140197754,
"learning_rate": 4.418482842154368e-06,
"loss": 4.7598,
"step": 68200
},
{
"epoch": 0.56,
"grad_norm": 2.2164254188537598,
"learning_rate": 4.410298799400928e-06,
"loss": 4.7545,
"step": 68300
},
{
"epoch": 0.56,
"grad_norm": 3.473665237426758,
"learning_rate": 4.402114756647489e-06,
"loss": 4.7601,
"step": 68400
},
{
"epoch": 0.56,
"grad_norm": 1.9182640314102173,
"learning_rate": 4.3939307138940495e-06,
"loss": 4.7559,
"step": 68500
},
{
"epoch": 0.56,
"grad_norm": 2.2187399864196777,
"learning_rate": 4.385746671140611e-06,
"loss": 4.7611,
"step": 68600
},
{
"epoch": 0.56,
"grad_norm": 2.2415308952331543,
"learning_rate": 4.377562628387171e-06,
"loss": 4.7572,
"step": 68700
},
{
"epoch": 0.56,
"grad_norm": 1.0853921175003052,
"learning_rate": 4.369378585633732e-06,
"loss": 4.7522,
"step": 68800
},
{
"epoch": 0.56,
"grad_norm": 2.0470669269561768,
"learning_rate": 4.361194542880292e-06,
"loss": 4.7567,
"step": 68900
},
{
"epoch": 0.56,
"grad_norm": 1.5501480102539062,
"learning_rate": 4.353010500126853e-06,
"loss": 4.7548,
"step": 69000
},
{
"epoch": 0.57,
"grad_norm": 1.0756503343582153,
"learning_rate": 4.344826457373414e-06,
"loss": 4.7556,
"step": 69100
},
{
"epoch": 0.57,
"grad_norm": 1.0396485328674316,
"learning_rate": 4.336642414619974e-06,
"loss": 4.756,
"step": 69200
},
{
"epoch": 0.57,
"grad_norm": 1.5130740404129028,
"learning_rate": 4.328458371866535e-06,
"loss": 4.7538,
"step": 69300
},
{
"epoch": 0.57,
"grad_norm": 1.2191152572631836,
"learning_rate": 4.320274329113095e-06,
"loss": 4.7594,
"step": 69400
},
{
"epoch": 0.57,
"grad_norm": 1.1031177043914795,
"learning_rate": 4.3120902863596564e-06,
"loss": 4.7585,
"step": 69500
},
{
"epoch": 0.57,
"grad_norm": 3.345165967941284,
"learning_rate": 4.303906243606217e-06,
"loss": 4.7601,
"step": 69600
},
{
"epoch": 0.57,
"grad_norm": 1.058370590209961,
"learning_rate": 4.295722200852778e-06,
"loss": 4.7575,
"step": 69700
},
{
"epoch": 0.57,
"grad_norm": 1.364247441291809,
"learning_rate": 4.287538158099338e-06,
"loss": 4.7592,
"step": 69800
},
{
"epoch": 0.57,
"grad_norm": 2.520071029663086,
"learning_rate": 4.279354115345899e-06,
"loss": 4.7597,
"step": 69900
},
{
"epoch": 0.57,
"grad_norm": 1.4218943119049072,
"learning_rate": 4.2711700725924595e-06,
"loss": 4.7525,
"step": 70000
},
{
"epoch": 0.57,
"grad_norm": 2.9582276344299316,
"learning_rate": 4.2629860298390206e-06,
"loss": 4.7611,
"step": 70100
},
{
"epoch": 0.57,
"grad_norm": 2.1016061305999756,
"learning_rate": 4.254801987085581e-06,
"loss": 4.7557,
"step": 70200
},
{
"epoch": 0.58,
"grad_norm": 1.1501710414886475,
"learning_rate": 4.246617944332141e-06,
"loss": 4.7582,
"step": 70300
},
{
"epoch": 0.58,
"grad_norm": 2.3157947063446045,
"learning_rate": 4.238433901578702e-06,
"loss": 4.7574,
"step": 70400
},
{
"epoch": 0.58,
"grad_norm": 1.0421010255813599,
"learning_rate": 4.2302498588252625e-06,
"loss": 4.7583,
"step": 70500
},
{
"epoch": 0.58,
"grad_norm": 1.3601773977279663,
"learning_rate": 4.222065816071824e-06,
"loss": 4.7567,
"step": 70600
},
{
"epoch": 0.58,
"grad_norm": 1.1386362314224243,
"learning_rate": 4.213881773318384e-06,
"loss": 4.7535,
"step": 70700
},
{
"epoch": 0.58,
"grad_norm": 1.3439152240753174,
"learning_rate": 4.205697730564945e-06,
"loss": 4.7595,
"step": 70800
},
{
"epoch": 0.58,
"grad_norm": 1.9923715591430664,
"learning_rate": 4.197513687811505e-06,
"loss": 4.7561,
"step": 70900
},
{
"epoch": 0.58,
"grad_norm": 1.0728856325149536,
"learning_rate": 4.189329645058066e-06,
"loss": 4.7528,
"step": 71000
},
{
"epoch": 0.58,
"grad_norm": 1.5504320859909058,
"learning_rate": 4.181145602304627e-06,
"loss": 4.7555,
"step": 71100
},
{
"epoch": 0.58,
"grad_norm": 3.476879358291626,
"learning_rate": 4.172961559551188e-06,
"loss": 4.7571,
"step": 71200
},
{
"epoch": 0.58,
"grad_norm": 1.4305684566497803,
"learning_rate": 4.164777516797748e-06,
"loss": 4.7501,
"step": 71300
},
{
"epoch": 0.58,
"grad_norm": 1.2255500555038452,
"learning_rate": 4.156593474044308e-06,
"loss": 4.7591,
"step": 71400
},
{
"epoch": 0.59,
"grad_norm": 2.292752742767334,
"learning_rate": 4.148409431290869e-06,
"loss": 4.7576,
"step": 71500
},
{
"epoch": 0.59,
"grad_norm": 1.9670140743255615,
"learning_rate": 4.14022538853743e-06,
"loss": 4.7605,
"step": 71600
},
{
"epoch": 0.59,
"grad_norm": 2.035198450088501,
"learning_rate": 4.132041345783991e-06,
"loss": 4.7564,
"step": 71700
},
{
"epoch": 0.59,
"grad_norm": 1.918428659439087,
"learning_rate": 4.123857303030551e-06,
"loss": 4.7538,
"step": 71800
},
{
"epoch": 0.59,
"grad_norm": 4.245315074920654,
"learning_rate": 4.115673260277112e-06,
"loss": 4.7585,
"step": 71900
},
{
"epoch": 0.59,
"grad_norm": 3.4246652126312256,
"learning_rate": 4.107489217523672e-06,
"loss": 4.7507,
"step": 72000
},
{
"epoch": 0.59,
"grad_norm": 1.2266836166381836,
"learning_rate": 4.0993051747702335e-06,
"loss": 4.7602,
"step": 72100
},
{
"epoch": 0.59,
"grad_norm": 1.9559603929519653,
"learning_rate": 4.091121132016794e-06,
"loss": 4.7555,
"step": 72200
},
{
"epoch": 0.59,
"grad_norm": 2.2520241737365723,
"learning_rate": 4.082937089263355e-06,
"loss": 4.7551,
"step": 72300
},
{
"epoch": 0.59,
"grad_norm": 2.5236260890960693,
"learning_rate": 4.074753046509915e-06,
"loss": 4.7552,
"step": 72400
},
{
"epoch": 0.59,
"grad_norm": 1.0424671173095703,
"learning_rate": 4.066569003756476e-06,
"loss": 4.7548,
"step": 72500
},
{
"epoch": 0.59,
"grad_norm": 1.1566205024719238,
"learning_rate": 4.0583849610030366e-06,
"loss": 4.7522,
"step": 72600
},
{
"epoch": 0.59,
"grad_norm": 1.9585150480270386,
"learning_rate": 4.050200918249597e-06,
"loss": 4.7609,
"step": 72700
},
{
"epoch": 0.6,
"grad_norm": 6.500349044799805,
"learning_rate": 4.042016875496158e-06,
"loss": 4.7562,
"step": 72800
},
{
"epoch": 0.6,
"grad_norm": 1.1571673154830933,
"learning_rate": 4.033832832742718e-06,
"loss": 4.7571,
"step": 72900
},
{
"epoch": 0.6,
"grad_norm": 1.7180365324020386,
"learning_rate": 4.025648789989279e-06,
"loss": 4.7614,
"step": 73000
},
{
"epoch": 0.6,
"grad_norm": 1.5343599319458008,
"learning_rate": 4.01746474723584e-06,
"loss": 4.7588,
"step": 73100
},
{
"epoch": 0.6,
"grad_norm": 1.5855658054351807,
"learning_rate": 4.009280704482401e-06,
"loss": 4.7603,
"step": 73200
},
{
"epoch": 0.6,
"grad_norm": 1.0107240676879883,
"learning_rate": 4.001096661728961e-06,
"loss": 4.7576,
"step": 73300
},
{
"epoch": 0.6,
"grad_norm": 1.6505345106124878,
"learning_rate": 3.992912618975522e-06,
"loss": 4.7562,
"step": 73400
},
{
"epoch": 0.6,
"grad_norm": 2.6212563514709473,
"learning_rate": 3.984728576222082e-06,
"loss": 4.7516,
"step": 73500
},
{
"epoch": 0.6,
"grad_norm": 1.5305055379867554,
"learning_rate": 3.9765445334686435e-06,
"loss": 4.7533,
"step": 73600
},
{
"epoch": 0.6,
"grad_norm": 3.195974826812744,
"learning_rate": 3.968360490715204e-06,
"loss": 4.7509,
"step": 73700
},
{
"epoch": 0.6,
"grad_norm": 1.088273286819458,
"learning_rate": 3.960176447961764e-06,
"loss": 4.7559,
"step": 73800
},
{
"epoch": 0.6,
"grad_norm": 2.045375108718872,
"learning_rate": 3.951992405208325e-06,
"loss": 4.7516,
"step": 73900
},
{
"epoch": 0.61,
"grad_norm": 1.5279570817947388,
"learning_rate": 3.943808362454885e-06,
"loss": 4.7566,
"step": 74000
},
{
"epoch": 0.61,
"grad_norm": 1.987199306488037,
"learning_rate": 3.9356243197014465e-06,
"loss": 4.7531,
"step": 74100
},
{
"epoch": 0.61,
"grad_norm": 1.7594095468521118,
"learning_rate": 3.927440276948007e-06,
"loss": 4.7568,
"step": 74200
},
{
"epoch": 0.61,
"grad_norm": 1.9795931577682495,
"learning_rate": 3.919256234194568e-06,
"loss": 4.7606,
"step": 74300
},
{
"epoch": 0.61,
"grad_norm": 1.383016586303711,
"learning_rate": 3.911072191441128e-06,
"loss": 4.7559,
"step": 74400
},
{
"epoch": 0.61,
"grad_norm": 1.4739179611206055,
"learning_rate": 3.902888148687689e-06,
"loss": 4.7553,
"step": 74500
},
{
"epoch": 0.61,
"grad_norm": 1.1515618562698364,
"learning_rate": 3.89470410593425e-06,
"loss": 4.7608,
"step": 74600
},
{
"epoch": 0.61,
"grad_norm": 1.4380580186843872,
"learning_rate": 3.886520063180811e-06,
"loss": 4.7578,
"step": 74700
},
{
"epoch": 0.61,
"grad_norm": 1.187768578529358,
"learning_rate": 3.878336020427371e-06,
"loss": 4.7533,
"step": 74800
},
{
"epoch": 0.61,
"grad_norm": 4.768670082092285,
"learning_rate": 3.870151977673931e-06,
"loss": 4.7573,
"step": 74900
},
{
"epoch": 0.61,
"grad_norm": 1.2797937393188477,
"learning_rate": 3.861967934920492e-06,
"loss": 4.7546,
"step": 75000
},
{
"epoch": 0.61,
"grad_norm": 2.6686596870422363,
"learning_rate": 3.8537838921670525e-06,
"loss": 4.7562,
"step": 75100
},
{
"epoch": 0.62,
"grad_norm": 2.777021646499634,
"learning_rate": 3.845599849413614e-06,
"loss": 4.7578,
"step": 75200
},
{
"epoch": 0.62,
"grad_norm": 1.4774482250213623,
"learning_rate": 3.837415806660174e-06,
"loss": 4.7553,
"step": 75300
},
{
"epoch": 0.62,
"grad_norm": 1.154783010482788,
"learning_rate": 3.829231763906735e-06,
"loss": 4.7536,
"step": 75400
},
{
"epoch": 0.62,
"grad_norm": 1.1363816261291504,
"learning_rate": 3.821047721153295e-06,
"loss": 4.7553,
"step": 75500
},
{
"epoch": 0.62,
"grad_norm": 1.314833402633667,
"learning_rate": 3.8128636783998564e-06,
"loss": 4.754,
"step": 75600
},
{
"epoch": 0.62,
"grad_norm": 2.0026655197143555,
"learning_rate": 3.804679635646417e-06,
"loss": 4.7564,
"step": 75700
},
{
"epoch": 0.62,
"grad_norm": 2.530662775039673,
"learning_rate": 3.7964955928929774e-06,
"loss": 4.7621,
"step": 75800
},
{
"epoch": 0.62,
"grad_norm": 1.7200578451156616,
"learning_rate": 3.7883115501395385e-06,
"loss": 4.7534,
"step": 75900
},
{
"epoch": 0.62,
"grad_norm": 1.1230708360671997,
"learning_rate": 3.7801275073860988e-06,
"loss": 4.7573,
"step": 76000
},
{
"epoch": 0.62,
"grad_norm": 1.1518919467926025,
"learning_rate": 3.77194346463266e-06,
"loss": 4.756,
"step": 76100
},
{
"epoch": 0.62,
"grad_norm": 4.6440582275390625,
"learning_rate": 3.76375942187922e-06,
"loss": 4.7538,
"step": 76200
},
{
"epoch": 0.62,
"grad_norm": 1.2866283655166626,
"learning_rate": 3.755575379125781e-06,
"loss": 4.7572,
"step": 76300
},
{
"epoch": 0.63,
"grad_norm": 1.0763148069381714,
"learning_rate": 3.7473913363723415e-06,
"loss": 4.7568,
"step": 76400
},
{
"epoch": 0.63,
"grad_norm": 1.0883269309997559,
"learning_rate": 3.739207293618902e-06,
"loss": 4.7562,
"step": 76500
},
{
"epoch": 0.63,
"grad_norm": 1.50298011302948,
"learning_rate": 3.7310232508654625e-06,
"loss": 4.7512,
"step": 76600
},
{
"epoch": 0.63,
"grad_norm": 1.144468069076538,
"learning_rate": 3.7228392081120236e-06,
"loss": 4.7571,
"step": 76700
},
{
"epoch": 0.63,
"grad_norm": 1.2953712940216064,
"learning_rate": 3.7146551653585843e-06,
"loss": 4.7502,
"step": 76800
},
{
"epoch": 0.63,
"grad_norm": 3.047788143157959,
"learning_rate": 3.7064711226051445e-06,
"loss": 4.7561,
"step": 76900
},
{
"epoch": 0.63,
"grad_norm": 1.7507997751235962,
"learning_rate": 3.6982870798517057e-06,
"loss": 4.7547,
"step": 77000
},
{
"epoch": 0.63,
"grad_norm": 1.189469814300537,
"learning_rate": 3.690103037098266e-06,
"loss": 4.7554,
"step": 77100
},
{
"epoch": 0.63,
"grad_norm": 1.2107117176055908,
"learning_rate": 3.681918994344827e-06,
"loss": 4.7544,
"step": 77200
},
{
"epoch": 0.63,
"grad_norm": 1.0349069833755493,
"learning_rate": 3.6737349515913873e-06,
"loss": 4.7603,
"step": 77300
},
{
"epoch": 0.63,
"grad_norm": 1.1641030311584473,
"learning_rate": 3.665550908837948e-06,
"loss": 4.7566,
"step": 77400
},
{
"epoch": 0.63,
"grad_norm": 1.1426669359207153,
"learning_rate": 3.6573668660845087e-06,
"loss": 4.7535,
"step": 77500
},
{
"epoch": 0.64,
"grad_norm": 2.1960771083831787,
"learning_rate": 3.6491828233310694e-06,
"loss": 4.7559,
"step": 77600
},
{
"epoch": 0.64,
"grad_norm": 1.1451727151870728,
"learning_rate": 3.6409987805776296e-06,
"loss": 4.7591,
"step": 77700
},
{
"epoch": 0.64,
"grad_norm": 1.3886533975601196,
"learning_rate": 3.6328147378241908e-06,
"loss": 4.7553,
"step": 77800
},
{
"epoch": 0.64,
"grad_norm": 1.1393433809280396,
"learning_rate": 3.6246306950707514e-06,
"loss": 4.7581,
"step": 77900
},
{
"epoch": 0.64,
"grad_norm": 1.2284306287765503,
"learning_rate": 3.616446652317312e-06,
"loss": 4.7566,
"step": 78000
},
{
"epoch": 0.64,
"grad_norm": 1.0621302127838135,
"learning_rate": 3.608262609563873e-06,
"loss": 4.7532,
"step": 78100
},
{
"epoch": 0.64,
"grad_norm": 1.1183658838272095,
"learning_rate": 3.600078566810433e-06,
"loss": 4.7583,
"step": 78200
},
{
"epoch": 0.64,
"grad_norm": 1.1688071489334106,
"learning_rate": 3.591894524056994e-06,
"loss": 4.7542,
"step": 78300
},
{
"epoch": 0.64,
"grad_norm": 3.1448984146118164,
"learning_rate": 3.5837104813035545e-06,
"loss": 4.7592,
"step": 78400
},
{
"epoch": 0.64,
"grad_norm": 1.1687928438186646,
"learning_rate": 3.5755264385501156e-06,
"loss": 4.7553,
"step": 78500
},
{
"epoch": 0.64,
"grad_norm": 1.1805286407470703,
"learning_rate": 3.567342395796676e-06,
"loss": 4.7544,
"step": 78600
},
{
"epoch": 0.64,
"grad_norm": 2.4032955169677734,
"learning_rate": 3.5591583530432365e-06,
"loss": 4.7569,
"step": 78700
},
{
"epoch": 0.64,
"grad_norm": 3.784090757369995,
"learning_rate": 3.550974310289797e-06,
"loss": 4.7565,
"step": 78800
},
{
"epoch": 0.65,
"grad_norm": 1.2469580173492432,
"learning_rate": 3.542790267536358e-06,
"loss": 4.7485,
"step": 78900
},
{
"epoch": 0.65,
"grad_norm": 2.6100597381591797,
"learning_rate": 3.5346062247829186e-06,
"loss": 4.7561,
"step": 79000
},
{
"epoch": 0.65,
"grad_norm": 1.4072147607803345,
"learning_rate": 3.5264221820294793e-06,
"loss": 4.7556,
"step": 79100
},
{
"epoch": 0.65,
"grad_norm": 1.2158293724060059,
"learning_rate": 3.51823813927604e-06,
"loss": 4.7503,
"step": 79200
},
{
"epoch": 0.65,
"grad_norm": 1.4874674081802368,
"learning_rate": 3.5100540965226003e-06,
"loss": 4.7547,
"step": 79300
},
{
"epoch": 0.65,
"grad_norm": 1.221482515335083,
"learning_rate": 3.5018700537691614e-06,
"loss": 4.7559,
"step": 79400
},
{
"epoch": 0.65,
"grad_norm": 1.1589709520339966,
"learning_rate": 3.4936860110157216e-06,
"loss": 4.7545,
"step": 79500
},
{
"epoch": 0.65,
"grad_norm": 1.2871575355529785,
"learning_rate": 3.4855019682622828e-06,
"loss": 4.7522,
"step": 79600
},
{
"epoch": 0.65,
"grad_norm": 1.7240387201309204,
"learning_rate": 3.477317925508843e-06,
"loss": 4.7566,
"step": 79700
},
{
"epoch": 0.65,
"grad_norm": 6.059484004974365,
"learning_rate": 3.4691338827554037e-06,
"loss": 4.7566,
"step": 79800
},
{
"epoch": 0.65,
"grad_norm": 1.1639505624771118,
"learning_rate": 3.4609498400019644e-06,
"loss": 4.7521,
"step": 79900
},
{
"epoch": 0.65,
"grad_norm": 1.1786649227142334,
"learning_rate": 3.452765797248525e-06,
"loss": 4.7475,
"step": 80000
},
{
"epoch": 0.66,
"grad_norm": 1.275763988494873,
"learning_rate": 3.4445817544950862e-06,
"loss": 4.7512,
"step": 80100
},
{
"epoch": 0.66,
"grad_norm": 2.3286848068237305,
"learning_rate": 3.4363977117416465e-06,
"loss": 4.7573,
"step": 80200
},
{
"epoch": 0.66,
"grad_norm": 1.2990089654922485,
"learning_rate": 3.428213668988207e-06,
"loss": 4.752,
"step": 80300
},
{
"epoch": 0.66,
"grad_norm": 2.1798534393310547,
"learning_rate": 3.4200296262347674e-06,
"loss": 4.7566,
"step": 80400
},
{
"epoch": 0.66,
"grad_norm": 2.029482841491699,
"learning_rate": 3.4118455834813286e-06,
"loss": 4.7528,
"step": 80500
},
{
"epoch": 0.66,
"grad_norm": 1.3646825551986694,
"learning_rate": 3.403661540727889e-06,
"loss": 4.7567,
"step": 80600
},
{
"epoch": 0.66,
"grad_norm": 1.1107501983642578,
"learning_rate": 3.39547749797445e-06,
"loss": 4.7518,
"step": 80700
},
{
"epoch": 0.66,
"grad_norm": 1.7238624095916748,
"learning_rate": 3.38729345522101e-06,
"loss": 4.7535,
"step": 80800
},
{
"epoch": 0.66,
"grad_norm": 1.2147496938705444,
"learning_rate": 3.379109412467571e-06,
"loss": 4.7529,
"step": 80900
},
{
"epoch": 0.66,
"grad_norm": 1.3453848361968994,
"learning_rate": 3.3709253697141316e-06,
"loss": 4.7549,
"step": 81000
},
{
"epoch": 0.66,
"grad_norm": 1.3312301635742188,
"learning_rate": 3.3627413269606923e-06,
"loss": 4.7533,
"step": 81100
},
{
"epoch": 0.66,
"grad_norm": 1.0629857778549194,
"learning_rate": 3.3545572842072534e-06,
"loss": 4.754,
"step": 81200
},
{
"epoch": 0.67,
"grad_norm": 1.145863652229309,
"learning_rate": 3.3463732414538136e-06,
"loss": 4.7561,
"step": 81300
},
{
"epoch": 0.67,
"grad_norm": 1.1477543115615845,
"learning_rate": 3.3381891987003743e-06,
"loss": 4.7546,
"step": 81400
},
{
"epoch": 0.67,
"grad_norm": 2.111903190612793,
"learning_rate": 3.330005155946935e-06,
"loss": 4.7523,
"step": 81500
},
{
"epoch": 0.67,
"grad_norm": 2.4378135204315186,
"learning_rate": 3.3218211131934957e-06,
"loss": 4.7524,
"step": 81600
},
{
"epoch": 0.67,
"grad_norm": 1.0920718908309937,
"learning_rate": 3.313637070440056e-06,
"loss": 4.7579,
"step": 81700
},
{
"epoch": 0.67,
"grad_norm": 1.142166018486023,
"learning_rate": 3.305453027686617e-06,
"loss": 4.751,
"step": 81800
},
{
"epoch": 0.67,
"grad_norm": 2.497532844543457,
"learning_rate": 3.2972689849331774e-06,
"loss": 4.7587,
"step": 81900
},
{
"epoch": 0.67,
"grad_norm": 1.1811760663986206,
"learning_rate": 3.2890849421797385e-06,
"loss": 4.7565,
"step": 82000
},
{
"epoch": 0.67,
"grad_norm": 1.4381294250488281,
"learning_rate": 3.2809008994262987e-06,
"loss": 4.7523,
"step": 82100
},
{
"epoch": 0.67,
"grad_norm": 1.1105141639709473,
"learning_rate": 3.2727168566728594e-06,
"loss": 4.755,
"step": 82200
},
{
"epoch": 0.67,
"grad_norm": 1.0709772109985352,
"learning_rate": 3.2645328139194206e-06,
"loss": 4.7518,
"step": 82300
},
{
"epoch": 0.67,
"grad_norm": 1.3575836420059204,
"learning_rate": 3.256348771165981e-06,
"loss": 4.7538,
"step": 82400
},
{
"epoch": 0.68,
"grad_norm": 1.1453856229782104,
"learning_rate": 3.2481647284125415e-06,
"loss": 4.7567,
"step": 82500
},
{
"epoch": 0.68,
"grad_norm": 1.2590690851211548,
"learning_rate": 3.239980685659102e-06,
"loss": 4.7556,
"step": 82600
},
{
"epoch": 0.68,
"grad_norm": 1.3445689678192139,
"learning_rate": 3.231796642905663e-06,
"loss": 4.7529,
"step": 82700
},
{
"epoch": 0.68,
"grad_norm": 1.1240034103393555,
"learning_rate": 3.223612600152223e-06,
"loss": 4.7595,
"step": 82800
},
{
"epoch": 0.68,
"grad_norm": 1.2913769483566284,
"learning_rate": 3.2154285573987843e-06,
"loss": 4.7523,
"step": 82900
},
{
"epoch": 0.68,
"grad_norm": 1.2136105298995972,
"learning_rate": 3.2072445146453445e-06,
"loss": 4.7548,
"step": 83000
},
{
"epoch": 0.68,
"grad_norm": 1.0630725622177124,
"learning_rate": 3.1990604718919057e-06,
"loss": 4.7551,
"step": 83100
},
{
"epoch": 0.68,
"grad_norm": 1.495082139968872,
"learning_rate": 3.190876429138466e-06,
"loss": 4.7553,
"step": 83200
},
{
"epoch": 0.68,
"grad_norm": 0.9895689487457275,
"learning_rate": 3.1826923863850266e-06,
"loss": 4.759,
"step": 83300
},
{
"epoch": 0.68,
"grad_norm": 1.4668093919754028,
"learning_rate": 3.1745083436315877e-06,
"loss": 4.7561,
"step": 83400
},
{
"epoch": 0.68,
"grad_norm": 1.5256825685501099,
"learning_rate": 3.166324300878148e-06,
"loss": 4.7573,
"step": 83500
},
{
"epoch": 0.68,
"grad_norm": 3.0631277561187744,
"learning_rate": 3.158140258124709e-06,
"loss": 4.7543,
"step": 83600
},
{
"epoch": 0.69,
"grad_norm": 1.171787977218628,
"learning_rate": 3.1499562153712694e-06,
"loss": 4.7508,
"step": 83700
},
{
"epoch": 0.69,
"grad_norm": 7.035879611968994,
"learning_rate": 3.14177217261783e-06,
"loss": 4.7519,
"step": 83800
},
{
"epoch": 0.69,
"grad_norm": 2.3109359741210938,
"learning_rate": 3.1335881298643903e-06,
"loss": 4.7573,
"step": 83900
},
{
"epoch": 0.69,
"grad_norm": 2.3658266067504883,
"learning_rate": 3.1254040871109514e-06,
"loss": 4.754,
"step": 84000
},
{
"epoch": 0.69,
"grad_norm": 1.6524670124053955,
"learning_rate": 3.1172200443575117e-06,
"loss": 4.7549,
"step": 84100
},
{
"epoch": 0.69,
"grad_norm": 1.797340989112854,
"learning_rate": 3.109036001604073e-06,
"loss": 4.7499,
"step": 84200
},
{
"epoch": 0.69,
"grad_norm": 3.3878042697906494,
"learning_rate": 3.100851958850633e-06,
"loss": 4.7518,
"step": 84300
},
{
"epoch": 0.69,
"grad_norm": 1.5656503438949585,
"learning_rate": 3.0926679160971938e-06,
"loss": 4.7588,
"step": 84400
},
{
"epoch": 0.69,
"grad_norm": 1.4081205129623413,
"learning_rate": 3.084483873343755e-06,
"loss": 4.7587,
"step": 84500
},
{
"epoch": 0.69,
"grad_norm": 2.011707305908203,
"learning_rate": 3.076299830590315e-06,
"loss": 4.7525,
"step": 84600
},
{
"epoch": 0.69,
"grad_norm": 1.1103359460830688,
"learning_rate": 3.0681157878368763e-06,
"loss": 4.7547,
"step": 84700
},
{
"epoch": 0.69,
"grad_norm": 1.331764578819275,
"learning_rate": 3.0599317450834365e-06,
"loss": 4.7544,
"step": 84800
},
{
"epoch": 0.69,
"grad_norm": 1.1731749773025513,
"learning_rate": 3.0517477023299972e-06,
"loss": 4.7527,
"step": 84900
},
{
"epoch": 0.7,
"grad_norm": 2.4476029872894287,
"learning_rate": 3.043563659576558e-06,
"loss": 4.7529,
"step": 85000
},
{
"epoch": 0.7,
"grad_norm": 2.6501026153564453,
"learning_rate": 3.0353796168231186e-06,
"loss": 4.7495,
"step": 85100
},
{
"epoch": 0.7,
"grad_norm": 1.4330114126205444,
"learning_rate": 3.027195574069679e-06,
"loss": 4.7551,
"step": 85200
},
{
"epoch": 0.7,
"grad_norm": 1.794797420501709,
"learning_rate": 3.01901153131624e-06,
"loss": 4.7554,
"step": 85300
},
{
"epoch": 0.7,
"grad_norm": 1.1396609544754028,
"learning_rate": 3.0108274885628003e-06,
"loss": 4.7521,
"step": 85400
},
{
"epoch": 0.7,
"grad_norm": 1.5291541814804077,
"learning_rate": 3.0026434458093614e-06,
"loss": 4.7538,
"step": 85500
},
{
"epoch": 0.7,
"grad_norm": 0.9390245676040649,
"learning_rate": 2.9944594030559216e-06,
"loss": 4.7499,
"step": 85600
},
{
"epoch": 0.7,
"grad_norm": 4.141879558563232,
"learning_rate": 2.9862753603024823e-06,
"loss": 4.7587,
"step": 85700
},
{
"epoch": 0.7,
"grad_norm": 2.151954412460327,
"learning_rate": 2.9780913175490434e-06,
"loss": 4.7525,
"step": 85800
},
{
"epoch": 0.7,
"grad_norm": 1.340173363685608,
"learning_rate": 2.9699072747956037e-06,
"loss": 4.7519,
"step": 85900
},
{
"epoch": 0.7,
"grad_norm": 1.9204574823379517,
"learning_rate": 2.9617232320421644e-06,
"loss": 4.7583,
"step": 86000
},
{
"epoch": 0.7,
"grad_norm": 1.391839623451233,
"learning_rate": 2.953539189288725e-06,
"loss": 4.751,
"step": 86100
},
{
"epoch": 0.71,
"grad_norm": 1.4064441919326782,
"learning_rate": 2.9453551465352858e-06,
"loss": 4.7506,
"step": 86200
},
{
"epoch": 0.71,
"grad_norm": 1.2319107055664062,
"learning_rate": 2.937171103781846e-06,
"loss": 4.7551,
"step": 86300
},
{
"epoch": 0.71,
"grad_norm": 2.515320301055908,
"learning_rate": 2.928987061028407e-06,
"loss": 4.7517,
"step": 86400
},
{
"epoch": 0.71,
"grad_norm": 2.4007177352905273,
"learning_rate": 2.9208030182749674e-06,
"loss": 4.7513,
"step": 86500
},
{
"epoch": 0.71,
"grad_norm": 1.4867286682128906,
"learning_rate": 2.9126189755215285e-06,
"loss": 4.7549,
"step": 86600
},
{
"epoch": 0.71,
"grad_norm": 1.2570160627365112,
"learning_rate": 2.904434932768089e-06,
"loss": 4.753,
"step": 86700
},
{
"epoch": 0.71,
"grad_norm": 2.847069025039673,
"learning_rate": 2.8962508900146495e-06,
"loss": 4.7555,
"step": 86800
},
{
"epoch": 0.71,
"grad_norm": 1.0997235774993896,
"learning_rate": 2.8880668472612106e-06,
"loss": 4.7532,
"step": 86900
},
{
"epoch": 0.71,
"grad_norm": 1.8394368886947632,
"learning_rate": 2.879882804507771e-06,
"loss": 4.7504,
"step": 87000
},
{
"epoch": 0.71,
"grad_norm": 3.6865549087524414,
"learning_rate": 2.871698761754332e-06,
"loss": 4.7567,
"step": 87100
},
{
"epoch": 0.71,
"grad_norm": 3.022850275039673,
"learning_rate": 2.8635147190008923e-06,
"loss": 4.7509,
"step": 87200
},
{
"epoch": 0.71,
"grad_norm": 1.7531808614730835,
"learning_rate": 2.855330676247453e-06,
"loss": 4.7527,
"step": 87300
},
{
"epoch": 0.72,
"grad_norm": 2.0469372272491455,
"learning_rate": 2.8471466334940136e-06,
"loss": 4.7564,
"step": 87400
},
{
"epoch": 0.72,
"grad_norm": 1.4322601556777954,
"learning_rate": 2.8389625907405743e-06,
"loss": 4.7552,
"step": 87500
},
{
"epoch": 0.72,
"grad_norm": 1.2034333944320679,
"learning_rate": 2.8307785479871346e-06,
"loss": 4.7555,
"step": 87600
},
{
"epoch": 0.72,
"grad_norm": 1.0759299993515015,
"learning_rate": 2.8225945052336957e-06,
"loss": 4.7508,
"step": 87700
},
{
"epoch": 0.72,
"grad_norm": 1.1701573133468628,
"learning_rate": 2.814410462480256e-06,
"loss": 4.7535,
"step": 87800
},
{
"epoch": 0.72,
"grad_norm": 1.4818124771118164,
"learning_rate": 2.8062264197268167e-06,
"loss": 4.7528,
"step": 87900
},
{
"epoch": 0.72,
"grad_norm": 1.362298846244812,
"learning_rate": 2.7980423769733778e-06,
"loss": 4.7488,
"step": 88000
},
{
"epoch": 0.72,
"grad_norm": 0.9951609969139099,
"learning_rate": 2.789858334219938e-06,
"loss": 4.7509,
"step": 88100
},
{
"epoch": 0.72,
"grad_norm": 1.2555766105651855,
"learning_rate": 2.781674291466499e-06,
"loss": 4.7559,
"step": 88200
},
{
"epoch": 0.72,
"grad_norm": 1.8623309135437012,
"learning_rate": 2.7734902487130594e-06,
"loss": 4.7489,
"step": 88300
},
{
"epoch": 0.72,
"grad_norm": 1.2883721590042114,
"learning_rate": 2.76530620595962e-06,
"loss": 4.751,
"step": 88400
},
{
"epoch": 0.72,
"grad_norm": 1.1867636442184448,
"learning_rate": 2.757122163206181e-06,
"loss": 4.7524,
"step": 88500
},
{
"epoch": 0.73,
"grad_norm": 1.4036273956298828,
"learning_rate": 2.7489381204527415e-06,
"loss": 4.755,
"step": 88600
},
{
"epoch": 0.73,
"grad_norm": 1.2148162126541138,
"learning_rate": 2.7407540776993018e-06,
"loss": 4.7582,
"step": 88700
},
{
"epoch": 0.73,
"grad_norm": 2.2214956283569336,
"learning_rate": 2.732570034945863e-06,
"loss": 4.7543,
"step": 88800
},
{
"epoch": 0.73,
"grad_norm": 1.103264331817627,
"learning_rate": 2.724385992192423e-06,
"loss": 4.7468,
"step": 88900
},
{
"epoch": 0.73,
"grad_norm": 1.3318493366241455,
"learning_rate": 2.7162019494389843e-06,
"loss": 4.7547,
"step": 89000
},
{
"epoch": 0.73,
"grad_norm": 1.7869521379470825,
"learning_rate": 2.708017906685545e-06,
"loss": 4.7528,
"step": 89100
},
{
"epoch": 0.73,
"grad_norm": 1.0730737447738647,
"learning_rate": 2.6998338639321052e-06,
"loss": 4.7554,
"step": 89200
},
{
"epoch": 0.73,
"grad_norm": 1.677322268486023,
"learning_rate": 2.6916498211786663e-06,
"loss": 4.7574,
"step": 89300
},
{
"epoch": 0.73,
"grad_norm": 1.7166889905929565,
"learning_rate": 2.6834657784252266e-06,
"loss": 4.7563,
"step": 89400
},
{
"epoch": 0.73,
"grad_norm": 1.3023245334625244,
"learning_rate": 2.6752817356717877e-06,
"loss": 4.7569,
"step": 89500
},
{
"epoch": 0.73,
"grad_norm": 1.2815351486206055,
"learning_rate": 2.667097692918348e-06,
"loss": 4.7568,
"step": 89600
},
{
"epoch": 0.73,
"grad_norm": 1.1161749362945557,
"learning_rate": 2.6589136501649087e-06,
"loss": 4.7536,
"step": 89700
},
{
"epoch": 0.73,
"grad_norm": 1.4548900127410889,
"learning_rate": 2.650729607411469e-06,
"loss": 4.7566,
"step": 89800
},
{
"epoch": 0.74,
"grad_norm": 8.324539184570312,
"learning_rate": 2.64254556465803e-06,
"loss": 4.7539,
"step": 89900
},
{
"epoch": 0.74,
"grad_norm": 2.0228288173675537,
"learning_rate": 2.6343615219045903e-06,
"loss": 4.7514,
"step": 90000
},
{
"epoch": 0.74,
"grad_norm": 1.1695142984390259,
"learning_rate": 2.6261774791511514e-06,
"loss": 4.7519,
"step": 90100
},
{
"epoch": 0.74,
"grad_norm": 1.6865144968032837,
"learning_rate": 2.617993436397712e-06,
"loss": 4.7557,
"step": 90200
},
{
"epoch": 0.74,
"grad_norm": 0.9601481556892395,
"learning_rate": 2.6098093936442724e-06,
"loss": 4.7518,
"step": 90300
},
{
"epoch": 0.74,
"grad_norm": 1.0379222631454468,
"learning_rate": 2.6016253508908335e-06,
"loss": 4.7521,
"step": 90400
},
{
"epoch": 0.74,
"grad_norm": 1.6704763174057007,
"learning_rate": 2.5934413081373938e-06,
"loss": 4.7526,
"step": 90500
},
{
"epoch": 0.74,
"grad_norm": 1.0544642210006714,
"learning_rate": 2.585257265383955e-06,
"loss": 4.7529,
"step": 90600
},
{
"epoch": 0.74,
"grad_norm": 1.2152049541473389,
"learning_rate": 2.577073222630515e-06,
"loss": 4.7557,
"step": 90700
},
{
"epoch": 0.74,
"grad_norm": 1.1299751996994019,
"learning_rate": 2.568889179877076e-06,
"loss": 4.7552,
"step": 90800
},
{
"epoch": 0.74,
"grad_norm": 1.3130440711975098,
"learning_rate": 2.5607051371236365e-06,
"loss": 4.7512,
"step": 90900
},
{
"epoch": 0.74,
"grad_norm": 1.1738765239715576,
"learning_rate": 2.5525210943701972e-06,
"loss": 4.7478,
"step": 91000
},
{
"epoch": 0.75,
"grad_norm": 1.3825798034667969,
"learning_rate": 2.5443370516167575e-06,
"loss": 4.7545,
"step": 91100
},
{
"epoch": 0.75,
"grad_norm": 1.2850853204727173,
"learning_rate": 2.5361530088633186e-06,
"loss": 4.7546,
"step": 91200
},
{
"epoch": 0.75,
"grad_norm": 1.215085506439209,
"learning_rate": 2.5279689661098793e-06,
"loss": 4.7488,
"step": 91300
},
{
"epoch": 0.75,
"grad_norm": 1.4124336242675781,
"learning_rate": 2.5197849233564396e-06,
"loss": 4.7441,
"step": 91400
},
{
"epoch": 0.75,
"grad_norm": 2.5708861351013184,
"learning_rate": 2.5116008806030007e-06,
"loss": 4.7553,
"step": 91500
},
{
"epoch": 0.75,
"grad_norm": 1.9249249696731567,
"learning_rate": 2.503416837849561e-06,
"loss": 4.7565,
"step": 91600
},
{
"epoch": 0.75,
"grad_norm": 1.1398611068725586,
"learning_rate": 2.4952327950961216e-06,
"loss": 4.7513,
"step": 91700
},
{
"epoch": 0.75,
"grad_norm": 2.037564516067505,
"learning_rate": 2.4870487523426827e-06,
"loss": 4.7517,
"step": 91800
},
{
"epoch": 0.75,
"grad_norm": 1.4297902584075928,
"learning_rate": 2.478864709589243e-06,
"loss": 4.7494,
"step": 91900
},
{
"epoch": 0.75,
"grad_norm": 1.369734764099121,
"learning_rate": 2.4706806668358037e-06,
"loss": 4.7511,
"step": 92000
},
{
"epoch": 0.75,
"grad_norm": 1.1665796041488647,
"learning_rate": 2.4624966240823644e-06,
"loss": 4.7576,
"step": 92100
},
{
"epoch": 0.75,
"grad_norm": 1.085404396057129,
"learning_rate": 2.454312581328925e-06,
"loss": 4.7535,
"step": 92200
},
{
"epoch": 0.76,
"grad_norm": 5.764316082000732,
"learning_rate": 2.4461285385754858e-06,
"loss": 4.7497,
"step": 92300
},
{
"epoch": 0.76,
"grad_norm": 1.3492110967636108,
"learning_rate": 2.4379444958220465e-06,
"loss": 4.7558,
"step": 92400
},
{
"epoch": 0.76,
"grad_norm": 1.0760524272918701,
"learning_rate": 2.429760453068607e-06,
"loss": 4.7525,
"step": 92500
},
{
"epoch": 0.76,
"grad_norm": 1.2596811056137085,
"learning_rate": 2.4215764103151674e-06,
"loss": 4.7532,
"step": 92600
},
{
"epoch": 0.76,
"grad_norm": 1.0836505889892578,
"learning_rate": 2.413392367561728e-06,
"loss": 4.7506,
"step": 92700
},
{
"epoch": 0.76,
"grad_norm": 2.759760618209839,
"learning_rate": 2.405208324808289e-06,
"loss": 4.7493,
"step": 92800
},
{
"epoch": 0.76,
"grad_norm": 1.3454488515853882,
"learning_rate": 2.39702428205485e-06,
"loss": 4.7539,
"step": 92900
},
{
"epoch": 0.76,
"grad_norm": 1.2812906503677368,
"learning_rate": 2.3888402393014106e-06,
"loss": 4.7509,
"step": 93000
},
{
"epoch": 0.76,
"grad_norm": 1.247383952140808,
"learning_rate": 2.380656196547971e-06,
"loss": 4.7493,
"step": 93100
},
{
"epoch": 0.76,
"grad_norm": 1.803625226020813,
"learning_rate": 2.3724721537945316e-06,
"loss": 4.7527,
"step": 93200
},
{
"epoch": 0.76,
"grad_norm": 2.4045066833496094,
"learning_rate": 2.3642881110410922e-06,
"loss": 4.7578,
"step": 93300
},
{
"epoch": 0.76,
"grad_norm": 2.0578811168670654,
"learning_rate": 2.356104068287653e-06,
"loss": 4.7539,
"step": 93400
},
{
"epoch": 0.77,
"grad_norm": 2.907444477081299,
"learning_rate": 2.3479200255342136e-06,
"loss": 4.7534,
"step": 93500
},
{
"epoch": 0.77,
"grad_norm": 1.3155359029769897,
"learning_rate": 2.3397359827807743e-06,
"loss": 4.7543,
"step": 93600
},
{
"epoch": 0.77,
"grad_norm": 1.2676302194595337,
"learning_rate": 2.331551940027335e-06,
"loss": 4.7501,
"step": 93700
},
{
"epoch": 0.77,
"grad_norm": 1.1166573762893677,
"learning_rate": 2.3233678972738953e-06,
"loss": 4.7518,
"step": 93800
},
{
"epoch": 0.77,
"grad_norm": 1.3180181980133057,
"learning_rate": 2.315183854520456e-06,
"loss": 4.7548,
"step": 93900
},
{
"epoch": 0.77,
"grad_norm": 2.867478132247925,
"learning_rate": 2.306999811767017e-06,
"loss": 4.7551,
"step": 94000
},
{
"epoch": 0.77,
"grad_norm": 1.3548469543457031,
"learning_rate": 2.2988157690135778e-06,
"loss": 4.7524,
"step": 94100
},
{
"epoch": 0.77,
"grad_norm": 3.94809627532959,
"learning_rate": 2.290631726260138e-06,
"loss": 4.7516,
"step": 94200
},
{
"epoch": 0.77,
"grad_norm": 1.0845712423324585,
"learning_rate": 2.2824476835066987e-06,
"loss": 4.7549,
"step": 94300
},
{
"epoch": 0.77,
"grad_norm": 0.9430265426635742,
"learning_rate": 2.2742636407532594e-06,
"loss": 4.7552,
"step": 94400
},
{
"epoch": 0.77,
"grad_norm": 1.1491626501083374,
"learning_rate": 2.26607959799982e-06,
"loss": 4.7533,
"step": 94500
},
{
"epoch": 0.77,
"grad_norm": 1.323564887046814,
"learning_rate": 2.257895555246381e-06,
"loss": 4.7502,
"step": 94600
},
{
"epoch": 0.78,
"grad_norm": 1.2415287494659424,
"learning_rate": 2.2497115124929415e-06,
"loss": 4.754,
"step": 94700
},
{
"epoch": 0.78,
"grad_norm": 1.1996134519577026,
"learning_rate": 2.241527469739502e-06,
"loss": 4.7475,
"step": 94800
},
{
"epoch": 0.78,
"grad_norm": 1.3265007734298706,
"learning_rate": 2.2333434269860624e-06,
"loss": 4.7504,
"step": 94900
},
{
"epoch": 0.78,
"grad_norm": 2.0656216144561768,
"learning_rate": 2.225159384232623e-06,
"loss": 4.7536,
"step": 95000
},
{
"epoch": 0.78,
"grad_norm": 1.7077275514602661,
"learning_rate": 2.2169753414791843e-06,
"loss": 4.7559,
"step": 95100
},
{
"epoch": 0.78,
"grad_norm": 0.9614852070808411,
"learning_rate": 2.208791298725745e-06,
"loss": 4.753,
"step": 95200
},
{
"epoch": 0.78,
"grad_norm": 1.010793685913086,
"learning_rate": 2.2006072559723056e-06,
"loss": 4.7531,
"step": 95300
},
{
"epoch": 0.78,
"grad_norm": 1.7269645929336548,
"learning_rate": 2.192423213218866e-06,
"loss": 4.7525,
"step": 95400
},
{
"epoch": 0.78,
"grad_norm": 1.1839239597320557,
"learning_rate": 2.1842391704654266e-06,
"loss": 4.7516,
"step": 95500
},
{
"epoch": 0.78,
"grad_norm": 1.0646226406097412,
"learning_rate": 2.1760551277119873e-06,
"loss": 4.7489,
"step": 95600
},
{
"epoch": 0.78,
"grad_norm": 1.2255668640136719,
"learning_rate": 2.167871084958548e-06,
"loss": 4.7525,
"step": 95700
},
{
"epoch": 0.78,
"grad_norm": 1.5146337747573853,
"learning_rate": 2.1596870422051087e-06,
"loss": 4.7524,
"step": 95800
},
{
"epoch": 0.78,
"grad_norm": 2.578728437423706,
"learning_rate": 2.1515029994516693e-06,
"loss": 4.7537,
"step": 95900
},
{
"epoch": 0.79,
"grad_norm": 1.3910084962844849,
"learning_rate": 2.14331895669823e-06,
"loss": 4.7557,
"step": 96000
},
{
"epoch": 0.79,
"grad_norm": 1.6304432153701782,
"learning_rate": 2.1351349139447903e-06,
"loss": 4.7509,
"step": 96100
},
{
"epoch": 0.79,
"grad_norm": 1.6290279626846313,
"learning_rate": 2.1269508711913514e-06,
"loss": 4.7499,
"step": 96200
},
{
"epoch": 0.79,
"grad_norm": 1.312935471534729,
"learning_rate": 2.118766828437912e-06,
"loss": 4.7512,
"step": 96300
},
{
"epoch": 0.79,
"grad_norm": 2.8677687644958496,
"learning_rate": 2.110582785684473e-06,
"loss": 4.7507,
"step": 96400
},
{
"epoch": 0.79,
"grad_norm": 2.544320583343506,
"learning_rate": 2.1023987429310335e-06,
"loss": 4.7503,
"step": 96500
},
{
"epoch": 0.79,
"grad_norm": 2.5052340030670166,
"learning_rate": 2.0942147001775938e-06,
"loss": 4.7543,
"step": 96600
},
{
"epoch": 0.79,
"grad_norm": 2.0886638164520264,
"learning_rate": 2.0860306574241544e-06,
"loss": 4.7513,
"step": 96700
},
{
"epoch": 0.79,
"grad_norm": 1.1290991306304932,
"learning_rate": 2.077846614670715e-06,
"loss": 4.7486,
"step": 96800
},
{
"epoch": 0.79,
"grad_norm": 0.968976616859436,
"learning_rate": 2.069662571917276e-06,
"loss": 4.7549,
"step": 96900
},
{
"epoch": 0.79,
"grad_norm": 1.1029621362686157,
"learning_rate": 2.0614785291638365e-06,
"loss": 4.7545,
"step": 97000
},
{
"epoch": 0.79,
"grad_norm": 1.5654712915420532,
"learning_rate": 2.053294486410397e-06,
"loss": 4.7516,
"step": 97100
},
{
"epoch": 0.8,
"grad_norm": 1.3423889875411987,
"learning_rate": 2.045110443656958e-06,
"loss": 4.7529,
"step": 97200
},
{
"epoch": 0.8,
"grad_norm": 1.2194217443466187,
"learning_rate": 2.0369264009035186e-06,
"loss": 4.7503,
"step": 97300
},
{
"epoch": 0.8,
"grad_norm": 1.0679503679275513,
"learning_rate": 2.0287423581500793e-06,
"loss": 4.7515,
"step": 97400
},
{
"epoch": 0.8,
"grad_norm": 1.2756659984588623,
"learning_rate": 2.02055831539664e-06,
"loss": 4.752,
"step": 97500
},
{
"epoch": 0.8,
"grad_norm": 1.315800428390503,
"learning_rate": 2.0123742726432007e-06,
"loss": 4.7554,
"step": 97600
},
{
"epoch": 0.8,
"grad_norm": 1.2954620122909546,
"learning_rate": 2.0041902298897614e-06,
"loss": 4.7525,
"step": 97700
},
{
"epoch": 0.8,
"grad_norm": 1.1520215272903442,
"learning_rate": 1.9960061871363216e-06,
"loss": 4.7548,
"step": 97800
},
{
"epoch": 0.8,
"grad_norm": 1.7471413612365723,
"learning_rate": 1.9878221443828823e-06,
"loss": 4.7549,
"step": 97900
},
{
"epoch": 0.8,
"grad_norm": 1.0936230421066284,
"learning_rate": 1.979638101629443e-06,
"loss": 4.753,
"step": 98000
},
{
"epoch": 0.8,
"grad_norm": 1.110677719116211,
"learning_rate": 1.9714540588760037e-06,
"loss": 4.7524,
"step": 98100
},
{
"epoch": 0.8,
"grad_norm": 1.094068169593811,
"learning_rate": 1.9632700161225644e-06,
"loss": 4.7508,
"step": 98200
},
{
"epoch": 0.8,
"grad_norm": 1.3435810804367065,
"learning_rate": 1.955085973369125e-06,
"loss": 4.7504,
"step": 98300
},
{
"epoch": 0.81,
"grad_norm": 1.7671642303466797,
"learning_rate": 1.9469019306156858e-06,
"loss": 4.7504,
"step": 98400
},
{
"epoch": 0.81,
"grad_norm": 2.0996792316436768,
"learning_rate": 1.9387178878622464e-06,
"loss": 4.7468,
"step": 98500
},
{
"epoch": 0.81,
"grad_norm": 1.256888508796692,
"learning_rate": 1.930533845108807e-06,
"loss": 4.7501,
"step": 98600
},
{
"epoch": 0.81,
"grad_norm": 1.4650033712387085,
"learning_rate": 1.922349802355368e-06,
"loss": 4.7493,
"step": 98700
},
{
"epoch": 0.81,
"grad_norm": 1.3852843046188354,
"learning_rate": 1.9141657596019285e-06,
"loss": 4.7553,
"step": 98800
},
{
"epoch": 0.81,
"grad_norm": 1.2050074338912964,
"learning_rate": 1.905981716848489e-06,
"loss": 4.7481,
"step": 98900
},
{
"epoch": 0.81,
"grad_norm": 2.955382823944092,
"learning_rate": 1.8977976740950497e-06,
"loss": 4.7498,
"step": 99000
},
{
"epoch": 0.81,
"grad_norm": 1.6441978216171265,
"learning_rate": 1.8896136313416102e-06,
"loss": 4.7498,
"step": 99100
},
{
"epoch": 0.81,
"grad_norm": 1.577948808670044,
"learning_rate": 1.8814295885881709e-06,
"loss": 4.7552,
"step": 99200
},
{
"epoch": 0.81,
"grad_norm": 1.3677524328231812,
"learning_rate": 1.8732455458347315e-06,
"loss": 4.7555,
"step": 99300
},
{
"epoch": 0.81,
"grad_norm": 1.4369767904281616,
"learning_rate": 1.8650615030812922e-06,
"loss": 4.7499,
"step": 99400
},
{
"epoch": 0.81,
"grad_norm": 1.5186824798583984,
"learning_rate": 1.8568774603278531e-06,
"loss": 4.7539,
"step": 99500
},
{
"epoch": 0.82,
"grad_norm": 1.2422914505004883,
"learning_rate": 1.8486934175744136e-06,
"loss": 4.7557,
"step": 99600
},
{
"epoch": 0.82,
"grad_norm": 1.3044426441192627,
"learning_rate": 1.8405093748209743e-06,
"loss": 4.7525,
"step": 99700
},
{
"epoch": 0.82,
"grad_norm": 3.3080742359161377,
"learning_rate": 1.832325332067535e-06,
"loss": 4.7509,
"step": 99800
},
{
"epoch": 0.82,
"grad_norm": 1.1785410642623901,
"learning_rate": 1.8241412893140955e-06,
"loss": 4.7519,
"step": 99900
},
{
"epoch": 0.82,
"grad_norm": 1.4587723016738892,
"learning_rate": 1.8159572465606562e-06,
"loss": 4.7484,
"step": 100000
},
{
"epoch": 0.82,
"grad_norm": 3.0926313400268555,
"learning_rate": 1.8077732038072169e-06,
"loss": 4.7562,
"step": 100100
},
{
"epoch": 0.82,
"grad_norm": 1.2200719118118286,
"learning_rate": 1.7995891610537775e-06,
"loss": 4.7584,
"step": 100200
},
{
"epoch": 0.82,
"grad_norm": 1.3386414051055908,
"learning_rate": 1.791405118300338e-06,
"loss": 4.7458,
"step": 100300
},
{
"epoch": 0.82,
"grad_norm": 1.484997034072876,
"learning_rate": 1.7832210755468987e-06,
"loss": 4.7449,
"step": 100400
},
{
"epoch": 0.82,
"grad_norm": 2.519181489944458,
"learning_rate": 1.7750370327934594e-06,
"loss": 4.7529,
"step": 100500
},
{
"epoch": 0.82,
"grad_norm": 2.090131998062134,
"learning_rate": 1.7668529900400199e-06,
"loss": 4.756,
"step": 100600
},
{
"epoch": 0.82,
"grad_norm": 1.105173110961914,
"learning_rate": 1.7586689472865808e-06,
"loss": 4.754,
"step": 100700
},
{
"epoch": 0.82,
"grad_norm": 1.1731809377670288,
"learning_rate": 1.7504849045331415e-06,
"loss": 4.7607,
"step": 100800
},
{
"epoch": 0.83,
"grad_norm": 1.1397889852523804,
"learning_rate": 1.7423008617797022e-06,
"loss": 4.7506,
"step": 100900
},
{
"epoch": 0.83,
"grad_norm": 1.1067900657653809,
"learning_rate": 1.7341168190262629e-06,
"loss": 4.7521,
"step": 101000
},
{
"epoch": 0.83,
"grad_norm": 1.0581586360931396,
"learning_rate": 1.7259327762728233e-06,
"loss": 4.7531,
"step": 101100
},
{
"epoch": 0.83,
"grad_norm": 1.9792087078094482,
"learning_rate": 1.717748733519384e-06,
"loss": 4.7541,
"step": 101200
},
{
"epoch": 0.83,
"grad_norm": 1.133318305015564,
"learning_rate": 1.7095646907659447e-06,
"loss": 4.7554,
"step": 101300
},
{
"epoch": 0.83,
"grad_norm": 1.241073489189148,
"learning_rate": 1.7013806480125052e-06,
"loss": 4.7493,
"step": 101400
},
{
"epoch": 0.83,
"grad_norm": 1.2004437446594238,
"learning_rate": 1.6931966052590659e-06,
"loss": 4.7515,
"step": 101500
},
{
"epoch": 0.83,
"grad_norm": 1.545440912246704,
"learning_rate": 1.6850125625056266e-06,
"loss": 4.7498,
"step": 101600
},
{
"epoch": 0.83,
"grad_norm": 1.2501575946807861,
"learning_rate": 1.6768285197521873e-06,
"loss": 4.7457,
"step": 101700
},
{
"epoch": 0.83,
"grad_norm": 1.2254008054733276,
"learning_rate": 1.6686444769987482e-06,
"loss": 4.754,
"step": 101800
},
{
"epoch": 0.83,
"grad_norm": 1.8597551584243774,
"learning_rate": 1.6604604342453086e-06,
"loss": 4.7545,
"step": 101900
},
{
"epoch": 0.83,
"grad_norm": 1.5887017250061035,
"learning_rate": 1.6522763914918693e-06,
"loss": 4.7491,
"step": 102000
},
{
"epoch": 0.84,
"grad_norm": 1.3773962259292603,
"learning_rate": 1.64409234873843e-06,
"loss": 4.753,
"step": 102100
},
{
"epoch": 0.84,
"grad_norm": 1.1974895000457764,
"learning_rate": 1.6359083059849907e-06,
"loss": 4.7563,
"step": 102200
},
{
"epoch": 0.84,
"grad_norm": 1.3141651153564453,
"learning_rate": 1.6277242632315512e-06,
"loss": 4.7483,
"step": 102300
},
{
"epoch": 0.84,
"grad_norm": 2.256546974182129,
"learning_rate": 1.6195402204781119e-06,
"loss": 4.747,
"step": 102400
},
{
"epoch": 0.84,
"grad_norm": 2.344313859939575,
"learning_rate": 1.6113561777246726e-06,
"loss": 4.7458,
"step": 102500
},
{
"epoch": 0.84,
"grad_norm": 1.533346176147461,
"learning_rate": 1.603172134971233e-06,
"loss": 4.7488,
"step": 102600
},
{
"epoch": 0.84,
"grad_norm": 1.1802254915237427,
"learning_rate": 1.5949880922177937e-06,
"loss": 4.7503,
"step": 102700
},
{
"epoch": 0.84,
"grad_norm": 1.1822803020477295,
"learning_rate": 1.5868040494643544e-06,
"loss": 4.7526,
"step": 102800
},
{
"epoch": 0.84,
"grad_norm": 1.3468492031097412,
"learning_rate": 1.5786200067109153e-06,
"loss": 4.7511,
"step": 102900
},
{
"epoch": 0.84,
"grad_norm": 4.000704765319824,
"learning_rate": 1.570435963957476e-06,
"loss": 4.754,
"step": 103000
},
{
"epoch": 0.84,
"grad_norm": 1.139367699623108,
"learning_rate": 1.5622519212040365e-06,
"loss": 4.7488,
"step": 103100
},
{
"epoch": 0.84,
"grad_norm": 4.506742000579834,
"learning_rate": 1.5540678784505972e-06,
"loss": 4.7518,
"step": 103200
},
{
"epoch": 0.85,
"grad_norm": 1.9105794429779053,
"learning_rate": 1.5458838356971579e-06,
"loss": 4.751,
"step": 103300
},
{
"epoch": 0.85,
"grad_norm": 1.203366994857788,
"learning_rate": 1.5376997929437184e-06,
"loss": 4.7505,
"step": 103400
},
{
"epoch": 0.85,
"grad_norm": 1.2069025039672852,
"learning_rate": 1.529515750190279e-06,
"loss": 4.7502,
"step": 103500
},
{
"epoch": 0.85,
"grad_norm": 1.0046311616897583,
"learning_rate": 1.5213317074368397e-06,
"loss": 4.7522,
"step": 103600
},
{
"epoch": 0.85,
"grad_norm": 2.022199869155884,
"learning_rate": 1.5131476646834004e-06,
"loss": 4.7543,
"step": 103700
},
{
"epoch": 0.85,
"grad_norm": 1.0690085887908936,
"learning_rate": 1.504963621929961e-06,
"loss": 4.7497,
"step": 103800
},
{
"epoch": 0.85,
"grad_norm": 1.2978872060775757,
"learning_rate": 1.4967795791765216e-06,
"loss": 4.7511,
"step": 103900
},
{
"epoch": 0.85,
"grad_norm": 1.110472321510315,
"learning_rate": 1.4885955364230825e-06,
"loss": 4.7504,
"step": 104000
},
{
"epoch": 0.85,
"grad_norm": 2.129612684249878,
"learning_rate": 1.4804114936696432e-06,
"loss": 4.7538,
"step": 104100
},
{
"epoch": 0.85,
"grad_norm": 1.1347908973693848,
"learning_rate": 1.4722274509162037e-06,
"loss": 4.7535,
"step": 104200
},
{
"epoch": 0.85,
"grad_norm": 1.1420745849609375,
"learning_rate": 1.4640434081627644e-06,
"loss": 4.7491,
"step": 104300
},
{
"epoch": 0.85,
"grad_norm": 2.80501127243042,
"learning_rate": 1.455859365409325e-06,
"loss": 4.7507,
"step": 104400
},
{
"epoch": 0.86,
"grad_norm": 1.399776816368103,
"learning_rate": 1.4476753226558857e-06,
"loss": 4.7523,
"step": 104500
},
{
"epoch": 0.86,
"grad_norm": 1.2114007472991943,
"learning_rate": 1.4394912799024462e-06,
"loss": 4.7522,
"step": 104600
},
{
"epoch": 0.86,
"grad_norm": 1.078600525856018,
"learning_rate": 1.431307237149007e-06,
"loss": 4.7535,
"step": 104700
},
{
"epoch": 0.86,
"grad_norm": 1.3322091102600098,
"learning_rate": 1.4231231943955676e-06,
"loss": 4.7511,
"step": 104800
},
{
"epoch": 0.86,
"grad_norm": 1.2436057329177856,
"learning_rate": 1.4149391516421283e-06,
"loss": 4.7501,
"step": 104900
},
{
"epoch": 0.86,
"grad_norm": 1.163930058479309,
"learning_rate": 1.4067551088886888e-06,
"loss": 4.7533,
"step": 105000
},
{
"epoch": 0.86,
"grad_norm": 1.1139936447143555,
"learning_rate": 1.3985710661352497e-06,
"loss": 4.7474,
"step": 105100
},
{
"epoch": 0.86,
"grad_norm": 1.704499363899231,
"learning_rate": 1.3903870233818104e-06,
"loss": 4.7524,
"step": 105200
},
{
"epoch": 0.86,
"grad_norm": 1.2708555459976196,
"learning_rate": 1.382202980628371e-06,
"loss": 4.7558,
"step": 105300
},
{
"epoch": 0.86,
"grad_norm": 2.6546807289123535,
"learning_rate": 1.3740189378749315e-06,
"loss": 4.7514,
"step": 105400
},
{
"epoch": 0.86,
"grad_norm": 1.196606159210205,
"learning_rate": 1.3658348951214922e-06,
"loss": 4.7479,
"step": 105500
},
{
"epoch": 0.86,
"grad_norm": 2.2983286380767822,
"learning_rate": 1.357650852368053e-06,
"loss": 4.7532,
"step": 105600
},
{
"epoch": 0.87,
"grad_norm": 1.0857946872711182,
"learning_rate": 1.3494668096146136e-06,
"loss": 4.7531,
"step": 105700
},
{
"epoch": 0.87,
"grad_norm": 1.606785535812378,
"learning_rate": 1.341282766861174e-06,
"loss": 4.7506,
"step": 105800
},
{
"epoch": 0.87,
"grad_norm": 1.9557284116744995,
"learning_rate": 1.3330987241077348e-06,
"loss": 4.7553,
"step": 105900
},
{
"epoch": 0.87,
"grad_norm": 2.19726824760437,
"learning_rate": 1.3249146813542955e-06,
"loss": 4.7524,
"step": 106000
},
{
"epoch": 0.87,
"grad_norm": 1.0980935096740723,
"learning_rate": 1.316730638600856e-06,
"loss": 4.7515,
"step": 106100
},
{
"epoch": 0.87,
"grad_norm": 1.3451943397521973,
"learning_rate": 1.3085465958474168e-06,
"loss": 4.7547,
"step": 106200
},
{
"epoch": 0.87,
"grad_norm": 1.2886918783187866,
"learning_rate": 1.3003625530939775e-06,
"loss": 4.7505,
"step": 106300
},
{
"epoch": 0.87,
"grad_norm": 1.1479195356369019,
"learning_rate": 1.2921785103405382e-06,
"loss": 4.7538,
"step": 106400
},
{
"epoch": 0.87,
"grad_norm": 1.7975718975067139,
"learning_rate": 1.283994467587099e-06,
"loss": 4.7472,
"step": 106500
},
{
"epoch": 0.87,
"grad_norm": 1.536818027496338,
"learning_rate": 1.2758104248336594e-06,
"loss": 4.7498,
"step": 106600
},
{
"epoch": 0.87,
"grad_norm": 1.1835073232650757,
"learning_rate": 1.26762638208022e-06,
"loss": 4.751,
"step": 106700
},
{
"epoch": 0.87,
"grad_norm": 1.418748378753662,
"learning_rate": 1.2594423393267808e-06,
"loss": 4.7506,
"step": 106800
},
{
"epoch": 0.87,
"grad_norm": 1.7083241939544678,
"learning_rate": 1.2512582965733413e-06,
"loss": 4.757,
"step": 106900
},
{
"epoch": 0.88,
"grad_norm": 1.9533259868621826,
"learning_rate": 1.243074253819902e-06,
"loss": 4.7491,
"step": 107000
},
{
"epoch": 0.88,
"grad_norm": 1.549060344696045,
"learning_rate": 1.2348902110664629e-06,
"loss": 4.7502,
"step": 107100
},
{
"epoch": 0.88,
"grad_norm": 1.22414231300354,
"learning_rate": 1.2267061683130233e-06,
"loss": 4.7512,
"step": 107200
},
{
"epoch": 0.88,
"grad_norm": 2.5019216537475586,
"learning_rate": 1.218522125559584e-06,
"loss": 4.7551,
"step": 107300
},
{
"epoch": 0.88,
"grad_norm": 1.4612125158309937,
"learning_rate": 1.2103380828061447e-06,
"loss": 4.7517,
"step": 107400
},
{
"epoch": 0.88,
"grad_norm": 1.41428542137146,
"learning_rate": 1.2021540400527052e-06,
"loss": 4.7522,
"step": 107500
},
{
"epoch": 0.88,
"grad_norm": 1.1158181428909302,
"learning_rate": 1.193969997299266e-06,
"loss": 4.7534,
"step": 107600
},
{
"epoch": 0.88,
"grad_norm": 1.4456719160079956,
"learning_rate": 1.1857859545458268e-06,
"loss": 4.7515,
"step": 107700
},
{
"epoch": 0.88,
"grad_norm": 2.0484957695007324,
"learning_rate": 1.1776019117923873e-06,
"loss": 4.7475,
"step": 107800
},
{
"epoch": 0.88,
"grad_norm": 1.0839197635650635,
"learning_rate": 1.169417869038948e-06,
"loss": 4.7474,
"step": 107900
},
{
"epoch": 0.88,
"grad_norm": 1.4242494106292725,
"learning_rate": 1.1612338262855086e-06,
"loss": 4.7516,
"step": 108000
},
{
"epoch": 0.88,
"grad_norm": 1.1142181158065796,
"learning_rate": 1.1530497835320691e-06,
"loss": 4.7513,
"step": 108100
},
{
"epoch": 0.89,
"grad_norm": 1.2992855310440063,
"learning_rate": 1.14486574077863e-06,
"loss": 4.7497,
"step": 108200
},
{
"epoch": 0.89,
"grad_norm": 1.1050403118133545,
"learning_rate": 1.1366816980251905e-06,
"loss": 4.7478,
"step": 108300
},
{
"epoch": 0.89,
"grad_norm": 1.6111624240875244,
"learning_rate": 1.1284976552717512e-06,
"loss": 4.7521,
"step": 108400
},
{
"epoch": 0.89,
"grad_norm": 1.6379482746124268,
"learning_rate": 1.1203136125183119e-06,
"loss": 4.7456,
"step": 108500
},
{
"epoch": 0.89,
"grad_norm": 1.8351396322250366,
"learning_rate": 1.1121295697648726e-06,
"loss": 4.7518,
"step": 108600
},
{
"epoch": 0.89,
"grad_norm": 1.1721076965332031,
"learning_rate": 1.1039455270114333e-06,
"loss": 4.7557,
"step": 108700
},
{
"epoch": 0.89,
"grad_norm": 1.4993492364883423,
"learning_rate": 1.095761484257994e-06,
"loss": 4.7519,
"step": 108800
},
{
"epoch": 0.89,
"grad_norm": 1.1917214393615723,
"learning_rate": 1.0875774415045544e-06,
"loss": 4.748,
"step": 108900
},
{
"epoch": 0.89,
"grad_norm": 1.0404828786849976,
"learning_rate": 1.0793933987511151e-06,
"loss": 4.7565,
"step": 109000
},
{
"epoch": 0.89,
"grad_norm": 1.5994240045547485,
"learning_rate": 1.0712093559976758e-06,
"loss": 4.7499,
"step": 109100
},
{
"epoch": 0.89,
"grad_norm": 1.197583556175232,
"learning_rate": 1.0630253132442365e-06,
"loss": 4.7537,
"step": 109200
},
{
"epoch": 0.89,
"grad_norm": 1.6032483577728271,
"learning_rate": 1.0548412704907972e-06,
"loss": 4.7542,
"step": 109300
},
{
"epoch": 0.9,
"grad_norm": 1.39584481716156,
"learning_rate": 1.0466572277373579e-06,
"loss": 4.7493,
"step": 109400
},
{
"epoch": 0.9,
"grad_norm": 1.410801649093628,
"learning_rate": 1.0384731849839184e-06,
"loss": 4.7559,
"step": 109500
},
{
"epoch": 0.9,
"grad_norm": 1.246910810470581,
"learning_rate": 1.030289142230479e-06,
"loss": 4.751,
"step": 109600
},
{
"epoch": 0.9,
"grad_norm": 4.328908920288086,
"learning_rate": 1.0221050994770397e-06,
"loss": 4.7496,
"step": 109700
},
{
"epoch": 0.9,
"grad_norm": 1.5280972719192505,
"learning_rate": 1.0139210567236004e-06,
"loss": 4.7539,
"step": 109800
},
{
"epoch": 0.9,
"grad_norm": 2.1216630935668945,
"learning_rate": 1.0057370139701611e-06,
"loss": 4.7523,
"step": 109900
},
{
"epoch": 0.9,
"grad_norm": 1.4128057956695557,
"learning_rate": 9.975529712167218e-07,
"loss": 4.7492,
"step": 110000
},
{
"epoch": 0.9,
"grad_norm": 1.2564375400543213,
"learning_rate": 9.893689284632823e-07,
"loss": 4.754,
"step": 110100
},
{
"epoch": 0.9,
"grad_norm": 2.3144404888153076,
"learning_rate": 9.81184885709843e-07,
"loss": 4.7473,
"step": 110200
},
{
"epoch": 0.9,
"grad_norm": 0.9776962399482727,
"learning_rate": 9.730008429564037e-07,
"loss": 4.7479,
"step": 110300
},
{
"epoch": 0.9,
"grad_norm": 2.479701519012451,
"learning_rate": 9.648168002029644e-07,
"loss": 4.7514,
"step": 110400
},
{
"epoch": 0.9,
"grad_norm": 3.217172145843506,
"learning_rate": 9.56632757449525e-07,
"loss": 4.7535,
"step": 110500
},
{
"epoch": 0.91,
"grad_norm": 1.103346824645996,
"learning_rate": 9.484487146960856e-07,
"loss": 4.7474,
"step": 110600
},
{
"epoch": 0.91,
"grad_norm": 1.1965771913528442,
"learning_rate": 9.402646719426463e-07,
"loss": 4.7475,
"step": 110700
},
{
"epoch": 0.91,
"grad_norm": 1.2940635681152344,
"learning_rate": 9.320806291892069e-07,
"loss": 4.7484,
"step": 110800
},
{
"epoch": 0.91,
"grad_norm": 1.06132972240448,
"learning_rate": 9.238965864357677e-07,
"loss": 4.7547,
"step": 110900
},
{
"epoch": 0.91,
"grad_norm": 1.8715641498565674,
"learning_rate": 9.157125436823283e-07,
"loss": 4.749,
"step": 111000
},
{
"epoch": 0.91,
"grad_norm": 1.1907116174697876,
"learning_rate": 9.07528500928889e-07,
"loss": 4.7539,
"step": 111100
},
{
"epoch": 0.91,
"grad_norm": 1.5867308378219604,
"learning_rate": 8.993444581754496e-07,
"loss": 4.7562,
"step": 111200
},
{
"epoch": 0.91,
"grad_norm": 1.2849870920181274,
"learning_rate": 8.911604154220103e-07,
"loss": 4.7512,
"step": 111300
},
{
"epoch": 0.91,
"grad_norm": 1.3407094478607178,
"learning_rate": 8.829763726685708e-07,
"loss": 4.7543,
"step": 111400
},
{
"epoch": 0.91,
"grad_norm": 1.0691750049591064,
"learning_rate": 8.747923299151316e-07,
"loss": 4.7451,
"step": 111500
},
{
"epoch": 0.91,
"grad_norm": 1.0635693073272705,
"learning_rate": 8.666082871616922e-07,
"loss": 4.7521,
"step": 111600
},
{
"epoch": 0.91,
"grad_norm": 1.5273666381835938,
"learning_rate": 8.584242444082529e-07,
"loss": 4.7551,
"step": 111700
},
{
"epoch": 0.91,
"grad_norm": 1.7429158687591553,
"learning_rate": 8.502402016548135e-07,
"loss": 4.7544,
"step": 111800
},
{
"epoch": 0.92,
"grad_norm": 1.0581636428833008,
"learning_rate": 8.420561589013741e-07,
"loss": 4.7532,
"step": 111900
},
{
"epoch": 0.92,
"grad_norm": 1.3187443017959595,
"learning_rate": 8.338721161479348e-07,
"loss": 4.7551,
"step": 112000
},
{
"epoch": 0.92,
"grad_norm": 1.2842453718185425,
"learning_rate": 8.256880733944956e-07,
"loss": 4.7482,
"step": 112100
},
{
"epoch": 0.92,
"grad_norm": 1.264115571975708,
"learning_rate": 8.175040306410561e-07,
"loss": 4.7475,
"step": 112200
},
{
"epoch": 0.92,
"grad_norm": 1.338619589805603,
"learning_rate": 8.093199878876168e-07,
"loss": 4.7465,
"step": 112300
},
{
"epoch": 0.92,
"grad_norm": 1.2081668376922607,
"learning_rate": 8.011359451341774e-07,
"loss": 4.75,
"step": 112400
},
{
"epoch": 0.92,
"grad_norm": 1.475716471672058,
"learning_rate": 7.92951902380738e-07,
"loss": 4.7475,
"step": 112500
},
{
"epoch": 0.92,
"grad_norm": 1.1391123533248901,
"learning_rate": 7.847678596272988e-07,
"loss": 4.7529,
"step": 112600
},
{
"epoch": 0.92,
"grad_norm": 1.5139949321746826,
"learning_rate": 7.765838168738595e-07,
"loss": 4.7457,
"step": 112700
},
{
"epoch": 0.92,
"grad_norm": 1.250877857208252,
"learning_rate": 7.683997741204201e-07,
"loss": 4.7512,
"step": 112800
},
{
"epoch": 0.92,
"grad_norm": 1.3660330772399902,
"learning_rate": 7.602157313669807e-07,
"loss": 4.7519,
"step": 112900
},
{
"epoch": 0.92,
"grad_norm": 1.3007818460464478,
"learning_rate": 7.520316886135414e-07,
"loss": 4.7539,
"step": 113000
},
{
"epoch": 0.93,
"grad_norm": 1.1533290147781372,
"learning_rate": 7.438476458601019e-07,
"loss": 4.7522,
"step": 113100
},
{
"epoch": 0.93,
"grad_norm": 1.1087945699691772,
"learning_rate": 7.356636031066627e-07,
"loss": 4.7549,
"step": 113200
},
{
"epoch": 0.93,
"grad_norm": 1.3991641998291016,
"learning_rate": 7.274795603532233e-07,
"loss": 4.7538,
"step": 113300
},
{
"epoch": 0.93,
"grad_norm": 1.820162296295166,
"learning_rate": 7.19295517599784e-07,
"loss": 4.7521,
"step": 113400
},
{
"epoch": 0.93,
"grad_norm": 1.1187483072280884,
"learning_rate": 7.111114748463446e-07,
"loss": 4.7577,
"step": 113500
},
{
"epoch": 0.93,
"grad_norm": 1.0411303043365479,
"learning_rate": 7.029274320929053e-07,
"loss": 4.7514,
"step": 113600
},
{
"epoch": 0.93,
"grad_norm": 1.2369052171707153,
"learning_rate": 6.947433893394661e-07,
"loss": 4.7497,
"step": 113700
},
{
"epoch": 0.93,
"grad_norm": 1.505008578300476,
"learning_rate": 6.865593465860267e-07,
"loss": 4.7505,
"step": 113800
},
{
"epoch": 0.93,
"grad_norm": 1.2643870115280151,
"learning_rate": 6.783753038325872e-07,
"loss": 4.7499,
"step": 113900
},
{
"epoch": 0.93,
"grad_norm": 1.095914363861084,
"learning_rate": 6.701912610791479e-07,
"loss": 4.7478,
"step": 114000
},
{
"epoch": 0.93,
"grad_norm": 1.4800920486450195,
"learning_rate": 6.620072183257085e-07,
"loss": 4.748,
"step": 114100
},
{
"epoch": 0.93,
"grad_norm": 1.6267393827438354,
"learning_rate": 6.538231755722692e-07,
"loss": 4.7523,
"step": 114200
},
{
"epoch": 0.94,
"grad_norm": 1.7788121700286865,
"learning_rate": 6.456391328188299e-07,
"loss": 4.7548,
"step": 114300
},
{
"epoch": 0.94,
"grad_norm": 1.2876346111297607,
"learning_rate": 6.374550900653906e-07,
"loss": 4.7555,
"step": 114400
},
{
"epoch": 0.94,
"grad_norm": 1.0011918544769287,
"learning_rate": 6.292710473119512e-07,
"loss": 4.7487,
"step": 114500
},
{
"epoch": 0.94,
"grad_norm": 2.096606731414795,
"learning_rate": 6.210870045585119e-07,
"loss": 4.7527,
"step": 114600
},
{
"epoch": 0.94,
"grad_norm": 1.2112175226211548,
"learning_rate": 6.129029618050726e-07,
"loss": 4.7482,
"step": 114700
},
{
"epoch": 0.94,
"grad_norm": 1.4160529375076294,
"learning_rate": 6.047189190516331e-07,
"loss": 4.7567,
"step": 114800
},
{
"epoch": 0.94,
"grad_norm": 1.36955988407135,
"learning_rate": 5.965348762981938e-07,
"loss": 4.7562,
"step": 114900
},
{
"epoch": 0.94,
"grad_norm": 1.577446460723877,
"learning_rate": 5.883508335447545e-07,
"loss": 4.7534,
"step": 115000
},
{
"epoch": 0.94,
"grad_norm": 1.2918033599853516,
"learning_rate": 5.801667907913151e-07,
"loss": 4.7472,
"step": 115100
},
{
"epoch": 0.94,
"grad_norm": 1.07747220993042,
"learning_rate": 5.719827480378758e-07,
"loss": 4.7495,
"step": 115200
},
{
"epoch": 0.94,
"grad_norm": 1.4656786918640137,
"learning_rate": 5.637987052844365e-07,
"loss": 4.7501,
"step": 115300
},
{
"epoch": 0.94,
"grad_norm": 1.2017009258270264,
"learning_rate": 5.556146625309971e-07,
"loss": 4.7482,
"step": 115400
},
{
"epoch": 0.95,
"grad_norm": 1.3578641414642334,
"learning_rate": 5.474306197775578e-07,
"loss": 4.7467,
"step": 115500
},
{
"epoch": 0.95,
"grad_norm": 1.246904969215393,
"learning_rate": 5.392465770241185e-07,
"loss": 4.7564,
"step": 115600
},
{
"epoch": 0.95,
"grad_norm": 1.289212942123413,
"learning_rate": 5.31062534270679e-07,
"loss": 4.7516,
"step": 115700
},
{
"epoch": 0.95,
"grad_norm": 2.1466667652130127,
"learning_rate": 5.228784915172397e-07,
"loss": 4.7485,
"step": 115800
},
{
"epoch": 0.95,
"grad_norm": 1.6222789287567139,
"learning_rate": 5.146944487638004e-07,
"loss": 4.7535,
"step": 115900
},
{
"epoch": 0.95,
"grad_norm": 1.2257126569747925,
"learning_rate": 5.06510406010361e-07,
"loss": 4.7537,
"step": 116000
},
{
"epoch": 0.95,
"grad_norm": 1.0743142366409302,
"learning_rate": 4.983263632569217e-07,
"loss": 4.7468,
"step": 116100
},
{
"epoch": 0.95,
"grad_norm": 1.4326200485229492,
"learning_rate": 4.901423205034824e-07,
"loss": 4.7537,
"step": 116200
},
{
"epoch": 0.95,
"grad_norm": 1.513900876045227,
"learning_rate": 4.81958277750043e-07,
"loss": 4.7492,
"step": 116300
},
{
"epoch": 0.95,
"grad_norm": 1.3525514602661133,
"learning_rate": 4.7377423499660366e-07,
"loss": 4.7511,
"step": 116400
},
{
"epoch": 0.95,
"grad_norm": 1.0643374919891357,
"learning_rate": 4.655901922431643e-07,
"loss": 4.756,
"step": 116500
},
{
"epoch": 0.95,
"grad_norm": 1.0973927974700928,
"learning_rate": 4.57406149489725e-07,
"loss": 4.7512,
"step": 116600
},
{
"epoch": 0.96,
"grad_norm": 1.498604416847229,
"learning_rate": 4.492221067362856e-07,
"loss": 4.7534,
"step": 116700
},
{
"epoch": 0.96,
"grad_norm": 2.6713321208953857,
"learning_rate": 4.4103806398284626e-07,
"loss": 4.7497,
"step": 116800
},
{
"epoch": 0.96,
"grad_norm": 1.2505509853363037,
"learning_rate": 4.3285402122940695e-07,
"loss": 4.7551,
"step": 116900
},
{
"epoch": 0.96,
"grad_norm": 1.1062592267990112,
"learning_rate": 4.246699784759676e-07,
"loss": 4.7536,
"step": 117000
},
{
"epoch": 0.96,
"grad_norm": 1.4121407270431519,
"learning_rate": 4.1648593572252823e-07,
"loss": 4.7548,
"step": 117100
},
{
"epoch": 0.96,
"grad_norm": 1.2598930597305298,
"learning_rate": 4.083018929690889e-07,
"loss": 4.7559,
"step": 117200
},
{
"epoch": 0.96,
"grad_norm": 1.1028268337249756,
"learning_rate": 4.0011785021564955e-07,
"loss": 4.7495,
"step": 117300
},
{
"epoch": 0.96,
"grad_norm": 1.5532176494598389,
"learning_rate": 3.919338074622102e-07,
"loss": 4.7543,
"step": 117400
},
{
"epoch": 0.96,
"grad_norm": 1.2546781301498413,
"learning_rate": 3.837497647087709e-07,
"loss": 4.7502,
"step": 117500
},
{
"epoch": 0.96,
"grad_norm": 1.0731525421142578,
"learning_rate": 3.755657219553315e-07,
"loss": 4.7519,
"step": 117600
},
{
"epoch": 0.96,
"grad_norm": 1.3289110660552979,
"learning_rate": 3.6738167920189216e-07,
"loss": 4.7495,
"step": 117700
},
{
"epoch": 0.96,
"grad_norm": 1.2810921669006348,
"learning_rate": 3.5919763644845285e-07,
"loss": 4.7522,
"step": 117800
},
{
"epoch": 0.96,
"grad_norm": 1.0300296545028687,
"learning_rate": 3.510135936950135e-07,
"loss": 4.7491,
"step": 117900
},
{
"epoch": 0.97,
"grad_norm": 1.9749176502227783,
"learning_rate": 3.428295509415742e-07,
"loss": 4.7558,
"step": 118000
},
{
"epoch": 0.97,
"grad_norm": 1.6398601531982422,
"learning_rate": 3.346455081881348e-07,
"loss": 4.7489,
"step": 118100
},
{
"epoch": 0.97,
"grad_norm": 1.154733657836914,
"learning_rate": 3.2646146543469545e-07,
"loss": 4.747,
"step": 118200
},
{
"epoch": 0.97,
"grad_norm": 1.1068767309188843,
"learning_rate": 3.1827742268125614e-07,
"loss": 4.7532,
"step": 118300
},
{
"epoch": 0.97,
"grad_norm": 1.5782713890075684,
"learning_rate": 3.100933799278168e-07,
"loss": 4.7525,
"step": 118400
},
{
"epoch": 0.97,
"grad_norm": 1.2185344696044922,
"learning_rate": 3.019093371743774e-07,
"loss": 4.7507,
"step": 118500
},
{
"epoch": 0.97,
"grad_norm": 1.168750286102295,
"learning_rate": 2.9372529442093805e-07,
"loss": 4.7503,
"step": 118600
},
{
"epoch": 0.97,
"grad_norm": 1.3794721364974976,
"learning_rate": 2.8554125166749874e-07,
"loss": 4.7539,
"step": 118700
},
{
"epoch": 0.97,
"grad_norm": 1.0481081008911133,
"learning_rate": 2.773572089140594e-07,
"loss": 4.7493,
"step": 118800
},
{
"epoch": 0.97,
"grad_norm": 1.1850849390029907,
"learning_rate": 2.6917316616062007e-07,
"loss": 4.752,
"step": 118900
},
{
"epoch": 0.97,
"grad_norm": 1.3937416076660156,
"learning_rate": 2.609891234071807e-07,
"loss": 4.7518,
"step": 119000
},
{
"epoch": 0.97,
"grad_norm": 1.0362590551376343,
"learning_rate": 2.5280508065374134e-07,
"loss": 4.7494,
"step": 119100
},
{
"epoch": 0.98,
"grad_norm": 1.226879358291626,
"learning_rate": 2.4462103790030203e-07,
"loss": 4.7541,
"step": 119200
},
{
"epoch": 0.98,
"grad_norm": 1.289158582687378,
"learning_rate": 2.3643699514686265e-07,
"loss": 4.7541,
"step": 119300
},
{
"epoch": 0.98,
"grad_norm": 1.2052675485610962,
"learning_rate": 2.282529523934233e-07,
"loss": 4.7494,
"step": 119400
},
{
"epoch": 0.98,
"grad_norm": 1.3836339712142944,
"learning_rate": 2.2006890963998397e-07,
"loss": 4.7501,
"step": 119500
},
{
"epoch": 0.98,
"grad_norm": 1.075812578201294,
"learning_rate": 2.1188486688654464e-07,
"loss": 4.7466,
"step": 119600
},
{
"epoch": 0.98,
"grad_norm": 1.0450024604797363,
"learning_rate": 2.0370082413310527e-07,
"loss": 4.7513,
"step": 119700
},
{
"epoch": 0.98,
"grad_norm": 1.6995435953140259,
"learning_rate": 1.9551678137966594e-07,
"loss": 4.7544,
"step": 119800
},
{
"epoch": 0.98,
"grad_norm": 2.0586297512054443,
"learning_rate": 1.873327386262266e-07,
"loss": 4.7512,
"step": 119900
},
{
"epoch": 0.98,
"grad_norm": 1.734843134880066,
"learning_rate": 1.7914869587278727e-07,
"loss": 4.752,
"step": 120000
},
{
"epoch": 0.98,
"grad_norm": 1.8307639360427856,
"learning_rate": 1.709646531193479e-07,
"loss": 4.7486,
"step": 120100
},
{
"epoch": 0.98,
"grad_norm": 1.0754849910736084,
"learning_rate": 1.6278061036590857e-07,
"loss": 4.7518,
"step": 120200
},
{
"epoch": 0.98,
"grad_norm": 1.8558402061462402,
"learning_rate": 1.545965676124692e-07,
"loss": 4.7496,
"step": 120300
},
{
"epoch": 0.99,
"grad_norm": 1.3178366422653198,
"learning_rate": 1.4641252485902987e-07,
"loss": 4.7474,
"step": 120400
},
{
"epoch": 0.99,
"grad_norm": 1.176018238067627,
"learning_rate": 1.3822848210559053e-07,
"loss": 4.7517,
"step": 120500
},
{
"epoch": 0.99,
"grad_norm": 1.0331361293792725,
"learning_rate": 1.300444393521512e-07,
"loss": 4.7445,
"step": 120600
},
{
"epoch": 0.99,
"grad_norm": 1.2724260091781616,
"learning_rate": 1.2186039659871183e-07,
"loss": 4.7519,
"step": 120700
},
{
"epoch": 0.99,
"grad_norm": 1.1402854919433594,
"learning_rate": 1.1367635384527251e-07,
"loss": 4.7547,
"step": 120800
},
{
"epoch": 0.99,
"grad_norm": 1.6155171394348145,
"learning_rate": 1.0549231109183315e-07,
"loss": 4.7553,
"step": 120900
},
{
"epoch": 0.99,
"grad_norm": 1.3488671779632568,
"learning_rate": 9.73082683383938e-08,
"loss": 4.7489,
"step": 121000
},
{
"epoch": 0.99,
"grad_norm": 1.4476072788238525,
"learning_rate": 8.912422558495446e-08,
"loss": 4.7531,
"step": 121100
},
{
"epoch": 0.99,
"grad_norm": 1.5881561040878296,
"learning_rate": 8.094018283151511e-08,
"loss": 4.7472,
"step": 121200
},
{
"epoch": 0.99,
"grad_norm": 1.3526599407196045,
"learning_rate": 7.275614007807578e-08,
"loss": 4.7507,
"step": 121300
},
{
"epoch": 0.99,
"grad_norm": 1.1709797382354736,
"learning_rate": 6.457209732463643e-08,
"loss": 4.7495,
"step": 121400
},
{
"epoch": 0.99,
"grad_norm": 1.1517142057418823,
"learning_rate": 5.6388054571197084e-08,
"loss": 4.7534,
"step": 121500
},
{
"epoch": 1.0,
"grad_norm": 1.6636133193969727,
"learning_rate": 4.820401181775774e-08,
"loss": 4.7524,
"step": 121600
},
{
"epoch": 1.0,
"grad_norm": 1.1265342235565186,
"learning_rate": 4.001996906431839e-08,
"loss": 4.749,
"step": 121700
},
{
"epoch": 1.0,
"grad_norm": 1.4785326719284058,
"learning_rate": 3.183592631087905e-08,
"loss": 4.7507,
"step": 121800
},
{
"epoch": 1.0,
"grad_norm": 3.0491678714752197,
"learning_rate": 2.3651883557439706e-08,
"loss": 4.7502,
"step": 121900
},
{
"epoch": 1.0,
"grad_norm": 1.162163257598877,
"learning_rate": 1.5467840804000363e-08,
"loss": 4.7493,
"step": 122000
}
],
"logging_steps": 100,
"max_steps": 122189,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"total_flos": 1.4891090143551898e+18,
"train_batch_size": 96,
"trial_name": null,
"trial_params": null
}