craa's picture
End of training
5e243c2 verified
Invalid JSON: Unexpected token 'N', ..."ad_norm": NaN, "... is not valid JSON
{
"best_global_step": 79000,
"best_metric": 3.5298874378204346,
"best_model_checkpoint": "/scratch/cl5625/exceptions/models/swap_require_to_drop_2128/checkpoint-40000",
"epoch": 32.01426408942711,
"eval_steps": 1000,
"global_step": 110000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014555193292966931,
"grad_norm": 1.664777398109436,
"learning_rate": 0.000294,
"loss": 8.4459,
"step": 50
},
{
"epoch": 0.029110386585933862,
"grad_norm": 0.8744126558303833,
"learning_rate": 0.0005939999999999999,
"loss": 6.7213,
"step": 100
},
{
"epoch": 0.04366557987890079,
"grad_norm": 0.514337956905365,
"learning_rate": 0.0005998287711124053,
"loss": 6.3296,
"step": 150
},
{
"epoch": 0.058220773171867725,
"grad_norm": 0.584080696105957,
"learning_rate": 0.000599654047757717,
"loss": 6.151,
"step": 200
},
{
"epoch": 0.07277596646483465,
"grad_norm": 0.45835795998573303,
"learning_rate": 0.0005994793244030285,
"loss": 5.9916,
"step": 250
},
{
"epoch": 0.08733115975780158,
"grad_norm": 0.4237825870513916,
"learning_rate": 0.00059930460104834,
"loss": 5.8717,
"step": 300
},
{
"epoch": 0.10188635305076851,
"grad_norm": 0.410149484872818,
"learning_rate": 0.0005991298776936517,
"loss": 5.7453,
"step": 350
},
{
"epoch": 0.11644154634373545,
"grad_norm": 0.5075629949569702,
"learning_rate": 0.0005989551543389632,
"loss": 5.6456,
"step": 400
},
{
"epoch": 0.1309967396367024,
"grad_norm": 0.45372700691223145,
"learning_rate": 0.0005987804309842748,
"loss": 5.5133,
"step": 450
},
{
"epoch": 0.1455519329296693,
"grad_norm": 0.45813828706741333,
"learning_rate": 0.0005986057076295864,
"loss": 5.4275,
"step": 500
},
{
"epoch": 0.16010712622263623,
"grad_norm": 0.42607325315475464,
"learning_rate": 0.0005984309842748981,
"loss": 5.3298,
"step": 550
},
{
"epoch": 0.17466231951560315,
"grad_norm": 0.5173180103302002,
"learning_rate": 0.0005982562609202096,
"loss": 5.2464,
"step": 600
},
{
"epoch": 0.1892175128085701,
"grad_norm": 0.43883052468299866,
"learning_rate": 0.0005980815375655212,
"loss": 5.1861,
"step": 650
},
{
"epoch": 0.20377270610153703,
"grad_norm": 0.4173907935619354,
"learning_rate": 0.0005979068142108328,
"loss": 5.1403,
"step": 700
},
{
"epoch": 0.21832789939450395,
"grad_norm": 0.4811343550682068,
"learning_rate": 0.0005977320908561445,
"loss": 5.0862,
"step": 750
},
{
"epoch": 0.2328830926874709,
"grad_norm": 0.4635143280029297,
"learning_rate": 0.000597557367501456,
"loss": 5.0425,
"step": 800
},
{
"epoch": 0.24743828598043782,
"grad_norm": 0.37773096561431885,
"learning_rate": 0.0005973826441467675,
"loss": 4.9656,
"step": 850
},
{
"epoch": 0.2619934792734048,
"grad_norm": 0.4224317669868469,
"learning_rate": 0.0005972079207920792,
"loss": 4.9229,
"step": 900
},
{
"epoch": 0.27654867256637167,
"grad_norm": 0.4564674496650696,
"learning_rate": 0.0005970331974373907,
"loss": 4.8711,
"step": 950
},
{
"epoch": 0.2911038658593386,
"grad_norm": 0.44638097286224365,
"learning_rate": 0.0005968584740827023,
"loss": 4.8313,
"step": 1000
},
{
"epoch": 0.2911038658593386,
"eval_accuracy": 0.25401106002460316,
"eval_loss": 4.755082130432129,
"eval_runtime": 180.1011,
"eval_samples_per_second": 92.454,
"eval_steps_per_second": 5.78,
"step": 1000
},
{
"epoch": 0.30565905915230557,
"grad_norm": 0.5874536037445068,
"learning_rate": 0.0005966837507280139,
"loss": 4.7718,
"step": 1050
},
{
"epoch": 0.32021425244527246,
"grad_norm": 0.4394214451313019,
"learning_rate": 0.0005965090273733256,
"loss": 4.7342,
"step": 1100
},
{
"epoch": 0.3347694457382394,
"grad_norm": 0.44372034072875977,
"learning_rate": 0.0005963343040186371,
"loss": 4.7015,
"step": 1150
},
{
"epoch": 0.3493246390312063,
"grad_norm": 0.4285391867160797,
"learning_rate": 0.0005961595806639486,
"loss": 4.6587,
"step": 1200
},
{
"epoch": 0.36387983232417326,
"grad_norm": 0.4155198037624359,
"learning_rate": 0.0005959848573092603,
"loss": 4.6256,
"step": 1250
},
{
"epoch": 0.3784350256171402,
"grad_norm": 0.4572656452655792,
"learning_rate": 0.0005958101339545718,
"loss": 4.6061,
"step": 1300
},
{
"epoch": 0.3929902189101071,
"grad_norm": 0.4733801484107971,
"learning_rate": 0.0005956354105998835,
"loss": 4.5809,
"step": 1350
},
{
"epoch": 0.40754541220307405,
"grad_norm": 0.41797181963920593,
"learning_rate": 0.000595460687245195,
"loss": 4.5521,
"step": 1400
},
{
"epoch": 0.422100605496041,
"grad_norm": 0.461195707321167,
"learning_rate": 0.0005952859638905067,
"loss": 4.5162,
"step": 1450
},
{
"epoch": 0.4366557987890079,
"grad_norm": 0.43522897362709045,
"learning_rate": 0.0005951112405358182,
"loss": 4.4957,
"step": 1500
},
{
"epoch": 0.45121099208197485,
"grad_norm": 0.43900424242019653,
"learning_rate": 0.0005949365171811299,
"loss": 4.4926,
"step": 1550
},
{
"epoch": 0.4657661853749418,
"grad_norm": 0.42797166109085083,
"learning_rate": 0.0005947617938264414,
"loss": 4.4513,
"step": 1600
},
{
"epoch": 0.4803213786679087,
"grad_norm": 0.4188319444656372,
"learning_rate": 0.000594587070471753,
"loss": 4.4385,
"step": 1650
},
{
"epoch": 0.49487657196087564,
"grad_norm": 0.4120756685733795,
"learning_rate": 0.0005944123471170646,
"loss": 4.4206,
"step": 1700
},
{
"epoch": 0.5094317652538426,
"grad_norm": 0.47579753398895264,
"learning_rate": 0.0005942376237623762,
"loss": 4.4176,
"step": 1750
},
{
"epoch": 0.5239869585468095,
"grad_norm": 0.43895846605300903,
"learning_rate": 0.0005940629004076878,
"loss": 4.3887,
"step": 1800
},
{
"epoch": 0.5385421518397764,
"grad_norm": 0.39533790946006775,
"learning_rate": 0.0005938881770529993,
"loss": 4.38,
"step": 1850
},
{
"epoch": 0.5530973451327433,
"grad_norm": 0.377754271030426,
"learning_rate": 0.000593713453698311,
"loss": 4.3735,
"step": 1900
},
{
"epoch": 0.5676525384257103,
"grad_norm": 0.39021018147468567,
"learning_rate": 0.0005935387303436226,
"loss": 4.3428,
"step": 1950
},
{
"epoch": 0.5822077317186772,
"grad_norm": 0.3872169554233551,
"learning_rate": 0.0005933640069889342,
"loss": 4.327,
"step": 2000
},
{
"epoch": 0.5822077317186772,
"eval_accuracy": 0.2996545519912005,
"eval_loss": 4.280758380889893,
"eval_runtime": 179.3832,
"eval_samples_per_second": 92.824,
"eval_steps_per_second": 5.803,
"step": 2000
},
{
"epoch": 0.5967629250116442,
"grad_norm": 0.42003875970840454,
"learning_rate": 0.0005931892836342457,
"loss": 4.3191,
"step": 2050
},
{
"epoch": 0.6113181183046111,
"grad_norm": 0.38116443157196045,
"learning_rate": 0.0005930145602795573,
"loss": 4.3089,
"step": 2100
},
{
"epoch": 0.625873311597578,
"grad_norm": 0.3790656328201294,
"learning_rate": 0.000592839836924869,
"loss": 4.2951,
"step": 2150
},
{
"epoch": 0.6404285048905449,
"grad_norm": 0.40056294202804565,
"learning_rate": 0.0005926651135701805,
"loss": 4.2845,
"step": 2200
},
{
"epoch": 0.6549836981835119,
"grad_norm": 0.3878796398639679,
"learning_rate": 0.0005924903902154921,
"loss": 4.2657,
"step": 2250
},
{
"epoch": 0.6695388914764788,
"grad_norm": 0.388304740190506,
"learning_rate": 0.0005923156668608037,
"loss": 4.2631,
"step": 2300
},
{
"epoch": 0.6840940847694458,
"grad_norm": 0.41076818108558655,
"learning_rate": 0.0005921409435061153,
"loss": 4.2714,
"step": 2350
},
{
"epoch": 0.6986492780624126,
"grad_norm": 0.3966241776943207,
"learning_rate": 0.0005919662201514268,
"loss": 4.2525,
"step": 2400
},
{
"epoch": 0.7132044713553796,
"grad_norm": 0.39895981550216675,
"learning_rate": 0.0005917914967967384,
"loss": 4.2249,
"step": 2450
},
{
"epoch": 0.7277596646483465,
"grad_norm": 0.37347468733787537,
"learning_rate": 0.0005916167734420501,
"loss": 4.2305,
"step": 2500
},
{
"epoch": 0.7423148579413135,
"grad_norm": 0.36039796471595764,
"learning_rate": 0.0005914420500873616,
"loss": 4.2042,
"step": 2550
},
{
"epoch": 0.7568700512342804,
"grad_norm": 0.37558725476264954,
"learning_rate": 0.0005912673267326732,
"loss": 4.208,
"step": 2600
},
{
"epoch": 0.7714252445272474,
"grad_norm": 0.38653722405433655,
"learning_rate": 0.0005910926033779848,
"loss": 4.1975,
"step": 2650
},
{
"epoch": 0.7859804378202142,
"grad_norm": 0.37924301624298096,
"learning_rate": 0.0005909178800232964,
"loss": 4.1807,
"step": 2700
},
{
"epoch": 0.8005356311131812,
"grad_norm": 0.378798246383667,
"learning_rate": 0.000590743156668608,
"loss": 4.1775,
"step": 2750
},
{
"epoch": 0.8150908244061481,
"grad_norm": 0.35967278480529785,
"learning_rate": 0.0005905684333139196,
"loss": 4.1639,
"step": 2800
},
{
"epoch": 0.8296460176991151,
"grad_norm": 0.3435153663158417,
"learning_rate": 0.0005903937099592312,
"loss": 4.1748,
"step": 2850
},
{
"epoch": 0.844201210992082,
"grad_norm": 0.3642827272415161,
"learning_rate": 0.0005902189866045427,
"loss": 4.1575,
"step": 2900
},
{
"epoch": 0.858756404285049,
"grad_norm": 0.34654879570007324,
"learning_rate": 0.0005900442632498543,
"loss": 4.1613,
"step": 2950
},
{
"epoch": 0.8733115975780158,
"grad_norm": 0.37901315093040466,
"learning_rate": 0.0005898695398951659,
"loss": 4.143,
"step": 3000
},
{
"epoch": 0.8733115975780158,
"eval_accuracy": 0.3158166719769421,
"eval_loss": 4.093630313873291,
"eval_runtime": 179.4838,
"eval_samples_per_second": 92.772,
"eval_steps_per_second": 5.8,
"step": 3000
},
{
"epoch": 0.8878667908709827,
"grad_norm": 0.35573050379753113,
"learning_rate": 0.0005896948165404776,
"loss": 4.1588,
"step": 3050
},
{
"epoch": 0.9024219841639497,
"grad_norm": 0.37339988350868225,
"learning_rate": 0.0005895200931857891,
"loss": 4.1301,
"step": 3100
},
{
"epoch": 0.9169771774569166,
"grad_norm": 0.36611881852149963,
"learning_rate": 0.0005893453698311007,
"loss": 4.1326,
"step": 3150
},
{
"epoch": 0.9315323707498836,
"grad_norm": 0.3458259403705597,
"learning_rate": 0.0005891706464764123,
"loss": 4.1098,
"step": 3200
},
{
"epoch": 0.9460875640428504,
"grad_norm": 0.34497207403182983,
"learning_rate": 0.0005889959231217238,
"loss": 4.11,
"step": 3250
},
{
"epoch": 0.9606427573358174,
"grad_norm": 0.3454968333244324,
"learning_rate": 0.0005888211997670355,
"loss": 4.0968,
"step": 3300
},
{
"epoch": 0.9751979506287843,
"grad_norm": 0.3598727285861969,
"learning_rate": 0.000588646476412347,
"loss": 4.0848,
"step": 3350
},
{
"epoch": 0.9897531439217513,
"grad_norm": 0.37226811051368713,
"learning_rate": 0.0005884717530576587,
"loss": 4.1,
"step": 3400
},
{
"epoch": 1.0040754541220307,
"grad_norm": 0.36724382638931274,
"learning_rate": 0.0005882970297029702,
"loss": 4.0639,
"step": 3450
},
{
"epoch": 1.0186306474149978,
"grad_norm": 0.3470579981803894,
"learning_rate": 0.0005881223063482818,
"loss": 4.0231,
"step": 3500
},
{
"epoch": 1.0331858407079646,
"grad_norm": 0.37876060605049133,
"learning_rate": 0.0005879475829935934,
"loss": 4.015,
"step": 3550
},
{
"epoch": 1.0477410340009314,
"grad_norm": 0.3453887104988098,
"learning_rate": 0.0005877728596389051,
"loss": 4.0009,
"step": 3600
},
{
"epoch": 1.0622962272938985,
"grad_norm": 0.35795602202415466,
"learning_rate": 0.0005875981362842166,
"loss": 4.0177,
"step": 3650
},
{
"epoch": 1.0768514205868653,
"grad_norm": 0.3534950017929077,
"learning_rate": 0.0005874234129295281,
"loss": 4.0121,
"step": 3700
},
{
"epoch": 1.0914066138798324,
"grad_norm": 0.3307143747806549,
"learning_rate": 0.0005872486895748398,
"loss": 4.0045,
"step": 3750
},
{
"epoch": 1.1059618071727992,
"grad_norm": 0.34968075156211853,
"learning_rate": 0.0005870739662201513,
"loss": 3.9982,
"step": 3800
},
{
"epoch": 1.120517000465766,
"grad_norm": 0.3467692732810974,
"learning_rate": 0.000586899242865463,
"loss": 3.9971,
"step": 3850
},
{
"epoch": 1.1350721937587331,
"grad_norm": 0.34137192368507385,
"learning_rate": 0.0005867245195107746,
"loss": 4.0123,
"step": 3900
},
{
"epoch": 1.1496273870517,
"grad_norm": 0.3370218276977539,
"learning_rate": 0.0005865497961560862,
"loss": 3.9948,
"step": 3950
},
{
"epoch": 1.164182580344667,
"grad_norm": 0.35244500637054443,
"learning_rate": 0.0005863750728013977,
"loss": 3.9875,
"step": 4000
},
{
"epoch": 1.164182580344667,
"eval_accuracy": 0.32581989104983733,
"eval_loss": 3.9877822399139404,
"eval_runtime": 179.4823,
"eval_samples_per_second": 92.772,
"eval_steps_per_second": 5.8,
"step": 4000
},
{
"epoch": 1.1787377736376339,
"grad_norm": 0.34883835911750793,
"learning_rate": 0.0005862003494467094,
"loss": 3.991,
"step": 4050
},
{
"epoch": 1.193292966930601,
"grad_norm": 0.33956724405288696,
"learning_rate": 0.0005860256260920209,
"loss": 3.971,
"step": 4100
},
{
"epoch": 1.2078481602235678,
"grad_norm": 0.3573682904243469,
"learning_rate": 0.0005858509027373325,
"loss": 3.97,
"step": 4150
},
{
"epoch": 1.2224033535165346,
"grad_norm": 0.340936541557312,
"learning_rate": 0.0005856761793826441,
"loss": 3.9766,
"step": 4200
},
{
"epoch": 1.2369585468095017,
"grad_norm": 0.34760385751724243,
"learning_rate": 0.0005855014560279557,
"loss": 3.9639,
"step": 4250
},
{
"epoch": 1.2515137401024685,
"grad_norm": 0.3335438668727875,
"learning_rate": 0.0005853267326732673,
"loss": 3.9661,
"step": 4300
},
{
"epoch": 1.2660689333954354,
"grad_norm": 0.3376685380935669,
"learning_rate": 0.0005851520093185788,
"loss": 3.965,
"step": 4350
},
{
"epoch": 1.2806241266884024,
"grad_norm": 0.3536045253276825,
"learning_rate": 0.0005849772859638905,
"loss": 3.9526,
"step": 4400
},
{
"epoch": 1.2951793199813695,
"grad_norm": 0.3248458802700043,
"learning_rate": 0.0005848025626092021,
"loss": 3.9497,
"step": 4450
},
{
"epoch": 1.3097345132743363,
"grad_norm": 0.33945971727371216,
"learning_rate": 0.0005846278392545136,
"loss": 3.9642,
"step": 4500
},
{
"epoch": 1.3242897065673032,
"grad_norm": 0.320038378238678,
"learning_rate": 0.0005844531158998252,
"loss": 3.9586,
"step": 4550
},
{
"epoch": 1.3388448998602702,
"grad_norm": 0.3388155996799469,
"learning_rate": 0.0005842783925451368,
"loss": 3.9434,
"step": 4600
},
{
"epoch": 1.353400093153237,
"grad_norm": 0.34264621138572693,
"learning_rate": 0.0005841036691904484,
"loss": 3.9458,
"step": 4650
},
{
"epoch": 1.367955286446204,
"grad_norm": 0.3136458992958069,
"learning_rate": 0.00058392894583576,
"loss": 3.9506,
"step": 4700
},
{
"epoch": 1.382510479739171,
"grad_norm": 0.34289732575416565,
"learning_rate": 0.0005837542224810716,
"loss": 3.9497,
"step": 4750
},
{
"epoch": 1.3970656730321378,
"grad_norm": 0.36164990067481995,
"learning_rate": 0.0005835794991263832,
"loss": 3.9397,
"step": 4800
},
{
"epoch": 1.4116208663251049,
"grad_norm": 0.34485942125320435,
"learning_rate": 0.0005834047757716948,
"loss": 3.9358,
"step": 4850
},
{
"epoch": 1.4261760596180717,
"grad_norm": 0.3529173731803894,
"learning_rate": 0.0005832300524170063,
"loss": 3.9443,
"step": 4900
},
{
"epoch": 1.4407312529110388,
"grad_norm": 0.32209211587905884,
"learning_rate": 0.0005830553290623179,
"loss": 3.9202,
"step": 4950
},
{
"epoch": 1.4552864462040056,
"grad_norm": 0.3291449546813965,
"learning_rate": 0.0005828806057076296,
"loss": 3.9281,
"step": 5000
},
{
"epoch": 1.4552864462040056,
"eval_accuracy": 0.3324244555048086,
"eval_loss": 3.9115827083587646,
"eval_runtime": 179.3171,
"eval_samples_per_second": 92.858,
"eval_steps_per_second": 5.805,
"step": 5000
},
{
"epoch": 1.4698416394969724,
"grad_norm": 0.32161611318588257,
"learning_rate": 0.0005827058823529411,
"loss": 3.92,
"step": 5050
},
{
"epoch": 1.4843968327899395,
"grad_norm": 0.321431428194046,
"learning_rate": 0.0005825311589982527,
"loss": 3.914,
"step": 5100
},
{
"epoch": 1.4989520260829063,
"grad_norm": 0.33909738063812256,
"learning_rate": 0.0005823564356435643,
"loss": 3.9124,
"step": 5150
},
{
"epoch": 1.5135072193758732,
"grad_norm": 0.3282327353954315,
"learning_rate": 0.0005821817122888759,
"loss": 3.9224,
"step": 5200
},
{
"epoch": 1.5280624126688402,
"grad_norm": 0.32862597703933716,
"learning_rate": 0.0005820069889341875,
"loss": 3.9115,
"step": 5250
},
{
"epoch": 1.5426176059618073,
"grad_norm": 0.33926936984062195,
"learning_rate": 0.000581832265579499,
"loss": 3.9111,
"step": 5300
},
{
"epoch": 1.5571727992547741,
"grad_norm": 0.31386932730674744,
"learning_rate": 0.0005816575422248107,
"loss": 3.9139,
"step": 5350
},
{
"epoch": 1.571727992547741,
"grad_norm": 0.32861968874931335,
"learning_rate": 0.0005814828188701222,
"loss": 3.904,
"step": 5400
},
{
"epoch": 1.586283185840708,
"grad_norm": 0.3435116112232208,
"learning_rate": 0.0005813080955154338,
"loss": 3.9043,
"step": 5450
},
{
"epoch": 1.6008383791336749,
"grad_norm": 0.3164297640323639,
"learning_rate": 0.0005811333721607454,
"loss": 3.9041,
"step": 5500
},
{
"epoch": 1.6153935724266417,
"grad_norm": 0.325747549533844,
"learning_rate": 0.0005809586488060571,
"loss": 3.8981,
"step": 5550
},
{
"epoch": 1.6299487657196088,
"grad_norm": 0.3419976532459259,
"learning_rate": 0.0005807839254513686,
"loss": 3.8924,
"step": 5600
},
{
"epoch": 1.6445039590125758,
"grad_norm": 0.3399415612220764,
"learning_rate": 0.0005806092020966802,
"loss": 3.8939,
"step": 5650
},
{
"epoch": 1.6590591523055425,
"grad_norm": 0.31317266821861267,
"learning_rate": 0.0005804344787419918,
"loss": 3.8892,
"step": 5700
},
{
"epoch": 1.6736143455985095,
"grad_norm": 0.33786946535110474,
"learning_rate": 0.0005802597553873033,
"loss": 3.9072,
"step": 5750
},
{
"epoch": 1.6881695388914766,
"grad_norm": 0.299448162317276,
"learning_rate": 0.000580085032032615,
"loss": 3.8842,
"step": 5800
},
{
"epoch": 1.7027247321844434,
"grad_norm": 0.32242727279663086,
"learning_rate": 0.0005799103086779265,
"loss": 3.8825,
"step": 5850
},
{
"epoch": 1.7172799254774103,
"grad_norm": 0.3023882508277893,
"learning_rate": 0.0005797355853232382,
"loss": 3.8735,
"step": 5900
},
{
"epoch": 1.7318351187703773,
"grad_norm": 0.34403297305107117,
"learning_rate": 0.0005795608619685497,
"loss": 3.8863,
"step": 5950
},
{
"epoch": 1.7463903120633442,
"grad_norm": 0.3128809928894043,
"learning_rate": 0.0005793861386138614,
"loss": 3.8772,
"step": 6000
},
{
"epoch": 1.7463903120633442,
"eval_accuracy": 0.33771024606574407,
"eval_loss": 3.8521087169647217,
"eval_runtime": 179.351,
"eval_samples_per_second": 92.84,
"eval_steps_per_second": 5.804,
"step": 6000
},
{
"epoch": 1.760945505356311,
"grad_norm": 0.32059556245803833,
"learning_rate": 0.0005792114152591729,
"loss": 3.8643,
"step": 6050
},
{
"epoch": 1.775500698649278,
"grad_norm": 0.3130369186401367,
"learning_rate": 0.0005790366919044846,
"loss": 3.8665,
"step": 6100
},
{
"epoch": 1.7900558919422451,
"grad_norm": 0.3213827908039093,
"learning_rate": 0.0005788619685497961,
"loss": 3.8755,
"step": 6150
},
{
"epoch": 1.804611085235212,
"grad_norm": 0.2993370294570923,
"learning_rate": 0.0005786872451951077,
"loss": 3.8674,
"step": 6200
},
{
"epoch": 1.8191662785281788,
"grad_norm": 0.3117568790912628,
"learning_rate": 0.0005785125218404193,
"loss": 3.8717,
"step": 6250
},
{
"epoch": 1.8337214718211459,
"grad_norm": 0.3372628092765808,
"learning_rate": 0.0005783377984857308,
"loss": 3.8741,
"step": 6300
},
{
"epoch": 1.8482766651141127,
"grad_norm": 0.3122817575931549,
"learning_rate": 0.0005781630751310425,
"loss": 3.8657,
"step": 6350
},
{
"epoch": 1.8628318584070795,
"grad_norm": 0.32614654302597046,
"learning_rate": 0.0005779883517763541,
"loss": 3.8523,
"step": 6400
},
{
"epoch": 1.8773870517000466,
"grad_norm": 0.3281755745410919,
"learning_rate": 0.0005778136284216657,
"loss": 3.849,
"step": 6450
},
{
"epoch": 1.8919422449930137,
"grad_norm": 0.32258889079093933,
"learning_rate": 0.0005776389050669772,
"loss": 3.8664,
"step": 6500
},
{
"epoch": 1.9064974382859803,
"grad_norm": 0.3219572603702545,
"learning_rate": 0.0005774641817122889,
"loss": 3.851,
"step": 6550
},
{
"epoch": 1.9210526315789473,
"grad_norm": 0.3125177025794983,
"learning_rate": 0.0005772894583576004,
"loss": 3.8439,
"step": 6600
},
{
"epoch": 1.9356078248719144,
"grad_norm": 0.3299446403980255,
"learning_rate": 0.000577114735002912,
"loss": 3.8437,
"step": 6650
},
{
"epoch": 1.9501630181648812,
"grad_norm": 0.29954883456230164,
"learning_rate": 0.0005769400116482236,
"loss": 3.8372,
"step": 6700
},
{
"epoch": 1.964718211457848,
"grad_norm": 0.3428015112876892,
"learning_rate": 0.0005767652882935352,
"loss": 3.842,
"step": 6750
},
{
"epoch": 1.9792734047508151,
"grad_norm": 0.30913880467414856,
"learning_rate": 0.0005765905649388468,
"loss": 3.8458,
"step": 6800
},
{
"epoch": 1.993828598043782,
"grad_norm": 0.32585445046424866,
"learning_rate": 0.0005764158415841583,
"loss": 3.8499,
"step": 6850
},
{
"epoch": 2.0081509082440614,
"grad_norm": 0.33030200004577637,
"learning_rate": 0.00057624111822947,
"loss": 3.7942,
"step": 6900
},
{
"epoch": 2.0227061015370285,
"grad_norm": 0.34908726811408997,
"learning_rate": 0.0005760663948747816,
"loss": 3.7466,
"step": 6950
},
{
"epoch": 2.0372612948299955,
"grad_norm": 0.3279435336589813,
"learning_rate": 0.0005758916715200931,
"loss": 3.7465,
"step": 7000
},
{
"epoch": 2.0372612948299955,
"eval_accuracy": 0.34202185279211383,
"eval_loss": 3.8094260692596436,
"eval_runtime": 179.4152,
"eval_samples_per_second": 92.807,
"eval_steps_per_second": 5.802,
"step": 7000
},
{
"epoch": 2.051816488122962,
"grad_norm": 0.32543864846229553,
"learning_rate": 0.0005757169481654047,
"loss": 3.7445,
"step": 7050
},
{
"epoch": 2.066371681415929,
"grad_norm": 0.3198238015174866,
"learning_rate": 0.0005755422248107163,
"loss": 3.7343,
"step": 7100
},
{
"epoch": 2.0809268747088963,
"grad_norm": 0.3203284442424774,
"learning_rate": 0.0005753675014560279,
"loss": 3.7459,
"step": 7150
},
{
"epoch": 2.095482068001863,
"grad_norm": 0.3182157576084137,
"learning_rate": 0.0005751927781013395,
"loss": 3.7434,
"step": 7200
},
{
"epoch": 2.11003726129483,
"grad_norm": 0.3111145794391632,
"learning_rate": 0.0005750180547466511,
"loss": 3.7489,
"step": 7250
},
{
"epoch": 2.124592454587797,
"grad_norm": 0.3280778229236603,
"learning_rate": 0.0005748433313919627,
"loss": 3.7433,
"step": 7300
},
{
"epoch": 2.139147647880764,
"grad_norm": 0.31418535113334656,
"learning_rate": 0.0005746686080372743,
"loss": 3.7502,
"step": 7350
},
{
"epoch": 2.1537028411737307,
"grad_norm": 0.30446282029151917,
"learning_rate": 0.0005744938846825858,
"loss": 3.7584,
"step": 7400
},
{
"epoch": 2.1682580344666977,
"grad_norm": 0.3233334422111511,
"learning_rate": 0.0005743191613278974,
"loss": 3.7633,
"step": 7450
},
{
"epoch": 2.182813227759665,
"grad_norm": 0.33291539549827576,
"learning_rate": 0.0005741444379732091,
"loss": 3.7499,
"step": 7500
},
{
"epoch": 2.1973684210526314,
"grad_norm": 0.30665940046310425,
"learning_rate": 0.0005739697146185206,
"loss": 3.7578,
"step": 7550
},
{
"epoch": 2.2119236143455985,
"grad_norm": 0.33006513118743896,
"learning_rate": 0.0005737949912638322,
"loss": 3.7439,
"step": 7600
},
{
"epoch": 2.2264788076385655,
"grad_norm": 0.33310437202453613,
"learning_rate": 0.0005736202679091438,
"loss": 3.7458,
"step": 7650
},
{
"epoch": 2.241034000931532,
"grad_norm": 0.3161093294620514,
"learning_rate": 0.0005734455445544554,
"loss": 3.7432,
"step": 7700
},
{
"epoch": 2.255589194224499,
"grad_norm": 0.3161999583244324,
"learning_rate": 0.000573270821199767,
"loss": 3.7499,
"step": 7750
},
{
"epoch": 2.2701443875174663,
"grad_norm": 0.32372191548347473,
"learning_rate": 0.0005730960978450785,
"loss": 3.7527,
"step": 7800
},
{
"epoch": 2.2846995808104333,
"grad_norm": 0.31958499550819397,
"learning_rate": 0.0005729213744903902,
"loss": 3.7505,
"step": 7850
},
{
"epoch": 2.2992547741034,
"grad_norm": 0.3293428421020508,
"learning_rate": 0.0005727466511357017,
"loss": 3.7562,
"step": 7900
},
{
"epoch": 2.313809967396367,
"grad_norm": 0.3318077623844147,
"learning_rate": 0.0005725719277810134,
"loss": 3.7479,
"step": 7950
},
{
"epoch": 2.328365160689334,
"grad_norm": 0.31161871552467346,
"learning_rate": 0.0005723972044263249,
"loss": 3.7597,
"step": 8000
},
{
"epoch": 2.328365160689334,
"eval_accuracy": 0.3449051501758032,
"eval_loss": 3.7793445587158203,
"eval_runtime": 179.445,
"eval_samples_per_second": 92.792,
"eval_steps_per_second": 5.801,
"step": 8000
},
{
"epoch": 2.3429203539823007,
"grad_norm": 0.3241608440876007,
"learning_rate": 0.0005722224810716366,
"loss": 3.731,
"step": 8050
},
{
"epoch": 2.3574755472752678,
"grad_norm": 0.302915096282959,
"learning_rate": 0.0005720477577169481,
"loss": 3.7498,
"step": 8100
},
{
"epoch": 2.372030740568235,
"grad_norm": 0.30697834491729736,
"learning_rate": 0.0005718730343622598,
"loss": 3.7483,
"step": 8150
},
{
"epoch": 2.386585933861202,
"grad_norm": 0.30782854557037354,
"learning_rate": 0.0005716983110075713,
"loss": 3.7506,
"step": 8200
},
{
"epoch": 2.4011411271541685,
"grad_norm": 0.3194926083087921,
"learning_rate": 0.0005715235876528828,
"loss": 3.7471,
"step": 8250
},
{
"epoch": 2.4156963204471356,
"grad_norm": 0.32422101497650146,
"learning_rate": 0.0005713488642981945,
"loss": 3.7533,
"step": 8300
},
{
"epoch": 2.4302515137401026,
"grad_norm": 0.3300728499889374,
"learning_rate": 0.0005711741409435061,
"loss": 3.7488,
"step": 8350
},
{
"epoch": 2.4448067070330692,
"grad_norm": 0.3104513883590698,
"learning_rate": 0.0005709994175888177,
"loss": 3.7511,
"step": 8400
},
{
"epoch": 2.4593619003260363,
"grad_norm": 0.3395399749279022,
"learning_rate": 0.0005708246942341292,
"loss": 3.755,
"step": 8450
},
{
"epoch": 2.4739170936190034,
"grad_norm": 0.321796715259552,
"learning_rate": 0.0005706499708794409,
"loss": 3.7581,
"step": 8500
},
{
"epoch": 2.4884722869119704,
"grad_norm": 0.33986401557922363,
"learning_rate": 0.0005704752475247524,
"loss": 3.7325,
"step": 8550
},
{
"epoch": 2.503027480204937,
"grad_norm": 0.3136991262435913,
"learning_rate": 0.0005703005241700641,
"loss": 3.7543,
"step": 8600
},
{
"epoch": 2.517582673497904,
"grad_norm": 0.3213069438934326,
"learning_rate": 0.0005701258008153756,
"loss": 3.7426,
"step": 8650
},
{
"epoch": 2.5321378667908707,
"grad_norm": 0.32211968302726746,
"learning_rate": 0.0005699510774606872,
"loss": 3.7248,
"step": 8700
},
{
"epoch": 2.546693060083838,
"grad_norm": 0.304557204246521,
"learning_rate": 0.0005697763541059988,
"loss": 3.7378,
"step": 8750
},
{
"epoch": 2.561248253376805,
"grad_norm": 0.32637786865234375,
"learning_rate": 0.0005696016307513103,
"loss": 3.7449,
"step": 8800
},
{
"epoch": 2.575803446669772,
"grad_norm": 0.31502121686935425,
"learning_rate": 0.000569426907396622,
"loss": 3.7256,
"step": 8850
},
{
"epoch": 2.590358639962739,
"grad_norm": 0.32065874338150024,
"learning_rate": 0.0005692521840419336,
"loss": 3.7397,
"step": 8900
},
{
"epoch": 2.6049138332557056,
"grad_norm": 0.3020254671573639,
"learning_rate": 0.0005690774606872452,
"loss": 3.7328,
"step": 8950
},
{
"epoch": 2.6194690265486726,
"grad_norm": 0.32158565521240234,
"learning_rate": 0.0005689027373325567,
"loss": 3.7427,
"step": 9000
},
{
"epoch": 2.6194690265486726,
"eval_accuracy": 0.347388149557257,
"eval_loss": 3.7533369064331055,
"eval_runtime": 179.4903,
"eval_samples_per_second": 92.768,
"eval_steps_per_second": 5.8,
"step": 9000
},
{
"epoch": 2.6340242198416393,
"grad_norm": 0.32961833477020264,
"learning_rate": 0.0005687280139778683,
"loss": 3.7421,
"step": 9050
},
{
"epoch": 2.6485794131346063,
"grad_norm": 0.34298452734947205,
"learning_rate": 0.0005685532906231799,
"loss": 3.7278,
"step": 9100
},
{
"epoch": 2.6631346064275734,
"grad_norm": 0.3249497711658478,
"learning_rate": 0.0005683785672684915,
"loss": 3.7257,
"step": 9150
},
{
"epoch": 2.6776897997205404,
"grad_norm": 0.3133349120616913,
"learning_rate": 0.0005682038439138031,
"loss": 3.7441,
"step": 9200
},
{
"epoch": 2.692244993013507,
"grad_norm": 0.300086110830307,
"learning_rate": 0.0005680291205591147,
"loss": 3.7475,
"step": 9250
},
{
"epoch": 2.706800186306474,
"grad_norm": 0.3237035870552063,
"learning_rate": 0.0005678543972044263,
"loss": 3.7394,
"step": 9300
},
{
"epoch": 2.721355379599441,
"grad_norm": 0.32211583852767944,
"learning_rate": 0.0005676796738497378,
"loss": 3.7323,
"step": 9350
},
{
"epoch": 2.735910572892408,
"grad_norm": 0.31221750378608704,
"learning_rate": 0.0005675049504950495,
"loss": 3.7259,
"step": 9400
},
{
"epoch": 2.750465766185375,
"grad_norm": 0.3241555690765381,
"learning_rate": 0.0005673302271403611,
"loss": 3.7383,
"step": 9450
},
{
"epoch": 2.765020959478342,
"grad_norm": 0.29803764820098877,
"learning_rate": 0.0005671555037856726,
"loss": 3.7216,
"step": 9500
},
{
"epoch": 2.779576152771309,
"grad_norm": 0.3058941662311554,
"learning_rate": 0.0005669807804309842,
"loss": 3.7273,
"step": 9550
},
{
"epoch": 2.7941313460642756,
"grad_norm": 0.3076792061328888,
"learning_rate": 0.0005668060570762958,
"loss": 3.729,
"step": 9600
},
{
"epoch": 2.8086865393572427,
"grad_norm": 0.3100437819957733,
"learning_rate": 0.0005666313337216074,
"loss": 3.7223,
"step": 9650
},
{
"epoch": 2.8232417326502097,
"grad_norm": 0.312364786863327,
"learning_rate": 0.000566456610366919,
"loss": 3.7305,
"step": 9700
},
{
"epoch": 2.8377969259431763,
"grad_norm": 0.31409692764282227,
"learning_rate": 0.0005662818870122306,
"loss": 3.7071,
"step": 9750
},
{
"epoch": 2.8523521192361434,
"grad_norm": 0.30859071016311646,
"learning_rate": 0.0005661071636575422,
"loss": 3.7384,
"step": 9800
},
{
"epoch": 2.8669073125291105,
"grad_norm": 0.3208765685558319,
"learning_rate": 0.0005659324403028537,
"loss": 3.7277,
"step": 9850
},
{
"epoch": 2.8814625058220775,
"grad_norm": 0.29655808210372925,
"learning_rate": 0.0005657577169481653,
"loss": 3.7154,
"step": 9900
},
{
"epoch": 2.896017699115044,
"grad_norm": 0.3313380777835846,
"learning_rate": 0.0005655829935934769,
"loss": 3.7181,
"step": 9950
},
{
"epoch": 2.910572892408011,
"grad_norm": 0.2961195707321167,
"learning_rate": 0.0005654082702387886,
"loss": 3.7251,
"step": 10000
},
{
"epoch": 2.910572892408011,
"eval_accuracy": 0.34988078617775464,
"eval_loss": 3.723102331161499,
"eval_runtime": 179.4175,
"eval_samples_per_second": 92.806,
"eval_steps_per_second": 5.802,
"step": 10000
},
{
"epoch": 2.9251280857009783,
"grad_norm": 0.30046647787094116,
"learning_rate": 0.0005652335468841001,
"loss": 3.6131,
"step": 10050
},
{
"epoch": 2.939683278993945,
"grad_norm": 0.33831682801246643,
"learning_rate": 0.0005650588235294117,
"loss": 3.6069,
"step": 10100
},
{
"epoch": 2.954238472286912,
"grad_norm": 0.3195353150367737,
"learning_rate": 0.0005648841001747233,
"loss": 3.6146,
"step": 10150
},
{
"epoch": 2.968793665579879,
"grad_norm": 0.30412888526916504,
"learning_rate": 0.0005647093768200349,
"loss": 3.6043,
"step": 10200
},
{
"epoch": 2.983348858872846,
"grad_norm": 0.3138900399208069,
"learning_rate": 0.0005645346534653465,
"loss": 3.6143,
"step": 10250
},
{
"epoch": 2.9979040521658127,
"grad_norm": 0.31442713737487793,
"learning_rate": 0.0005643599301106582,
"loss": 3.6146,
"step": 10300
},
{
"epoch": 3.0125174662319516,
"grad_norm": 0.33579516410827637,
"learning_rate": 0.0005641852067559697,
"loss": 3.6742,
"step": 10350
},
{
"epoch": 3.0270726595249187,
"grad_norm": 0.32205700874328613,
"learning_rate": 0.0005640104834012812,
"loss": 3.6072,
"step": 10400
},
{
"epoch": 3.0416278528178853,
"grad_norm": 0.3189714550971985,
"learning_rate": 0.0005638357600465929,
"loss": 3.6164,
"step": 10450
},
{
"epoch": 3.0561830461108523,
"grad_norm": 0.3315085172653198,
"learning_rate": 0.0005636610366919044,
"loss": 3.6037,
"step": 10500
},
{
"epoch": 3.0707382394038194,
"grad_norm": 0.3438396155834198,
"learning_rate": 0.0005634863133372161,
"loss": 3.6141,
"step": 10550
},
{
"epoch": 3.085293432696786,
"grad_norm": 0.32832789421081543,
"learning_rate": 0.0005633115899825276,
"loss": 3.6291,
"step": 10600
},
{
"epoch": 3.099848625989753,
"grad_norm": 0.33390843868255615,
"learning_rate": 0.0005631368666278393,
"loss": 3.6353,
"step": 10650
},
{
"epoch": 3.11440381928272,
"grad_norm": 0.32860368490219116,
"learning_rate": 0.0005629621432731508,
"loss": 3.628,
"step": 10700
},
{
"epoch": 3.128959012575687,
"grad_norm": 0.32481637597084045,
"learning_rate": 0.0005627874199184623,
"loss": 3.6332,
"step": 10750
},
{
"epoch": 3.143514205868654,
"grad_norm": 0.32326942682266235,
"learning_rate": 0.000562612696563774,
"loss": 3.6268,
"step": 10800
},
{
"epoch": 3.158069399161621,
"grad_norm": 0.3113882541656494,
"learning_rate": 0.0005624379732090856,
"loss": 3.6337,
"step": 10850
},
{
"epoch": 3.172624592454588,
"grad_norm": 0.32441622018814087,
"learning_rate": 0.0005622632498543972,
"loss": 3.6364,
"step": 10900
},
{
"epoch": 3.1871797857475546,
"grad_norm": 0.31897199153900146,
"learning_rate": 0.0005620885264997087,
"loss": 3.6257,
"step": 10950
},
{
"epoch": 3.2017349790405216,
"grad_norm": 0.3312537968158722,
"learning_rate": 0.0005619138031450204,
"loss": 3.629,
"step": 11000
},
{
"epoch": 3.2017349790405216,
"eval_accuracy": 0.3520870087549616,
"eval_loss": 3.7162582874298096,
"eval_runtime": 181.7923,
"eval_samples_per_second": 91.594,
"eval_steps_per_second": 5.726,
"step": 11000
},
{
"epoch": 3.2162901723334887,
"grad_norm": 0.32515862584114075,
"learning_rate": 0.0005617390797903319,
"loss": 3.6315,
"step": 11050
},
{
"epoch": 3.2308453656264557,
"grad_norm": 0.2998684048652649,
"learning_rate": 0.0005615643564356436,
"loss": 3.64,
"step": 11100
},
{
"epoch": 3.2454005589194224,
"grad_norm": 0.3279356062412262,
"learning_rate": 0.0005613896330809551,
"loss": 3.6487,
"step": 11150
},
{
"epoch": 3.2599557522123894,
"grad_norm": 0.31287768483161926,
"learning_rate": 0.0005612149097262667,
"loss": 3.6286,
"step": 11200
},
{
"epoch": 3.2745109455053565,
"grad_norm": 0.34031912684440613,
"learning_rate": 0.0005610401863715783,
"loss": 3.6535,
"step": 11250
},
{
"epoch": 3.289066138798323,
"grad_norm": 0.32775530219078064,
"learning_rate": 0.0005608654630168898,
"loss": 3.6403,
"step": 11300
},
{
"epoch": 3.30362133209129,
"grad_norm": 0.30365148186683655,
"learning_rate": 0.0005606907396622015,
"loss": 3.6398,
"step": 11350
},
{
"epoch": 3.3181765253842572,
"grad_norm": 0.31511229276657104,
"learning_rate": 0.0005605160163075131,
"loss": 3.6436,
"step": 11400
},
{
"epoch": 3.332731718677224,
"grad_norm": 0.3300502896308899,
"learning_rate": 0.0005603412929528247,
"loss": 3.6464,
"step": 11450
},
{
"epoch": 3.347286911970191,
"grad_norm": 0.3103245198726654,
"learning_rate": 0.0005601665695981362,
"loss": 3.636,
"step": 11500
},
{
"epoch": 3.361842105263158,
"grad_norm": 0.322672039270401,
"learning_rate": 0.0005599918462434478,
"loss": 3.6354,
"step": 11550
},
{
"epoch": 3.3763972985561246,
"grad_norm": 0.3106880486011505,
"learning_rate": 0.0005598171228887594,
"loss": 3.6439,
"step": 11600
},
{
"epoch": 3.3909524918490916,
"grad_norm": 0.3212052583694458,
"learning_rate": 0.0005596423995340709,
"loss": 3.6357,
"step": 11650
},
{
"epoch": 3.4055076851420587,
"grad_norm": 0.3229770362377167,
"learning_rate": 0.0005594676761793826,
"loss": 3.6416,
"step": 11700
},
{
"epoch": 3.4200628784350258,
"grad_norm": 0.31980758905410767,
"learning_rate": 0.0005592929528246942,
"loss": 3.6531,
"step": 11750
},
{
"epoch": 3.4346180717279924,
"grad_norm": 0.3255523145198822,
"learning_rate": 0.0005591182294700058,
"loss": 3.6435,
"step": 11800
},
{
"epoch": 3.4491732650209594,
"grad_norm": 0.3042266368865967,
"learning_rate": 0.0005589435061153173,
"loss": 3.6418,
"step": 11850
},
{
"epoch": 3.4637284583139265,
"grad_norm": 0.32445093989372253,
"learning_rate": 0.000558768782760629,
"loss": 3.6515,
"step": 11900
},
{
"epoch": 3.478283651606893,
"grad_norm": 0.31971192359924316,
"learning_rate": 0.0005585940594059406,
"loss": 3.6484,
"step": 11950
},
{
"epoch": 3.49283884489986,
"grad_norm": 0.3065326511859894,
"learning_rate": 0.0005584193360512521,
"loss": 3.6484,
"step": 12000
},
{
"epoch": 3.49283884489986,
"eval_accuracy": 0.35326992108394023,
"eval_loss": 3.6962692737579346,
"eval_runtime": 180.8346,
"eval_samples_per_second": 92.079,
"eval_steps_per_second": 5.757,
"step": 12000
},
{
"epoch": 3.5073940381928272,
"grad_norm": 0.3139113485813141,
"learning_rate": 0.0005582446126965637,
"loss": 3.6446,
"step": 12050
},
{
"epoch": 3.5219492314857943,
"grad_norm": 0.32031387090682983,
"learning_rate": 0.0005580698893418753,
"loss": 3.6495,
"step": 12100
},
{
"epoch": 3.536504424778761,
"grad_norm": 0.3141990602016449,
"learning_rate": 0.0005578951659871869,
"loss": 3.6398,
"step": 12150
},
{
"epoch": 3.551059618071728,
"grad_norm": 0.3226875066757202,
"learning_rate": 0.0005577204426324985,
"loss": 3.6488,
"step": 12200
},
{
"epoch": 3.565614811364695,
"grad_norm": 0.30300959944725037,
"learning_rate": 0.0005575457192778101,
"loss": 3.6313,
"step": 12250
},
{
"epoch": 3.5801700046576617,
"grad_norm": 0.3211573660373688,
"learning_rate": 0.0005573709959231217,
"loss": 3.6463,
"step": 12300
},
{
"epoch": 3.5947251979506287,
"grad_norm": 0.31371423602104187,
"learning_rate": 0.0005571962725684332,
"loss": 3.6475,
"step": 12350
},
{
"epoch": 3.609280391243596,
"grad_norm": 0.31352680921554565,
"learning_rate": 0.0005570215492137449,
"loss": 3.6435,
"step": 12400
},
{
"epoch": 3.623835584536563,
"grad_norm": 0.29655176401138306,
"learning_rate": 0.0005568468258590564,
"loss": 3.6431,
"step": 12450
},
{
"epoch": 3.6383907778295295,
"grad_norm": 0.32901984453201294,
"learning_rate": 0.0005566721025043681,
"loss": 3.632,
"step": 12500
},
{
"epoch": 3.6529459711224965,
"grad_norm": 0.29682856798171997,
"learning_rate": 0.0005564973791496796,
"loss": 3.649,
"step": 12550
},
{
"epoch": 3.6675011644154636,
"grad_norm": 0.32371819019317627,
"learning_rate": 0.0005563226557949913,
"loss": 3.6322,
"step": 12600
},
{
"epoch": 3.68205635770843,
"grad_norm": 0.3183552324771881,
"learning_rate": 0.0005561479324403028,
"loss": 3.6362,
"step": 12650
},
{
"epoch": 3.6966115510013973,
"grad_norm": 0.31535252928733826,
"learning_rate": 0.0005559732090856144,
"loss": 3.6283,
"step": 12700
},
{
"epoch": 3.7111667442943643,
"grad_norm": 0.30772748589515686,
"learning_rate": 0.000555798485730926,
"loss": 3.6552,
"step": 12750
},
{
"epoch": 3.7257219375873314,
"grad_norm": 0.32342010736465454,
"learning_rate": 0.0005556237623762376,
"loss": 3.6481,
"step": 12800
},
{
"epoch": 3.740277130880298,
"grad_norm": 0.3001164495944977,
"learning_rate": 0.0005554490390215492,
"loss": 3.632,
"step": 12850
},
{
"epoch": 3.754832324173265,
"grad_norm": 0.3260549306869507,
"learning_rate": 0.0005552743156668607,
"loss": 3.6338,
"step": 12900
},
{
"epoch": 3.7693875174662317,
"grad_norm": 0.325056254863739,
"learning_rate": 0.0005550995923121724,
"loss": 3.6336,
"step": 12950
},
{
"epoch": 3.7839427107591987,
"grad_norm": 0.3091551959514618,
"learning_rate": 0.0005549248689574839,
"loss": 3.6332,
"step": 13000
},
{
"epoch": 3.7839427107591987,
"eval_accuracy": 0.354759579680046,
"eval_loss": 3.6787266731262207,
"eval_runtime": 180.7938,
"eval_samples_per_second": 92.099,
"eval_steps_per_second": 5.758,
"step": 13000
},
{
"epoch": 3.798497904052166,
"grad_norm": 0.3215680718421936,
"learning_rate": 0.0005547501456027955,
"loss": 3.646,
"step": 13050
},
{
"epoch": 3.813053097345133,
"grad_norm": 0.30009904503822327,
"learning_rate": 0.0005545754222481071,
"loss": 3.6497,
"step": 13100
},
{
"epoch": 3.8276082906381,
"grad_norm": 0.31329527497291565,
"learning_rate": 0.0005544006988934188,
"loss": 3.6408,
"step": 13150
},
{
"epoch": 3.8421634839310665,
"grad_norm": 0.3503079116344452,
"learning_rate": 0.0005542259755387303,
"loss": 3.6464,
"step": 13200
},
{
"epoch": 3.8567186772240336,
"grad_norm": 0.32458430528640747,
"learning_rate": 0.0005540512521840418,
"loss": 3.6416,
"step": 13250
},
{
"epoch": 3.8712738705170002,
"grad_norm": 0.28616443276405334,
"learning_rate": 0.0005538765288293535,
"loss": 3.6359,
"step": 13300
},
{
"epoch": 3.8858290638099673,
"grad_norm": 0.310008704662323,
"learning_rate": 0.0005537018054746651,
"loss": 3.6406,
"step": 13350
},
{
"epoch": 3.9003842571029343,
"grad_norm": 0.3077920377254486,
"learning_rate": 0.0005535270821199767,
"loss": 3.6404,
"step": 13400
},
{
"epoch": 3.9149394503959014,
"grad_norm": 0.3122752904891968,
"learning_rate": 0.0005533523587652882,
"loss": 3.6297,
"step": 13450
},
{
"epoch": 3.929494643688868,
"grad_norm": 0.30146297812461853,
"learning_rate": 0.0005531776354105999,
"loss": 3.6378,
"step": 13500
},
{
"epoch": 3.944049836981835,
"grad_norm": 0.3015786409378052,
"learning_rate": 0.0005530029120559114,
"loss": 3.6414,
"step": 13550
},
{
"epoch": 3.958605030274802,
"grad_norm": 0.3123229444026947,
"learning_rate": 0.0005528281887012229,
"loss": 3.64,
"step": 13600
},
{
"epoch": 3.9731602235677688,
"grad_norm": 0.3401547968387604,
"learning_rate": 0.0005526534653465346,
"loss": 3.6367,
"step": 13650
},
{
"epoch": 3.987715416860736,
"grad_norm": 0.327626496553421,
"learning_rate": 0.0005524787419918462,
"loss": 3.6363,
"step": 13700
},
{
"epoch": 4.0020377270610155,
"grad_norm": 0.3363268971443176,
"learning_rate": 0.0005523040186371578,
"loss": 3.6228,
"step": 13750
},
{
"epoch": 4.0165929203539825,
"grad_norm": 0.3198484480381012,
"learning_rate": 0.0005521292952824693,
"loss": 3.5255,
"step": 13800
},
{
"epoch": 4.03114811364695,
"grad_norm": 0.3151451349258423,
"learning_rate": 0.000551954571927781,
"loss": 3.5266,
"step": 13850
},
{
"epoch": 4.045703306939916,
"grad_norm": 0.3302767872810364,
"learning_rate": 0.0005517798485730926,
"loss": 3.5197,
"step": 13900
},
{
"epoch": 4.060258500232883,
"grad_norm": 0.3168678283691406,
"learning_rate": 0.0005516051252184042,
"loss": 3.528,
"step": 13950
},
{
"epoch": 4.07481369352585,
"grad_norm": 0.33349886536598206,
"learning_rate": 0.0005514304018637157,
"loss": 3.5315,
"step": 14000
},
{
"epoch": 4.07481369352585,
"eval_accuracy": 0.3561975262617702,
"eval_loss": 3.6728265285491943,
"eval_runtime": 180.7141,
"eval_samples_per_second": 92.14,
"eval_steps_per_second": 5.76,
"step": 14000
},
{
"epoch": 4.089368886818817,
"grad_norm": 0.32183805108070374,
"learning_rate": 0.0005512556785090273,
"loss": 3.5357,
"step": 14050
},
{
"epoch": 4.103924080111784,
"grad_norm": 0.31829333305358887,
"learning_rate": 0.0005510809551543389,
"loss": 3.5455,
"step": 14100
},
{
"epoch": 4.118479273404751,
"grad_norm": 0.30199795961380005,
"learning_rate": 0.0005509062317996504,
"loss": 3.5482,
"step": 14150
},
{
"epoch": 4.133034466697718,
"grad_norm": 0.311040461063385,
"learning_rate": 0.0005507315084449621,
"loss": 3.5449,
"step": 14200
},
{
"epoch": 4.147589659990684,
"grad_norm": 0.33546823263168335,
"learning_rate": 0.0005505567850902737,
"loss": 3.5483,
"step": 14250
},
{
"epoch": 4.162144853283651,
"grad_norm": 0.33755001425743103,
"learning_rate": 0.0005503820617355853,
"loss": 3.5585,
"step": 14300
},
{
"epoch": 4.176700046576618,
"grad_norm": 0.3277694582939148,
"learning_rate": 0.0005502073383808969,
"loss": 3.566,
"step": 14350
},
{
"epoch": 4.1912552398695855,
"grad_norm": 0.32476261258125305,
"learning_rate": 0.0005500326150262085,
"loss": 3.551,
"step": 14400
},
{
"epoch": 4.2058104331625525,
"grad_norm": 0.35436880588531494,
"learning_rate": 0.00054985789167152,
"loss": 3.5592,
"step": 14450
},
{
"epoch": 4.22036562645552,
"grad_norm": 0.32852813601493835,
"learning_rate": 0.0005496831683168316,
"loss": 3.5695,
"step": 14500
},
{
"epoch": 4.234920819748487,
"grad_norm": 0.31876441836357117,
"learning_rate": 0.0005495084449621433,
"loss": 3.5533,
"step": 14550
},
{
"epoch": 4.249476013041453,
"grad_norm": 0.3397602140903473,
"learning_rate": 0.0005493337216074548,
"loss": 3.5484,
"step": 14600
},
{
"epoch": 4.26403120633442,
"grad_norm": 0.30222707986831665,
"learning_rate": 0.0005491589982527664,
"loss": 3.5632,
"step": 14650
},
{
"epoch": 4.278586399627387,
"grad_norm": 0.3110881745815277,
"learning_rate": 0.000548984274898078,
"loss": 3.5593,
"step": 14700
},
{
"epoch": 4.293141592920354,
"grad_norm": 0.32440635561943054,
"learning_rate": 0.0005488095515433897,
"loss": 3.556,
"step": 14750
},
{
"epoch": 4.307696786213321,
"grad_norm": 0.31460386514663696,
"learning_rate": 0.0005486348281887012,
"loss": 3.567,
"step": 14800
},
{
"epoch": 4.322251979506288,
"grad_norm": 0.2991078495979309,
"learning_rate": 0.0005484601048340127,
"loss": 3.563,
"step": 14850
},
{
"epoch": 4.336807172799254,
"grad_norm": 0.343728631734848,
"learning_rate": 0.0005482853814793244,
"loss": 3.5608,
"step": 14900
},
{
"epoch": 4.351362366092221,
"grad_norm": 0.32072120904922485,
"learning_rate": 0.0005481106581246359,
"loss": 3.5685,
"step": 14950
},
{
"epoch": 4.365917559385188,
"grad_norm": 0.3259231448173523,
"learning_rate": 0.0005479359347699475,
"loss": 3.5681,
"step": 15000
},
{
"epoch": 4.365917559385188,
"eval_accuracy": 0.3573054561698956,
"eval_loss": 3.6608378887176514,
"eval_runtime": 180.9212,
"eval_samples_per_second": 92.035,
"eval_steps_per_second": 5.754,
"step": 15000
},
{
"epoch": 4.3804727526781555,
"grad_norm": 0.3365843892097473,
"learning_rate": 0.0005477612114152591,
"loss": 3.5708,
"step": 15050
},
{
"epoch": 4.395027945971123,
"grad_norm": 0.32022547721862793,
"learning_rate": 0.0005475864880605708,
"loss": 3.5736,
"step": 15100
},
{
"epoch": 4.40958313926409,
"grad_norm": 0.3090197741985321,
"learning_rate": 0.0005474117647058823,
"loss": 3.5685,
"step": 15150
},
{
"epoch": 4.424138332557057,
"grad_norm": 0.31479737162590027,
"learning_rate": 0.0005472370413511939,
"loss": 3.5693,
"step": 15200
},
{
"epoch": 4.438693525850024,
"grad_norm": 0.3122880458831787,
"learning_rate": 0.0005470623179965055,
"loss": 3.5662,
"step": 15250
},
{
"epoch": 4.45324871914299,
"grad_norm": 0.31671661138534546,
"learning_rate": 0.0005468875946418171,
"loss": 3.5729,
"step": 15300
},
{
"epoch": 4.467803912435957,
"grad_norm": 0.32296523451805115,
"learning_rate": 0.0005467128712871287,
"loss": 3.5622,
"step": 15350
},
{
"epoch": 4.482359105728924,
"grad_norm": 0.315123975276947,
"learning_rate": 0.0005465381479324402,
"loss": 3.5623,
"step": 15400
},
{
"epoch": 4.496914299021891,
"grad_norm": 0.3224656581878662,
"learning_rate": 0.0005463634245777519,
"loss": 3.5768,
"step": 15450
},
{
"epoch": 4.511469492314858,
"grad_norm": 0.329997181892395,
"learning_rate": 0.0005461887012230634,
"loss": 3.5794,
"step": 15500
},
{
"epoch": 4.526024685607825,
"grad_norm": 0.3304203748703003,
"learning_rate": 0.000546013977868375,
"loss": 3.5778,
"step": 15550
},
{
"epoch": 4.540579878900791,
"grad_norm": 0.3303874433040619,
"learning_rate": 0.0005458392545136866,
"loss": 3.5719,
"step": 15600
},
{
"epoch": 4.5551350721937585,
"grad_norm": 0.31413403153419495,
"learning_rate": 0.0005456645311589983,
"loss": 3.5651,
"step": 15650
},
{
"epoch": 4.5696902654867255,
"grad_norm": 0.3068625032901764,
"learning_rate": 0.0005454898078043098,
"loss": 3.5817,
"step": 15700
},
{
"epoch": 4.584245458779693,
"grad_norm": 0.30968910455703735,
"learning_rate": 0.0005453150844496213,
"loss": 3.5769,
"step": 15750
},
{
"epoch": 4.59880065207266,
"grad_norm": 0.34386196732521057,
"learning_rate": 0.000545140361094933,
"loss": 3.581,
"step": 15800
},
{
"epoch": 4.613355845365627,
"grad_norm": 0.32992249727249146,
"learning_rate": 0.0005449656377402445,
"loss": 3.5871,
"step": 15850
},
{
"epoch": 4.627911038658594,
"grad_norm": 0.31546783447265625,
"learning_rate": 0.0005447909143855562,
"loss": 3.5648,
"step": 15900
},
{
"epoch": 4.64246623195156,
"grad_norm": 0.3235871493816376,
"learning_rate": 0.0005446161910308677,
"loss": 3.5675,
"step": 15950
},
{
"epoch": 4.657021425244527,
"grad_norm": 0.32952284812927246,
"learning_rate": 0.0005444414676761794,
"loss": 3.5781,
"step": 16000
},
{
"epoch": 4.657021425244527,
"eval_accuracy": 0.3587489265349742,
"eval_loss": 3.6463191509246826,
"eval_runtime": 180.9022,
"eval_samples_per_second": 92.044,
"eval_steps_per_second": 5.754,
"step": 16000
},
{
"epoch": 4.671576618537494,
"grad_norm": 0.2940504848957062,
"learning_rate": 0.0005442667443214909,
"loss": 3.583,
"step": 16050
},
{
"epoch": 4.686131811830461,
"grad_norm": 0.3155236840248108,
"learning_rate": 0.0005440920209668024,
"loss": 3.5748,
"step": 16100
},
{
"epoch": 4.700687005123428,
"grad_norm": 0.30486825108528137,
"learning_rate": 0.0005439172976121141,
"loss": 3.5818,
"step": 16150
},
{
"epoch": 4.715242198416395,
"grad_norm": 0.3136492669582367,
"learning_rate": 0.0005437425742574257,
"loss": 3.5634,
"step": 16200
},
{
"epoch": 4.729797391709361,
"grad_norm": 0.30183354020118713,
"learning_rate": 0.0005435678509027373,
"loss": 3.5841,
"step": 16250
},
{
"epoch": 4.7443525850023285,
"grad_norm": 0.3134796619415283,
"learning_rate": 0.0005433931275480488,
"loss": 3.5798,
"step": 16300
},
{
"epoch": 4.7589077782952955,
"grad_norm": 0.3129316568374634,
"learning_rate": 0.0005432184041933605,
"loss": 3.5788,
"step": 16350
},
{
"epoch": 4.773462971588263,
"grad_norm": 0.3630964457988739,
"learning_rate": 0.000543043680838672,
"loss": 3.5812,
"step": 16400
},
{
"epoch": 4.78801816488123,
"grad_norm": 0.3073498606681824,
"learning_rate": 0.0005428689574839837,
"loss": 3.5714,
"step": 16450
},
{
"epoch": 4.802573358174197,
"grad_norm": 0.31628504395484924,
"learning_rate": 0.0005426942341292952,
"loss": 3.5832,
"step": 16500
},
{
"epoch": 4.817128551467164,
"grad_norm": 0.3307545483112335,
"learning_rate": 0.0005425195107746068,
"loss": 3.577,
"step": 16550
},
{
"epoch": 4.831683744760131,
"grad_norm": 0.30894699692726135,
"learning_rate": 0.0005423447874199184,
"loss": 3.5857,
"step": 16600
},
{
"epoch": 4.846238938053097,
"grad_norm": 0.3408315181732178,
"learning_rate": 0.00054217006406523,
"loss": 3.57,
"step": 16650
},
{
"epoch": 4.860794131346064,
"grad_norm": 0.3083992600440979,
"learning_rate": 0.0005419953407105417,
"loss": 3.5818,
"step": 16700
},
{
"epoch": 4.875349324639031,
"grad_norm": 0.29149574041366577,
"learning_rate": 0.0005418206173558532,
"loss": 3.5844,
"step": 16750
},
{
"epoch": 4.889904517931998,
"grad_norm": 0.31696999073028564,
"learning_rate": 0.0005416458940011648,
"loss": 3.5859,
"step": 16800
},
{
"epoch": 4.904459711224965,
"grad_norm": 0.30741801857948303,
"learning_rate": 0.0005414711706464764,
"loss": 3.5762,
"step": 16850
},
{
"epoch": 4.919014904517932,
"grad_norm": 0.3022885322570801,
"learning_rate": 0.000541296447291788,
"loss": 3.5817,
"step": 16900
},
{
"epoch": 4.9335700978108985,
"grad_norm": 0.3171398937702179,
"learning_rate": 0.0005411217239370995,
"loss": 3.5774,
"step": 16950
},
{
"epoch": 4.948125291103866,
"grad_norm": 0.29663971066474915,
"learning_rate": 0.0005409470005824111,
"loss": 3.5613,
"step": 17000
},
{
"epoch": 4.948125291103866,
"eval_accuracy": 0.3598574440796266,
"eval_loss": 3.633816957473755,
"eval_runtime": 180.8783,
"eval_samples_per_second": 92.056,
"eval_steps_per_second": 5.755,
"step": 17000
},
{
"epoch": 4.962680484396833,
"grad_norm": 0.3096961975097656,
"learning_rate": 0.0005407722772277228,
"loss": 3.5658,
"step": 17050
},
{
"epoch": 4.9772356776898,
"grad_norm": 0.31567344069480896,
"learning_rate": 0.0005405975538730343,
"loss": 3.5664,
"step": 17100
},
{
"epoch": 4.991790870982767,
"grad_norm": 0.35638725757598877,
"learning_rate": 0.0005404228305183459,
"loss": 3.5741,
"step": 17150
},
{
"epoch": 5.006113181183046,
"grad_norm": 0.3229467272758484,
"learning_rate": 0.0005402481071636575,
"loss": 3.5306,
"step": 17200
},
{
"epoch": 5.020668374476013,
"grad_norm": 0.32992422580718994,
"learning_rate": 0.0005400733838089692,
"loss": 3.4672,
"step": 17250
},
{
"epoch": 5.03522356776898,
"grad_norm": 0.3403879404067993,
"learning_rate": 0.0005398986604542807,
"loss": 3.4667,
"step": 17300
},
{
"epoch": 5.049778761061947,
"grad_norm": 0.32500892877578735,
"learning_rate": 0.0005397239370995922,
"loss": 3.4752,
"step": 17350
},
{
"epoch": 5.064333954354914,
"grad_norm": 0.2997499108314514,
"learning_rate": 0.0005395492137449039,
"loss": 3.4724,
"step": 17400
},
{
"epoch": 5.078889147647881,
"grad_norm": 0.3481856882572174,
"learning_rate": 0.0005393744903902154,
"loss": 3.4825,
"step": 17450
},
{
"epoch": 5.093444340940848,
"grad_norm": 0.338044136762619,
"learning_rate": 0.000539199767035527,
"loss": 3.4807,
"step": 17500
},
{
"epoch": 5.107999534233815,
"grad_norm": 0.3250182867050171,
"learning_rate": 0.0005390250436808386,
"loss": 3.4797,
"step": 17550
},
{
"epoch": 5.122554727526782,
"grad_norm": 0.3101995885372162,
"learning_rate": 0.0005388503203261503,
"loss": 3.5014,
"step": 17600
},
{
"epoch": 5.137109920819748,
"grad_norm": 0.32210931181907654,
"learning_rate": 0.0005386755969714618,
"loss": 3.4837,
"step": 17650
},
{
"epoch": 5.151665114112715,
"grad_norm": 0.32669687271118164,
"learning_rate": 0.0005385008736167733,
"loss": 3.4861,
"step": 17700
},
{
"epoch": 5.166220307405682,
"grad_norm": 0.333234041929245,
"learning_rate": 0.000538326150262085,
"loss": 3.4951,
"step": 17750
},
{
"epoch": 5.180775500698649,
"grad_norm": 0.3409346640110016,
"learning_rate": 0.0005381514269073965,
"loss": 3.4741,
"step": 17800
},
{
"epoch": 5.195330693991616,
"grad_norm": 0.3232422471046448,
"learning_rate": 0.0005379767035527082,
"loss": 3.4955,
"step": 17850
},
{
"epoch": 5.2098858872845835,
"grad_norm": 0.32606247067451477,
"learning_rate": 0.0005378019801980197,
"loss": 3.5095,
"step": 17900
},
{
"epoch": 5.2244410805775505,
"grad_norm": 0.3317588269710541,
"learning_rate": 0.0005376272568433314,
"loss": 3.4932,
"step": 17950
},
{
"epoch": 5.238996273870517,
"grad_norm": 0.3223241865634918,
"learning_rate": 0.0005374525334886429,
"loss": 3.4987,
"step": 18000
},
{
"epoch": 5.238996273870517,
"eval_accuracy": 0.3603531742538573,
"eval_loss": 3.6327548027038574,
"eval_runtime": 181.0812,
"eval_samples_per_second": 91.953,
"eval_steps_per_second": 5.749,
"step": 18000
},
{
"epoch": 5.253551467163484,
"grad_norm": 0.3261396884918213,
"learning_rate": 0.0005372778101339545,
"loss": 3.5104,
"step": 18050
},
{
"epoch": 5.268106660456451,
"grad_norm": 0.31246986985206604,
"learning_rate": 0.0005371030867792661,
"loss": 3.5021,
"step": 18100
},
{
"epoch": 5.282661853749418,
"grad_norm": 0.320519357919693,
"learning_rate": 0.0005369283634245778,
"loss": 3.4982,
"step": 18150
},
{
"epoch": 5.297217047042385,
"grad_norm": 0.3356594443321228,
"learning_rate": 0.0005367536400698893,
"loss": 3.5051,
"step": 18200
},
{
"epoch": 5.311772240335352,
"grad_norm": 0.32634884119033813,
"learning_rate": 0.0005365789167152008,
"loss": 3.517,
"step": 18250
},
{
"epoch": 5.326327433628318,
"grad_norm": 0.31914022564888,
"learning_rate": 0.0005364041933605125,
"loss": 3.5203,
"step": 18300
},
{
"epoch": 5.340882626921285,
"grad_norm": 0.3101184070110321,
"learning_rate": 0.000536229470005824,
"loss": 3.5101,
"step": 18350
},
{
"epoch": 5.355437820214252,
"grad_norm": 0.313850075006485,
"learning_rate": 0.0005360547466511357,
"loss": 3.5077,
"step": 18400
},
{
"epoch": 5.369993013507219,
"grad_norm": 0.2964019477367401,
"learning_rate": 0.0005358800232964472,
"loss": 3.5284,
"step": 18450
},
{
"epoch": 5.384548206800186,
"grad_norm": 0.320024311542511,
"learning_rate": 0.0005357052999417589,
"loss": 3.5195,
"step": 18500
},
{
"epoch": 5.3991034000931535,
"grad_norm": 0.3183858394622803,
"learning_rate": 0.0005355305765870704,
"loss": 3.503,
"step": 18550
},
{
"epoch": 5.4136585933861205,
"grad_norm": 0.3233298659324646,
"learning_rate": 0.000535355853232382,
"loss": 3.5258,
"step": 18600
},
{
"epoch": 5.428213786679087,
"grad_norm": 0.3176547586917877,
"learning_rate": 0.0005351811298776936,
"loss": 3.5182,
"step": 18650
},
{
"epoch": 5.442768979972054,
"grad_norm": 0.32504311203956604,
"learning_rate": 0.0005350064065230052,
"loss": 3.5212,
"step": 18700
},
{
"epoch": 5.457324173265021,
"grad_norm": 0.32569703459739685,
"learning_rate": 0.0005348316831683168,
"loss": 3.5192,
"step": 18750
},
{
"epoch": 5.471879366557988,
"grad_norm": 0.31813210248947144,
"learning_rate": 0.0005346569598136284,
"loss": 3.516,
"step": 18800
},
{
"epoch": 5.486434559850955,
"grad_norm": 0.32232987880706787,
"learning_rate": 0.00053448223645894,
"loss": 3.5212,
"step": 18850
},
{
"epoch": 5.500989753143922,
"grad_norm": 0.33343178033828735,
"learning_rate": 0.0005343075131042515,
"loss": 3.5267,
"step": 18900
},
{
"epoch": 5.515544946436888,
"grad_norm": 0.33466091752052307,
"learning_rate": 0.0005341327897495632,
"loss": 3.5215,
"step": 18950
},
{
"epoch": 5.530100139729855,
"grad_norm": 0.3394691050052643,
"learning_rate": 0.0005339580663948748,
"loss": 3.5154,
"step": 19000
},
{
"epoch": 5.530100139729855,
"eval_accuracy": 0.36140610138304957,
"eval_loss": 3.622549533843994,
"eval_runtime": 180.8942,
"eval_samples_per_second": 92.048,
"eval_steps_per_second": 5.755,
"step": 19000
},
{
"epoch": 5.544655333022822,
"grad_norm": 0.31924381852149963,
"learning_rate": 0.0005337833430401863,
"loss": 3.5189,
"step": 19050
},
{
"epoch": 5.559210526315789,
"grad_norm": 0.30619359016418457,
"learning_rate": 0.0005336086196854979,
"loss": 3.522,
"step": 19100
},
{
"epoch": 5.573765719608756,
"grad_norm": 0.3263663947582245,
"learning_rate": 0.0005334338963308095,
"loss": 3.5163,
"step": 19150
},
{
"epoch": 5.5883209129017235,
"grad_norm": 0.32492828369140625,
"learning_rate": 0.0005332591729761211,
"loss": 3.5419,
"step": 19200
},
{
"epoch": 5.602876106194691,
"grad_norm": 0.30963876843452454,
"learning_rate": 0.0005330844496214327,
"loss": 3.5158,
"step": 19250
},
{
"epoch": 5.617431299487658,
"grad_norm": 0.33830997347831726,
"learning_rate": 0.0005329097262667443,
"loss": 3.5281,
"step": 19300
},
{
"epoch": 5.631986492780624,
"grad_norm": 0.3114701211452484,
"learning_rate": 0.0005327350029120559,
"loss": 3.5288,
"step": 19350
},
{
"epoch": 5.646541686073591,
"grad_norm": 0.32258859276771545,
"learning_rate": 0.0005325602795573674,
"loss": 3.5321,
"step": 19400
},
{
"epoch": 5.661096879366558,
"grad_norm": 0.342039555311203,
"learning_rate": 0.000532385556202679,
"loss": 3.5308,
"step": 19450
},
{
"epoch": 5.675652072659525,
"grad_norm": 0.3170706629753113,
"learning_rate": 0.0005322108328479906,
"loss": 3.5177,
"step": 19500
},
{
"epoch": 5.690207265952492,
"grad_norm": 0.3374808728694916,
"learning_rate": 0.0005320361094933023,
"loss": 3.5252,
"step": 19550
},
{
"epoch": 5.704762459245459,
"grad_norm": 0.29898542165756226,
"learning_rate": 0.0005318613861386138,
"loss": 3.5209,
"step": 19600
},
{
"epoch": 5.719317652538425,
"grad_norm": 0.3294072151184082,
"learning_rate": 0.0005316866627839254,
"loss": 3.5228,
"step": 19650
},
{
"epoch": 5.733872845831392,
"grad_norm": 0.30430126190185547,
"learning_rate": 0.000531511939429237,
"loss": 3.5207,
"step": 19700
},
{
"epoch": 5.748428039124359,
"grad_norm": 0.33038702607154846,
"learning_rate": 0.0005313372160745486,
"loss": 3.5294,
"step": 19750
},
{
"epoch": 5.7629832324173265,
"grad_norm": 0.3005201518535614,
"learning_rate": 0.0005311624927198602,
"loss": 3.5347,
"step": 19800
},
{
"epoch": 5.7775384257102935,
"grad_norm": 0.3165362775325775,
"learning_rate": 0.0005309877693651717,
"loss": 3.5212,
"step": 19850
},
{
"epoch": 5.792093619003261,
"grad_norm": 0.3144179880619049,
"learning_rate": 0.0005308130460104834,
"loss": 3.5243,
"step": 19900
},
{
"epoch": 5.806648812296228,
"grad_norm": 0.3042346239089966,
"learning_rate": 0.0005306383226557949,
"loss": 3.5196,
"step": 19950
},
{
"epoch": 5.821204005589195,
"grad_norm": 0.3238431513309479,
"learning_rate": 0.0005304635993011065,
"loss": 3.528,
"step": 20000
},
{
"epoch": 5.821204005589195,
"eval_accuracy": 0.36223972256034176,
"eval_loss": 3.6133012771606445,
"eval_runtime": 180.9991,
"eval_samples_per_second": 91.995,
"eval_steps_per_second": 5.751,
"step": 20000
},
{
"epoch": 5.835759198882161,
"grad_norm": 0.32478225231170654,
"learning_rate": 0.0005302888759464181,
"loss": 3.5305,
"step": 20050
},
{
"epoch": 5.850314392175128,
"grad_norm": 0.3156353831291199,
"learning_rate": 0.0005301141525917298,
"loss": 3.524,
"step": 20100
},
{
"epoch": 5.864869585468095,
"grad_norm": 0.3088098466396332,
"learning_rate": 0.0005299394292370413,
"loss": 3.5311,
"step": 20150
},
{
"epoch": 5.879424778761062,
"grad_norm": 0.32470381259918213,
"learning_rate": 0.0005297647058823528,
"loss": 3.5493,
"step": 20200
},
{
"epoch": 5.893979972054029,
"grad_norm": 0.3192812204360962,
"learning_rate": 0.0005295899825276645,
"loss": 3.5181,
"step": 20250
},
{
"epoch": 5.908535165346996,
"grad_norm": 0.3229912519454956,
"learning_rate": 0.000529415259172976,
"loss": 3.5267,
"step": 20300
},
{
"epoch": 5.923090358639962,
"grad_norm": 0.31635522842407227,
"learning_rate": 0.0005292405358182877,
"loss": 3.5297,
"step": 20350
},
{
"epoch": 5.937645551932929,
"grad_norm": 0.3278980851173401,
"learning_rate": 0.0005290658124635992,
"loss": 3.5257,
"step": 20400
},
{
"epoch": 5.9522007452258965,
"grad_norm": 0.3188556134700775,
"learning_rate": 0.0005288910891089109,
"loss": 3.5308,
"step": 20450
},
{
"epoch": 5.9667559385188635,
"grad_norm": 0.3289923071861267,
"learning_rate": 0.0005287163657542224,
"loss": 3.5367,
"step": 20500
},
{
"epoch": 5.981311131811831,
"grad_norm": 0.2996402680873871,
"learning_rate": 0.000528541642399534,
"loss": 3.5295,
"step": 20550
},
{
"epoch": 5.995866325104798,
"grad_norm": 0.3355785608291626,
"learning_rate": 0.0005283669190448456,
"loss": 3.5302,
"step": 20600
},
{
"epoch": 6.010188635305076,
"grad_norm": 0.3253796696662903,
"learning_rate": 0.0005281921956901572,
"loss": 3.4522,
"step": 20650
},
{
"epoch": 6.0247438285980435,
"grad_norm": 0.31267303228378296,
"learning_rate": 0.0005280174723354688,
"loss": 3.4193,
"step": 20700
},
{
"epoch": 6.0392990218910105,
"grad_norm": 0.35141482949256897,
"learning_rate": 0.0005278427489807804,
"loss": 3.4292,
"step": 20750
},
{
"epoch": 6.053854215183978,
"grad_norm": 0.31395936012268066,
"learning_rate": 0.000527668025626092,
"loss": 3.4291,
"step": 20800
},
{
"epoch": 6.068409408476945,
"grad_norm": 0.32290589809417725,
"learning_rate": 0.0005274933022714035,
"loss": 3.4305,
"step": 20850
},
{
"epoch": 6.082964601769912,
"grad_norm": 0.3760620951652527,
"learning_rate": 0.0005273185789167152,
"loss": 3.4281,
"step": 20900
},
{
"epoch": 6.097519795062879,
"grad_norm": 0.3305610120296478,
"learning_rate": 0.0005271438555620268,
"loss": 3.4387,
"step": 20950
},
{
"epoch": 6.112074988355845,
"grad_norm": 0.330484002828598,
"learning_rate": 0.0005269691322073384,
"loss": 3.442,
"step": 21000
},
{
"epoch": 6.112074988355845,
"eval_accuracy": 0.3625605721041184,
"eval_loss": 3.6160457134246826,
"eval_runtime": 181.0216,
"eval_samples_per_second": 91.984,
"eval_steps_per_second": 5.751,
"step": 21000
},
{
"epoch": 6.126630181648812,
"grad_norm": 0.3237796127796173,
"learning_rate": 0.0005267944088526499,
"loss": 3.4373,
"step": 21050
},
{
"epoch": 6.141185374941779,
"grad_norm": 0.33345699310302734,
"learning_rate": 0.0005266196854979615,
"loss": 3.4453,
"step": 21100
},
{
"epoch": 6.155740568234746,
"grad_norm": 0.3279930353164673,
"learning_rate": 0.0005264449621432731,
"loss": 3.4346,
"step": 21150
},
{
"epoch": 6.170295761527713,
"grad_norm": 0.33429208397865295,
"learning_rate": 0.0005262702387885847,
"loss": 3.4517,
"step": 21200
},
{
"epoch": 6.18485095482068,
"grad_norm": 0.32830730080604553,
"learning_rate": 0.0005260955154338963,
"loss": 3.4596,
"step": 21250
},
{
"epoch": 6.199406148113647,
"grad_norm": 0.3260050117969513,
"learning_rate": 0.0005259207920792079,
"loss": 3.4487,
"step": 21300
},
{
"epoch": 6.2139613414066135,
"grad_norm": 0.3051382005214691,
"learning_rate": 0.0005257460687245195,
"loss": 3.4669,
"step": 21350
},
{
"epoch": 6.2285165346995806,
"grad_norm": 0.35948219895362854,
"learning_rate": 0.000525571345369831,
"loss": 3.4666,
"step": 21400
},
{
"epoch": 6.243071727992548,
"grad_norm": 0.33564814925193787,
"learning_rate": 0.0005253966220151426,
"loss": 3.4627,
"step": 21450
},
{
"epoch": 6.257626921285515,
"grad_norm": 0.31916743516921997,
"learning_rate": 0.0005252218986604543,
"loss": 3.4698,
"step": 21500
},
{
"epoch": 6.272182114578482,
"grad_norm": 0.31790220737457275,
"learning_rate": 0.0005250471753057658,
"loss": 3.4645,
"step": 21550
},
{
"epoch": 6.286737307871449,
"grad_norm": 0.32677316665649414,
"learning_rate": 0.0005248724519510774,
"loss": 3.4663,
"step": 21600
},
{
"epoch": 6.301292501164416,
"grad_norm": 0.3288456201553345,
"learning_rate": 0.000524697728596389,
"loss": 3.4605,
"step": 21650
},
{
"epoch": 6.315847694457382,
"grad_norm": 0.33529654145240784,
"learning_rate": 0.0005245230052417006,
"loss": 3.4611,
"step": 21700
},
{
"epoch": 6.330402887750349,
"grad_norm": 0.3224530816078186,
"learning_rate": 0.0005243482818870122,
"loss": 3.4751,
"step": 21750
},
{
"epoch": 6.344958081043316,
"grad_norm": 0.30517300963401794,
"learning_rate": 0.0005241735585323238,
"loss": 3.4797,
"step": 21800
},
{
"epoch": 6.359513274336283,
"grad_norm": 0.31948354840278625,
"learning_rate": 0.0005239988351776354,
"loss": 3.4618,
"step": 21850
},
{
"epoch": 6.37406846762925,
"grad_norm": 0.3166545331478119,
"learning_rate": 0.0005238241118229469,
"loss": 3.4755,
"step": 21900
},
{
"epoch": 6.388623660922217,
"grad_norm": 0.30832865834236145,
"learning_rate": 0.0005236493884682585,
"loss": 3.4632,
"step": 21950
},
{
"epoch": 6.403178854215184,
"grad_norm": 0.31591275334358215,
"learning_rate": 0.0005234746651135701,
"loss": 3.4671,
"step": 22000
},
{
"epoch": 6.403178854215184,
"eval_accuracy": 0.36362748498265474,
"eval_loss": 3.609940767288208,
"eval_runtime": 180.7493,
"eval_samples_per_second": 92.122,
"eval_steps_per_second": 5.759,
"step": 22000
},
{
"epoch": 6.417734047508151,
"grad_norm": 0.31404587626457214,
"learning_rate": 0.0005232999417588818,
"loss": 3.4756,
"step": 22050
},
{
"epoch": 6.432289240801118,
"grad_norm": 0.3089274764060974,
"learning_rate": 0.0005231252184041933,
"loss": 3.4688,
"step": 22100
},
{
"epoch": 6.446844434094085,
"grad_norm": 0.32326218485832214,
"learning_rate": 0.0005229504950495049,
"loss": 3.4659,
"step": 22150
},
{
"epoch": 6.461399627387052,
"grad_norm": 0.3198147714138031,
"learning_rate": 0.0005227757716948165,
"loss": 3.4657,
"step": 22200
},
{
"epoch": 6.475954820680019,
"grad_norm": 0.36542582511901855,
"learning_rate": 0.000522601048340128,
"loss": 3.4748,
"step": 22250
},
{
"epoch": 6.490510013972986,
"grad_norm": 0.34132885932922363,
"learning_rate": 0.0005224263249854397,
"loss": 3.4711,
"step": 22300
},
{
"epoch": 6.505065207265952,
"grad_norm": 0.3270913362503052,
"learning_rate": 0.0005222516016307512,
"loss": 3.4922,
"step": 22350
},
{
"epoch": 6.519620400558919,
"grad_norm": 0.32891523838043213,
"learning_rate": 0.0005220768782760629,
"loss": 3.4697,
"step": 22400
},
{
"epoch": 6.534175593851886,
"grad_norm": 0.3459501564502716,
"learning_rate": 0.0005219021549213744,
"loss": 3.4809,
"step": 22450
},
{
"epoch": 6.548730787144853,
"grad_norm": 0.29732978343963623,
"learning_rate": 0.000521727431566686,
"loss": 3.4849,
"step": 22500
},
{
"epoch": 6.56328598043782,
"grad_norm": 0.3077053129673004,
"learning_rate": 0.0005215527082119976,
"loss": 3.4712,
"step": 22550
},
{
"epoch": 6.577841173730787,
"grad_norm": 0.314859002828598,
"learning_rate": 0.0005213779848573093,
"loss": 3.4753,
"step": 22600
},
{
"epoch": 6.592396367023754,
"grad_norm": 0.35085999965667725,
"learning_rate": 0.0005212032615026208,
"loss": 3.4777,
"step": 22650
},
{
"epoch": 6.6069515603167215,
"grad_norm": 0.33542004227638245,
"learning_rate": 0.0005210285381479323,
"loss": 3.4806,
"step": 22700
},
{
"epoch": 6.621506753609688,
"grad_norm": 0.3448728024959564,
"learning_rate": 0.000520853814793244,
"loss": 3.5001,
"step": 22750
},
{
"epoch": 6.636061946902655,
"grad_norm": 0.32117313146591187,
"learning_rate": 0.0005206790914385555,
"loss": 3.4917,
"step": 22800
},
{
"epoch": 6.650617140195622,
"grad_norm": 0.33129552006721497,
"learning_rate": 0.0005205043680838672,
"loss": 3.484,
"step": 22850
},
{
"epoch": 6.665172333488589,
"grad_norm": 0.3124433755874634,
"learning_rate": 0.0005203296447291787,
"loss": 3.4768,
"step": 22900
},
{
"epoch": 6.679727526781556,
"grad_norm": 0.3298899829387665,
"learning_rate": 0.0005201549213744904,
"loss": 3.4987,
"step": 22950
},
{
"epoch": 6.694282720074523,
"grad_norm": 0.32636532187461853,
"learning_rate": 0.0005199801980198019,
"loss": 3.4893,
"step": 23000
},
{
"epoch": 6.694282720074523,
"eval_accuracy": 0.3642555509027801,
"eval_loss": 3.598724842071533,
"eval_runtime": 180.9694,
"eval_samples_per_second": 92.01,
"eval_steps_per_second": 5.752,
"step": 23000
},
{
"epoch": 6.708837913367489,
"grad_norm": 0.32573258876800537,
"learning_rate": 0.0005198054746651136,
"loss": 3.4969,
"step": 23050
},
{
"epoch": 6.723393106660456,
"grad_norm": 0.32142332196235657,
"learning_rate": 0.0005196307513104251,
"loss": 3.4879,
"step": 23100
},
{
"epoch": 6.737948299953423,
"grad_norm": 0.3339453637599945,
"learning_rate": 0.0005194560279557367,
"loss": 3.4853,
"step": 23150
},
{
"epoch": 6.75250349324639,
"grad_norm": 0.3392508625984192,
"learning_rate": 0.0005192813046010483,
"loss": 3.4777,
"step": 23200
},
{
"epoch": 6.767058686539357,
"grad_norm": 0.3458227515220642,
"learning_rate": 0.0005191065812463599,
"loss": 3.4887,
"step": 23250
},
{
"epoch": 6.781613879832324,
"grad_norm": 0.3117659389972687,
"learning_rate": 0.0005189318578916715,
"loss": 3.4842,
"step": 23300
},
{
"epoch": 6.7961690731252915,
"grad_norm": 0.3235318958759308,
"learning_rate": 0.000518757134536983,
"loss": 3.4785,
"step": 23350
},
{
"epoch": 6.810724266418258,
"grad_norm": 0.32054176926612854,
"learning_rate": 0.0005185824111822947,
"loss": 3.4813,
"step": 23400
},
{
"epoch": 6.825279459711225,
"grad_norm": 0.319515585899353,
"learning_rate": 0.0005184076878276063,
"loss": 3.4716,
"step": 23450
},
{
"epoch": 6.839834653004192,
"grad_norm": 0.3443147540092468,
"learning_rate": 0.0005182329644729179,
"loss": 3.4967,
"step": 23500
},
{
"epoch": 6.854389846297159,
"grad_norm": 0.3200646936893463,
"learning_rate": 0.0005180582411182294,
"loss": 3.4893,
"step": 23550
},
{
"epoch": 6.868945039590126,
"grad_norm": 0.33306822180747986,
"learning_rate": 0.000517883517763541,
"loss": 3.5065,
"step": 23600
},
{
"epoch": 6.883500232883093,
"grad_norm": 0.3191053867340088,
"learning_rate": 0.0005177087944088526,
"loss": 3.5058,
"step": 23650
},
{
"epoch": 6.898055426176059,
"grad_norm": 0.32728537917137146,
"learning_rate": 0.0005175340710541642,
"loss": 3.488,
"step": 23700
},
{
"epoch": 6.912610619469026,
"grad_norm": 0.315595418214798,
"learning_rate": 0.0005173593476994758,
"loss": 3.4906,
"step": 23750
},
{
"epoch": 6.927165812761993,
"grad_norm": 0.3276635408401489,
"learning_rate": 0.0005171846243447874,
"loss": 3.4941,
"step": 23800
},
{
"epoch": 6.94172100605496,
"grad_norm": 0.31373319029808044,
"learning_rate": 0.000517009900990099,
"loss": 3.4911,
"step": 23850
},
{
"epoch": 6.956276199347927,
"grad_norm": 0.33638837933540344,
"learning_rate": 0.0005168351776354105,
"loss": 3.4843,
"step": 23900
},
{
"epoch": 6.9708313926408945,
"grad_norm": 0.33347102999687195,
"learning_rate": 0.0005166604542807221,
"loss": 3.4993,
"step": 23950
},
{
"epoch": 6.9853865859338615,
"grad_norm": 0.3285515606403351,
"learning_rate": 0.0005164857309260338,
"loss": 3.4985,
"step": 24000
},
{
"epoch": 6.9853865859338615,
"eval_accuracy": 0.3649057119563231,
"eval_loss": 3.588160514831543,
"eval_runtime": 180.8076,
"eval_samples_per_second": 92.092,
"eval_steps_per_second": 5.758,
"step": 24000
},
{
"epoch": 6.999941779226829,
"grad_norm": 0.3126235604286194,
"learning_rate": 0.0005163110075713453,
"loss": 3.4946,
"step": 24050
},
{
"epoch": 7.014264089427107,
"grad_norm": 0.3227352499961853,
"learning_rate": 0.0005161362842166569,
"loss": 3.3874,
"step": 24100
},
{
"epoch": 7.028819282720074,
"grad_norm": 0.33581089973449707,
"learning_rate": 0.0005159615608619685,
"loss": 3.3799,
"step": 24150
},
{
"epoch": 7.0433744760130415,
"grad_norm": 0.33446604013442993,
"learning_rate": 0.0005157868375072801,
"loss": 3.3827,
"step": 24200
},
{
"epoch": 7.0579296693060085,
"grad_norm": 0.32778769731521606,
"learning_rate": 0.0005156121141525917,
"loss": 3.3804,
"step": 24250
},
{
"epoch": 7.072484862598976,
"grad_norm": 0.3300207555294037,
"learning_rate": 0.0005154373907979033,
"loss": 3.3909,
"step": 24300
},
{
"epoch": 7.087040055891943,
"grad_norm": 0.3466382920742035,
"learning_rate": 0.0005152626674432149,
"loss": 3.3882,
"step": 24350
},
{
"epoch": 7.101595249184909,
"grad_norm": 0.3209831714630127,
"learning_rate": 0.0005150879440885264,
"loss": 3.4095,
"step": 24400
},
{
"epoch": 7.116150442477876,
"grad_norm": 0.31116464734077454,
"learning_rate": 0.000514913220733838,
"loss": 3.405,
"step": 24450
},
{
"epoch": 7.130705635770843,
"grad_norm": 0.32435309886932373,
"learning_rate": 0.0005147384973791496,
"loss": 3.3998,
"step": 24500
},
{
"epoch": 7.14526082906381,
"grad_norm": 0.3378669321537018,
"learning_rate": 0.0005145637740244613,
"loss": 3.4078,
"step": 24550
},
{
"epoch": 7.159816022356777,
"grad_norm": 0.3956412672996521,
"learning_rate": 0.0005143890506697728,
"loss": 3.4078,
"step": 24600
},
{
"epoch": 7.174371215649744,
"grad_norm": 0.34623751044273376,
"learning_rate": 0.0005142143273150844,
"loss": 3.4175,
"step": 24650
},
{
"epoch": 7.188926408942711,
"grad_norm": 0.3511153757572174,
"learning_rate": 0.000514039603960396,
"loss": 3.4144,
"step": 24700
},
{
"epoch": 7.203481602235677,
"grad_norm": 0.30973565578460693,
"learning_rate": 0.0005138648806057075,
"loss": 3.4264,
"step": 24750
},
{
"epoch": 7.218036795528644,
"grad_norm": 0.3381648063659668,
"learning_rate": 0.0005136901572510192,
"loss": 3.4011,
"step": 24800
},
{
"epoch": 7.2325919888216115,
"grad_norm": 0.33070191740989685,
"learning_rate": 0.0005135154338963307,
"loss": 3.4212,
"step": 24850
},
{
"epoch": 7.2471471821145785,
"grad_norm": 0.3330495357513428,
"learning_rate": 0.0005133407105416424,
"loss": 3.4229,
"step": 24900
},
{
"epoch": 7.261702375407546,
"grad_norm": 0.3378702700138092,
"learning_rate": 0.0005131659871869539,
"loss": 3.4219,
"step": 24950
},
{
"epoch": 7.276257568700513,
"grad_norm": 0.35611557960510254,
"learning_rate": 0.0005129912638322656,
"loss": 3.4264,
"step": 25000
},
{
"epoch": 7.276257568700513,
"eval_accuracy": 0.36439411559586166,
"eval_loss": 3.598292350769043,
"eval_runtime": 180.9485,
"eval_samples_per_second": 92.021,
"eval_steps_per_second": 5.753,
"step": 25000
},
{
"epoch": 7.290812761993479,
"grad_norm": 0.3345873951911926,
"learning_rate": 0.0005128165404775771,
"loss": 3.433,
"step": 25050
},
{
"epoch": 7.305367955286446,
"grad_norm": 0.33987024426460266,
"learning_rate": 0.0005126418171228888,
"loss": 3.4204,
"step": 25100
},
{
"epoch": 7.319923148579413,
"grad_norm": 0.3445035219192505,
"learning_rate": 0.0005124670937682003,
"loss": 3.4285,
"step": 25150
},
{
"epoch": 7.33447834187238,
"grad_norm": 0.345106840133667,
"learning_rate": 0.000512292370413512,
"loss": 3.4366,
"step": 25200
},
{
"epoch": 7.349033535165347,
"grad_norm": 0.3592861294746399,
"learning_rate": 0.0005121176470588235,
"loss": 3.4408,
"step": 25250
},
{
"epoch": 7.363588728458314,
"grad_norm": 0.33465760946273804,
"learning_rate": 0.000511942923704135,
"loss": 3.4377,
"step": 25300
},
{
"epoch": 7.378143921751281,
"grad_norm": 0.361323744058609,
"learning_rate": 0.0005117682003494467,
"loss": 3.4457,
"step": 25350
},
{
"epoch": 7.392699115044248,
"grad_norm": 0.3065403997898102,
"learning_rate": 0.0005115934769947583,
"loss": 3.4445,
"step": 25400
},
{
"epoch": 7.407254308337214,
"grad_norm": 0.3310743570327759,
"learning_rate": 0.0005114187536400699,
"loss": 3.4393,
"step": 25450
},
{
"epoch": 7.4218095016301815,
"grad_norm": 0.34009453654289246,
"learning_rate": 0.0005112440302853814,
"loss": 3.4302,
"step": 25500
},
{
"epoch": 7.4363646949231486,
"grad_norm": 0.3367420732975006,
"learning_rate": 0.0005110693069306931,
"loss": 3.4376,
"step": 25550
},
{
"epoch": 7.450919888216116,
"grad_norm": 0.3324291706085205,
"learning_rate": 0.0005108945835760046,
"loss": 3.4399,
"step": 25600
},
{
"epoch": 7.465475081509083,
"grad_norm": 0.32936856150627136,
"learning_rate": 0.0005107198602213162,
"loss": 3.4438,
"step": 25650
},
{
"epoch": 7.48003027480205,
"grad_norm": 0.3215152323246002,
"learning_rate": 0.0005105451368666278,
"loss": 3.4416,
"step": 25700
},
{
"epoch": 7.494585468095016,
"grad_norm": 0.3312242925167084,
"learning_rate": 0.0005103704135119394,
"loss": 3.4529,
"step": 25750
},
{
"epoch": 7.509140661387983,
"grad_norm": 0.34947583079338074,
"learning_rate": 0.000510195690157251,
"loss": 3.4315,
"step": 25800
},
{
"epoch": 7.52369585468095,
"grad_norm": 0.3578192889690399,
"learning_rate": 0.0005100209668025625,
"loss": 3.4518,
"step": 25850
},
{
"epoch": 7.538251047973917,
"grad_norm": 0.3474888801574707,
"learning_rate": 0.0005098462434478742,
"loss": 3.4521,
"step": 25900
},
{
"epoch": 7.552806241266884,
"grad_norm": 0.314382404088974,
"learning_rate": 0.0005096715200931858,
"loss": 3.4394,
"step": 25950
},
{
"epoch": 7.567361434559851,
"grad_norm": 0.34033820033073425,
"learning_rate": 0.0005094967967384974,
"loss": 3.4448,
"step": 26000
},
{
"epoch": 7.567361434559851,
"eval_accuracy": 0.3653323360749711,
"eval_loss": 3.592108726501465,
"eval_runtime": 181.0772,
"eval_samples_per_second": 91.955,
"eval_steps_per_second": 5.749,
"step": 26000
},
{
"epoch": 7.581916627852818,
"grad_norm": 0.3230842053890228,
"learning_rate": 0.0005093220733838089,
"loss": 3.4504,
"step": 26050
},
{
"epoch": 7.5964718211457845,
"grad_norm": 0.3355337977409363,
"learning_rate": 0.0005091473500291205,
"loss": 3.439,
"step": 26100
},
{
"epoch": 7.6110270144387515,
"grad_norm": 0.34470894932746887,
"learning_rate": 0.0005089726266744321,
"loss": 3.4568,
"step": 26150
},
{
"epoch": 7.625582207731719,
"grad_norm": 0.3162420094013214,
"learning_rate": 0.0005087979033197437,
"loss": 3.4457,
"step": 26200
},
{
"epoch": 7.640137401024686,
"grad_norm": 0.325000524520874,
"learning_rate": 0.0005086231799650553,
"loss": 3.4556,
"step": 26250
},
{
"epoch": 7.654692594317653,
"grad_norm": 0.32103052735328674,
"learning_rate": 0.0005084484566103669,
"loss": 3.4577,
"step": 26300
},
{
"epoch": 7.66924778761062,
"grad_norm": 0.34083518385887146,
"learning_rate": 0.0005082737332556785,
"loss": 3.4528,
"step": 26350
},
{
"epoch": 7.683802980903586,
"grad_norm": 0.3336086869239807,
"learning_rate": 0.00050809900990099,
"loss": 3.4556,
"step": 26400
},
{
"epoch": 7.698358174196553,
"grad_norm": 0.3343296945095062,
"learning_rate": 0.0005079242865463016,
"loss": 3.4525,
"step": 26450
},
{
"epoch": 7.71291336748952,
"grad_norm": 0.3320295512676239,
"learning_rate": 0.0005077495631916133,
"loss": 3.4512,
"step": 26500
},
{
"epoch": 7.727468560782487,
"grad_norm": 0.3143779933452606,
"learning_rate": 0.0005075748398369248,
"loss": 3.4488,
"step": 26550
},
{
"epoch": 7.742023754075454,
"grad_norm": 0.32744669914245605,
"learning_rate": 0.0005074001164822364,
"loss": 3.4614,
"step": 26600
},
{
"epoch": 7.756578947368421,
"grad_norm": 0.3253073990345001,
"learning_rate": 0.000507225393127548,
"loss": 3.4551,
"step": 26650
},
{
"epoch": 7.771134140661388,
"grad_norm": 0.3403880000114441,
"learning_rate": 0.0005070506697728596,
"loss": 3.4575,
"step": 26700
},
{
"epoch": 7.785689333954355,
"grad_norm": 0.3548356592655182,
"learning_rate": 0.0005068759464181711,
"loss": 3.4624,
"step": 26750
},
{
"epoch": 7.8002445272473215,
"grad_norm": 0.33326730132102966,
"learning_rate": 0.0005067012230634828,
"loss": 3.4634,
"step": 26800
},
{
"epoch": 7.814799720540289,
"grad_norm": 0.35093986988067627,
"learning_rate": 0.0005065264997087944,
"loss": 3.4566,
"step": 26850
},
{
"epoch": 7.829354913833256,
"grad_norm": 0.3309878706932068,
"learning_rate": 0.0005063517763541059,
"loss": 3.4525,
"step": 26900
},
{
"epoch": 7.843910107126223,
"grad_norm": 0.33384084701538086,
"learning_rate": 0.0005061770529994175,
"loss": 3.4692,
"step": 26950
},
{
"epoch": 7.85846530041919,
"grad_norm": 0.3291708827018738,
"learning_rate": 0.0005060023296447291,
"loss": 3.4691,
"step": 27000
},
{
"epoch": 7.85846530041919,
"eval_accuracy": 0.36616501703382,
"eval_loss": 3.5818653106689453,
"eval_runtime": 181.056,
"eval_samples_per_second": 91.966,
"eval_steps_per_second": 5.75,
"step": 27000
},
{
"epoch": 7.873020493712157,
"grad_norm": 0.3469105660915375,
"learning_rate": 0.0005058276062900408,
"loss": 3.4533,
"step": 27050
},
{
"epoch": 7.887575687005123,
"grad_norm": 0.3380752205848694,
"learning_rate": 0.0005056528829353523,
"loss": 3.4577,
"step": 27100
},
{
"epoch": 7.90213088029809,
"grad_norm": 0.3110817074775696,
"learning_rate": 0.000505478159580664,
"loss": 3.471,
"step": 27150
},
{
"epoch": 7.916686073591057,
"grad_norm": 0.32800230383872986,
"learning_rate": 0.0005053034362259755,
"loss": 3.4548,
"step": 27200
},
{
"epoch": 7.931241266884024,
"grad_norm": 0.3017429709434509,
"learning_rate": 0.000505128712871287,
"loss": 3.4505,
"step": 27250
},
{
"epoch": 7.945796460176991,
"grad_norm": 0.33647236227989197,
"learning_rate": 0.0005049539895165987,
"loss": 3.4667,
"step": 27300
},
{
"epoch": 7.960351653469958,
"grad_norm": 0.34851449728012085,
"learning_rate": 0.0005047792661619103,
"loss": 3.4613,
"step": 27350
},
{
"epoch": 7.974906846762925,
"grad_norm": 0.33686619997024536,
"learning_rate": 0.0005046045428072219,
"loss": 3.4554,
"step": 27400
},
{
"epoch": 7.989462040055892,
"grad_norm": 0.30195578932762146,
"learning_rate": 0.0005044298194525334,
"loss": 3.4715,
"step": 27450
},
{
"epoch": 8.003784350256172,
"grad_norm": 0.3394927680492401,
"learning_rate": 0.0005042550960978451,
"loss": 3.4395,
"step": 27500
},
{
"epoch": 8.01833954354914,
"grad_norm": 0.32353535294532776,
"learning_rate": 0.0005040803727431566,
"loss": 3.3566,
"step": 27550
},
{
"epoch": 8.032894736842104,
"grad_norm": 0.3227969706058502,
"learning_rate": 0.0005039056493884683,
"loss": 3.3603,
"step": 27600
},
{
"epoch": 8.047449930135071,
"grad_norm": 0.31982970237731934,
"learning_rate": 0.0005037309260337798,
"loss": 3.348,
"step": 27650
},
{
"epoch": 8.062005123428039,
"grad_norm": 0.3820800483226776,
"learning_rate": 0.0005035562026790914,
"loss": 3.3559,
"step": 27700
},
{
"epoch": 8.076560316721006,
"grad_norm": 0.3284827470779419,
"learning_rate": 0.000503381479324403,
"loss": 3.377,
"step": 27750
},
{
"epoch": 8.091115510013973,
"grad_norm": 0.33183208107948303,
"learning_rate": 0.0005032067559697145,
"loss": 3.362,
"step": 27800
},
{
"epoch": 8.10567070330694,
"grad_norm": 0.3440621495246887,
"learning_rate": 0.0005030320326150262,
"loss": 3.3727,
"step": 27850
},
{
"epoch": 8.120225896599907,
"grad_norm": 0.3330867290496826,
"learning_rate": 0.0005028573092603378,
"loss": 3.3693,
"step": 27900
},
{
"epoch": 8.134781089892874,
"grad_norm": 0.3781154155731201,
"learning_rate": 0.0005026825859056494,
"loss": 3.378,
"step": 27950
},
{
"epoch": 8.149336283185841,
"grad_norm": 0.3525262773036957,
"learning_rate": 0.0005025078625509609,
"loss": 3.3775,
"step": 28000
},
{
"epoch": 8.149336283185841,
"eval_accuracy": 0.36626068426042596,
"eval_loss": 3.590256929397583,
"eval_runtime": 180.9876,
"eval_samples_per_second": 92.001,
"eval_steps_per_second": 5.752,
"step": 28000
},
{
"epoch": 8.163891476478808,
"grad_norm": 0.3500209450721741,
"learning_rate": 0.0005023331391962726,
"loss": 3.3713,
"step": 28050
},
{
"epoch": 8.178446669771775,
"grad_norm": 0.32972395420074463,
"learning_rate": 0.0005021584158415841,
"loss": 3.3718,
"step": 28100
},
{
"epoch": 8.193001863064742,
"grad_norm": 0.3366987705230713,
"learning_rate": 0.0005019836924868956,
"loss": 3.3944,
"step": 28150
},
{
"epoch": 8.20755705635771,
"grad_norm": 0.32175183296203613,
"learning_rate": 0.0005018089691322073,
"loss": 3.374,
"step": 28200
},
{
"epoch": 8.222112249650676,
"grad_norm": 0.325791597366333,
"learning_rate": 0.0005016342457775189,
"loss": 3.3832,
"step": 28250
},
{
"epoch": 8.236667442943642,
"grad_norm": 0.34130793809890747,
"learning_rate": 0.0005014595224228305,
"loss": 3.3921,
"step": 28300
},
{
"epoch": 8.251222636236609,
"grad_norm": 0.321627676486969,
"learning_rate": 0.000501284799068142,
"loss": 3.3985,
"step": 28350
},
{
"epoch": 8.265777829529576,
"grad_norm": 0.3398359417915344,
"learning_rate": 0.0005011100757134537,
"loss": 3.3839,
"step": 28400
},
{
"epoch": 8.280333022822543,
"grad_norm": 0.3410577178001404,
"learning_rate": 0.0005009353523587653,
"loss": 3.3938,
"step": 28450
},
{
"epoch": 8.29488821611551,
"grad_norm": 0.3390825390815735,
"learning_rate": 0.0005007606290040768,
"loss": 3.4063,
"step": 28500
},
{
"epoch": 8.309443409408477,
"grad_norm": 0.3315296471118927,
"learning_rate": 0.0005005859056493884,
"loss": 3.4016,
"step": 28550
},
{
"epoch": 8.323998602701444,
"grad_norm": 0.34398385882377625,
"learning_rate": 0.0005004111822947,
"loss": 3.4099,
"step": 28600
},
{
"epoch": 8.338553795994411,
"grad_norm": 0.3477787971496582,
"learning_rate": 0.0005002364589400116,
"loss": 3.4103,
"step": 28650
},
{
"epoch": 8.353108989287378,
"grad_norm": 0.3421306908130646,
"learning_rate": 0.0005000617355853231,
"loss": 3.3944,
"step": 28700
},
{
"epoch": 8.367664182580345,
"grad_norm": 0.33684462308883667,
"learning_rate": 0.0004998870122306348,
"loss": 3.409,
"step": 28750
},
{
"epoch": 8.382219375873312,
"grad_norm": 0.314606636762619,
"learning_rate": 0.0004997122888759464,
"loss": 3.4149,
"step": 28800
},
{
"epoch": 8.39677456916628,
"grad_norm": 0.35119929909706116,
"learning_rate": 0.000499537565521258,
"loss": 3.4165,
"step": 28850
},
{
"epoch": 8.411329762459246,
"grad_norm": 0.35205650329589844,
"learning_rate": 0.0004993628421665695,
"loss": 3.4171,
"step": 28900
},
{
"epoch": 8.425884955752213,
"grad_norm": 0.3453122079372406,
"learning_rate": 0.0004991881188118811,
"loss": 3.409,
"step": 28950
},
{
"epoch": 8.440440149045179,
"grad_norm": 0.3224046230316162,
"learning_rate": 0.0004990133954571928,
"loss": 3.4149,
"step": 29000
},
{
"epoch": 8.440440149045179,
"eval_accuracy": 0.3668040129933488,
"eval_loss": 3.584907293319702,
"eval_runtime": 180.9411,
"eval_samples_per_second": 92.024,
"eval_steps_per_second": 5.753,
"step": 29000
},
{
"epoch": 8.454995342338146,
"grad_norm": 0.33683833479881287,
"learning_rate": 0.0004988386721025043,
"loss": 3.4179,
"step": 29050
},
{
"epoch": 8.469550535631113,
"grad_norm": 0.31904080510139465,
"learning_rate": 0.0004986639487478159,
"loss": 3.4262,
"step": 29100
},
{
"epoch": 8.48410572892408,
"grad_norm": 0.34945112466812134,
"learning_rate": 0.0004984892253931275,
"loss": 3.4071,
"step": 29150
},
{
"epoch": 8.498660922217047,
"grad_norm": 0.3821871876716614,
"learning_rate": 0.0004983145020384391,
"loss": 3.412,
"step": 29200
},
{
"epoch": 8.513216115510014,
"grad_norm": 0.33756762742996216,
"learning_rate": 0.0004981397786837507,
"loss": 3.4187,
"step": 29250
},
{
"epoch": 8.527771308802981,
"grad_norm": 0.33595606684684753,
"learning_rate": 0.0004979650553290622,
"loss": 3.4251,
"step": 29300
},
{
"epoch": 8.542326502095948,
"grad_norm": 0.3365992307662964,
"learning_rate": 0.0004977903319743739,
"loss": 3.4241,
"step": 29350
},
{
"epoch": 8.556881695388915,
"grad_norm": 0.3620302677154541,
"learning_rate": 0.0004976156086196854,
"loss": 3.4122,
"step": 29400
},
{
"epoch": 8.571436888681882,
"grad_norm": 0.3167871832847595,
"learning_rate": 0.0004974408852649971,
"loss": 3.4281,
"step": 29450
},
{
"epoch": 8.58599208197485,
"grad_norm": 0.3546694815158844,
"learning_rate": 0.0004972661619103086,
"loss": 3.4266,
"step": 29500
},
{
"epoch": 8.600547275267816,
"grad_norm": 0.33703261613845825,
"learning_rate": 0.0004970914385556202,
"loss": 3.4098,
"step": 29550
},
{
"epoch": 8.615102468560782,
"grad_norm": 0.3471863567829132,
"learning_rate": 0.0004969167152009318,
"loss": 3.4201,
"step": 29600
},
{
"epoch": 8.629657661853749,
"grad_norm": 0.34901905059814453,
"learning_rate": 0.0004967419918462435,
"loss": 3.4237,
"step": 29650
},
{
"epoch": 8.644212855146716,
"grad_norm": 0.32548439502716064,
"learning_rate": 0.000496567268491555,
"loss": 3.4217,
"step": 29700
},
{
"epoch": 8.658768048439683,
"grad_norm": 0.3385307192802429,
"learning_rate": 0.0004963925451368665,
"loss": 3.4222,
"step": 29750
},
{
"epoch": 8.67332324173265,
"grad_norm": 0.33839151263237,
"learning_rate": 0.0004962178217821782,
"loss": 3.4217,
"step": 29800
},
{
"epoch": 8.687878435025617,
"grad_norm": 0.3228484094142914,
"learning_rate": 0.0004960430984274898,
"loss": 3.4194,
"step": 29850
},
{
"epoch": 8.702433628318584,
"grad_norm": 0.3400082290172577,
"learning_rate": 0.0004958683750728014,
"loss": 3.4276,
"step": 29900
},
{
"epoch": 8.716988821611551,
"grad_norm": 0.3327467739582062,
"learning_rate": 0.0004956936517181129,
"loss": 3.4376,
"step": 29950
},
{
"epoch": 8.731544014904518,
"grad_norm": 0.3484002649784088,
"learning_rate": 0.0004955189283634246,
"loss": 3.4252,
"step": 30000
},
{
"epoch": 8.731544014904518,
"eval_accuracy": 0.36735321809154226,
"eval_loss": 3.576219320297241,
"eval_runtime": 181.0445,
"eval_samples_per_second": 91.972,
"eval_steps_per_second": 5.75,
"step": 30000
},
{
"epoch": 8.746099208197485,
"grad_norm": 0.345060259103775,
"learning_rate": 0.0004953442050087361,
"loss": 3.4264,
"step": 30050
},
{
"epoch": 8.760654401490452,
"grad_norm": 0.34570980072021484,
"learning_rate": 0.0004951694816540476,
"loss": 3.4343,
"step": 30100
},
{
"epoch": 8.77520959478342,
"grad_norm": 0.357194721698761,
"learning_rate": 0.0004949947582993593,
"loss": 3.4181,
"step": 30150
},
{
"epoch": 8.789764788076386,
"grad_norm": 0.3243914544582367,
"learning_rate": 0.0004948200349446709,
"loss": 3.4204,
"step": 30200
},
{
"epoch": 8.804319981369353,
"grad_norm": 0.32040515542030334,
"learning_rate": 0.0004946453115899825,
"loss": 3.4296,
"step": 30250
},
{
"epoch": 8.81887517466232,
"grad_norm": 0.34954166412353516,
"learning_rate": 0.000494470588235294,
"loss": 3.4327,
"step": 30300
},
{
"epoch": 8.833430367955286,
"grad_norm": 0.3481607735157013,
"learning_rate": 0.0004942958648806057,
"loss": 3.4328,
"step": 30350
},
{
"epoch": 8.847985561248253,
"grad_norm": 0.3422067165374756,
"learning_rate": 0.0004941211415259173,
"loss": 3.4195,
"step": 30400
},
{
"epoch": 8.86254075454122,
"grad_norm": 0.33856475353240967,
"learning_rate": 0.0004939464181712289,
"loss": 3.4212,
"step": 30450
},
{
"epoch": 8.877095947834187,
"grad_norm": 0.3440602719783783,
"learning_rate": 0.0004937716948165404,
"loss": 3.4384,
"step": 30500
},
{
"epoch": 8.891651141127154,
"grad_norm": 0.3470291793346405,
"learning_rate": 0.000493596971461852,
"loss": 3.4268,
"step": 30550
},
{
"epoch": 8.906206334420121,
"grad_norm": 0.3485357463359833,
"learning_rate": 0.0004934222481071636,
"loss": 3.4337,
"step": 30600
},
{
"epoch": 8.920761527713088,
"grad_norm": 0.35263752937316895,
"learning_rate": 0.0004932475247524751,
"loss": 3.4405,
"step": 30650
},
{
"epoch": 8.935316721006055,
"grad_norm": 0.3400113582611084,
"learning_rate": 0.0004930728013977868,
"loss": 3.4308,
"step": 30700
},
{
"epoch": 8.949871914299022,
"grad_norm": 0.33924058079719543,
"learning_rate": 0.0004928980780430984,
"loss": 3.4263,
"step": 30750
},
{
"epoch": 8.96442710759199,
"grad_norm": 0.33143338561058044,
"learning_rate": 0.00049272335468841,
"loss": 3.4255,
"step": 30800
},
{
"epoch": 8.978982300884956,
"grad_norm": 0.334474116563797,
"learning_rate": 0.0004925486313337215,
"loss": 3.4441,
"step": 30850
},
{
"epoch": 8.993537494177923,
"grad_norm": 0.3472048044204712,
"learning_rate": 0.0004923739079790332,
"loss": 3.4464,
"step": 30900
},
{
"epoch": 9.007859804378203,
"grad_norm": 0.3369418680667877,
"learning_rate": 0.0004921991846243447,
"loss": 3.3817,
"step": 30950
},
{
"epoch": 9.022414997671168,
"grad_norm": 0.36469414830207825,
"learning_rate": 0.0004920244612696563,
"loss": 3.3288,
"step": 31000
},
{
"epoch": 9.022414997671168,
"eval_accuracy": 0.3673842453001712,
"eval_loss": 3.5799498558044434,
"eval_runtime": 181.0711,
"eval_samples_per_second": 91.958,
"eval_steps_per_second": 5.749,
"step": 31000
},
{
"epoch": 9.036970190964135,
"grad_norm": 0.32988861203193665,
"learning_rate": 0.0004918497379149679,
"loss": 3.3293,
"step": 31050
},
{
"epoch": 9.051525384257102,
"grad_norm": 0.3383880853652954,
"learning_rate": 0.0004916750145602795,
"loss": 3.3384,
"step": 31100
},
{
"epoch": 9.06608057755007,
"grad_norm": 0.32971954345703125,
"learning_rate": 0.0004915002912055911,
"loss": 3.3286,
"step": 31150
},
{
"epoch": 9.080635770843037,
"grad_norm": 0.3277758061885834,
"learning_rate": 0.0004913255678509026,
"loss": 3.3446,
"step": 31200
},
{
"epoch": 9.095190964136004,
"grad_norm": 0.36317312717437744,
"learning_rate": 0.0004911508444962143,
"loss": 3.3389,
"step": 31250
},
{
"epoch": 9.10974615742897,
"grad_norm": 0.368633896112442,
"learning_rate": 0.0004909761211415259,
"loss": 3.347,
"step": 31300
},
{
"epoch": 9.124301350721938,
"grad_norm": 0.3367079198360443,
"learning_rate": 0.0004908013977868375,
"loss": 3.3551,
"step": 31350
},
{
"epoch": 9.138856544014905,
"grad_norm": 0.36006441712379456,
"learning_rate": 0.0004906266744321491,
"loss": 3.3436,
"step": 31400
},
{
"epoch": 9.153411737307872,
"grad_norm": 0.3398313820362091,
"learning_rate": 0.0004904519510774606,
"loss": 3.3646,
"step": 31450
},
{
"epoch": 9.167966930600839,
"grad_norm": 0.3552166819572449,
"learning_rate": 0.0004902772277227722,
"loss": 3.351,
"step": 31500
},
{
"epoch": 9.182522123893806,
"grad_norm": 0.3333413600921631,
"learning_rate": 0.0004901025043680838,
"loss": 3.358,
"step": 31550
},
{
"epoch": 9.197077317186773,
"grad_norm": 0.3085155189037323,
"learning_rate": 0.0004899277810133955,
"loss": 3.3575,
"step": 31600
},
{
"epoch": 9.211632510479738,
"grad_norm": 0.3518926501274109,
"learning_rate": 0.000489753057658707,
"loss": 3.3689,
"step": 31650
},
{
"epoch": 9.226187703772705,
"grad_norm": 0.3818323612213135,
"learning_rate": 0.0004895783343040186,
"loss": 3.3583,
"step": 31700
},
{
"epoch": 9.240742897065672,
"grad_norm": 0.3476870357990265,
"learning_rate": 0.0004894036109493302,
"loss": 3.3651,
"step": 31750
},
{
"epoch": 9.25529809035864,
"grad_norm": 0.3409420847892761,
"learning_rate": 0.0004892288875946419,
"loss": 3.3707,
"step": 31800
},
{
"epoch": 9.269853283651607,
"grad_norm": 0.32737651467323303,
"learning_rate": 0.0004890541642399534,
"loss": 3.3694,
"step": 31850
},
{
"epoch": 9.284408476944574,
"grad_norm": 0.3541163206100464,
"learning_rate": 0.0004888794408852649,
"loss": 3.3713,
"step": 31900
},
{
"epoch": 9.29896367023754,
"grad_norm": 0.3474277853965759,
"learning_rate": 0.0004887047175305766,
"loss": 3.3778,
"step": 31950
},
{
"epoch": 9.313518863530508,
"grad_norm": 0.32477056980133057,
"learning_rate": 0.0004885299941758881,
"loss": 3.3776,
"step": 32000
},
{
"epoch": 9.313518863530508,
"eval_accuracy": 0.36725214460888733,
"eval_loss": 3.5803961753845215,
"eval_runtime": 181.1965,
"eval_samples_per_second": 91.895,
"eval_steps_per_second": 5.745,
"step": 32000
},
{
"epoch": 9.328074056823475,
"grad_norm": 0.3401997685432434,
"learning_rate": 0.0004883552708211997,
"loss": 3.3788,
"step": 32050
},
{
"epoch": 9.342629250116442,
"grad_norm": 0.3674589693546295,
"learning_rate": 0.00048818054746651137,
"loss": 3.3687,
"step": 32100
},
{
"epoch": 9.357184443409409,
"grad_norm": 0.3335835337638855,
"learning_rate": 0.0004880058241118229,
"loss": 3.3841,
"step": 32150
},
{
"epoch": 9.371739636702376,
"grad_norm": 0.3415198028087616,
"learning_rate": 0.0004878311007571345,
"loss": 3.3814,
"step": 32200
},
{
"epoch": 9.386294829995343,
"grad_norm": 0.3449629247188568,
"learning_rate": 0.0004876563774024461,
"loss": 3.3807,
"step": 32250
},
{
"epoch": 9.40085002328831,
"grad_norm": 0.3588092625141144,
"learning_rate": 0.00048748165404775763,
"loss": 3.38,
"step": 32300
},
{
"epoch": 9.415405216581275,
"grad_norm": 0.3250521421432495,
"learning_rate": 0.0004873069306930693,
"loss": 3.3881,
"step": 32350
},
{
"epoch": 9.429960409874242,
"grad_norm": 0.34006109833717346,
"learning_rate": 0.0004871322073383809,
"loss": 3.3951,
"step": 32400
},
{
"epoch": 9.44451560316721,
"grad_norm": 0.34316784143447876,
"learning_rate": 0.00048695748398369247,
"loss": 3.3704,
"step": 32450
},
{
"epoch": 9.459070796460177,
"grad_norm": 0.34885114431381226,
"learning_rate": 0.000486782760629004,
"loss": 3.3855,
"step": 32500
},
{
"epoch": 9.473625989753144,
"grad_norm": 0.36461853981018066,
"learning_rate": 0.0004866080372743156,
"loss": 3.3787,
"step": 32550
},
{
"epoch": 9.48818118304611,
"grad_norm": 0.3631289005279541,
"learning_rate": 0.0004864333139196272,
"loss": 3.3854,
"step": 32600
},
{
"epoch": 9.502736376339078,
"grad_norm": 0.3386242985725403,
"learning_rate": 0.00048625859056493885,
"loss": 3.382,
"step": 32650
},
{
"epoch": 9.517291569632045,
"grad_norm": 0.3215729594230652,
"learning_rate": 0.0004860838672102504,
"loss": 3.3778,
"step": 32700
},
{
"epoch": 9.531846762925012,
"grad_norm": 0.32870006561279297,
"learning_rate": 0.000485909143855562,
"loss": 3.394,
"step": 32750
},
{
"epoch": 9.546401956217979,
"grad_norm": 0.34010738134384155,
"learning_rate": 0.0004857344205008736,
"loss": 3.393,
"step": 32800
},
{
"epoch": 9.560957149510946,
"grad_norm": 0.32582029700279236,
"learning_rate": 0.00048555969714618517,
"loss": 3.3919,
"step": 32850
},
{
"epoch": 9.575512342803913,
"grad_norm": 0.3527780771255493,
"learning_rate": 0.0004853849737914967,
"loss": 3.4016,
"step": 32900
},
{
"epoch": 9.59006753609688,
"grad_norm": 0.3426138162612915,
"learning_rate": 0.00048521025043680836,
"loss": 3.3947,
"step": 32950
},
{
"epoch": 9.604622729389845,
"grad_norm": 0.3326384425163269,
"learning_rate": 0.00048503552708211995,
"loss": 3.3921,
"step": 33000
},
{
"epoch": 9.604622729389845,
"eval_accuracy": 0.36779582592372645,
"eval_loss": 3.571920394897461,
"eval_runtime": 180.8342,
"eval_samples_per_second": 92.079,
"eval_steps_per_second": 5.757,
"step": 33000
},
{
"epoch": 9.619177922682812,
"grad_norm": 0.3384208679199219,
"learning_rate": 0.00048486080372743155,
"loss": 3.4002,
"step": 33050
},
{
"epoch": 9.63373311597578,
"grad_norm": 0.3510007858276367,
"learning_rate": 0.0004846860803727431,
"loss": 3.403,
"step": 33100
},
{
"epoch": 9.648288309268747,
"grad_norm": 0.35957035422325134,
"learning_rate": 0.0004845113570180547,
"loss": 3.3999,
"step": 33150
},
{
"epoch": 9.662843502561714,
"grad_norm": 0.3399753272533417,
"learning_rate": 0.00048433663366336633,
"loss": 3.3905,
"step": 33200
},
{
"epoch": 9.67739869585468,
"grad_norm": 0.3327416479587555,
"learning_rate": 0.0004841619103086779,
"loss": 3.4005,
"step": 33250
},
{
"epoch": 9.691953889147648,
"grad_norm": 0.35132935643196106,
"learning_rate": 0.00048398718695398947,
"loss": 3.4131,
"step": 33300
},
{
"epoch": 9.706509082440615,
"grad_norm": 0.3293425440788269,
"learning_rate": 0.00048381246359930106,
"loss": 3.3982,
"step": 33350
},
{
"epoch": 9.721064275733582,
"grad_norm": 0.3519092798233032,
"learning_rate": 0.00048363774024461265,
"loss": 3.4061,
"step": 33400
},
{
"epoch": 9.735619469026549,
"grad_norm": 0.3286028802394867,
"learning_rate": 0.0004834630168899242,
"loss": 3.3979,
"step": 33450
},
{
"epoch": 9.750174662319516,
"grad_norm": 0.3378731310367584,
"learning_rate": 0.00048328829353523584,
"loss": 3.388,
"step": 33500
},
{
"epoch": 9.764729855612483,
"grad_norm": 0.3523999750614166,
"learning_rate": 0.00048311357018054744,
"loss": 3.4064,
"step": 33550
},
{
"epoch": 9.77928504890545,
"grad_norm": 0.3340063989162445,
"learning_rate": 0.00048293884682585903,
"loss": 3.41,
"step": 33600
},
{
"epoch": 9.793840242198417,
"grad_norm": 0.3363160490989685,
"learning_rate": 0.00048276412347117057,
"loss": 3.4068,
"step": 33650
},
{
"epoch": 9.808395435491384,
"grad_norm": 0.3576688766479492,
"learning_rate": 0.00048258940011648217,
"loss": 3.409,
"step": 33700
},
{
"epoch": 9.82295062878435,
"grad_norm": 0.34224554896354675,
"learning_rate": 0.0004824146767617938,
"loss": 3.4088,
"step": 33750
},
{
"epoch": 9.837505822077317,
"grad_norm": 0.3352258801460266,
"learning_rate": 0.0004822399534071054,
"loss": 3.4147,
"step": 33800
},
{
"epoch": 9.852061015370284,
"grad_norm": 0.32171106338500977,
"learning_rate": 0.00048206523005241695,
"loss": 3.4071,
"step": 33850
},
{
"epoch": 9.86661620866325,
"grad_norm": 0.32649844884872437,
"learning_rate": 0.00048189050669772854,
"loss": 3.4206,
"step": 33900
},
{
"epoch": 9.881171401956218,
"grad_norm": 0.3591761589050293,
"learning_rate": 0.00048171578334304014,
"loss": 3.405,
"step": 33950
},
{
"epoch": 9.895726595249185,
"grad_norm": 0.34346505999565125,
"learning_rate": 0.00048154105998835173,
"loss": 3.4212,
"step": 34000
},
{
"epoch": 9.895726595249185,
"eval_accuracy": 0.368610760259458,
"eval_loss": 3.5636332035064697,
"eval_runtime": 180.8038,
"eval_samples_per_second": 92.094,
"eval_steps_per_second": 5.758,
"step": 34000
},
{
"epoch": 9.910281788542152,
"grad_norm": 0.33863702416419983,
"learning_rate": 0.0004813663366336633,
"loss": 3.41,
"step": 34050
},
{
"epoch": 9.924836981835119,
"grad_norm": 0.3576464056968689,
"learning_rate": 0.0004811916132789749,
"loss": 3.4112,
"step": 34100
},
{
"epoch": 9.939392175128086,
"grad_norm": 0.35173270106315613,
"learning_rate": 0.0004810168899242865,
"loss": 3.4118,
"step": 34150
},
{
"epoch": 9.953947368421053,
"grad_norm": 0.3575360178947449,
"learning_rate": 0.0004808421665695981,
"loss": 3.4144,
"step": 34200
},
{
"epoch": 9.96850256171402,
"grad_norm": 0.34621626138687134,
"learning_rate": 0.00048066744321490965,
"loss": 3.4117,
"step": 34250
},
{
"epoch": 9.983057755006987,
"grad_norm": 0.3460022211074829,
"learning_rate": 0.00048049271986022124,
"loss": 3.4004,
"step": 34300
},
{
"epoch": 9.997612948299953,
"grad_norm": 0.3510585427284241,
"learning_rate": 0.0004803179965055329,
"loss": 3.4003,
"step": 34350
},
{
"epoch": 10.011935258500232,
"grad_norm": 0.3488433361053467,
"learning_rate": 0.0004801432731508445,
"loss": 3.3164,
"step": 34400
},
{
"epoch": 10.0264904517932,
"grad_norm": 0.34920355677604675,
"learning_rate": 0.000479968549796156,
"loss": 3.2976,
"step": 34450
},
{
"epoch": 10.041045645086166,
"grad_norm": 0.33996644616127014,
"learning_rate": 0.0004797938264414676,
"loss": 3.3053,
"step": 34500
},
{
"epoch": 10.055600838379133,
"grad_norm": 0.3447495102882385,
"learning_rate": 0.0004796191030867792,
"loss": 3.3029,
"step": 34550
},
{
"epoch": 10.0701560316721,
"grad_norm": 0.3605789244174957,
"learning_rate": 0.00047944437973209086,
"loss": 3.3058,
"step": 34600
},
{
"epoch": 10.084711224965067,
"grad_norm": 0.34010049700737,
"learning_rate": 0.0004792696563774024,
"loss": 3.3146,
"step": 34650
},
{
"epoch": 10.099266418258035,
"grad_norm": 0.35634300112724304,
"learning_rate": 0.000479094933022714,
"loss": 3.3178,
"step": 34700
},
{
"epoch": 10.113821611551002,
"grad_norm": 0.3321462571620941,
"learning_rate": 0.0004789202096680256,
"loss": 3.2983,
"step": 34750
},
{
"epoch": 10.128376804843969,
"grad_norm": 0.3477012813091278,
"learning_rate": 0.00047874548631333713,
"loss": 3.3279,
"step": 34800
},
{
"epoch": 10.142931998136936,
"grad_norm": 0.3322581946849823,
"learning_rate": 0.0004785707629586487,
"loss": 3.3356,
"step": 34850
},
{
"epoch": 10.157487191429903,
"grad_norm": 0.3342720866203308,
"learning_rate": 0.0004783960396039604,
"loss": 3.3184,
"step": 34900
},
{
"epoch": 10.17204238472287,
"grad_norm": 0.35528644919395447,
"learning_rate": 0.00047822131624927197,
"loss": 3.3323,
"step": 34950
},
{
"epoch": 10.186597578015837,
"grad_norm": 0.32312092185020447,
"learning_rate": 0.0004780465928945835,
"loss": 3.3329,
"step": 35000
},
{
"epoch": 10.186597578015837,
"eval_accuracy": 0.3682773352940022,
"eval_loss": 3.570481300354004,
"eval_runtime": 181.1752,
"eval_samples_per_second": 91.905,
"eval_steps_per_second": 5.746,
"step": 35000
},
{
"epoch": 10.201152771308802,
"grad_norm": 0.3692251443862915,
"learning_rate": 0.0004778718695398951,
"loss": 3.358,
"step": 35050
},
{
"epoch": 10.21570796460177,
"grad_norm": 0.3326742351055145,
"learning_rate": 0.0004776971461852067,
"loss": 3.3378,
"step": 35100
},
{
"epoch": 10.230263157894736,
"grad_norm": 0.3492138683795929,
"learning_rate": 0.00047752242283051835,
"loss": 3.335,
"step": 35150
},
{
"epoch": 10.244818351187703,
"grad_norm": 0.34393349289894104,
"learning_rate": 0.00047734769947582994,
"loss": 3.3608,
"step": 35200
},
{
"epoch": 10.25937354448067,
"grad_norm": 0.3394237458705902,
"learning_rate": 0.0004771729761211415,
"loss": 3.3426,
"step": 35250
},
{
"epoch": 10.273928737773637,
"grad_norm": 0.352071613073349,
"learning_rate": 0.0004769982527664531,
"loss": 3.3391,
"step": 35300
},
{
"epoch": 10.288483931066605,
"grad_norm": 0.35096055269241333,
"learning_rate": 0.00047682352941176467,
"loss": 3.3605,
"step": 35350
},
{
"epoch": 10.303039124359572,
"grad_norm": 0.368264377117157,
"learning_rate": 0.0004766488060570762,
"loss": 3.3471,
"step": 35400
},
{
"epoch": 10.317594317652539,
"grad_norm": 0.3177869915962219,
"learning_rate": 0.00047647408270238786,
"loss": 3.3564,
"step": 35450
},
{
"epoch": 10.332149510945506,
"grad_norm": 0.36875662207603455,
"learning_rate": 0.00047629935934769945,
"loss": 3.3499,
"step": 35500
},
{
"epoch": 10.346704704238473,
"grad_norm": 0.34088993072509766,
"learning_rate": 0.00047612463599301105,
"loss": 3.3666,
"step": 35550
},
{
"epoch": 10.36125989753144,
"grad_norm": 0.35179397463798523,
"learning_rate": 0.0004759499126383226,
"loss": 3.3562,
"step": 35600
},
{
"epoch": 10.375815090824407,
"grad_norm": 0.3595273792743683,
"learning_rate": 0.0004757751892836342,
"loss": 3.36,
"step": 35650
},
{
"epoch": 10.390370284117374,
"grad_norm": 0.3881017565727234,
"learning_rate": 0.0004756004659289458,
"loss": 3.3608,
"step": 35700
},
{
"epoch": 10.40492547741034,
"grad_norm": 0.3473113179206848,
"learning_rate": 0.0004754257425742574,
"loss": 3.3539,
"step": 35750
},
{
"epoch": 10.419480670703306,
"grad_norm": 0.331753134727478,
"learning_rate": 0.00047525101921956896,
"loss": 3.3595,
"step": 35800
},
{
"epoch": 10.434035863996273,
"grad_norm": 0.3324008882045746,
"learning_rate": 0.00047507629586488056,
"loss": 3.3727,
"step": 35850
},
{
"epoch": 10.44859105728924,
"grad_norm": 0.3285880982875824,
"learning_rate": 0.00047490157251019215,
"loss": 3.3584,
"step": 35900
},
{
"epoch": 10.463146250582207,
"grad_norm": 0.35034114122390747,
"learning_rate": 0.0004747268491555037,
"loss": 3.3613,
"step": 35950
},
{
"epoch": 10.477701443875175,
"grad_norm": 0.3423924148082733,
"learning_rate": 0.00047455212580081534,
"loss": 3.367,
"step": 36000
},
{
"epoch": 10.477701443875175,
"eval_accuracy": 0.3687240565818758,
"eval_loss": 3.565606117248535,
"eval_runtime": 181.094,
"eval_samples_per_second": 91.947,
"eval_steps_per_second": 5.748,
"step": 36000
},
{
"epoch": 10.492256637168142,
"grad_norm": 0.342632532119751,
"learning_rate": 0.00047437740244612694,
"loss": 3.3662,
"step": 36050
},
{
"epoch": 10.506811830461109,
"grad_norm": 0.3601525127887726,
"learning_rate": 0.00047420267909143853,
"loss": 3.3598,
"step": 36100
},
{
"epoch": 10.521367023754076,
"grad_norm": 0.3272550702095032,
"learning_rate": 0.0004740279557367501,
"loss": 3.3775,
"step": 36150
},
{
"epoch": 10.535922217047043,
"grad_norm": 0.3502359688282013,
"learning_rate": 0.00047385323238206166,
"loss": 3.3747,
"step": 36200
},
{
"epoch": 10.55047741034001,
"grad_norm": 0.34133386611938477,
"learning_rate": 0.00047367850902737326,
"loss": 3.3727,
"step": 36250
},
{
"epoch": 10.565032603632977,
"grad_norm": 0.3748016953468323,
"learning_rate": 0.0004735037856726849,
"loss": 3.378,
"step": 36300
},
{
"epoch": 10.579587796925944,
"grad_norm": 0.3498660922050476,
"learning_rate": 0.0004733290623179965,
"loss": 3.3736,
"step": 36350
},
{
"epoch": 10.59414299021891,
"grad_norm": 0.33588114380836487,
"learning_rate": 0.00047315433896330804,
"loss": 3.373,
"step": 36400
},
{
"epoch": 10.608698183511876,
"grad_norm": 0.3481428325176239,
"learning_rate": 0.00047297961560861964,
"loss": 3.3796,
"step": 36450
},
{
"epoch": 10.623253376804843,
"grad_norm": 0.3276902437210083,
"learning_rate": 0.00047280489225393123,
"loss": 3.3677,
"step": 36500
},
{
"epoch": 10.63780857009781,
"grad_norm": 0.3404669165611267,
"learning_rate": 0.0004726301688992429,
"loss": 3.3745,
"step": 36550
},
{
"epoch": 10.652363763390778,
"grad_norm": 0.34793999791145325,
"learning_rate": 0.0004724554455445544,
"loss": 3.3816,
"step": 36600
},
{
"epoch": 10.666918956683745,
"grad_norm": 0.32581639289855957,
"learning_rate": 0.000472280722189866,
"loss": 3.3818,
"step": 36650
},
{
"epoch": 10.681474149976712,
"grad_norm": 0.3253845274448395,
"learning_rate": 0.0004721059988351776,
"loss": 3.376,
"step": 36700
},
{
"epoch": 10.696029343269679,
"grad_norm": 0.341382771730423,
"learning_rate": 0.00047193127548048915,
"loss": 3.3828,
"step": 36750
},
{
"epoch": 10.710584536562646,
"grad_norm": 0.36883407831192017,
"learning_rate": 0.00047175655212580074,
"loss": 3.3844,
"step": 36800
},
{
"epoch": 10.725139729855613,
"grad_norm": 0.3668972849845886,
"learning_rate": 0.0004715818287711124,
"loss": 3.3904,
"step": 36850
},
{
"epoch": 10.73969492314858,
"grad_norm": 0.35706183314323425,
"learning_rate": 0.000471407105416424,
"loss": 3.3934,
"step": 36900
},
{
"epoch": 10.754250116441547,
"grad_norm": 0.33550548553466797,
"learning_rate": 0.0004712323820617355,
"loss": 3.3862,
"step": 36950
},
{
"epoch": 10.768805309734514,
"grad_norm": 0.3570643961429596,
"learning_rate": 0.0004710576587070471,
"loss": 3.3889,
"step": 37000
},
{
"epoch": 10.768805309734514,
"eval_accuracy": 0.3696497016393061,
"eval_loss": 3.5564565658569336,
"eval_runtime": 181.0728,
"eval_samples_per_second": 91.958,
"eval_steps_per_second": 5.749,
"step": 37000
},
{
"epoch": 10.783360503027481,
"grad_norm": 0.3547266125679016,
"learning_rate": 0.0004708829353523587,
"loss": 3.3802,
"step": 37050
},
{
"epoch": 10.797915696320446,
"grad_norm": 0.3460685908794403,
"learning_rate": 0.0004707082119976703,
"loss": 3.3797,
"step": 37100
},
{
"epoch": 10.812470889613413,
"grad_norm": 0.32947590947151184,
"learning_rate": 0.0004705334886429819,
"loss": 3.3932,
"step": 37150
},
{
"epoch": 10.82702608290638,
"grad_norm": 0.33554786443710327,
"learning_rate": 0.0004703587652882935,
"loss": 3.3781,
"step": 37200
},
{
"epoch": 10.841581276199348,
"grad_norm": 0.33499568700790405,
"learning_rate": 0.0004701840419336051,
"loss": 3.3761,
"step": 37250
},
{
"epoch": 10.856136469492315,
"grad_norm": 0.3423595428466797,
"learning_rate": 0.0004700093185789167,
"loss": 3.3915,
"step": 37300
},
{
"epoch": 10.870691662785282,
"grad_norm": 0.33275771141052246,
"learning_rate": 0.0004698345952242282,
"loss": 3.3884,
"step": 37350
},
{
"epoch": 10.885246856078249,
"grad_norm": 0.3475160300731659,
"learning_rate": 0.00046965987186953987,
"loss": 3.3939,
"step": 37400
},
{
"epoch": 10.899802049371216,
"grad_norm": 0.34615978598594666,
"learning_rate": 0.00046948514851485147,
"loss": 3.3877,
"step": 37450
},
{
"epoch": 10.914357242664183,
"grad_norm": 0.3393753468990326,
"learning_rate": 0.00046931042516016306,
"loss": 3.3759,
"step": 37500
},
{
"epoch": 10.92891243595715,
"grad_norm": 0.36012470722198486,
"learning_rate": 0.0004691357018054746,
"loss": 3.3791,
"step": 37550
},
{
"epoch": 10.943467629250117,
"grad_norm": 0.33901482820510864,
"learning_rate": 0.0004689609784507862,
"loss": 3.3967,
"step": 37600
},
{
"epoch": 10.958022822543084,
"grad_norm": 0.32990455627441406,
"learning_rate": 0.0004687862550960978,
"loss": 3.4031,
"step": 37650
},
{
"epoch": 10.972578015836051,
"grad_norm": 0.379361093044281,
"learning_rate": 0.00046861153174140944,
"loss": 3.3777,
"step": 37700
},
{
"epoch": 10.987133209129016,
"grad_norm": 0.33749642968177795,
"learning_rate": 0.000468436808386721,
"loss": 3.3842,
"step": 37750
},
{
"epoch": 11.001455519329296,
"grad_norm": 0.37495532631874084,
"learning_rate": 0.0004682620850320326,
"loss": 3.3758,
"step": 37800
},
{
"epoch": 11.016010712622263,
"grad_norm": 0.3221411406993866,
"learning_rate": 0.00046808736167734417,
"loss": 3.2876,
"step": 37850
},
{
"epoch": 11.03056590591523,
"grad_norm": 0.34512948989868164,
"learning_rate": 0.0004679126383226557,
"loss": 3.2815,
"step": 37900
},
{
"epoch": 11.045121099208197,
"grad_norm": 0.3403730094432831,
"learning_rate": 0.00046773791496796736,
"loss": 3.3007,
"step": 37950
},
{
"epoch": 11.059676292501164,
"grad_norm": 0.33120304346084595,
"learning_rate": 0.00046756319161327895,
"loss": 3.2762,
"step": 38000
},
{
"epoch": 11.059676292501164,
"eval_accuracy": 0.3692904206666595,
"eval_loss": 3.568734645843506,
"eval_runtime": 181.2074,
"eval_samples_per_second": 91.889,
"eval_steps_per_second": 5.745,
"step": 38000
},
{
"epoch": 11.074231485794131,
"grad_norm": 0.3604724407196045,
"learning_rate": 0.00046738846825859054,
"loss": 3.2975,
"step": 38050
},
{
"epoch": 11.088786679087098,
"grad_norm": 0.3622620403766632,
"learning_rate": 0.0004672137449039021,
"loss": 3.2926,
"step": 38100
},
{
"epoch": 11.103341872380065,
"grad_norm": 0.35401245951652527,
"learning_rate": 0.0004670390215492137,
"loss": 3.2959,
"step": 38150
},
{
"epoch": 11.117897065673032,
"grad_norm": 0.3392096161842346,
"learning_rate": 0.0004668642981945253,
"loss": 3.2822,
"step": 38200
},
{
"epoch": 11.132452258966,
"grad_norm": 0.33189859986305237,
"learning_rate": 0.0004666895748398369,
"loss": 3.289,
"step": 38250
},
{
"epoch": 11.147007452258967,
"grad_norm": 0.3406781554222107,
"learning_rate": 0.00046651485148514846,
"loss": 3.3034,
"step": 38300
},
{
"epoch": 11.161562645551934,
"grad_norm": 0.34871330857276917,
"learning_rate": 0.00046634012813046006,
"loss": 3.3274,
"step": 38350
},
{
"epoch": 11.1761178388449,
"grad_norm": 0.36096620559692383,
"learning_rate": 0.00046616540477577165,
"loss": 3.3217,
"step": 38400
},
{
"epoch": 11.190673032137866,
"grad_norm": 0.3496626913547516,
"learning_rate": 0.00046599068142108324,
"loss": 3.3182,
"step": 38450
},
{
"epoch": 11.205228225430833,
"grad_norm": 0.3420194089412689,
"learning_rate": 0.0004658159580663948,
"loss": 3.3136,
"step": 38500
},
{
"epoch": 11.2197834187238,
"grad_norm": 0.3360060155391693,
"learning_rate": 0.00046564123471170643,
"loss": 3.3192,
"step": 38550
},
{
"epoch": 11.234338612016767,
"grad_norm": 0.3573230803012848,
"learning_rate": 0.00046546651135701803,
"loss": 3.3189,
"step": 38600
},
{
"epoch": 11.248893805309734,
"grad_norm": 0.3675867021083832,
"learning_rate": 0.0004652917880023296,
"loss": 3.319,
"step": 38650
},
{
"epoch": 11.263448998602701,
"grad_norm": 0.3547263443470001,
"learning_rate": 0.00046511706464764116,
"loss": 3.3308,
"step": 38700
},
{
"epoch": 11.278004191895668,
"grad_norm": 0.34671658277511597,
"learning_rate": 0.00046494234129295276,
"loss": 3.329,
"step": 38750
},
{
"epoch": 11.292559385188635,
"grad_norm": 0.35404467582702637,
"learning_rate": 0.0004647676179382644,
"loss": 3.3238,
"step": 38800
},
{
"epoch": 11.307114578481603,
"grad_norm": 0.3403128385543823,
"learning_rate": 0.000464592894583576,
"loss": 3.3302,
"step": 38850
},
{
"epoch": 11.32166977177457,
"grad_norm": 0.3744731843471527,
"learning_rate": 0.00046441817122888754,
"loss": 3.3324,
"step": 38900
},
{
"epoch": 11.336224965067537,
"grad_norm": 0.3594568073749542,
"learning_rate": 0.00046424344787419913,
"loss": 3.343,
"step": 38950
},
{
"epoch": 11.350780158360504,
"grad_norm": 0.3566104769706726,
"learning_rate": 0.00046406872451951073,
"loss": 3.3424,
"step": 39000
},
{
"epoch": 11.350780158360504,
"eval_accuracy": 0.3692883051751621,
"eval_loss": 3.566476345062256,
"eval_runtime": 181.1872,
"eval_samples_per_second": 91.899,
"eval_steps_per_second": 5.745,
"step": 39000
},
{
"epoch": 11.36533535165347,
"grad_norm": 0.3528974950313568,
"learning_rate": 0.00046389400116482227,
"loss": 3.3309,
"step": 39050
},
{
"epoch": 11.379890544946436,
"grad_norm": 0.35445570945739746,
"learning_rate": 0.0004637192778101339,
"loss": 3.3369,
"step": 39100
},
{
"epoch": 11.394445738239403,
"grad_norm": 0.3597460985183716,
"learning_rate": 0.0004635445544554455,
"loss": 3.3324,
"step": 39150
},
{
"epoch": 11.40900093153237,
"grad_norm": 0.33990415930747986,
"learning_rate": 0.0004633698311007571,
"loss": 3.3461,
"step": 39200
},
{
"epoch": 11.423556124825337,
"grad_norm": 0.3448801040649414,
"learning_rate": 0.0004631951077460687,
"loss": 3.3306,
"step": 39250
},
{
"epoch": 11.438111318118304,
"grad_norm": 0.3563622534275055,
"learning_rate": 0.00046302038439138024,
"loss": 3.336,
"step": 39300
},
{
"epoch": 11.452666511411271,
"grad_norm": 0.3600659668445587,
"learning_rate": 0.0004628456610366919,
"loss": 3.3442,
"step": 39350
},
{
"epoch": 11.467221704704238,
"grad_norm": 0.32568398118019104,
"learning_rate": 0.0004626709376820035,
"loss": 3.3392,
"step": 39400
},
{
"epoch": 11.481776897997205,
"grad_norm": 0.35172879695892334,
"learning_rate": 0.0004624962143273151,
"loss": 3.3516,
"step": 39450
},
{
"epoch": 11.496332091290173,
"grad_norm": 0.3447670638561249,
"learning_rate": 0.0004623214909726266,
"loss": 3.3333,
"step": 39500
},
{
"epoch": 11.51088728458314,
"grad_norm": 0.34315159916877747,
"learning_rate": 0.0004621467676179382,
"loss": 3.3565,
"step": 39550
},
{
"epoch": 11.525442477876107,
"grad_norm": 0.3695205748081207,
"learning_rate": 0.0004619720442632498,
"loss": 3.3595,
"step": 39600
},
{
"epoch": 11.539997671169074,
"grad_norm": 0.34323710203170776,
"learning_rate": 0.00046179732090856145,
"loss": 3.3469,
"step": 39650
},
{
"epoch": 11.55455286446204,
"grad_norm": 0.33781781792640686,
"learning_rate": 0.000461622597553873,
"loss": 3.3469,
"step": 39700
},
{
"epoch": 11.569108057755006,
"grad_norm": 0.3478993773460388,
"learning_rate": 0.0004614478741991846,
"loss": 3.3564,
"step": 39750
},
{
"epoch": 11.583663251047973,
"grad_norm": 0.34204310178756714,
"learning_rate": 0.0004612731508444962,
"loss": 3.3526,
"step": 39800
},
{
"epoch": 11.59821844434094,
"grad_norm": 0.36109477281570435,
"learning_rate": 0.0004610984274898077,
"loss": 3.3517,
"step": 39850
},
{
"epoch": 11.612773637633907,
"grad_norm": 0.34194666147232056,
"learning_rate": 0.00046092370413511937,
"loss": 3.3634,
"step": 39900
},
{
"epoch": 11.627328830926874,
"grad_norm": 0.3449431359767914,
"learning_rate": 0.00046074898078043096,
"loss": 3.3512,
"step": 39950
},
{
"epoch": 11.641884024219841,
"grad_norm": 0.3200359046459198,
"learning_rate": 0.00046057425742574256,
"loss": 3.3651,
"step": 40000
},
{
"epoch": 11.641884024219841,
"eval_accuracy": 0.3697300903162084,
"eval_loss": 3.5560121536254883,
"eval_runtime": 181.1724,
"eval_samples_per_second": 91.907,
"eval_steps_per_second": 5.746,
"step": 40000
},
{
"epoch": 11.656439217512808,
"grad_norm": 0.3647145628929138,
"learning_rate": 0.0004603995340710541,
"loss": 3.3446,
"step": 40050
},
{
"epoch": 11.670994410805775,
"grad_norm": 0.37295401096343994,
"learning_rate": 0.0004602248107163657,
"loss": 3.3714,
"step": 40100
},
{
"epoch": 11.685549604098743,
"grad_norm": 0.3441055119037628,
"learning_rate": 0.0004600500873616773,
"loss": 3.3631,
"step": 40150
},
{
"epoch": 11.70010479739171,
"grad_norm": 0.3755488097667694,
"learning_rate": 0.00045987536400698894,
"loss": 3.3652,
"step": 40200
},
{
"epoch": 11.714659990684677,
"grad_norm": 0.3308577239513397,
"learning_rate": 0.0004597006406523005,
"loss": 3.367,
"step": 40250
},
{
"epoch": 11.729215183977644,
"grad_norm": 0.3554903268814087,
"learning_rate": 0.00045952591729761207,
"loss": 3.3616,
"step": 40300
},
{
"epoch": 11.74377037727061,
"grad_norm": 0.3407958745956421,
"learning_rate": 0.00045935119394292367,
"loss": 3.3621,
"step": 40350
},
{
"epoch": 11.758325570563578,
"grad_norm": 0.3670002520084381,
"learning_rate": 0.00045917647058823526,
"loss": 3.3642,
"step": 40400
},
{
"epoch": 11.772880763856545,
"grad_norm": 0.3373028337955475,
"learning_rate": 0.0004590017472335468,
"loss": 3.3577,
"step": 40450
},
{
"epoch": 11.78743595714951,
"grad_norm": 0.3528793752193451,
"learning_rate": 0.00045882702387885845,
"loss": 3.3686,
"step": 40500
},
{
"epoch": 11.801991150442477,
"grad_norm": 0.3317636549472809,
"learning_rate": 0.00045865230052417004,
"loss": 3.3664,
"step": 40550
},
{
"epoch": 11.816546343735444,
"grad_norm": 0.3258965313434601,
"learning_rate": 0.00045847757716948164,
"loss": 3.3817,
"step": 40600
},
{
"epoch": 11.831101537028411,
"grad_norm": 0.348578542470932,
"learning_rate": 0.0004583028538147932,
"loss": 3.3611,
"step": 40650
},
{
"epoch": 11.845656730321378,
"grad_norm": 0.3560961186885834,
"learning_rate": 0.00045812813046010477,
"loss": 3.3698,
"step": 40700
},
{
"epoch": 11.860211923614346,
"grad_norm": 0.3377649486064911,
"learning_rate": 0.0004579534071054164,
"loss": 3.3704,
"step": 40750
},
{
"epoch": 11.874767116907313,
"grad_norm": 0.33449798822402954,
"learning_rate": 0.000457778683750728,
"loss": 3.3793,
"step": 40800
},
{
"epoch": 11.88932231020028,
"grad_norm": 0.3655473589897156,
"learning_rate": 0.00045760396039603955,
"loss": 3.3721,
"step": 40850
},
{
"epoch": 11.903877503493247,
"grad_norm": 0.3865300416946411,
"learning_rate": 0.00045742923704135115,
"loss": 3.3725,
"step": 40900
},
{
"epoch": 11.918432696786214,
"grad_norm": 0.3576103150844574,
"learning_rate": 0.00045725451368666274,
"loss": 3.3717,
"step": 40950
},
{
"epoch": 11.93298789007918,
"grad_norm": 0.32919180393218994,
"learning_rate": 0.0004570797903319743,
"loss": 3.3767,
"step": 41000
},
{
"epoch": 11.93298789007918,
"eval_accuracy": 0.370469572121865,
"eval_loss": 3.5477712154388428,
"eval_runtime": 181.325,
"eval_samples_per_second": 91.83,
"eval_steps_per_second": 5.741,
"step": 41000
},
{
"epoch": 11.947543083372148,
"grad_norm": 0.32170921564102173,
"learning_rate": 0.00045690506697728593,
"loss": 3.3728,
"step": 41050
},
{
"epoch": 11.962098276665115,
"grad_norm": 0.3520164489746094,
"learning_rate": 0.0004567303436225975,
"loss": 3.37,
"step": 41100
},
{
"epoch": 11.97665346995808,
"grad_norm": 0.34390193223953247,
"learning_rate": 0.0004565556202679091,
"loss": 3.3699,
"step": 41150
},
{
"epoch": 11.991208663251047,
"grad_norm": 0.341876357793808,
"learning_rate": 0.00045638089691322066,
"loss": 3.3689,
"step": 41200
},
{
"epoch": 12.005530973451327,
"grad_norm": 0.36311545968055725,
"learning_rate": 0.00045620617355853225,
"loss": 3.3312,
"step": 41250
},
{
"epoch": 12.020086166744294,
"grad_norm": 0.3760519325733185,
"learning_rate": 0.0004560314502038439,
"loss": 3.2632,
"step": 41300
},
{
"epoch": 12.034641360037261,
"grad_norm": 0.35626885294914246,
"learning_rate": 0.0004558567268491555,
"loss": 3.2631,
"step": 41350
},
{
"epoch": 12.049196553330228,
"grad_norm": 0.3312586545944214,
"learning_rate": 0.00045568200349446704,
"loss": 3.2653,
"step": 41400
},
{
"epoch": 12.063751746623195,
"grad_norm": 0.33731192350387573,
"learning_rate": 0.00045550728013977863,
"loss": 3.2639,
"step": 41450
},
{
"epoch": 12.078306939916162,
"grad_norm": 0.3921743929386139,
"learning_rate": 0.0004553325567850902,
"loss": 3.2823,
"step": 41500
},
{
"epoch": 12.09286213320913,
"grad_norm": 0.35110893845558167,
"learning_rate": 0.0004551578334304018,
"loss": 3.2822,
"step": 41550
},
{
"epoch": 12.107417326502096,
"grad_norm": 0.32425668835639954,
"learning_rate": 0.00045498311007571347,
"loss": 3.2891,
"step": 41600
},
{
"epoch": 12.121972519795063,
"grad_norm": 0.35301515460014343,
"learning_rate": 0.000454808386721025,
"loss": 3.2916,
"step": 41650
},
{
"epoch": 12.13652771308803,
"grad_norm": 0.35211730003356934,
"learning_rate": 0.0004546336633663366,
"loss": 3.2885,
"step": 41700
},
{
"epoch": 12.151082906380998,
"grad_norm": 0.35022473335266113,
"learning_rate": 0.0004544589400116482,
"loss": 3.2871,
"step": 41750
},
{
"epoch": 12.165638099673963,
"grad_norm": 0.33055204153060913,
"learning_rate": 0.00045428421665695974,
"loss": 3.2922,
"step": 41800
},
{
"epoch": 12.18019329296693,
"grad_norm": 0.3559176027774811,
"learning_rate": 0.00045410949330227133,
"loss": 3.301,
"step": 41850
},
{
"epoch": 12.194748486259897,
"grad_norm": 0.3754260838031769,
"learning_rate": 0.000453934769947583,
"loss": 3.3022,
"step": 41900
},
{
"epoch": 12.209303679552864,
"grad_norm": 0.33595168590545654,
"learning_rate": 0.0004537600465928946,
"loss": 3.3005,
"step": 41950
},
{
"epoch": 12.223858872845831,
"grad_norm": 0.3507530987262726,
"learning_rate": 0.0004535853232382061,
"loss": 3.3124,
"step": 42000
},
{
"epoch": 12.223858872845831,
"eval_accuracy": 0.3698600755159948,
"eval_loss": 3.563331127166748,
"eval_runtime": 181.337,
"eval_samples_per_second": 91.824,
"eval_steps_per_second": 5.741,
"step": 42000
},
{
"epoch": 12.238414066138798,
"grad_norm": 0.35963207483291626,
"learning_rate": 0.0004534105998835177,
"loss": 3.3003,
"step": 42050
},
{
"epoch": 12.252969259431765,
"grad_norm": 0.3779832124710083,
"learning_rate": 0.0004532358765288293,
"loss": 3.297,
"step": 42100
},
{
"epoch": 12.267524452724732,
"grad_norm": 0.38168567419052124,
"learning_rate": 0.00045306115317414095,
"loss": 3.3092,
"step": 42150
},
{
"epoch": 12.2820796460177,
"grad_norm": 0.3286755681037903,
"learning_rate": 0.0004528864298194525,
"loss": 3.311,
"step": 42200
},
{
"epoch": 12.296634839310666,
"grad_norm": 0.3363521993160248,
"learning_rate": 0.0004527117064647641,
"loss": 3.3073,
"step": 42250
},
{
"epoch": 12.311190032603633,
"grad_norm": 0.35366666316986084,
"learning_rate": 0.0004525369831100757,
"loss": 3.303,
"step": 42300
},
{
"epoch": 12.3257452258966,
"grad_norm": 0.35679879784584045,
"learning_rate": 0.0004523622597553872,
"loss": 3.331,
"step": 42350
},
{
"epoch": 12.340300419189568,
"grad_norm": 0.35085782408714294,
"learning_rate": 0.0004521875364006988,
"loss": 3.31,
"step": 42400
},
{
"epoch": 12.354855612482535,
"grad_norm": 0.36564141511917114,
"learning_rate": 0.00045201281304601046,
"loss": 3.3175,
"step": 42450
},
{
"epoch": 12.3694108057755,
"grad_norm": 0.3681644797325134,
"learning_rate": 0.00045183808969132206,
"loss": 3.3174,
"step": 42500
},
{
"epoch": 12.383965999068467,
"grad_norm": 0.363091379404068,
"learning_rate": 0.00045166336633663365,
"loss": 3.3259,
"step": 42550
},
{
"epoch": 12.398521192361434,
"grad_norm": 0.38171902298927307,
"learning_rate": 0.0004514886429819452,
"loss": 3.314,
"step": 42600
},
{
"epoch": 12.413076385654401,
"grad_norm": 0.3879528343677521,
"learning_rate": 0.0004513139196272568,
"loss": 3.3278,
"step": 42650
},
{
"epoch": 12.427631578947368,
"grad_norm": 0.34305742383003235,
"learning_rate": 0.00045113919627256843,
"loss": 3.3164,
"step": 42700
},
{
"epoch": 12.442186772240335,
"grad_norm": 0.3577149510383606,
"learning_rate": 0.00045096447291788003,
"loss": 3.3243,
"step": 42750
},
{
"epoch": 12.456741965533302,
"grad_norm": 0.33020853996276855,
"learning_rate": 0.00045078974956319157,
"loss": 3.3335,
"step": 42800
},
{
"epoch": 12.47129715882627,
"grad_norm": 0.36573347449302673,
"learning_rate": 0.00045061502620850316,
"loss": 3.3238,
"step": 42850
},
{
"epoch": 12.485852352119236,
"grad_norm": 0.3603604733943939,
"learning_rate": 0.00045044030285381476,
"loss": 3.3252,
"step": 42900
},
{
"epoch": 12.500407545412203,
"grad_norm": 0.3663039207458496,
"learning_rate": 0.0004502655794991263,
"loss": 3.3292,
"step": 42950
},
{
"epoch": 12.51496273870517,
"grad_norm": 0.33189836144447327,
"learning_rate": 0.00045009085614443795,
"loss": 3.3323,
"step": 43000
},
{
"epoch": 12.51496273870517,
"eval_accuracy": 0.37042279625431074,
"eval_loss": 3.5531504154205322,
"eval_runtime": 181.3654,
"eval_samples_per_second": 91.809,
"eval_steps_per_second": 5.74,
"step": 43000
},
{
"epoch": 12.529517931998138,
"grad_norm": 0.39830282330513,
"learning_rate": 0.00044991613278974954,
"loss": 3.3356,
"step": 43050
},
{
"epoch": 12.544073125291105,
"grad_norm": 0.3498653173446655,
"learning_rate": 0.00044974140943506113,
"loss": 3.3376,
"step": 43100
},
{
"epoch": 12.55862831858407,
"grad_norm": 0.3489161431789398,
"learning_rate": 0.0004495666860803727,
"loss": 3.3268,
"step": 43150
},
{
"epoch": 12.573183511877037,
"grad_norm": 0.36358317732810974,
"learning_rate": 0.00044939196272568427,
"loss": 3.334,
"step": 43200
},
{
"epoch": 12.587738705170004,
"grad_norm": 0.35750314593315125,
"learning_rate": 0.00044921723937099586,
"loss": 3.332,
"step": 43250
},
{
"epoch": 12.602293898462971,
"grad_norm": 0.3437962830066681,
"learning_rate": 0.0004490425160163075,
"loss": 3.3463,
"step": 43300
},
{
"epoch": 12.616849091755938,
"grad_norm": 0.36788836121559143,
"learning_rate": 0.00044886779266161905,
"loss": 3.3322,
"step": 43350
},
{
"epoch": 12.631404285048905,
"grad_norm": 0.3601713180541992,
"learning_rate": 0.00044869306930693065,
"loss": 3.3432,
"step": 43400
},
{
"epoch": 12.645959478341872,
"grad_norm": 0.3093632459640503,
"learning_rate": 0.00044851834595224224,
"loss": 3.3397,
"step": 43450
},
{
"epoch": 12.66051467163484,
"grad_norm": 0.34150272607803345,
"learning_rate": 0.00044834362259755383,
"loss": 3.3498,
"step": 43500
},
{
"epoch": 12.675069864927806,
"grad_norm": 0.3343871533870697,
"learning_rate": 0.00044816889924286543,
"loss": 3.3388,
"step": 43550
},
{
"epoch": 12.689625058220773,
"grad_norm": 0.33877092599868774,
"learning_rate": 0.000447994175888177,
"loss": 3.3481,
"step": 43600
},
{
"epoch": 12.70418025151374,
"grad_norm": 0.35952886939048767,
"learning_rate": 0.0004478194525334886,
"loss": 3.3494,
"step": 43650
},
{
"epoch": 12.718735444806708,
"grad_norm": 0.3660774827003479,
"learning_rate": 0.0004476447291788002,
"loss": 3.3384,
"step": 43700
},
{
"epoch": 12.733290638099675,
"grad_norm": 0.35078608989715576,
"learning_rate": 0.00044747000582411175,
"loss": 3.3374,
"step": 43750
},
{
"epoch": 12.747845831392642,
"grad_norm": 0.3302287459373474,
"learning_rate": 0.00044729528246942335,
"loss": 3.3384,
"step": 43800
},
{
"epoch": 12.762401024685607,
"grad_norm": 0.3574998378753662,
"learning_rate": 0.000447120559114735,
"loss": 3.3324,
"step": 43850
},
{
"epoch": 12.776956217978574,
"grad_norm": 0.3275946080684662,
"learning_rate": 0.0004469458357600466,
"loss": 3.3458,
"step": 43900
},
{
"epoch": 12.791511411271541,
"grad_norm": 0.3478822112083435,
"learning_rate": 0.00044677111240535813,
"loss": 3.3481,
"step": 43950
},
{
"epoch": 12.806066604564508,
"grad_norm": 0.3444738984107971,
"learning_rate": 0.0004465963890506697,
"loss": 3.3498,
"step": 44000
},
{
"epoch": 12.806066604564508,
"eval_accuracy": 0.37071473408095584,
"eval_loss": 3.550508975982666,
"eval_runtime": 181.4057,
"eval_samples_per_second": 91.789,
"eval_steps_per_second": 5.739,
"step": 44000
},
{
"epoch": 12.820621797857475,
"grad_norm": 0.3442026674747467,
"learning_rate": 0.0004464216656959813,
"loss": 3.3616,
"step": 44050
},
{
"epoch": 12.835176991150442,
"grad_norm": 0.3639858067035675,
"learning_rate": 0.00044624694234129297,
"loss": 3.3583,
"step": 44100
},
{
"epoch": 12.84973218444341,
"grad_norm": 0.35911089181900024,
"learning_rate": 0.0004460722189866045,
"loss": 3.3487,
"step": 44150
},
{
"epoch": 12.864287377736376,
"grad_norm": 0.3291614353656769,
"learning_rate": 0.0004458974956319161,
"loss": 3.3515,
"step": 44200
},
{
"epoch": 12.878842571029343,
"grad_norm": 0.3494235575199127,
"learning_rate": 0.0004457227722772277,
"loss": 3.3511,
"step": 44250
},
{
"epoch": 12.89339776432231,
"grad_norm": 0.3374604880809784,
"learning_rate": 0.00044554804892253923,
"loss": 3.35,
"step": 44300
},
{
"epoch": 12.907952957615278,
"grad_norm": 0.3468201160430908,
"learning_rate": 0.00044537332556785083,
"loss": 3.3465,
"step": 44350
},
{
"epoch": 12.922508150908245,
"grad_norm": 0.35529449582099915,
"learning_rate": 0.0004451986022131625,
"loss": 3.3596,
"step": 44400
},
{
"epoch": 12.937063344201212,
"grad_norm": 0.3489094376564026,
"learning_rate": 0.00044502387885847407,
"loss": 3.3567,
"step": 44450
},
{
"epoch": 12.951618537494177,
"grad_norm": 0.37239301204681396,
"learning_rate": 0.0004448491555037856,
"loss": 3.3536,
"step": 44500
},
{
"epoch": 12.966173730787144,
"grad_norm": 0.40329527854919434,
"learning_rate": 0.0004446744321490972,
"loss": 3.3503,
"step": 44550
},
{
"epoch": 12.980728924080111,
"grad_norm": 0.3665720820426941,
"learning_rate": 0.0004444997087944088,
"loss": 3.3592,
"step": 44600
},
{
"epoch": 12.995284117373078,
"grad_norm": 0.3658985197544098,
"learning_rate": 0.0004443249854397204,
"loss": 3.3493,
"step": 44650
},
{
"epoch": 13.009606427573358,
"grad_norm": 0.34227049350738525,
"learning_rate": 0.000444150262085032,
"loss": 3.2852,
"step": 44700
},
{
"epoch": 13.024161620866325,
"grad_norm": 0.3462371230125427,
"learning_rate": 0.0004439755387303436,
"loss": 3.2481,
"step": 44750
},
{
"epoch": 13.038716814159292,
"grad_norm": 0.36165347695350647,
"learning_rate": 0.0004438008153756552,
"loss": 3.2525,
"step": 44800
},
{
"epoch": 13.053272007452259,
"grad_norm": 0.3508550822734833,
"learning_rate": 0.00044362609202096677,
"loss": 3.259,
"step": 44850
},
{
"epoch": 13.067827200745226,
"grad_norm": 0.3499922752380371,
"learning_rate": 0.0004434513686662783,
"loss": 3.2469,
"step": 44900
},
{
"epoch": 13.082382394038193,
"grad_norm": 0.3820374011993408,
"learning_rate": 0.00044327664531158996,
"loss": 3.2486,
"step": 44950
},
{
"epoch": 13.09693758733116,
"grad_norm": 0.34704211354255676,
"learning_rate": 0.00044310192195690155,
"loss": 3.2658,
"step": 45000
},
{
"epoch": 13.09693758733116,
"eval_accuracy": 0.37041080846915864,
"eval_loss": 3.5606274604797363,
"eval_runtime": 181.4519,
"eval_samples_per_second": 91.765,
"eval_steps_per_second": 5.737,
"step": 45000
},
{
"epoch": 13.111492780624127,
"grad_norm": 0.3327675461769104,
"learning_rate": 0.00044292719860221315,
"loss": 3.265,
"step": 45050
},
{
"epoch": 13.126047973917094,
"grad_norm": 0.32860827445983887,
"learning_rate": 0.0004427524752475247,
"loss": 3.2626,
"step": 45100
},
{
"epoch": 13.140603167210061,
"grad_norm": 0.3338595926761627,
"learning_rate": 0.0004425777518928363,
"loss": 3.2655,
"step": 45150
},
{
"epoch": 13.155158360503027,
"grad_norm": 0.3572806715965271,
"learning_rate": 0.0004424030285381479,
"loss": 3.2779,
"step": 45200
},
{
"epoch": 13.169713553795994,
"grad_norm": 0.36495622992515564,
"learning_rate": 0.0004422283051834595,
"loss": 3.2692,
"step": 45250
},
{
"epoch": 13.18426874708896,
"grad_norm": 0.39937713742256165,
"learning_rate": 0.00044205358182877107,
"loss": 3.2933,
"step": 45300
},
{
"epoch": 13.198823940381928,
"grad_norm": 0.3623490333557129,
"learning_rate": 0.00044187885847408266,
"loss": 3.2924,
"step": 45350
},
{
"epoch": 13.213379133674895,
"grad_norm": 0.3873792588710785,
"learning_rate": 0.00044170413511939425,
"loss": 3.2905,
"step": 45400
},
{
"epoch": 13.227934326967862,
"grad_norm": 0.36040782928466797,
"learning_rate": 0.0004415294117647058,
"loss": 3.2894,
"step": 45450
},
{
"epoch": 13.242489520260829,
"grad_norm": 0.3621423542499542,
"learning_rate": 0.00044135468841001744,
"loss": 3.2964,
"step": 45500
},
{
"epoch": 13.257044713553796,
"grad_norm": 0.3496916592121124,
"learning_rate": 0.00044117996505532904,
"loss": 3.2837,
"step": 45550
},
{
"epoch": 13.271599906846763,
"grad_norm": 0.4022481441497803,
"learning_rate": 0.00044100524170064063,
"loss": 3.2878,
"step": 45600
},
{
"epoch": 13.28615510013973,
"grad_norm": 0.38104742765426636,
"learning_rate": 0.0004408305183459522,
"loss": 3.2934,
"step": 45650
},
{
"epoch": 13.300710293432697,
"grad_norm": 0.37204158306121826,
"learning_rate": 0.00044065579499126377,
"loss": 3.2973,
"step": 45700
},
{
"epoch": 13.315265486725664,
"grad_norm": 0.3469567894935608,
"learning_rate": 0.00044048107163657536,
"loss": 3.305,
"step": 45750
},
{
"epoch": 13.329820680018631,
"grad_norm": 0.34882915019989014,
"learning_rate": 0.000440306348281887,
"loss": 3.3061,
"step": 45800
},
{
"epoch": 13.344375873311598,
"grad_norm": 0.37024763226509094,
"learning_rate": 0.0004401316249271986,
"loss": 3.3038,
"step": 45850
},
{
"epoch": 13.358931066604564,
"grad_norm": 0.37024813890457153,
"learning_rate": 0.00043995690157251014,
"loss": 3.3044,
"step": 45900
},
{
"epoch": 13.37348625989753,
"grad_norm": 0.36839035153388977,
"learning_rate": 0.00043978217821782174,
"loss": 3.3079,
"step": 45950
},
{
"epoch": 13.388041453190498,
"grad_norm": 0.3722715377807617,
"learning_rate": 0.00043960745486313333,
"loss": 3.3054,
"step": 46000
},
{
"epoch": 13.388041453190498,
"eval_accuracy": 0.3708863239468584,
"eval_loss": 3.5549628734588623,
"eval_runtime": 181.4286,
"eval_samples_per_second": 91.777,
"eval_steps_per_second": 5.738,
"step": 46000
},
{
"epoch": 13.402596646483465,
"grad_norm": 0.3347816467285156,
"learning_rate": 0.00043943273150844487,
"loss": 3.3029,
"step": 46050
},
{
"epoch": 13.417151839776432,
"grad_norm": 0.3504791259765625,
"learning_rate": 0.0004392580081537565,
"loss": 3.3015,
"step": 46100
},
{
"epoch": 13.431707033069399,
"grad_norm": 0.37427136301994324,
"learning_rate": 0.0004390832847990681,
"loss": 3.3096,
"step": 46150
},
{
"epoch": 13.446262226362366,
"grad_norm": 0.3400617241859436,
"learning_rate": 0.0004389085614443797,
"loss": 3.307,
"step": 46200
},
{
"epoch": 13.460817419655333,
"grad_norm": 0.37000834941864014,
"learning_rate": 0.00043873383808969125,
"loss": 3.2937,
"step": 46250
},
{
"epoch": 13.4753726129483,
"grad_norm": 0.34978291392326355,
"learning_rate": 0.00043855911473500284,
"loss": 3.3096,
"step": 46300
},
{
"epoch": 13.489927806241267,
"grad_norm": 0.3517734706401825,
"learning_rate": 0.0004383843913803145,
"loss": 3.3093,
"step": 46350
},
{
"epoch": 13.504482999534234,
"grad_norm": 0.3470326364040375,
"learning_rate": 0.0004382096680256261,
"loss": 3.3038,
"step": 46400
},
{
"epoch": 13.519038192827201,
"grad_norm": 0.37263551354408264,
"learning_rate": 0.0004380349446709376,
"loss": 3.3167,
"step": 46450
},
{
"epoch": 13.533593386120168,
"grad_norm": 0.3580203056335449,
"learning_rate": 0.0004378602213162492,
"loss": 3.3184,
"step": 46500
},
{
"epoch": 13.548148579413134,
"grad_norm": 0.3770522177219391,
"learning_rate": 0.0004376854979615608,
"loss": 3.3158,
"step": 46550
},
{
"epoch": 13.5627037727061,
"grad_norm": 0.37715944647789,
"learning_rate": 0.0004375107746068724,
"loss": 3.3239,
"step": 46600
},
{
"epoch": 13.577258965999068,
"grad_norm": 0.3532266616821289,
"learning_rate": 0.000437336051252184,
"loss": 3.3219,
"step": 46650
},
{
"epoch": 13.591814159292035,
"grad_norm": 0.37203213572502136,
"learning_rate": 0.0004371613278974956,
"loss": 3.3188,
"step": 46700
},
{
"epoch": 13.606369352585002,
"grad_norm": 0.370649129152298,
"learning_rate": 0.0004369866045428072,
"loss": 3.3148,
"step": 46750
},
{
"epoch": 13.620924545877969,
"grad_norm": 0.3378678858280182,
"learning_rate": 0.0004368118811881188,
"loss": 3.3152,
"step": 46800
},
{
"epoch": 13.635479739170936,
"grad_norm": 0.3273431360721588,
"learning_rate": 0.0004366371578334303,
"loss": 3.3226,
"step": 46850
},
{
"epoch": 13.650034932463903,
"grad_norm": 0.349479079246521,
"learning_rate": 0.000436462434478742,
"loss": 3.3402,
"step": 46900
},
{
"epoch": 13.66459012575687,
"grad_norm": 0.4112270772457123,
"learning_rate": 0.00043628771112405357,
"loss": 3.331,
"step": 46950
},
{
"epoch": 13.679145319049837,
"grad_norm": 0.3594585061073303,
"learning_rate": 0.00043611298776936516,
"loss": 3.3295,
"step": 47000
},
{
"epoch": 13.679145319049837,
"eval_accuracy": 0.3710210102388613,
"eval_loss": 3.5495879650115967,
"eval_runtime": 181.3002,
"eval_samples_per_second": 91.842,
"eval_steps_per_second": 5.742,
"step": 47000
},
{
"epoch": 13.693700512342804,
"grad_norm": 0.3493228852748871,
"learning_rate": 0.0004359382644146767,
"loss": 3.3297,
"step": 47050
},
{
"epoch": 13.708255705635771,
"grad_norm": 0.34904351830482483,
"learning_rate": 0.0004357635410599883,
"loss": 3.3279,
"step": 47100
},
{
"epoch": 13.722810898928739,
"grad_norm": 0.34903162717819214,
"learning_rate": 0.0004355888177052999,
"loss": 3.3357,
"step": 47150
},
{
"epoch": 13.737366092221706,
"grad_norm": 0.34454700350761414,
"learning_rate": 0.00043541409435061154,
"loss": 3.3259,
"step": 47200
},
{
"epoch": 13.75192128551467,
"grad_norm": 0.35162457823753357,
"learning_rate": 0.0004352393709959231,
"loss": 3.3325,
"step": 47250
},
{
"epoch": 13.766476478807638,
"grad_norm": 0.3408013880252838,
"learning_rate": 0.0004350646476412347,
"loss": 3.3389,
"step": 47300
},
{
"epoch": 13.781031672100605,
"grad_norm": 0.354308545589447,
"learning_rate": 0.00043488992428654627,
"loss": 3.3408,
"step": 47350
},
{
"epoch": 13.795586865393572,
"grad_norm": 0.40119293332099915,
"learning_rate": 0.0004347152009318578,
"loss": 3.3234,
"step": 47400
},
{
"epoch": 13.810142058686539,
"grad_norm": 0.3471970558166504,
"learning_rate": 0.00043454047757716946,
"loss": 3.333,
"step": 47450
},
{
"epoch": 13.824697251979506,
"grad_norm": 0.36737313866615295,
"learning_rate": 0.00043436575422248105,
"loss": 3.333,
"step": 47500
},
{
"epoch": 13.839252445272473,
"grad_norm": 0.3637086749076843,
"learning_rate": 0.00043419103086779265,
"loss": 3.3328,
"step": 47550
},
{
"epoch": 13.85380763856544,
"grad_norm": 0.3733203113079071,
"learning_rate": 0.0004340163075131042,
"loss": 3.3223,
"step": 47600
},
{
"epoch": 13.868362831858407,
"grad_norm": 0.3271060883998871,
"learning_rate": 0.0004338415841584158,
"loss": 3.3405,
"step": 47650
},
{
"epoch": 13.882918025151374,
"grad_norm": 0.35861730575561523,
"learning_rate": 0.0004336668608037274,
"loss": 3.3473,
"step": 47700
},
{
"epoch": 13.897473218444341,
"grad_norm": 0.3527565598487854,
"learning_rate": 0.000433492137449039,
"loss": 3.3341,
"step": 47750
},
{
"epoch": 13.912028411737309,
"grad_norm": 0.40213543176651,
"learning_rate": 0.00043331741409435056,
"loss": 3.3284,
"step": 47800
},
{
"epoch": 13.926583605030276,
"grad_norm": 0.34782031178474426,
"learning_rate": 0.00043314269073966216,
"loss": 3.3332,
"step": 47850
},
{
"epoch": 13.94113879832324,
"grad_norm": 0.3511146903038025,
"learning_rate": 0.00043296796738497375,
"loss": 3.3264,
"step": 47900
},
{
"epoch": 13.955693991616208,
"grad_norm": 0.37038230895996094,
"learning_rate": 0.00043279324403028535,
"loss": 3.331,
"step": 47950
},
{
"epoch": 13.970249184909175,
"grad_norm": 0.37031859159469604,
"learning_rate": 0.0004326185206755969,
"loss": 3.3345,
"step": 48000
},
{
"epoch": 13.970249184909175,
"eval_accuracy": 0.37165318961467614,
"eval_loss": 3.5406837463378906,
"eval_runtime": 181.3711,
"eval_samples_per_second": 91.806,
"eval_steps_per_second": 5.74,
"step": 48000
},
{
"epoch": 13.984804378202142,
"grad_norm": 0.3739323019981384,
"learning_rate": 0.00043244379732090854,
"loss": 3.3379,
"step": 48050
},
{
"epoch": 13.99935957149511,
"grad_norm": 0.341511070728302,
"learning_rate": 0.00043226907396622013,
"loss": 3.3512,
"step": 48100
},
{
"epoch": 14.013681881695389,
"grad_norm": 0.37125998735427856,
"learning_rate": 0.0004320943506115317,
"loss": 3.2204,
"step": 48150
},
{
"epoch": 14.028237074988356,
"grad_norm": 0.3771876394748688,
"learning_rate": 0.00043191962725684326,
"loss": 3.2392,
"step": 48200
},
{
"epoch": 14.042792268281323,
"grad_norm": 0.35821080207824707,
"learning_rate": 0.00043174490390215486,
"loss": 3.2436,
"step": 48250
},
{
"epoch": 14.05734746157429,
"grad_norm": 0.370997816324234,
"learning_rate": 0.0004315701805474665,
"loss": 3.2457,
"step": 48300
},
{
"epoch": 14.071902654867257,
"grad_norm": 0.3405769169330597,
"learning_rate": 0.0004313954571927781,
"loss": 3.2555,
"step": 48350
},
{
"epoch": 14.086457848160224,
"grad_norm": 0.35191479325294495,
"learning_rate": 0.00043122073383808964,
"loss": 3.2478,
"step": 48400
},
{
"epoch": 14.101013041453191,
"grad_norm": 0.34124019742012024,
"learning_rate": 0.00043104601048340124,
"loss": 3.2366,
"step": 48450
},
{
"epoch": 14.115568234746158,
"grad_norm": 0.3625926375389099,
"learning_rate": 0.00043087128712871283,
"loss": 3.252,
"step": 48500
},
{
"epoch": 14.130123428039123,
"grad_norm": 0.364780068397522,
"learning_rate": 0.00043069656377402437,
"loss": 3.2408,
"step": 48550
},
{
"epoch": 14.14467862133209,
"grad_norm": 0.35787588357925415,
"learning_rate": 0.000430521840419336,
"loss": 3.255,
"step": 48600
},
{
"epoch": 14.159233814625058,
"grad_norm": 0.3852957785129547,
"learning_rate": 0.0004303471170646476,
"loss": 3.2672,
"step": 48650
},
{
"epoch": 14.173789007918025,
"grad_norm": 0.3464197516441345,
"learning_rate": 0.0004301723937099592,
"loss": 3.2624,
"step": 48700
},
{
"epoch": 14.188344201210992,
"grad_norm": 0.36321866512298584,
"learning_rate": 0.00042999767035527075,
"loss": 3.2599,
"step": 48750
},
{
"epoch": 14.202899394503959,
"grad_norm": 0.3656206429004669,
"learning_rate": 0.00042982294700058234,
"loss": 3.2667,
"step": 48800
},
{
"epoch": 14.217454587796926,
"grad_norm": 0.3317374885082245,
"learning_rate": 0.000429648223645894,
"loss": 3.2667,
"step": 48850
},
{
"epoch": 14.232009781089893,
"grad_norm": 0.34478551149368286,
"learning_rate": 0.0004294735002912056,
"loss": 3.2794,
"step": 48900
},
{
"epoch": 14.24656497438286,
"grad_norm": 0.38702359795570374,
"learning_rate": 0.0004292987769365172,
"loss": 3.2622,
"step": 48950
},
{
"epoch": 14.261120167675827,
"grad_norm": 0.4132826030254364,
"learning_rate": 0.0004291240535818287,
"loss": 3.2725,
"step": 49000
},
{
"epoch": 14.261120167675827,
"eval_accuracy": 0.37083684495127966,
"eval_loss": 3.559509515762329,
"eval_runtime": 181.4988,
"eval_samples_per_second": 91.742,
"eval_steps_per_second": 5.736,
"step": 49000
},
{
"epoch": 14.275675360968794,
"grad_norm": 0.3839230537414551,
"learning_rate": 0.0004289493302271403,
"loss": 3.2781,
"step": 49050
},
{
"epoch": 14.290230554261761,
"grad_norm": 0.3690083920955658,
"learning_rate": 0.0004287746068724519,
"loss": 3.2722,
"step": 49100
},
{
"epoch": 14.304785747554728,
"grad_norm": 0.3392127752304077,
"learning_rate": 0.00042859988351776356,
"loss": 3.2868,
"step": 49150
},
{
"epoch": 14.319340940847695,
"grad_norm": 0.3598864674568176,
"learning_rate": 0.0004284251601630751,
"loss": 3.2953,
"step": 49200
},
{
"epoch": 14.33389613414066,
"grad_norm": 0.3549182116985321,
"learning_rate": 0.0004282504368083867,
"loss": 3.2795,
"step": 49250
},
{
"epoch": 14.348451327433628,
"grad_norm": 0.3696673512458801,
"learning_rate": 0.0004280757134536983,
"loss": 3.2852,
"step": 49300
},
{
"epoch": 14.363006520726595,
"grad_norm": 0.3530607521533966,
"learning_rate": 0.0004279009900990098,
"loss": 3.288,
"step": 49350
},
{
"epoch": 14.377561714019562,
"grad_norm": 0.37303102016448975,
"learning_rate": 0.0004277262667443214,
"loss": 3.2851,
"step": 49400
},
{
"epoch": 14.392116907312529,
"grad_norm": 0.3705254793167114,
"learning_rate": 0.00042755154338963307,
"loss": 3.2887,
"step": 49450
},
{
"epoch": 14.406672100605496,
"grad_norm": 0.3585062623023987,
"learning_rate": 0.00042737682003494466,
"loss": 3.293,
"step": 49500
},
{
"epoch": 14.421227293898463,
"grad_norm": 0.3528754711151123,
"learning_rate": 0.0004272020966802562,
"loss": 3.2798,
"step": 49550
},
{
"epoch": 14.43578248719143,
"grad_norm": 0.3506697714328766,
"learning_rate": 0.0004270273733255678,
"loss": 3.2775,
"step": 49600
},
{
"epoch": 14.450337680484397,
"grad_norm": 0.3489326238632202,
"learning_rate": 0.0004268526499708794,
"loss": 3.2826,
"step": 49650
},
{
"epoch": 14.464892873777364,
"grad_norm": 0.3633829653263092,
"learning_rate": 0.00042667792661619104,
"loss": 3.2986,
"step": 49700
},
{
"epoch": 14.479448067070331,
"grad_norm": 0.38335442543029785,
"learning_rate": 0.0004265032032615026,
"loss": 3.2962,
"step": 49750
},
{
"epoch": 14.494003260363298,
"grad_norm": 0.3436025083065033,
"learning_rate": 0.0004263284799068142,
"loss": 3.2912,
"step": 49800
},
{
"epoch": 14.508558453656265,
"grad_norm": 0.3850632905960083,
"learning_rate": 0.00042615375655212577,
"loss": 3.3004,
"step": 49850
},
{
"epoch": 14.52311364694923,
"grad_norm": 0.34339576959609985,
"learning_rate": 0.00042597903319743736,
"loss": 3.2974,
"step": 49900
},
{
"epoch": 14.537668840242198,
"grad_norm": 0.3751101791858673,
"learning_rate": 0.0004258043098427489,
"loss": 3.3055,
"step": 49950
},
{
"epoch": 14.552224033535165,
"grad_norm": 0.4216310679912567,
"learning_rate": 0.00042562958648806055,
"loss": 3.3044,
"step": 50000
},
{
"epoch": 14.552224033535165,
"eval_accuracy": 0.37140920292863944,
"eval_loss": 3.5514838695526123,
"eval_runtime": 181.3812,
"eval_samples_per_second": 91.801,
"eval_steps_per_second": 5.739,
"step": 50000
},
{
"epoch": 14.566779226828132,
"grad_norm": 0.34719032049179077,
"learning_rate": 0.00042545486313337214,
"loss": 3.307,
"step": 50050
},
{
"epoch": 14.581334420121099,
"grad_norm": 0.3421182930469513,
"learning_rate": 0.00042528013977868374,
"loss": 3.2997,
"step": 50100
},
{
"epoch": 14.595889613414066,
"grad_norm": 0.3395073711872101,
"learning_rate": 0.0004251054164239953,
"loss": 3.3123,
"step": 50150
},
{
"epoch": 14.610444806707033,
"grad_norm": 0.39163294434547424,
"learning_rate": 0.0004249306930693069,
"loss": 3.3125,
"step": 50200
},
{
"epoch": 14.625,
"grad_norm": 0.3609045445919037,
"learning_rate": 0.0004247559697146185,
"loss": 3.3094,
"step": 50250
},
{
"epoch": 14.639555193292967,
"grad_norm": 0.3469443917274475,
"learning_rate": 0.0004245812463599301,
"loss": 3.3116,
"step": 50300
},
{
"epoch": 14.654110386585934,
"grad_norm": 0.34178218245506287,
"learning_rate": 0.00042440652300524166,
"loss": 3.3141,
"step": 50350
},
{
"epoch": 14.668665579878901,
"grad_norm": 0.3778707683086395,
"learning_rate": 0.00042423179965055325,
"loss": 3.319,
"step": 50400
},
{
"epoch": 14.683220773171868,
"grad_norm": 0.3536614179611206,
"learning_rate": 0.00042405707629586484,
"loss": 3.3098,
"step": 50450
},
{
"epoch": 14.697775966464835,
"grad_norm": 0.3431127667427063,
"learning_rate": 0.0004238823529411764,
"loss": 3.3091,
"step": 50500
},
{
"epoch": 14.712331159757802,
"grad_norm": 0.41096821427345276,
"learning_rate": 0.00042370762958648803,
"loss": 3.3104,
"step": 50550
},
{
"epoch": 14.72688635305077,
"grad_norm": 0.3570299446582794,
"learning_rate": 0.00042353290623179963,
"loss": 3.3163,
"step": 50600
},
{
"epoch": 14.741441546343735,
"grad_norm": 0.3669699728488922,
"learning_rate": 0.0004233581828771112,
"loss": 3.318,
"step": 50650
},
{
"epoch": 14.755996739636702,
"grad_norm": 0.3691817820072174,
"learning_rate": 0.00042318345952242276,
"loss": 3.3236,
"step": 50700
},
{
"epoch": 14.770551932929669,
"grad_norm": 0.381858229637146,
"learning_rate": 0.00042300873616773436,
"loss": 3.3118,
"step": 50750
},
{
"epoch": 14.785107126222636,
"grad_norm": 0.3354921340942383,
"learning_rate": 0.00042283401281304595,
"loss": 3.3126,
"step": 50800
},
{
"epoch": 14.799662319515603,
"grad_norm": 0.377597838640213,
"learning_rate": 0.0004226592894583576,
"loss": 3.3184,
"step": 50850
},
{
"epoch": 14.81421751280857,
"grad_norm": 0.3841676414012909,
"learning_rate": 0.00042248456610366914,
"loss": 3.3036,
"step": 50900
},
{
"epoch": 14.828772706101537,
"grad_norm": 0.36683332920074463,
"learning_rate": 0.00042230984274898073,
"loss": 3.3153,
"step": 50950
},
{
"epoch": 14.843327899394504,
"grad_norm": 0.3898123800754547,
"learning_rate": 0.00042213511939429233,
"loss": 3.3212,
"step": 51000
},
{
"epoch": 14.843327899394504,
"eval_accuracy": 0.3720270439731939,
"eval_loss": 3.5415496826171875,
"eval_runtime": 181.548,
"eval_samples_per_second": 91.717,
"eval_steps_per_second": 5.734,
"step": 51000
},
{
"epoch": 14.857883092687471,
"grad_norm": 0.35734856128692627,
"learning_rate": 0.0004219603960396039,
"loss": 3.318,
"step": 51050
},
{
"epoch": 14.872438285980438,
"grad_norm": 0.35610759258270264,
"learning_rate": 0.0004217856726849155,
"loss": 3.3265,
"step": 51100
},
{
"epoch": 14.886993479273405,
"grad_norm": 0.37935730814933777,
"learning_rate": 0.0004216109493302271,
"loss": 3.3323,
"step": 51150
},
{
"epoch": 14.901548672566372,
"grad_norm": 0.353719562292099,
"learning_rate": 0.0004214362259755387,
"loss": 3.3173,
"step": 51200
},
{
"epoch": 14.916103865859338,
"grad_norm": 0.3561287522315979,
"learning_rate": 0.0004212615026208503,
"loss": 3.3289,
"step": 51250
},
{
"epoch": 14.930659059152305,
"grad_norm": 0.3487884998321533,
"learning_rate": 0.00042108677926616184,
"loss": 3.3334,
"step": 51300
},
{
"epoch": 14.945214252445272,
"grad_norm": 0.3653643727302551,
"learning_rate": 0.00042091205591147343,
"loss": 3.3138,
"step": 51350
},
{
"epoch": 14.959769445738239,
"grad_norm": 0.3630554974079132,
"learning_rate": 0.0004207373325567851,
"loss": 3.3223,
"step": 51400
},
{
"epoch": 14.974324639031206,
"grad_norm": 0.3576192259788513,
"learning_rate": 0.0004205626092020967,
"loss": 3.3339,
"step": 51450
},
{
"epoch": 14.988879832324173,
"grad_norm": 0.3210159242153168,
"learning_rate": 0.0004203878858474082,
"loss": 3.338,
"step": 51500
},
{
"epoch": 15.003202142524453,
"grad_norm": 0.3387458920478821,
"learning_rate": 0.0004202131624927198,
"loss": 3.3036,
"step": 51550
},
{
"epoch": 15.01775733581742,
"grad_norm": 0.3409118950366974,
"learning_rate": 0.0004200384391380314,
"loss": 3.2119,
"step": 51600
},
{
"epoch": 15.032312529110387,
"grad_norm": 0.353676438331604,
"learning_rate": 0.00041986371578334305,
"loss": 3.2213,
"step": 51650
},
{
"epoch": 15.046867722403354,
"grad_norm": 0.3806433379650116,
"learning_rate": 0.0004196889924286546,
"loss": 3.2249,
"step": 51700
},
{
"epoch": 15.06142291569632,
"grad_norm": 0.3820704519748688,
"learning_rate": 0.0004195142690739662,
"loss": 3.2229,
"step": 51750
},
{
"epoch": 15.075978108989288,
"grad_norm": 0.3519267141819,
"learning_rate": 0.0004193395457192778,
"loss": 3.2291,
"step": 51800
},
{
"epoch": 15.090533302282255,
"grad_norm": 0.34987378120422363,
"learning_rate": 0.0004191648223645893,
"loss": 3.229,
"step": 51850
},
{
"epoch": 15.105088495575222,
"grad_norm": 0.352450430393219,
"learning_rate": 0.0004189900990099009,
"loss": 3.2363,
"step": 51900
},
{
"epoch": 15.119643688868187,
"grad_norm": 0.35399603843688965,
"learning_rate": 0.00041881537565521256,
"loss": 3.2366,
"step": 51950
},
{
"epoch": 15.134198882161154,
"grad_norm": 0.38175299763679504,
"learning_rate": 0.00041864065230052416,
"loss": 3.2308,
"step": 52000
},
{
"epoch": 15.134198882161154,
"eval_accuracy": 0.3711719152990112,
"eval_loss": 3.5574145317077637,
"eval_runtime": 181.4096,
"eval_samples_per_second": 91.787,
"eval_steps_per_second": 5.738,
"step": 52000
},
{
"epoch": 15.148754075454121,
"grad_norm": 0.36907628178596497,
"learning_rate": 0.00041846592894583575,
"loss": 3.2495,
"step": 52050
},
{
"epoch": 15.163309268747089,
"grad_norm": 0.3483155071735382,
"learning_rate": 0.0004182912055911473,
"loss": 3.243,
"step": 52100
},
{
"epoch": 15.177864462040056,
"grad_norm": 0.3661825656890869,
"learning_rate": 0.0004181164822364589,
"loss": 3.2469,
"step": 52150
},
{
"epoch": 15.192419655333023,
"grad_norm": 0.3702370822429657,
"learning_rate": 0.0004179417588817705,
"loss": 3.2505,
"step": 52200
},
{
"epoch": 15.20697484862599,
"grad_norm": 0.3534018099308014,
"learning_rate": 0.00041776703552708213,
"loss": 3.259,
"step": 52250
},
{
"epoch": 15.221530041918957,
"grad_norm": 0.3597104549407959,
"learning_rate": 0.00041759231217239367,
"loss": 3.2638,
"step": 52300
},
{
"epoch": 15.236085235211924,
"grad_norm": 0.43977323174476624,
"learning_rate": 0.00041741758881770527,
"loss": 3.2585,
"step": 52350
},
{
"epoch": 15.25064042850489,
"grad_norm": 0.36657196283340454,
"learning_rate": 0.00041724286546301686,
"loss": 3.2616,
"step": 52400
},
{
"epoch": 15.265195621797858,
"grad_norm": 0.34935930371284485,
"learning_rate": 0.0004170681421083284,
"loss": 3.2552,
"step": 52450
},
{
"epoch": 15.279750815090825,
"grad_norm": 0.36236581206321716,
"learning_rate": 0.00041689341875364005,
"loss": 3.2619,
"step": 52500
},
{
"epoch": 15.294306008383792,
"grad_norm": 0.3543926775455475,
"learning_rate": 0.00041671869539895164,
"loss": 3.2646,
"step": 52550
},
{
"epoch": 15.30886120167676,
"grad_norm": 0.36431822180747986,
"learning_rate": 0.00041654397204426324,
"loss": 3.2691,
"step": 52600
},
{
"epoch": 15.323416394969724,
"grad_norm": 0.3814825415611267,
"learning_rate": 0.0004163692486895748,
"loss": 3.2755,
"step": 52650
},
{
"epoch": 15.337971588262691,
"grad_norm": 0.3722764551639557,
"learning_rate": 0.00041619452533488637,
"loss": 3.2726,
"step": 52700
},
{
"epoch": 15.352526781555659,
"grad_norm": 0.366241991519928,
"learning_rate": 0.00041601980198019797,
"loss": 3.271,
"step": 52750
},
{
"epoch": 15.367081974848626,
"grad_norm": 0.3627392053604126,
"learning_rate": 0.0004158450786255096,
"loss": 3.2759,
"step": 52800
},
{
"epoch": 15.381637168141593,
"grad_norm": 0.3613549768924713,
"learning_rate": 0.00041567035527082115,
"loss": 3.2707,
"step": 52850
},
{
"epoch": 15.39619236143456,
"grad_norm": 0.37892386317253113,
"learning_rate": 0.00041549563191613275,
"loss": 3.2789,
"step": 52900
},
{
"epoch": 15.410747554727527,
"grad_norm": 0.36703553795814514,
"learning_rate": 0.00041532090856144434,
"loss": 3.2885,
"step": 52950
},
{
"epoch": 15.425302748020494,
"grad_norm": 0.3401341736316681,
"learning_rate": 0.00041514618520675594,
"loss": 3.288,
"step": 53000
},
{
"epoch": 15.425302748020494,
"eval_accuracy": 0.3716571855430602,
"eval_loss": 3.5500683784484863,
"eval_runtime": 181.3534,
"eval_samples_per_second": 91.815,
"eval_steps_per_second": 5.74,
"step": 53000
},
{
"epoch": 15.439857941313461,
"grad_norm": 0.4023256003856659,
"learning_rate": 0.00041497146185206753,
"loss": 3.2749,
"step": 53050
},
{
"epoch": 15.454413134606428,
"grad_norm": 0.37236660718917847,
"learning_rate": 0.0004147967384973791,
"loss": 3.2778,
"step": 53100
},
{
"epoch": 15.468968327899395,
"grad_norm": 0.37774816155433655,
"learning_rate": 0.0004146220151426907,
"loss": 3.2788,
"step": 53150
},
{
"epoch": 15.483523521192362,
"grad_norm": 0.3866797983646393,
"learning_rate": 0.0004144472917880023,
"loss": 3.279,
"step": 53200
},
{
"epoch": 15.49807871448533,
"grad_norm": 0.3538343012332916,
"learning_rate": 0.00041427256843331385,
"loss": 3.2892,
"step": 53250
},
{
"epoch": 15.512633907778294,
"grad_norm": 0.35101500153541565,
"learning_rate": 0.00041409784507862545,
"loss": 3.2939,
"step": 53300
},
{
"epoch": 15.527189101071261,
"grad_norm": 0.3746303915977478,
"learning_rate": 0.0004139231217239371,
"loss": 3.2691,
"step": 53350
},
{
"epoch": 15.541744294364229,
"grad_norm": 0.33723315596580505,
"learning_rate": 0.0004137483983692487,
"loss": 3.2896,
"step": 53400
},
{
"epoch": 15.556299487657196,
"grad_norm": 0.3884109556674957,
"learning_rate": 0.00041357367501456023,
"loss": 3.2849,
"step": 53450
},
{
"epoch": 15.570854680950163,
"grad_norm": 0.37353938817977905,
"learning_rate": 0.0004133989516598718,
"loss": 3.2796,
"step": 53500
},
{
"epoch": 15.58540987424313,
"grad_norm": 0.38491731882095337,
"learning_rate": 0.0004132242283051834,
"loss": 3.2967,
"step": 53550
},
{
"epoch": 15.599965067536097,
"grad_norm": 0.37890195846557617,
"learning_rate": 0.00041304950495049496,
"loss": 3.3009,
"step": 53600
},
{
"epoch": 15.614520260829064,
"grad_norm": 0.351624995470047,
"learning_rate": 0.0004128747815958066,
"loss": 3.3001,
"step": 53650
},
{
"epoch": 15.629075454122031,
"grad_norm": 0.3783181309700012,
"learning_rate": 0.0004127000582411182,
"loss": 3.2972,
"step": 53700
},
{
"epoch": 15.643630647414998,
"grad_norm": 0.35349804162979126,
"learning_rate": 0.0004125253348864298,
"loss": 3.2933,
"step": 53750
},
{
"epoch": 15.658185840707965,
"grad_norm": 0.3687611520290375,
"learning_rate": 0.00041235061153174134,
"loss": 3.2993,
"step": 53800
},
{
"epoch": 15.672741034000932,
"grad_norm": 0.3746609091758728,
"learning_rate": 0.00041217588817705293,
"loss": 3.2854,
"step": 53850
},
{
"epoch": 15.6872962272939,
"grad_norm": 0.3506755828857422,
"learning_rate": 0.0004120011648223646,
"loss": 3.3065,
"step": 53900
},
{
"epoch": 15.701851420586866,
"grad_norm": 0.3722805678844452,
"learning_rate": 0.0004118264414676762,
"loss": 3.287,
"step": 53950
},
{
"epoch": 15.716406613879832,
"grad_norm": 0.3747727572917938,
"learning_rate": 0.0004116517181129877,
"loss": 3.2921,
"step": 54000
},
{
"epoch": 15.716406613879832,
"eval_accuracy": 0.3723015877586379,
"eval_loss": 3.541686534881592,
"eval_runtime": 181.4595,
"eval_samples_per_second": 91.762,
"eval_steps_per_second": 5.737,
"step": 54000
},
{
"epoch": 15.730961807172799,
"grad_norm": 0.39570698142051697,
"learning_rate": 0.0004114769947582993,
"loss": 3.2945,
"step": 54050
},
{
"epoch": 15.745517000465766,
"grad_norm": 0.35266363620758057,
"learning_rate": 0.0004113022714036109,
"loss": 3.3048,
"step": 54100
},
{
"epoch": 15.760072193758733,
"grad_norm": 0.38566699624061584,
"learning_rate": 0.0004111275480489225,
"loss": 3.2989,
"step": 54150
},
{
"epoch": 15.7746273870517,
"grad_norm": 0.3701910078525543,
"learning_rate": 0.0004109528246942341,
"loss": 3.3014,
"step": 54200
},
{
"epoch": 15.789182580344667,
"grad_norm": 0.37288132309913635,
"learning_rate": 0.0004107781013395457,
"loss": 3.3057,
"step": 54250
},
{
"epoch": 15.803737773637634,
"grad_norm": 0.34713712334632874,
"learning_rate": 0.0004106033779848573,
"loss": 3.3007,
"step": 54300
},
{
"epoch": 15.818292966930601,
"grad_norm": 0.3697455823421478,
"learning_rate": 0.0004104286546301689,
"loss": 3.3109,
"step": 54350
},
{
"epoch": 15.832848160223568,
"grad_norm": 0.3423786163330078,
"learning_rate": 0.0004102539312754804,
"loss": 3.2992,
"step": 54400
},
{
"epoch": 15.847403353516535,
"grad_norm": 0.3706797957420349,
"learning_rate": 0.00041007920792079206,
"loss": 3.3111,
"step": 54450
},
{
"epoch": 15.861958546809502,
"grad_norm": 0.3587998151779175,
"learning_rate": 0.00040990448456610366,
"loss": 3.3116,
"step": 54500
},
{
"epoch": 15.87651374010247,
"grad_norm": 0.3819167912006378,
"learning_rate": 0.00040972976121141525,
"loss": 3.3038,
"step": 54550
},
{
"epoch": 15.891068933395436,
"grad_norm": 0.37546828389167786,
"learning_rate": 0.0004095550378567268,
"loss": 3.3088,
"step": 54600
},
{
"epoch": 15.905624126688402,
"grad_norm": 0.42444273829460144,
"learning_rate": 0.0004093803145020384,
"loss": 3.315,
"step": 54650
},
{
"epoch": 15.920179319981369,
"grad_norm": 0.3654751777648926,
"learning_rate": 0.00040920559114735,
"loss": 3.3072,
"step": 54700
},
{
"epoch": 15.934734513274336,
"grad_norm": 0.38066214323043823,
"learning_rate": 0.00040903086779266163,
"loss": 3.3202,
"step": 54750
},
{
"epoch": 15.949289706567303,
"grad_norm": 0.37775856256484985,
"learning_rate": 0.00040885614443797317,
"loss": 3.3169,
"step": 54800
},
{
"epoch": 15.96384489986027,
"grad_norm": 0.37143778800964355,
"learning_rate": 0.00040868142108328476,
"loss": 3.3098,
"step": 54850
},
{
"epoch": 15.978400093153237,
"grad_norm": 0.34987694025039673,
"learning_rate": 0.00040850669772859636,
"loss": 3.3059,
"step": 54900
},
{
"epoch": 15.992955286446204,
"grad_norm": 0.36554640531539917,
"learning_rate": 0.0004083319743739079,
"loss": 3.3089,
"step": 54950
},
{
"epoch": 16.00727759664648,
"grad_norm": 0.354849636554718,
"learning_rate": 0.0004081572510192195,
"loss": 3.2591,
"step": 55000
},
{
"epoch": 16.00727759664648,
"eval_accuracy": 0.3717398072387653,
"eval_loss": 3.5502583980560303,
"eval_runtime": 181.3395,
"eval_samples_per_second": 91.822,
"eval_steps_per_second": 5.741,
"step": 55000
},
{
"epoch": 16.02183278993945,
"grad_norm": 0.371589720249176,
"learning_rate": 0.00040798252766453114,
"loss": 3.2068,
"step": 55050
},
{
"epoch": 16.036387983232416,
"grad_norm": 0.3663135766983032,
"learning_rate": 0.00040780780430984273,
"loss": 3.2002,
"step": 55100
},
{
"epoch": 16.050943176525383,
"grad_norm": 0.351938396692276,
"learning_rate": 0.0004076330809551543,
"loss": 3.2036,
"step": 55150
},
{
"epoch": 16.06549836981835,
"grad_norm": 0.360164076089859,
"learning_rate": 0.00040745835760046587,
"loss": 3.2188,
"step": 55200
},
{
"epoch": 16.080053563111317,
"grad_norm": 0.35275858640670776,
"learning_rate": 0.00040728363424577746,
"loss": 3.2085,
"step": 55250
},
{
"epoch": 16.094608756404284,
"grad_norm": 0.38582101464271545,
"learning_rate": 0.0004071089108910891,
"loss": 3.2275,
"step": 55300
},
{
"epoch": 16.10916394969725,
"grad_norm": 0.37165072560310364,
"learning_rate": 0.0004069341875364007,
"loss": 3.234,
"step": 55350
},
{
"epoch": 16.12371914299022,
"grad_norm": 0.37573471665382385,
"learning_rate": 0.00040675946418171225,
"loss": 3.22,
"step": 55400
},
{
"epoch": 16.138274336283185,
"grad_norm": 0.36840030550956726,
"learning_rate": 0.00040658474082702384,
"loss": 3.2255,
"step": 55450
},
{
"epoch": 16.152829529576152,
"grad_norm": 0.36897239089012146,
"learning_rate": 0.00040641001747233543,
"loss": 3.2238,
"step": 55500
},
{
"epoch": 16.16738472286912,
"grad_norm": 0.3556951582431793,
"learning_rate": 0.000406235294117647,
"loss": 3.2226,
"step": 55550
},
{
"epoch": 16.181939916162086,
"grad_norm": 0.3834228813648224,
"learning_rate": 0.0004060605707629586,
"loss": 3.2351,
"step": 55600
},
{
"epoch": 16.196495109455054,
"grad_norm": 0.43296241760253906,
"learning_rate": 0.0004058858474082702,
"loss": 3.2453,
"step": 55650
},
{
"epoch": 16.21105030274802,
"grad_norm": 0.38260507583618164,
"learning_rate": 0.0004057111240535818,
"loss": 3.2455,
"step": 55700
},
{
"epoch": 16.225605496040988,
"grad_norm": 0.37897780537605286,
"learning_rate": 0.00040553640069889335,
"loss": 3.2406,
"step": 55750
},
{
"epoch": 16.240160689333955,
"grad_norm": 0.33175134658813477,
"learning_rate": 0.00040536167734420495,
"loss": 3.2564,
"step": 55800
},
{
"epoch": 16.254715882626922,
"grad_norm": 0.37075909972190857,
"learning_rate": 0.0004051869539895166,
"loss": 3.2521,
"step": 55850
},
{
"epoch": 16.26927107591989,
"grad_norm": 0.34951481223106384,
"learning_rate": 0.0004050122306348282,
"loss": 3.2413,
"step": 55900
},
{
"epoch": 16.283826269212856,
"grad_norm": 0.3745467960834503,
"learning_rate": 0.00040483750728013973,
"loss": 3.2515,
"step": 55950
},
{
"epoch": 16.298381462505823,
"grad_norm": 0.36970576643943787,
"learning_rate": 0.0004046627839254513,
"loss": 3.2485,
"step": 56000
},
{
"epoch": 16.298381462505823,
"eval_accuracy": 0.37181514224153484,
"eval_loss": 3.5516862869262695,
"eval_runtime": 181.2522,
"eval_samples_per_second": 91.866,
"eval_steps_per_second": 5.743,
"step": 56000
},
{
"epoch": 16.31293665579879,
"grad_norm": 0.4097478985786438,
"learning_rate": 0.0004044880605707629,
"loss": 3.2624,
"step": 56050
},
{
"epoch": 16.327491849091757,
"grad_norm": 0.3872320353984833,
"learning_rate": 0.00040431333721607446,
"loss": 3.2618,
"step": 56100
},
{
"epoch": 16.342047042384724,
"grad_norm": 0.38024234771728516,
"learning_rate": 0.0004041386138613861,
"loss": 3.2555,
"step": 56150
},
{
"epoch": 16.35660223567769,
"grad_norm": 0.35469555854797363,
"learning_rate": 0.0004039638905066977,
"loss": 3.2618,
"step": 56200
},
{
"epoch": 16.37115742897066,
"grad_norm": 0.37726670503616333,
"learning_rate": 0.0004037891671520093,
"loss": 3.2522,
"step": 56250
},
{
"epoch": 16.385712622263625,
"grad_norm": 0.39300525188446045,
"learning_rate": 0.0004036144437973209,
"loss": 3.249,
"step": 56300
},
{
"epoch": 16.40026781555659,
"grad_norm": 0.3485693633556366,
"learning_rate": 0.00040343972044263243,
"loss": 3.265,
"step": 56350
},
{
"epoch": 16.414823008849556,
"grad_norm": 0.3581498861312866,
"learning_rate": 0.0004032649970879441,
"loss": 3.2626,
"step": 56400
},
{
"epoch": 16.429378202142523,
"grad_norm": 0.3752081096172333,
"learning_rate": 0.00040309027373325567,
"loss": 3.2641,
"step": 56450
},
{
"epoch": 16.44393339543549,
"grad_norm": 0.392609179019928,
"learning_rate": 0.00040291555037856727,
"loss": 3.2616,
"step": 56500
},
{
"epoch": 16.458488588728457,
"grad_norm": 0.41342949867248535,
"learning_rate": 0.0004027408270238788,
"loss": 3.2656,
"step": 56550
},
{
"epoch": 16.473043782021424,
"grad_norm": 0.3561092019081116,
"learning_rate": 0.0004025661036691904,
"loss": 3.2831,
"step": 56600
},
{
"epoch": 16.48759897531439,
"grad_norm": 0.3804037868976593,
"learning_rate": 0.000402391380314502,
"loss": 3.2761,
"step": 56650
},
{
"epoch": 16.50215416860736,
"grad_norm": 0.3736860454082489,
"learning_rate": 0.00040221665695981364,
"loss": 3.2722,
"step": 56700
},
{
"epoch": 16.516709361900325,
"grad_norm": 0.39770689606666565,
"learning_rate": 0.0004020419336051252,
"loss": 3.2705,
"step": 56750
},
{
"epoch": 16.531264555193292,
"grad_norm": 0.376775860786438,
"learning_rate": 0.0004018672102504368,
"loss": 3.2794,
"step": 56800
},
{
"epoch": 16.54581974848626,
"grad_norm": 0.375805526971817,
"learning_rate": 0.00040169248689574837,
"loss": 3.2835,
"step": 56850
},
{
"epoch": 16.560374941779227,
"grad_norm": 0.3479491174221039,
"learning_rate": 0.0004015177635410599,
"loss": 3.2802,
"step": 56900
},
{
"epoch": 16.574930135072194,
"grad_norm": 0.3600131869316101,
"learning_rate": 0.0004013430401863715,
"loss": 3.2787,
"step": 56950
},
{
"epoch": 16.58948532836516,
"grad_norm": 0.38802334666252136,
"learning_rate": 0.00040116831683168315,
"loss": 3.2739,
"step": 57000
},
{
"epoch": 16.58948532836516,
"eval_accuracy": 0.3727245685308182,
"eval_loss": 3.5458953380584717,
"eval_runtime": 181.516,
"eval_samples_per_second": 91.733,
"eval_steps_per_second": 5.735,
"step": 57000
},
{
"epoch": 16.604040521658128,
"grad_norm": 0.39461463689804077,
"learning_rate": 0.00040099359347699475,
"loss": 3.2684,
"step": 57050
},
{
"epoch": 16.618595714951095,
"grad_norm": 0.3506292998790741,
"learning_rate": 0.0004008188701223063,
"loss": 3.2691,
"step": 57100
},
{
"epoch": 16.633150908244062,
"grad_norm": 0.3793623745441437,
"learning_rate": 0.0004006441467676179,
"loss": 3.2824,
"step": 57150
},
{
"epoch": 16.64770610153703,
"grad_norm": 0.4112677574157715,
"learning_rate": 0.0004004694234129295,
"loss": 3.2887,
"step": 57200
},
{
"epoch": 16.662261294829996,
"grad_norm": 0.3722013235092163,
"learning_rate": 0.0004002947000582411,
"loss": 3.2864,
"step": 57250
},
{
"epoch": 16.676816488122963,
"grad_norm": 0.4040554165840149,
"learning_rate": 0.00040011997670355267,
"loss": 3.2794,
"step": 57300
},
{
"epoch": 16.69137168141593,
"grad_norm": 0.3532615005970001,
"learning_rate": 0.00039994525334886426,
"loss": 3.2882,
"step": 57350
},
{
"epoch": 16.705926874708897,
"grad_norm": 0.3731229901313782,
"learning_rate": 0.00039977052999417585,
"loss": 3.2935,
"step": 57400
},
{
"epoch": 16.720482068001864,
"grad_norm": 0.3732684552669525,
"learning_rate": 0.00039959580663948745,
"loss": 3.2832,
"step": 57450
},
{
"epoch": 16.73503726129483,
"grad_norm": 0.3691073954105377,
"learning_rate": 0.000399421083284799,
"loss": 3.2842,
"step": 57500
},
{
"epoch": 16.7495924545878,
"grad_norm": 0.3791075646877289,
"learning_rate": 0.00039924635993011064,
"loss": 3.2934,
"step": 57550
},
{
"epoch": 16.764147647880765,
"grad_norm": 0.3449566066265106,
"learning_rate": 0.00039907163657542223,
"loss": 3.2974,
"step": 57600
},
{
"epoch": 16.778702841173732,
"grad_norm": 0.3655012547969818,
"learning_rate": 0.0003988969132207338,
"loss": 3.2847,
"step": 57650
},
{
"epoch": 16.793258034466696,
"grad_norm": 0.36870133876800537,
"learning_rate": 0.00039872218986604537,
"loss": 3.2877,
"step": 57700
},
{
"epoch": 16.807813227759663,
"grad_norm": 0.37629836797714233,
"learning_rate": 0.00039854746651135696,
"loss": 3.2832,
"step": 57750
},
{
"epoch": 16.82236842105263,
"grad_norm": 0.3698217272758484,
"learning_rate": 0.0003983727431566686,
"loss": 3.2885,
"step": 57800
},
{
"epoch": 16.836923614345597,
"grad_norm": 0.3338254988193512,
"learning_rate": 0.0003981980198019802,
"loss": 3.2886,
"step": 57850
},
{
"epoch": 16.851478807638564,
"grad_norm": 0.36517441272735596,
"learning_rate": 0.00039802329644729174,
"loss": 3.3047,
"step": 57900
},
{
"epoch": 16.86603400093153,
"grad_norm": 0.3859759569168091,
"learning_rate": 0.00039784857309260334,
"loss": 3.2912,
"step": 57950
},
{
"epoch": 16.8805891942245,
"grad_norm": 0.37025895714759827,
"learning_rate": 0.00039767384973791493,
"loss": 3.3028,
"step": 58000
},
{
"epoch": 16.8805891942245,
"eval_accuracy": 0.3730944269609519,
"eval_loss": 3.533510208129883,
"eval_runtime": 181.2342,
"eval_samples_per_second": 91.876,
"eval_steps_per_second": 5.744,
"step": 58000
},
{
"epoch": 16.895144387517465,
"grad_norm": 0.37400710582733154,
"learning_rate": 0.00039749912638322647,
"loss": 3.308,
"step": 58050
},
{
"epoch": 16.909699580810432,
"grad_norm": 0.36596307158470154,
"learning_rate": 0.0003973244030285381,
"loss": 3.2998,
"step": 58100
},
{
"epoch": 16.9242547741034,
"grad_norm": 0.3551575839519501,
"learning_rate": 0.0003971496796738497,
"loss": 3.3001,
"step": 58150
},
{
"epoch": 16.938809967396367,
"grad_norm": 0.3748112618923187,
"learning_rate": 0.0003969749563191613,
"loss": 3.3063,
"step": 58200
},
{
"epoch": 16.953365160689334,
"grad_norm": 0.36766737699508667,
"learning_rate": 0.00039680023296447285,
"loss": 3.2918,
"step": 58250
},
{
"epoch": 16.9679203539823,
"grad_norm": 0.362453818321228,
"learning_rate": 0.00039662550960978444,
"loss": 3.2946,
"step": 58300
},
{
"epoch": 16.982475547275268,
"grad_norm": 0.43086424469947815,
"learning_rate": 0.00039645078625509604,
"loss": 3.2994,
"step": 58350
},
{
"epoch": 16.997030740568235,
"grad_norm": 0.3485860526561737,
"learning_rate": 0.0003962760629004077,
"loss": 3.2895,
"step": 58400
},
{
"epoch": 17.011353050768513,
"grad_norm": 0.3714965581893921,
"learning_rate": 0.0003961013395457193,
"loss": 3.2196,
"step": 58450
},
{
"epoch": 17.02590824406148,
"grad_norm": 0.3558444678783417,
"learning_rate": 0.0003959266161910308,
"loss": 3.1859,
"step": 58500
},
{
"epoch": 17.040463437354447,
"grad_norm": 0.42480990290641785,
"learning_rate": 0.0003957518928363424,
"loss": 3.1988,
"step": 58550
},
{
"epoch": 17.055018630647414,
"grad_norm": 0.36877307295799255,
"learning_rate": 0.000395577169481654,
"loss": 3.1951,
"step": 58600
},
{
"epoch": 17.06957382394038,
"grad_norm": 0.35925930738449097,
"learning_rate": 0.00039540244612696566,
"loss": 3.2045,
"step": 58650
},
{
"epoch": 17.084129017233348,
"grad_norm": 0.41032111644744873,
"learning_rate": 0.0003952277227722772,
"loss": 3.1916,
"step": 58700
},
{
"epoch": 17.098684210526315,
"grad_norm": 0.40329569578170776,
"learning_rate": 0.0003950529994175888,
"loss": 3.2121,
"step": 58750
},
{
"epoch": 17.113239403819282,
"grad_norm": 0.38334032893180847,
"learning_rate": 0.0003948782760629004,
"loss": 3.212,
"step": 58800
},
{
"epoch": 17.12779459711225,
"grad_norm": 0.3426177501678467,
"learning_rate": 0.0003947035527082119,
"loss": 3.2076,
"step": 58850
},
{
"epoch": 17.142349790405216,
"grad_norm": 0.3961309790611267,
"learning_rate": 0.0003945288293535235,
"loss": 3.2136,
"step": 58900
},
{
"epoch": 17.156904983698183,
"grad_norm": 0.386750727891922,
"learning_rate": 0.00039435410599883517,
"loss": 3.2282,
"step": 58950
},
{
"epoch": 17.17146017699115,
"grad_norm": 0.3924585282802582,
"learning_rate": 0.00039417938264414676,
"loss": 3.2237,
"step": 59000
},
{
"epoch": 17.17146017699115,
"eval_accuracy": 0.3720184644798988,
"eval_loss": 3.5546765327453613,
"eval_runtime": 180.9754,
"eval_samples_per_second": 92.007,
"eval_steps_per_second": 5.752,
"step": 59000
},
{
"epoch": 17.186015370284117,
"grad_norm": 0.361965149641037,
"learning_rate": 0.0003940046592894583,
"loss": 3.2363,
"step": 59050
},
{
"epoch": 17.200570563577084,
"grad_norm": 0.4293610453605652,
"learning_rate": 0.0003938299359347699,
"loss": 3.2393,
"step": 59100
},
{
"epoch": 17.21512575687005,
"grad_norm": 0.36987578868865967,
"learning_rate": 0.0003936552125800815,
"loss": 3.221,
"step": 59150
},
{
"epoch": 17.22968095016302,
"grad_norm": 0.37416261434555054,
"learning_rate": 0.00039348048922539314,
"loss": 3.2321,
"step": 59200
},
{
"epoch": 17.244236143455986,
"grad_norm": 0.37772485613822937,
"learning_rate": 0.0003933057658707047,
"loss": 3.2202,
"step": 59250
},
{
"epoch": 17.258791336748953,
"grad_norm": 0.37424352765083313,
"learning_rate": 0.0003931310425160163,
"loss": 3.2311,
"step": 59300
},
{
"epoch": 17.27334653004192,
"grad_norm": 0.4201411306858063,
"learning_rate": 0.00039295631916132787,
"loss": 3.2314,
"step": 59350
},
{
"epoch": 17.287901723334887,
"grad_norm": 0.3957693874835968,
"learning_rate": 0.00039278159580663946,
"loss": 3.2417,
"step": 59400
},
{
"epoch": 17.302456916627854,
"grad_norm": 0.38037219643592834,
"learning_rate": 0.000392606872451951,
"loss": 3.2581,
"step": 59450
},
{
"epoch": 17.31701210992082,
"grad_norm": 0.3617865741252899,
"learning_rate": 0.00039243214909726265,
"loss": 3.2369,
"step": 59500
},
{
"epoch": 17.331567303213788,
"grad_norm": 0.3748098611831665,
"learning_rate": 0.00039225742574257425,
"loss": 3.2427,
"step": 59550
},
{
"epoch": 17.346122496506755,
"grad_norm": 0.4334610104560852,
"learning_rate": 0.00039208270238788584,
"loss": 3.2533,
"step": 59600
},
{
"epoch": 17.360677689799722,
"grad_norm": 0.38959386944770813,
"learning_rate": 0.0003919079790331974,
"loss": 3.2523,
"step": 59650
},
{
"epoch": 17.37523288309269,
"grad_norm": 0.37862473726272583,
"learning_rate": 0.000391733255678509,
"loss": 3.2447,
"step": 59700
},
{
"epoch": 17.389788076385653,
"grad_norm": 0.38255590200424194,
"learning_rate": 0.00039155853232382057,
"loss": 3.2471,
"step": 59750
},
{
"epoch": 17.40434326967862,
"grad_norm": 0.38841813802719116,
"learning_rate": 0.0003913838089691322,
"loss": 3.2485,
"step": 59800
},
{
"epoch": 17.418898462971587,
"grad_norm": 0.3490172028541565,
"learning_rate": 0.00039120908561444376,
"loss": 3.2544,
"step": 59850
},
{
"epoch": 17.433453656264554,
"grad_norm": 0.40020763874053955,
"learning_rate": 0.00039103436225975535,
"loss": 3.2498,
"step": 59900
},
{
"epoch": 17.44800884955752,
"grad_norm": 0.39921924471855164,
"learning_rate": 0.00039085963890506695,
"loss": 3.2444,
"step": 59950
},
{
"epoch": 17.462564042850488,
"grad_norm": 0.3600024878978729,
"learning_rate": 0.0003906849155503785,
"loss": 3.2656,
"step": 60000
},
{
"epoch": 17.462564042850488,
"eval_accuracy": 0.3723973725125493,
"eval_loss": 3.548039436340332,
"eval_runtime": 181.4348,
"eval_samples_per_second": 91.774,
"eval_steps_per_second": 5.738,
"step": 60000
},
{
"epoch": 17.477119236143455,
"grad_norm": 0.38024771213531494,
"learning_rate": 0.00039051019219569014,
"loss": 3.2521,
"step": 60050
},
{
"epoch": 17.491674429436422,
"grad_norm": 0.3785065710544586,
"learning_rate": 0.00039033546884100173,
"loss": 3.2533,
"step": 60100
},
{
"epoch": 17.50622962272939,
"grad_norm": 0.3751404285430908,
"learning_rate": 0.0003901607454863133,
"loss": 3.2598,
"step": 60150
},
{
"epoch": 17.520784816022356,
"grad_norm": 0.3761499524116516,
"learning_rate": 0.00038998602213162486,
"loss": 3.2586,
"step": 60200
},
{
"epoch": 17.535340009315323,
"grad_norm": 0.3843153715133667,
"learning_rate": 0.00038981129877693646,
"loss": 3.2766,
"step": 60250
},
{
"epoch": 17.54989520260829,
"grad_norm": 0.3554283082485199,
"learning_rate": 0.00038963657542224805,
"loss": 3.2603,
"step": 60300
},
{
"epoch": 17.564450395901257,
"grad_norm": 0.3592737019062042,
"learning_rate": 0.0003894618520675597,
"loss": 3.2806,
"step": 60350
},
{
"epoch": 17.579005589194225,
"grad_norm": 0.4006965160369873,
"learning_rate": 0.00038928712871287124,
"loss": 3.2658,
"step": 60400
},
{
"epoch": 17.59356078248719,
"grad_norm": 0.352741003036499,
"learning_rate": 0.00038911240535818284,
"loss": 3.2588,
"step": 60450
},
{
"epoch": 17.60811597578016,
"grad_norm": 0.4061281681060791,
"learning_rate": 0.00038893768200349443,
"loss": 3.266,
"step": 60500
},
{
"epoch": 17.622671169073126,
"grad_norm": 0.39381715655326843,
"learning_rate": 0.000388762958648806,
"loss": 3.266,
"step": 60550
},
{
"epoch": 17.637226362366093,
"grad_norm": 0.3868790566921234,
"learning_rate": 0.0003885882352941176,
"loss": 3.2583,
"step": 60600
},
{
"epoch": 17.65178155565906,
"grad_norm": 0.3857836425304413,
"learning_rate": 0.0003884135119394292,
"loss": 3.2715,
"step": 60650
},
{
"epoch": 17.666336748952027,
"grad_norm": 0.3806355893611908,
"learning_rate": 0.0003882387885847408,
"loss": 3.2631,
"step": 60700
},
{
"epoch": 17.680891942244994,
"grad_norm": 0.3960031569004059,
"learning_rate": 0.0003880640652300524,
"loss": 3.2652,
"step": 60750
},
{
"epoch": 17.69544713553796,
"grad_norm": 0.35661599040031433,
"learning_rate": 0.00038788934187536394,
"loss": 3.2893,
"step": 60800
},
{
"epoch": 17.710002328830928,
"grad_norm": 0.4023573100566864,
"learning_rate": 0.00038771461852067554,
"loss": 3.2819,
"step": 60850
},
{
"epoch": 17.724557522123895,
"grad_norm": 0.3875479996204376,
"learning_rate": 0.0003875398951659872,
"loss": 3.2738,
"step": 60900
},
{
"epoch": 17.739112715416862,
"grad_norm": 0.3642485439777374,
"learning_rate": 0.0003873651718112988,
"loss": 3.2685,
"step": 60950
},
{
"epoch": 17.75366790870983,
"grad_norm": 0.3675863742828369,
"learning_rate": 0.0003871904484566103,
"loss": 3.2953,
"step": 61000
},
{
"epoch": 17.75366790870983,
"eval_accuracy": 0.3730756225920859,
"eval_loss": 3.542440176010132,
"eval_runtime": 181.1914,
"eval_samples_per_second": 91.897,
"eval_steps_per_second": 5.745,
"step": 61000
},
{
"epoch": 17.768223102002796,
"grad_norm": 0.3717908561229706,
"learning_rate": 0.0003870157251019219,
"loss": 3.2746,
"step": 61050
},
{
"epoch": 17.78277829529576,
"grad_norm": 0.37193652987480164,
"learning_rate": 0.0003868410017472335,
"loss": 3.2775,
"step": 61100
},
{
"epoch": 17.797333488588727,
"grad_norm": 0.37539011240005493,
"learning_rate": 0.00038666627839254505,
"loss": 3.264,
"step": 61150
},
{
"epoch": 17.811888681881694,
"grad_norm": 0.4052475392818451,
"learning_rate": 0.0003864915550378567,
"loss": 3.2752,
"step": 61200
},
{
"epoch": 17.82644387517466,
"grad_norm": 0.37633731961250305,
"learning_rate": 0.0003863168316831683,
"loss": 3.2831,
"step": 61250
},
{
"epoch": 17.840999068467628,
"grad_norm": 0.3629589378833771,
"learning_rate": 0.0003861421083284799,
"loss": 3.2811,
"step": 61300
},
{
"epoch": 17.855554261760595,
"grad_norm": 0.35611119866371155,
"learning_rate": 0.0003859673849737914,
"loss": 3.2793,
"step": 61350
},
{
"epoch": 17.870109455053562,
"grad_norm": 0.3677215278148651,
"learning_rate": 0.000385792661619103,
"loss": 3.2767,
"step": 61400
},
{
"epoch": 17.88466464834653,
"grad_norm": 0.34761014580726624,
"learning_rate": 0.00038561793826441467,
"loss": 3.2798,
"step": 61450
},
{
"epoch": 17.899219841639496,
"grad_norm": 0.3875502049922943,
"learning_rate": 0.00038544321490972626,
"loss": 3.3052,
"step": 61500
},
{
"epoch": 17.913775034932463,
"grad_norm": 0.3841918110847473,
"learning_rate": 0.0003852684915550378,
"loss": 3.2864,
"step": 61550
},
{
"epoch": 17.92833022822543,
"grad_norm": 0.4031478762626648,
"learning_rate": 0.0003850937682003494,
"loss": 3.2933,
"step": 61600
},
{
"epoch": 17.942885421518397,
"grad_norm": 0.3854524791240692,
"learning_rate": 0.000384919044845661,
"loss": 3.2834,
"step": 61650
},
{
"epoch": 17.957440614811365,
"grad_norm": 0.3800252676010132,
"learning_rate": 0.0003847443214909726,
"loss": 3.2959,
"step": 61700
},
{
"epoch": 17.97199580810433,
"grad_norm": 0.33671489357948303,
"learning_rate": 0.00038456959813628423,
"loss": 3.292,
"step": 61750
},
{
"epoch": 17.9865510013973,
"grad_norm": 0.40121355652809143,
"learning_rate": 0.0003843948747815958,
"loss": 3.2815,
"step": 61800
},
{
"epoch": 18.000873311597577,
"grad_norm": 0.41646862030029297,
"learning_rate": 0.00038422015142690737,
"loss": 3.2722,
"step": 61850
},
{
"epoch": 18.015428504890544,
"grad_norm": 0.37640494108200073,
"learning_rate": 0.00038404542807221896,
"loss": 3.1641,
"step": 61900
},
{
"epoch": 18.02998369818351,
"grad_norm": 0.3735230267047882,
"learning_rate": 0.0003838707047175305,
"loss": 3.185,
"step": 61950
},
{
"epoch": 18.044538891476478,
"grad_norm": 0.3715892732143402,
"learning_rate": 0.00038369598136284215,
"loss": 3.1951,
"step": 62000
},
{
"epoch": 18.044538891476478,
"eval_accuracy": 0.3726110371537895,
"eval_loss": 3.549298048019409,
"eval_runtime": 181.408,
"eval_samples_per_second": 91.788,
"eval_steps_per_second": 5.738,
"step": 62000
},
{
"epoch": 18.059094084769445,
"grad_norm": 0.382521390914917,
"learning_rate": 0.00038352125800815374,
"loss": 3.196,
"step": 62050
},
{
"epoch": 18.073649278062412,
"grad_norm": 0.3650335371494293,
"learning_rate": 0.00038334653465346534,
"loss": 3.1978,
"step": 62100
},
{
"epoch": 18.08820447135538,
"grad_norm": 0.38035500049591064,
"learning_rate": 0.0003831718112987769,
"loss": 3.1877,
"step": 62150
},
{
"epoch": 18.102759664648346,
"grad_norm": 0.37595996260643005,
"learning_rate": 0.0003829970879440885,
"loss": 3.1956,
"step": 62200
},
{
"epoch": 18.117314857941313,
"grad_norm": 0.36986905336380005,
"learning_rate": 0.00038282236458940007,
"loss": 3.2052,
"step": 62250
},
{
"epoch": 18.13187005123428,
"grad_norm": 0.3834410309791565,
"learning_rate": 0.0003826476412347117,
"loss": 3.1984,
"step": 62300
},
{
"epoch": 18.146425244527247,
"grad_norm": 0.42468026280403137,
"learning_rate": 0.00038247291788002326,
"loss": 3.2093,
"step": 62350
},
{
"epoch": 18.160980437820214,
"grad_norm": 0.3679347336292267,
"learning_rate": 0.00038229819452533485,
"loss": 3.2119,
"step": 62400
},
{
"epoch": 18.17553563111318,
"grad_norm": 0.36006471514701843,
"learning_rate": 0.00038212347117064644,
"loss": 3.2029,
"step": 62450
},
{
"epoch": 18.19009082440615,
"grad_norm": 0.4088258445262909,
"learning_rate": 0.000381948747815958,
"loss": 3.2286,
"step": 62500
},
{
"epoch": 18.204646017699115,
"grad_norm": 0.35326552391052246,
"learning_rate": 0.0003817740244612696,
"loss": 3.2145,
"step": 62550
},
{
"epoch": 18.219201210992082,
"grad_norm": 0.4518735408782959,
"learning_rate": 0.00038159930110658123,
"loss": 3.2278,
"step": 62600
},
{
"epoch": 18.23375640428505,
"grad_norm": 0.3534115254878998,
"learning_rate": 0.0003814245777518928,
"loss": 3.2222,
"step": 62650
},
{
"epoch": 18.248311597578017,
"grad_norm": 0.3919824957847595,
"learning_rate": 0.0003812498543972044,
"loss": 3.2244,
"step": 62700
},
{
"epoch": 18.262866790870984,
"grad_norm": 0.38733547925949097,
"learning_rate": 0.00038107513104251596,
"loss": 3.2259,
"step": 62750
},
{
"epoch": 18.27742198416395,
"grad_norm": 0.36184003949165344,
"learning_rate": 0.00038090040768782755,
"loss": 3.2251,
"step": 62800
},
{
"epoch": 18.291977177456918,
"grad_norm": 0.3423421084880829,
"learning_rate": 0.0003807256843331392,
"loss": 3.2321,
"step": 62850
},
{
"epoch": 18.306532370749885,
"grad_norm": 0.40122663974761963,
"learning_rate": 0.0003805509609784508,
"loss": 3.2213,
"step": 62900
},
{
"epoch": 18.321087564042852,
"grad_norm": 0.3773971199989319,
"learning_rate": 0.00038037623762376233,
"loss": 3.2357,
"step": 62950
},
{
"epoch": 18.33564275733582,
"grad_norm": 0.3741346001625061,
"learning_rate": 0.00038020151426907393,
"loss": 3.2331,
"step": 63000
},
{
"epoch": 18.33564275733582,
"eval_accuracy": 0.3728661889338405,
"eval_loss": 3.5485711097717285,
"eval_runtime": 181.173,
"eval_samples_per_second": 91.907,
"eval_steps_per_second": 5.746,
"step": 63000
},
{
"epoch": 18.350197950628786,
"grad_norm": 0.37724754214286804,
"learning_rate": 0.0003800267909143855,
"loss": 3.239,
"step": 63050
},
{
"epoch": 18.364753143921753,
"grad_norm": 0.34363576769828796,
"learning_rate": 0.00037985206755969706,
"loss": 3.2444,
"step": 63100
},
{
"epoch": 18.379308337214717,
"grad_norm": 0.3696479797363281,
"learning_rate": 0.0003796773442050087,
"loss": 3.234,
"step": 63150
},
{
"epoch": 18.393863530507684,
"grad_norm": 0.3984706997871399,
"learning_rate": 0.0003795026208503203,
"loss": 3.2427,
"step": 63200
},
{
"epoch": 18.40841872380065,
"grad_norm": 0.39606693387031555,
"learning_rate": 0.0003793278974956319,
"loss": 3.2468,
"step": 63250
},
{
"epoch": 18.422973917093618,
"grad_norm": 0.4036598801612854,
"learning_rate": 0.00037915317414094344,
"loss": 3.2501,
"step": 63300
},
{
"epoch": 18.437529110386585,
"grad_norm": 0.34738820791244507,
"learning_rate": 0.00037897845078625503,
"loss": 3.2571,
"step": 63350
},
{
"epoch": 18.452084303679552,
"grad_norm": 0.36835139989852905,
"learning_rate": 0.0003788037274315667,
"loss": 3.249,
"step": 63400
},
{
"epoch": 18.46663949697252,
"grad_norm": 0.3729427456855774,
"learning_rate": 0.0003786290040768783,
"loss": 3.2576,
"step": 63450
},
{
"epoch": 18.481194690265486,
"grad_norm": 0.38652747869491577,
"learning_rate": 0.0003784542807221898,
"loss": 3.2487,
"step": 63500
},
{
"epoch": 18.495749883558453,
"grad_norm": 0.3909215033054352,
"learning_rate": 0.0003782795573675014,
"loss": 3.2474,
"step": 63550
},
{
"epoch": 18.51030507685142,
"grad_norm": 0.3667212426662445,
"learning_rate": 0.000378104834012813,
"loss": 3.2556,
"step": 63600
},
{
"epoch": 18.524860270144387,
"grad_norm": 0.36072465777397156,
"learning_rate": 0.0003779301106581246,
"loss": 3.2449,
"step": 63650
},
{
"epoch": 18.539415463437354,
"grad_norm": 0.375143438577652,
"learning_rate": 0.0003777553873034362,
"loss": 3.2497,
"step": 63700
},
{
"epoch": 18.55397065673032,
"grad_norm": 0.3688351511955261,
"learning_rate": 0.0003775806639487478,
"loss": 3.2531,
"step": 63750
},
{
"epoch": 18.56852585002329,
"grad_norm": 0.3653005361557007,
"learning_rate": 0.0003774059405940594,
"loss": 3.2502,
"step": 63800
},
{
"epoch": 18.583081043316255,
"grad_norm": 0.3686894178390503,
"learning_rate": 0.000377231217239371,
"loss": 3.2417,
"step": 63850
},
{
"epoch": 18.597636236609222,
"grad_norm": 0.3764590620994568,
"learning_rate": 0.0003770564938846825,
"loss": 3.2539,
"step": 63900
},
{
"epoch": 18.61219142990219,
"grad_norm": 0.395633727312088,
"learning_rate": 0.00037688177052999416,
"loss": 3.264,
"step": 63950
},
{
"epoch": 18.626746623195157,
"grad_norm": 0.3682929575443268,
"learning_rate": 0.00037670704717530576,
"loss": 3.2593,
"step": 64000
},
{
"epoch": 18.626746623195157,
"eval_accuracy": 0.3727078796534496,
"eval_loss": 3.540853500366211,
"eval_runtime": 181.1321,
"eval_samples_per_second": 91.927,
"eval_steps_per_second": 5.747,
"step": 64000
},
{
"epoch": 18.641301816488124,
"grad_norm": 0.4006809592247009,
"learning_rate": 0.00037653232382061735,
"loss": 3.2549,
"step": 64050
},
{
"epoch": 18.65585700978109,
"grad_norm": 0.3779636025428772,
"learning_rate": 0.0003763576004659289,
"loss": 3.2531,
"step": 64100
},
{
"epoch": 18.670412203074058,
"grad_norm": 0.3712959289550781,
"learning_rate": 0.0003761828771112405,
"loss": 3.2488,
"step": 64150
},
{
"epoch": 18.684967396367025,
"grad_norm": 0.39823856949806213,
"learning_rate": 0.0003760081537565521,
"loss": 3.2561,
"step": 64200
},
{
"epoch": 18.699522589659992,
"grad_norm": 0.4071546494960785,
"learning_rate": 0.00037583343040186373,
"loss": 3.2578,
"step": 64250
},
{
"epoch": 18.71407778295296,
"grad_norm": 0.3799132704734802,
"learning_rate": 0.00037565870704717527,
"loss": 3.2622,
"step": 64300
},
{
"epoch": 18.728632976245926,
"grad_norm": 0.394061416387558,
"learning_rate": 0.00037548398369248687,
"loss": 3.2412,
"step": 64350
},
{
"epoch": 18.743188169538893,
"grad_norm": 0.39441555738449097,
"learning_rate": 0.00037530926033779846,
"loss": 3.2708,
"step": 64400
},
{
"epoch": 18.75774336283186,
"grad_norm": 0.38645127415657043,
"learning_rate": 0.00037513453698311,
"loss": 3.2749,
"step": 64450
},
{
"epoch": 18.772298556124824,
"grad_norm": 0.3764881193637848,
"learning_rate": 0.0003749598136284216,
"loss": 3.2553,
"step": 64500
},
{
"epoch": 18.78685374941779,
"grad_norm": 0.3740571439266205,
"learning_rate": 0.00037478509027373324,
"loss": 3.2532,
"step": 64550
},
{
"epoch": 18.801408942710758,
"grad_norm": 0.36852291226387024,
"learning_rate": 0.00037461036691904484,
"loss": 3.2661,
"step": 64600
},
{
"epoch": 18.815964136003725,
"grad_norm": 0.36674243211746216,
"learning_rate": 0.0003744356435643564,
"loss": 3.2765,
"step": 64650
},
{
"epoch": 18.830519329296692,
"grad_norm": 0.3778327703475952,
"learning_rate": 0.00037426092020966797,
"loss": 3.2664,
"step": 64700
},
{
"epoch": 18.84507452258966,
"grad_norm": 0.38024887442588806,
"learning_rate": 0.00037408619685497957,
"loss": 3.2679,
"step": 64750
},
{
"epoch": 18.859629715882626,
"grad_norm": 0.38589903712272644,
"learning_rate": 0.0003739114735002912,
"loss": 3.2735,
"step": 64800
},
{
"epoch": 18.874184909175593,
"grad_norm": 0.3877568542957306,
"learning_rate": 0.0003737367501456028,
"loss": 3.2718,
"step": 64850
},
{
"epoch": 18.88874010246856,
"grad_norm": 0.3728450536727905,
"learning_rate": 0.00037356202679091435,
"loss": 3.2666,
"step": 64900
},
{
"epoch": 18.903295295761527,
"grad_norm": 0.38396206498146057,
"learning_rate": 0.00037338730343622594,
"loss": 3.2741,
"step": 64950
},
{
"epoch": 18.917850489054494,
"grad_norm": 0.37104254961013794,
"learning_rate": 0.00037321258008153754,
"loss": 3.2761,
"step": 65000
},
{
"epoch": 18.917850489054494,
"eval_accuracy": 0.37353950286654974,
"eval_loss": 3.5339014530181885,
"eval_runtime": 181.2275,
"eval_samples_per_second": 91.879,
"eval_steps_per_second": 5.744,
"step": 65000
},
{
"epoch": 18.93240568234746,
"grad_norm": 0.3607959449291229,
"learning_rate": 0.0003730378567268491,
"loss": 3.2735,
"step": 65050
},
{
"epoch": 18.94696087564043,
"grad_norm": 0.3835141956806183,
"learning_rate": 0.0003728631333721607,
"loss": 3.2632,
"step": 65100
},
{
"epoch": 18.961516068933395,
"grad_norm": 0.4325149357318878,
"learning_rate": 0.0003726884100174723,
"loss": 3.2719,
"step": 65150
},
{
"epoch": 18.976071262226363,
"grad_norm": 0.3760964274406433,
"learning_rate": 0.0003725136866627839,
"loss": 3.2746,
"step": 65200
},
{
"epoch": 18.99062645551933,
"grad_norm": 0.38402265310287476,
"learning_rate": 0.00037233896330809545,
"loss": 3.2712,
"step": 65250
},
{
"epoch": 19.004948765719607,
"grad_norm": 0.38246601819992065,
"learning_rate": 0.00037216423995340705,
"loss": 3.2424,
"step": 65300
},
{
"epoch": 19.019503959012575,
"grad_norm": 0.3741970360279083,
"learning_rate": 0.0003719895165987187,
"loss": 3.1709,
"step": 65350
},
{
"epoch": 19.03405915230554,
"grad_norm": 0.3912128508090973,
"learning_rate": 0.0003718147932440303,
"loss": 3.1856,
"step": 65400
},
{
"epoch": 19.04861434559851,
"grad_norm": 0.3777288496494293,
"learning_rate": 0.00037164006988934183,
"loss": 3.1709,
"step": 65450
},
{
"epoch": 19.063169538891476,
"grad_norm": 0.38531720638275146,
"learning_rate": 0.0003714653465346534,
"loss": 3.189,
"step": 65500
},
{
"epoch": 19.077724732184443,
"grad_norm": 0.36882486939430237,
"learning_rate": 0.000371290623179965,
"loss": 3.1823,
"step": 65550
},
{
"epoch": 19.09227992547741,
"grad_norm": 0.39064404368400574,
"learning_rate": 0.00037111589982527656,
"loss": 3.1747,
"step": 65600
},
{
"epoch": 19.106835118770377,
"grad_norm": 0.43773752450942993,
"learning_rate": 0.0003709411764705882,
"loss": 3.1819,
"step": 65650
},
{
"epoch": 19.121390312063344,
"grad_norm": 0.4174773693084717,
"learning_rate": 0.0003707664531158998,
"loss": 3.1938,
"step": 65700
},
{
"epoch": 19.13594550535631,
"grad_norm": 0.38426661491394043,
"learning_rate": 0.0003705917297612114,
"loss": 3.1937,
"step": 65750
},
{
"epoch": 19.150500698649278,
"grad_norm": 0.37709906697273254,
"learning_rate": 0.000370417006406523,
"loss": 3.1885,
"step": 65800
},
{
"epoch": 19.165055891942245,
"grad_norm": 0.42608362436294556,
"learning_rate": 0.00037024228305183453,
"loss": 3.1975,
"step": 65850
},
{
"epoch": 19.179611085235212,
"grad_norm": 0.4232037365436554,
"learning_rate": 0.0003700675596971461,
"loss": 3.2081,
"step": 65900
},
{
"epoch": 19.19416627852818,
"grad_norm": 0.39610755443573,
"learning_rate": 0.0003698928363424578,
"loss": 3.2042,
"step": 65950
},
{
"epoch": 19.208721471821146,
"grad_norm": 0.38242438435554504,
"learning_rate": 0.00036971811298776937,
"loss": 3.2115,
"step": 66000
},
{
"epoch": 19.208721471821146,
"eval_accuracy": 0.37263524777870455,
"eval_loss": 3.552818775177002,
"eval_runtime": 181.5094,
"eval_samples_per_second": 91.736,
"eval_steps_per_second": 5.735,
"step": 66000
},
{
"epoch": 19.223276665114113,
"grad_norm": 0.40275952219963074,
"learning_rate": 0.0003695433896330809,
"loss": 3.211,
"step": 66050
},
{
"epoch": 19.23783185840708,
"grad_norm": 0.38202306628227234,
"learning_rate": 0.0003693686662783925,
"loss": 3.2086,
"step": 66100
},
{
"epoch": 19.252387051700047,
"grad_norm": 0.3808169960975647,
"learning_rate": 0.0003691939429237041,
"loss": 3.2087,
"step": 66150
},
{
"epoch": 19.266942244993015,
"grad_norm": 0.3790864646434784,
"learning_rate": 0.00036901921956901575,
"loss": 3.2216,
"step": 66200
},
{
"epoch": 19.28149743828598,
"grad_norm": 0.4077747166156769,
"learning_rate": 0.0003688444962143273,
"loss": 3.2075,
"step": 66250
},
{
"epoch": 19.29605263157895,
"grad_norm": 0.3944033980369568,
"learning_rate": 0.0003686697728596389,
"loss": 3.2227,
"step": 66300
},
{
"epoch": 19.310607824871916,
"grad_norm": 0.3898009955883026,
"learning_rate": 0.0003684950495049505,
"loss": 3.2192,
"step": 66350
},
{
"epoch": 19.325163018164883,
"grad_norm": 0.3814161717891693,
"learning_rate": 0.000368320326150262,
"loss": 3.2139,
"step": 66400
},
{
"epoch": 19.33971821145785,
"grad_norm": 0.38406553864479065,
"learning_rate": 0.0003681456027955736,
"loss": 3.2201,
"step": 66450
},
{
"epoch": 19.354273404750813,
"grad_norm": 0.376544326543808,
"learning_rate": 0.00036797087944088526,
"loss": 3.2216,
"step": 66500
},
{
"epoch": 19.36882859804378,
"grad_norm": 0.372443825006485,
"learning_rate": 0.00036779615608619685,
"loss": 3.2304,
"step": 66550
},
{
"epoch": 19.383383791336747,
"grad_norm": 0.40152785181999207,
"learning_rate": 0.0003676214327315084,
"loss": 3.23,
"step": 66600
},
{
"epoch": 19.397938984629715,
"grad_norm": 0.41309258341789246,
"learning_rate": 0.00036744670937682,
"loss": 3.2264,
"step": 66650
},
{
"epoch": 19.41249417792268,
"grad_norm": 0.37425634264945984,
"learning_rate": 0.0003672719860221316,
"loss": 3.2374,
"step": 66700
},
{
"epoch": 19.42704937121565,
"grad_norm": 0.3656749725341797,
"learning_rate": 0.00036709726266744323,
"loss": 3.2347,
"step": 66750
},
{
"epoch": 19.441604564508616,
"grad_norm": 0.383222371339798,
"learning_rate": 0.00036692253931275477,
"loss": 3.2161,
"step": 66800
},
{
"epoch": 19.456159757801583,
"grad_norm": 0.39083653688430786,
"learning_rate": 0.00036674781595806636,
"loss": 3.2301,
"step": 66850
},
{
"epoch": 19.47071495109455,
"grad_norm": 0.3562709093093872,
"learning_rate": 0.00036657309260337796,
"loss": 3.2478,
"step": 66900
},
{
"epoch": 19.485270144387517,
"grad_norm": 0.34880685806274414,
"learning_rate": 0.00036639836924868955,
"loss": 3.241,
"step": 66950
},
{
"epoch": 19.499825337680484,
"grad_norm": 0.40873685479164124,
"learning_rate": 0.0003662236458940011,
"loss": 3.253,
"step": 67000
},
{
"epoch": 19.499825337680484,
"eval_accuracy": 0.373206665537621,
"eval_loss": 3.5417890548706055,
"eval_runtime": 181.1986,
"eval_samples_per_second": 91.894,
"eval_steps_per_second": 5.745,
"step": 67000
},
{
"epoch": 19.51438053097345,
"grad_norm": 0.379344642162323,
"learning_rate": 0.00036604892253931274,
"loss": 3.2529,
"step": 67050
},
{
"epoch": 19.528935724266418,
"grad_norm": 0.3883436620235443,
"learning_rate": 0.00036587419918462433,
"loss": 3.2466,
"step": 67100
},
{
"epoch": 19.543490917559385,
"grad_norm": 0.3720000088214874,
"learning_rate": 0.00036569947582993593,
"loss": 3.2464,
"step": 67150
},
{
"epoch": 19.558046110852352,
"grad_norm": 0.40469080209732056,
"learning_rate": 0.00036552475247524747,
"loss": 3.2474,
"step": 67200
},
{
"epoch": 19.57260130414532,
"grad_norm": 0.3750063478946686,
"learning_rate": 0.00036535002912055906,
"loss": 3.2367,
"step": 67250
},
{
"epoch": 19.587156497438286,
"grad_norm": 0.38467472791671753,
"learning_rate": 0.00036517530576587066,
"loss": 3.2446,
"step": 67300
},
{
"epoch": 19.601711690731253,
"grad_norm": 0.3896279036998749,
"learning_rate": 0.0003650005824111823,
"loss": 3.2494,
"step": 67350
},
{
"epoch": 19.61626688402422,
"grad_norm": 0.4054372012615204,
"learning_rate": 0.00036482585905649385,
"loss": 3.2491,
"step": 67400
},
{
"epoch": 19.630822077317188,
"grad_norm": 0.3839910328388214,
"learning_rate": 0.00036465113570180544,
"loss": 3.2424,
"step": 67450
},
{
"epoch": 19.645377270610155,
"grad_norm": 0.3664180040359497,
"learning_rate": 0.00036447641234711703,
"loss": 3.2461,
"step": 67500
},
{
"epoch": 19.65993246390312,
"grad_norm": 0.36931681632995605,
"learning_rate": 0.0003643016889924286,
"loss": 3.2386,
"step": 67550
},
{
"epoch": 19.67448765719609,
"grad_norm": 0.3919360935688019,
"learning_rate": 0.0003641269656377402,
"loss": 3.2551,
"step": 67600
},
{
"epoch": 19.689042850489056,
"grad_norm": 0.39192062616348267,
"learning_rate": 0.0003639522422830518,
"loss": 3.2434,
"step": 67650
},
{
"epoch": 19.703598043782023,
"grad_norm": 0.3523690104484558,
"learning_rate": 0.0003637775189283634,
"loss": 3.2539,
"step": 67700
},
{
"epoch": 19.71815323707499,
"grad_norm": 0.3749070167541504,
"learning_rate": 0.00036360279557367495,
"loss": 3.2427,
"step": 67750
},
{
"epoch": 19.732708430367957,
"grad_norm": 0.36370980739593506,
"learning_rate": 0.00036342807221898655,
"loss": 3.2471,
"step": 67800
},
{
"epoch": 19.74726362366092,
"grad_norm": 0.4108258783817291,
"learning_rate": 0.00036325334886429814,
"loss": 3.2508,
"step": 67850
},
{
"epoch": 19.761818816953888,
"grad_norm": 0.4331919848918915,
"learning_rate": 0.0003630786255096098,
"loss": 3.2479,
"step": 67900
},
{
"epoch": 19.776374010246855,
"grad_norm": 0.36273193359375,
"learning_rate": 0.00036290390215492133,
"loss": 3.2536,
"step": 67950
},
{
"epoch": 19.79092920353982,
"grad_norm": 0.39168423414230347,
"learning_rate": 0.0003627291788002329,
"loss": 3.2466,
"step": 68000
},
{
"epoch": 19.79092920353982,
"eval_accuracy": 0.3734319653820971,
"eval_loss": 3.537184715270996,
"eval_runtime": 181.2507,
"eval_samples_per_second": 91.867,
"eval_steps_per_second": 5.743,
"step": 68000
},
{
"epoch": 19.80548439683279,
"grad_norm": 0.38999927043914795,
"learning_rate": 0.0003625544554455445,
"loss": 3.2529,
"step": 68050
},
{
"epoch": 19.820039590125756,
"grad_norm": 0.3893345296382904,
"learning_rate": 0.0003623797320908561,
"loss": 3.253,
"step": 68100
},
{
"epoch": 19.834594783418723,
"grad_norm": 0.3700770437717438,
"learning_rate": 0.00036220500873616776,
"loss": 3.2616,
"step": 68150
},
{
"epoch": 19.84914997671169,
"grad_norm": 0.4178517162799835,
"learning_rate": 0.0003620302853814793,
"loss": 3.2551,
"step": 68200
},
{
"epoch": 19.863705170004657,
"grad_norm": 0.394083708524704,
"learning_rate": 0.0003618555620267909,
"loss": 3.2592,
"step": 68250
},
{
"epoch": 19.878260363297624,
"grad_norm": 0.3745860457420349,
"learning_rate": 0.0003616808386721025,
"loss": 3.2504,
"step": 68300
},
{
"epoch": 19.89281555659059,
"grad_norm": 0.39990267157554626,
"learning_rate": 0.00036150611531741403,
"loss": 3.2545,
"step": 68350
},
{
"epoch": 19.907370749883558,
"grad_norm": 0.37312051653862,
"learning_rate": 0.0003613313919627256,
"loss": 3.2596,
"step": 68400
},
{
"epoch": 19.921925943176525,
"grad_norm": 0.38254302740097046,
"learning_rate": 0.00036115666860803727,
"loss": 3.2645,
"step": 68450
},
{
"epoch": 19.936481136469492,
"grad_norm": 0.41094058752059937,
"learning_rate": 0.00036098194525334887,
"loss": 3.2695,
"step": 68500
},
{
"epoch": 19.95103632976246,
"grad_norm": 0.3820090889930725,
"learning_rate": 0.0003608072218986604,
"loss": 3.2599,
"step": 68550
},
{
"epoch": 19.965591523055426,
"grad_norm": 0.3915736675262451,
"learning_rate": 0.000360632498543972,
"loss": 3.2675,
"step": 68600
},
{
"epoch": 19.980146716348393,
"grad_norm": 0.3794993758201599,
"learning_rate": 0.0003604577751892836,
"loss": 3.2657,
"step": 68650
},
{
"epoch": 19.99470190964136,
"grad_norm": 0.38000932335853577,
"learning_rate": 0.00036028305183459513,
"loss": 3.2654,
"step": 68700
},
{
"epoch": 20.00902421984164,
"grad_norm": 0.3856370151042938,
"learning_rate": 0.0003601083284799068,
"loss": 3.1988,
"step": 68750
},
{
"epoch": 20.023579413134605,
"grad_norm": 0.39030441641807556,
"learning_rate": 0.0003599336051252184,
"loss": 3.155,
"step": 68800
},
{
"epoch": 20.038134606427572,
"grad_norm": 0.38833680748939514,
"learning_rate": 0.00035975888177052997,
"loss": 3.1615,
"step": 68850
},
{
"epoch": 20.05268979972054,
"grad_norm": 0.38570043444633484,
"learning_rate": 0.0003595841584158415,
"loss": 3.1685,
"step": 68900
},
{
"epoch": 20.067244993013507,
"grad_norm": 0.3800423741340637,
"learning_rate": 0.0003594094350611531,
"loss": 3.1791,
"step": 68950
},
{
"epoch": 20.081800186306474,
"grad_norm": 0.4435599148273468,
"learning_rate": 0.00035923471170646475,
"loss": 3.1752,
"step": 69000
},
{
"epoch": 20.081800186306474,
"eval_accuracy": 0.3730468284022598,
"eval_loss": 3.548830986022949,
"eval_runtime": 181.2712,
"eval_samples_per_second": 91.857,
"eval_steps_per_second": 5.743,
"step": 69000
},
{
"epoch": 20.09635537959944,
"grad_norm": 0.40220603346824646,
"learning_rate": 0.00035905998835177635,
"loss": 3.1679,
"step": 69050
},
{
"epoch": 20.110910572892408,
"grad_norm": 0.37997275590896606,
"learning_rate": 0.00035888526499708794,
"loss": 3.1812,
"step": 69100
},
{
"epoch": 20.125465766185375,
"grad_norm": 0.3896423876285553,
"learning_rate": 0.0003587105416423995,
"loss": 3.1766,
"step": 69150
},
{
"epoch": 20.140020959478342,
"grad_norm": 0.38264891505241394,
"learning_rate": 0.0003585358182877111,
"loss": 3.1838,
"step": 69200
},
{
"epoch": 20.15457615277131,
"grad_norm": 0.4277355372905731,
"learning_rate": 0.00035836109493302267,
"loss": 3.181,
"step": 69250
},
{
"epoch": 20.169131346064276,
"grad_norm": 0.3833529055118561,
"learning_rate": 0.0003581863715783343,
"loss": 3.1973,
"step": 69300
},
{
"epoch": 20.183686539357243,
"grad_norm": 0.4084775447845459,
"learning_rate": 0.00035801164822364586,
"loss": 3.1851,
"step": 69350
},
{
"epoch": 20.19824173265021,
"grad_norm": 0.3731643855571747,
"learning_rate": 0.00035783692486895745,
"loss": 3.1818,
"step": 69400
},
{
"epoch": 20.212796925943177,
"grad_norm": 0.3852198123931885,
"learning_rate": 0.00035766220151426905,
"loss": 3.2066,
"step": 69450
},
{
"epoch": 20.227352119236144,
"grad_norm": 0.44227173924446106,
"learning_rate": 0.0003574874781595806,
"loss": 3.2011,
"step": 69500
},
{
"epoch": 20.24190731252911,
"grad_norm": 0.3953424394130707,
"learning_rate": 0.00035731275480489224,
"loss": 3.2069,
"step": 69550
},
{
"epoch": 20.25646250582208,
"grad_norm": 0.39885902404785156,
"learning_rate": 0.00035713803145020383,
"loss": 3.2066,
"step": 69600
},
{
"epoch": 20.271017699115045,
"grad_norm": 0.3794148862361908,
"learning_rate": 0.0003569633080955154,
"loss": 3.2042,
"step": 69650
},
{
"epoch": 20.285572892408013,
"grad_norm": 0.3893904983997345,
"learning_rate": 0.00035678858474082697,
"loss": 3.2075,
"step": 69700
},
{
"epoch": 20.30012808570098,
"grad_norm": 0.4011684060096741,
"learning_rate": 0.00035661386138613856,
"loss": 3.2043,
"step": 69750
},
{
"epoch": 20.314683278993947,
"grad_norm": 0.4057433307170868,
"learning_rate": 0.00035643913803145015,
"loss": 3.2102,
"step": 69800
},
{
"epoch": 20.32923847228691,
"grad_norm": 0.36967119574546814,
"learning_rate": 0.0003562644146767618,
"loss": 3.2193,
"step": 69850
},
{
"epoch": 20.343793665579877,
"grad_norm": 0.379490464925766,
"learning_rate": 0.00035608969132207334,
"loss": 3.214,
"step": 69900
},
{
"epoch": 20.358348858872844,
"grad_norm": 0.4042377769947052,
"learning_rate": 0.00035591496796738494,
"loss": 3.2138,
"step": 69950
},
{
"epoch": 20.37290405216581,
"grad_norm": 0.4063800871372223,
"learning_rate": 0.00035574024461269653,
"loss": 3.2125,
"step": 70000
},
{
"epoch": 20.37290405216581,
"eval_accuracy": 0.37332101960578756,
"eval_loss": 3.5475003719329834,
"eval_runtime": 181.2622,
"eval_samples_per_second": 91.861,
"eval_steps_per_second": 5.743,
"step": 70000
},
{
"epoch": 20.38745924545878,
"grad_norm": 0.39343348145484924,
"learning_rate": 0.0003555655212580081,
"loss": 3.224,
"step": 70050
},
{
"epoch": 20.402014438751745,
"grad_norm": 0.3925335109233856,
"learning_rate": 0.00035539079790331967,
"loss": 3.2262,
"step": 70100
},
{
"epoch": 20.416569632044713,
"grad_norm": 0.3898204565048218,
"learning_rate": 0.0003552160745486313,
"loss": 3.2303,
"step": 70150
},
{
"epoch": 20.43112482533768,
"grad_norm": 0.4369237720966339,
"learning_rate": 0.0003550413511939429,
"loss": 3.2281,
"step": 70200
},
{
"epoch": 20.445680018630647,
"grad_norm": 0.4013806879520416,
"learning_rate": 0.0003548666278392545,
"loss": 3.2124,
"step": 70250
},
{
"epoch": 20.460235211923614,
"grad_norm": 0.4022632837295532,
"learning_rate": 0.00035469190448456604,
"loss": 3.2142,
"step": 70300
},
{
"epoch": 20.47479040521658,
"grad_norm": 0.38561418652534485,
"learning_rate": 0.00035451718112987764,
"loss": 3.2286,
"step": 70350
},
{
"epoch": 20.489345598509548,
"grad_norm": 0.38620567321777344,
"learning_rate": 0.0003543424577751893,
"loss": 3.2225,
"step": 70400
},
{
"epoch": 20.503900791802515,
"grad_norm": 0.38152825832366943,
"learning_rate": 0.0003541677344205009,
"loss": 3.2291,
"step": 70450
},
{
"epoch": 20.518455985095482,
"grad_norm": 0.39362338185310364,
"learning_rate": 0.0003539930110658124,
"loss": 3.223,
"step": 70500
},
{
"epoch": 20.53301117838845,
"grad_norm": 0.3903370797634125,
"learning_rate": 0.000353818287711124,
"loss": 3.2341,
"step": 70550
},
{
"epoch": 20.547566371681416,
"grad_norm": 0.3791417181491852,
"learning_rate": 0.0003536435643564356,
"loss": 3.2477,
"step": 70600
},
{
"epoch": 20.562121564974383,
"grad_norm": 0.38322868943214417,
"learning_rate": 0.00035346884100174715,
"loss": 3.2317,
"step": 70650
},
{
"epoch": 20.57667675826735,
"grad_norm": 0.36763378977775574,
"learning_rate": 0.0003532941176470588,
"loss": 3.2345,
"step": 70700
},
{
"epoch": 20.591231951560317,
"grad_norm": 0.39628368616104126,
"learning_rate": 0.0003531193942923704,
"loss": 3.2353,
"step": 70750
},
{
"epoch": 20.605787144853284,
"grad_norm": 0.38605067133903503,
"learning_rate": 0.000352944670937682,
"loss": 3.2283,
"step": 70800
},
{
"epoch": 20.62034233814625,
"grad_norm": 0.40509700775146484,
"learning_rate": 0.0003527699475829935,
"loss": 3.24,
"step": 70850
},
{
"epoch": 20.63489753143922,
"grad_norm": 0.3870425820350647,
"learning_rate": 0.0003525952242283051,
"loss": 3.2256,
"step": 70900
},
{
"epoch": 20.649452724732186,
"grad_norm": 0.3848491907119751,
"learning_rate": 0.00035242050087361677,
"loss": 3.2362,
"step": 70950
},
{
"epoch": 20.664007918025153,
"grad_norm": 0.3847205340862274,
"learning_rate": 0.00035224577751892836,
"loss": 3.2385,
"step": 71000
},
{
"epoch": 20.664007918025153,
"eval_accuracy": 0.37386528855715373,
"eval_loss": 3.5357706546783447,
"eval_runtime": 181.0515,
"eval_samples_per_second": 91.968,
"eval_steps_per_second": 5.75,
"step": 71000
},
{
"epoch": 20.67856311131812,
"grad_norm": 0.40746915340423584,
"learning_rate": 0.0003520710541642399,
"loss": 3.2387,
"step": 71050
},
{
"epoch": 20.693118304611087,
"grad_norm": 0.39892295002937317,
"learning_rate": 0.0003518963308095515,
"loss": 3.2394,
"step": 71100
},
{
"epoch": 20.707673497904054,
"grad_norm": 0.3839132487773895,
"learning_rate": 0.0003517216074548631,
"loss": 3.2403,
"step": 71150
},
{
"epoch": 20.722228691197017,
"grad_norm": 0.37013792991638184,
"learning_rate": 0.0003515468841001747,
"loss": 3.224,
"step": 71200
},
{
"epoch": 20.736783884489984,
"grad_norm": 0.3782537877559662,
"learning_rate": 0.00035137216074548634,
"loss": 3.2499,
"step": 71250
},
{
"epoch": 20.75133907778295,
"grad_norm": 0.40229395031929016,
"learning_rate": 0.0003511974373907979,
"loss": 3.2352,
"step": 71300
},
{
"epoch": 20.76589427107592,
"grad_norm": 0.4002828598022461,
"learning_rate": 0.00035102271403610947,
"loss": 3.244,
"step": 71350
},
{
"epoch": 20.780449464368886,
"grad_norm": 0.3857050836086273,
"learning_rate": 0.00035084799068142106,
"loss": 3.2383,
"step": 71400
},
{
"epoch": 20.795004657661853,
"grad_norm": 0.3786620795726776,
"learning_rate": 0.0003506732673267326,
"loss": 3.2368,
"step": 71450
},
{
"epoch": 20.80955985095482,
"grad_norm": 0.38194626569747925,
"learning_rate": 0.00035049854397204425,
"loss": 3.2364,
"step": 71500
},
{
"epoch": 20.824115044247787,
"grad_norm": 0.3764079511165619,
"learning_rate": 0.00035032382061735585,
"loss": 3.2616,
"step": 71550
},
{
"epoch": 20.838670237540754,
"grad_norm": 0.37126094102859497,
"learning_rate": 0.00035014909726266744,
"loss": 3.2465,
"step": 71600
},
{
"epoch": 20.85322543083372,
"grad_norm": 0.38610726594924927,
"learning_rate": 0.000349974373907979,
"loss": 3.2539,
"step": 71650
},
{
"epoch": 20.867780624126688,
"grad_norm": 0.4158027470111847,
"learning_rate": 0.0003497996505532906,
"loss": 3.2367,
"step": 71700
},
{
"epoch": 20.882335817419655,
"grad_norm": 0.4391123652458191,
"learning_rate": 0.00034962492719860217,
"loss": 3.2588,
"step": 71750
},
{
"epoch": 20.896891010712622,
"grad_norm": 0.39286503195762634,
"learning_rate": 0.0003494502038439138,
"loss": 3.2484,
"step": 71800
},
{
"epoch": 20.91144620400559,
"grad_norm": 0.400235652923584,
"learning_rate": 0.00034927548048922536,
"loss": 3.2569,
"step": 71850
},
{
"epoch": 20.926001397298556,
"grad_norm": 0.382038414478302,
"learning_rate": 0.00034910075713453695,
"loss": 3.2467,
"step": 71900
},
{
"epoch": 20.940556590591523,
"grad_norm": 0.38471704721450806,
"learning_rate": 0.00034892603377984855,
"loss": 3.2527,
"step": 71950
},
{
"epoch": 20.95511178388449,
"grad_norm": 0.369704931974411,
"learning_rate": 0.0003487513104251601,
"loss": 3.251,
"step": 72000
},
{
"epoch": 20.95511178388449,
"eval_accuracy": 0.3739182933718948,
"eval_loss": 3.5311179161071777,
"eval_runtime": 181.3153,
"eval_samples_per_second": 91.835,
"eval_steps_per_second": 5.741,
"step": 72000
},
{
"epoch": 20.969666977177457,
"grad_norm": 0.41386517882347107,
"learning_rate": 0.0003485765870704717,
"loss": 3.2598,
"step": 72050
},
{
"epoch": 20.984222170470424,
"grad_norm": 0.39041802287101746,
"learning_rate": 0.00034840186371578333,
"loss": 3.2575,
"step": 72100
},
{
"epoch": 20.99877736376339,
"grad_norm": 0.39289528131484985,
"learning_rate": 0.0003482271403610949,
"loss": 3.2441,
"step": 72150
},
{
"epoch": 21.01309967396367,
"grad_norm": 0.408292680978775,
"learning_rate": 0.0003480524170064065,
"loss": 3.1657,
"step": 72200
},
{
"epoch": 21.027654867256636,
"grad_norm": 0.3830980956554413,
"learning_rate": 0.00034787769365171806,
"loss": 3.1464,
"step": 72250
},
{
"epoch": 21.042210060549603,
"grad_norm": 0.3988492488861084,
"learning_rate": 0.00034770297029702965,
"loss": 3.1504,
"step": 72300
},
{
"epoch": 21.05676525384257,
"grad_norm": 0.38097283244132996,
"learning_rate": 0.0003475282469423413,
"loss": 3.1562,
"step": 72350
},
{
"epoch": 21.071320447135538,
"grad_norm": 0.38642212748527527,
"learning_rate": 0.0003473535235876529,
"loss": 3.1615,
"step": 72400
},
{
"epoch": 21.085875640428505,
"grad_norm": 0.39626067876815796,
"learning_rate": 0.00034717880023296444,
"loss": 3.1706,
"step": 72450
},
{
"epoch": 21.10043083372147,
"grad_norm": 0.4080914855003357,
"learning_rate": 0.00034700407687827603,
"loss": 3.1722,
"step": 72500
},
{
"epoch": 21.11498602701444,
"grad_norm": 0.38640791177749634,
"learning_rate": 0.0003468293535235876,
"loss": 3.1707,
"step": 72550
},
{
"epoch": 21.129541220307406,
"grad_norm": NaN,
"learning_rate": 0.00034665463016889916,
"loss": 3.1776,
"step": 72600
},
{
"epoch": 21.144096413600373,
"grad_norm": 0.3920632302761078,
"learning_rate": 0.0003464799068142108,
"loss": 3.177,
"step": 72650
},
{
"epoch": 21.15865160689334,
"grad_norm": 0.407919317483902,
"learning_rate": 0.0003463051834595224,
"loss": 3.173,
"step": 72700
},
{
"epoch": 21.173206800186307,
"grad_norm": 0.3902853727340698,
"learning_rate": 0.000346130460104834,
"loss": 3.1717,
"step": 72750
},
{
"epoch": 21.187761993479274,
"grad_norm": 0.38187336921691895,
"learning_rate": 0.00034595573675014554,
"loss": 3.1878,
"step": 72800
},
{
"epoch": 21.20231718677224,
"grad_norm": 0.4419068992137909,
"learning_rate": 0.00034578101339545714,
"loss": 3.1885,
"step": 72850
},
{
"epoch": 21.216872380065208,
"grad_norm": 0.4039742350578308,
"learning_rate": 0.0003456062900407688,
"loss": 3.1867,
"step": 72900
},
{
"epoch": 21.231427573358175,
"grad_norm": 0.3693220317363739,
"learning_rate": 0.0003454315666860804,
"loss": 3.2003,
"step": 72950
},
{
"epoch": 21.245982766651142,
"grad_norm": 0.41296273469924927,
"learning_rate": 0.0003452568433313919,
"loss": 3.1939,
"step": 73000
},
{
"epoch": 21.245982766651142,
"eval_accuracy": 0.37289486559636115,
"eval_loss": 3.548574447631836,
"eval_runtime": 181.1623,
"eval_samples_per_second": 91.912,
"eval_steps_per_second": 5.746,
"step": 73000
},
{
"epoch": 21.26053795994411,
"grad_norm": 0.38118070363998413,
"learning_rate": 0.0003450821199767035,
"loss": 3.1947,
"step": 73050
},
{
"epoch": 21.275093153237076,
"grad_norm": 0.3997765779495239,
"learning_rate": 0.0003449073966220151,
"loss": 3.206,
"step": 73100
},
{
"epoch": 21.289648346530043,
"grad_norm": 0.4266320466995239,
"learning_rate": 0.0003447326732673267,
"loss": 3.1859,
"step": 73150
},
{
"epoch": 21.30420353982301,
"grad_norm": 0.3923620581626892,
"learning_rate": 0.0003445579499126383,
"loss": 3.2073,
"step": 73200
},
{
"epoch": 21.318758733115978,
"grad_norm": 0.38114598393440247,
"learning_rate": 0.0003443832265579499,
"loss": 3.1972,
"step": 73250
},
{
"epoch": 21.33331392640894,
"grad_norm": 0.3909752368927002,
"learning_rate": 0.0003442085032032615,
"loss": 3.2098,
"step": 73300
},
{
"epoch": 21.347869119701908,
"grad_norm": 0.37029755115509033,
"learning_rate": 0.0003440337798485731,
"loss": 3.2042,
"step": 73350
},
{
"epoch": 21.362424312994875,
"grad_norm": 0.3990894556045532,
"learning_rate": 0.0003438590564938846,
"loss": 3.2002,
"step": 73400
},
{
"epoch": 21.376979506287842,
"grad_norm": 0.4198392927646637,
"learning_rate": 0.0003436843331391962,
"loss": 3.1933,
"step": 73450
},
{
"epoch": 21.39153469958081,
"grad_norm": 0.3842645287513733,
"learning_rate": 0.00034350960978450786,
"loss": 3.2007,
"step": 73500
},
{
"epoch": 21.406089892873776,
"grad_norm": 0.39544111490249634,
"learning_rate": 0.00034333488642981946,
"loss": 3.2013,
"step": 73550
},
{
"epoch": 21.420645086166743,
"grad_norm": 0.42064350843429565,
"learning_rate": 0.000343160163075131,
"loss": 3.2129,
"step": 73600
},
{
"epoch": 21.43520027945971,
"grad_norm": 0.38665804266929626,
"learning_rate": 0.0003429854397204426,
"loss": 3.2248,
"step": 73650
},
{
"epoch": 21.449755472752678,
"grad_norm": 0.3785528838634491,
"learning_rate": 0.0003428107163657542,
"loss": 3.2113,
"step": 73700
},
{
"epoch": 21.464310666045645,
"grad_norm": 0.3859052360057831,
"learning_rate": 0.00034263599301106583,
"loss": 3.203,
"step": 73750
},
{
"epoch": 21.47886585933861,
"grad_norm": 0.4086976647377014,
"learning_rate": 0.0003424612696563774,
"loss": 3.2091,
"step": 73800
},
{
"epoch": 21.49342105263158,
"grad_norm": 0.41609588265419006,
"learning_rate": 0.00034228654630168897,
"loss": 3.2185,
"step": 73850
},
{
"epoch": 21.507976245924546,
"grad_norm": 0.37916097044944763,
"learning_rate": 0.00034211182294700056,
"loss": 3.211,
"step": 73900
},
{
"epoch": 21.522531439217513,
"grad_norm": 0.3910274803638458,
"learning_rate": 0.0003419370995923121,
"loss": 3.2232,
"step": 73950
},
{
"epoch": 21.53708663251048,
"grad_norm": 0.41739487648010254,
"learning_rate": 0.0003417623762376237,
"loss": 3.2151,
"step": 74000
},
{
"epoch": 21.53708663251048,
"eval_accuracy": 0.3736458650779482,
"eval_loss": 3.5441434383392334,
"eval_runtime": 181.3399,
"eval_samples_per_second": 91.822,
"eval_steps_per_second": 5.741,
"step": 74000
},
{
"epoch": 21.551641825803447,
"grad_norm": 0.40151894092559814,
"learning_rate": 0.00034158765288293534,
"loss": 3.2244,
"step": 74050
},
{
"epoch": 21.566197019096414,
"grad_norm": 0.38283640146255493,
"learning_rate": 0.00034141292952824694,
"loss": 3.2235,
"step": 74100
},
{
"epoch": 21.58075221238938,
"grad_norm": 0.38787081837654114,
"learning_rate": 0.0003412382061735585,
"loss": 3.2332,
"step": 74150
},
{
"epoch": 21.595307405682348,
"grad_norm": 0.43580567836761475,
"learning_rate": 0.0003410634828188701,
"loss": 3.2224,
"step": 74200
},
{
"epoch": 21.609862598975315,
"grad_norm": 0.40886545181274414,
"learning_rate": 0.00034088875946418167,
"loss": 3.2235,
"step": 74250
},
{
"epoch": 21.624417792268282,
"grad_norm": 0.37370070815086365,
"learning_rate": 0.0003407140361094933,
"loss": 3.2321,
"step": 74300
},
{
"epoch": 21.63897298556125,
"grad_norm": 0.3833332061767578,
"learning_rate": 0.00034053931275480486,
"loss": 3.2174,
"step": 74350
},
{
"epoch": 21.653528178854216,
"grad_norm": 0.40175867080688477,
"learning_rate": 0.00034036458940011645,
"loss": 3.2303,
"step": 74400
},
{
"epoch": 21.668083372147183,
"grad_norm": 0.43237751722335815,
"learning_rate": 0.00034018986604542804,
"loss": 3.2193,
"step": 74450
},
{
"epoch": 21.68263856544015,
"grad_norm": 0.38853147625923157,
"learning_rate": 0.00034001514269073964,
"loss": 3.2311,
"step": 74500
},
{
"epoch": 21.697193758733118,
"grad_norm": 0.39983823895454407,
"learning_rate": 0.0003398404193360512,
"loss": 3.2339,
"step": 74550
},
{
"epoch": 21.711748952026085,
"grad_norm": 0.39615949988365173,
"learning_rate": 0.00033966569598136283,
"loss": 3.234,
"step": 74600
},
{
"epoch": 21.726304145319048,
"grad_norm": 0.40111032128334045,
"learning_rate": 0.0003394909726266744,
"loss": 3.2341,
"step": 74650
},
{
"epoch": 21.740859338612015,
"grad_norm": 0.3891691565513611,
"learning_rate": 0.000339316249271986,
"loss": 3.2286,
"step": 74700
},
{
"epoch": 21.755414531904982,
"grad_norm": 0.37335634231567383,
"learning_rate": 0.00033914152591729756,
"loss": 3.2335,
"step": 74750
},
{
"epoch": 21.76996972519795,
"grad_norm": 0.3801760971546173,
"learning_rate": 0.00033896680256260915,
"loss": 3.2348,
"step": 74800
},
{
"epoch": 21.784524918490916,
"grad_norm": 0.4102132022380829,
"learning_rate": 0.00033879207920792074,
"loss": 3.2321,
"step": 74850
},
{
"epoch": 21.799080111783883,
"grad_norm": 0.40376681089401245,
"learning_rate": 0.0003386173558532324,
"loss": 3.2337,
"step": 74900
},
{
"epoch": 21.81363530507685,
"grad_norm": 0.3670161962509155,
"learning_rate": 0.00033844263249854393,
"loss": 3.2325,
"step": 74950
},
{
"epoch": 21.828190498369818,
"grad_norm": 0.4087563455104828,
"learning_rate": 0.00033826790914385553,
"loss": 3.2295,
"step": 75000
},
{
"epoch": 21.828190498369818,
"eval_accuracy": 0.37407201908737464,
"eval_loss": 3.5371601581573486,
"eval_runtime": 180.9832,
"eval_samples_per_second": 92.003,
"eval_steps_per_second": 5.752,
"step": 75000
},
{
"epoch": 21.842745691662785,
"grad_norm": 0.37676188349723816,
"learning_rate": 0.0003380931857891671,
"loss": 3.2364,
"step": 75050
},
{
"epoch": 21.85730088495575,
"grad_norm": 0.36632487177848816,
"learning_rate": 0.00033791846243447866,
"loss": 3.2412,
"step": 75100
},
{
"epoch": 21.87185607824872,
"grad_norm": 0.4055189788341522,
"learning_rate": 0.0003377437390797903,
"loss": 3.2476,
"step": 75150
},
{
"epoch": 21.886411271541686,
"grad_norm": 0.4044891595840454,
"learning_rate": 0.0003375690157251019,
"loss": 3.2275,
"step": 75200
},
{
"epoch": 21.900966464834653,
"grad_norm": 0.38007423281669617,
"learning_rate": 0.0003373942923704135,
"loss": 3.2427,
"step": 75250
},
{
"epoch": 21.91552165812762,
"grad_norm": 0.4170713424682617,
"learning_rate": 0.00033721956901572504,
"loss": 3.2442,
"step": 75300
},
{
"epoch": 21.930076851420587,
"grad_norm": 0.3902817964553833,
"learning_rate": 0.00033704484566103663,
"loss": 3.2375,
"step": 75350
},
{
"epoch": 21.944632044713554,
"grad_norm": 0.40278515219688416,
"learning_rate": 0.00033687012230634823,
"loss": 3.2522,
"step": 75400
},
{
"epoch": 21.95918723800652,
"grad_norm": 0.36589857935905457,
"learning_rate": 0.0003366953989516599,
"loss": 3.2372,
"step": 75450
},
{
"epoch": 21.97374243129949,
"grad_norm": 0.403898686170578,
"learning_rate": 0.00033652067559697147,
"loss": 3.2346,
"step": 75500
},
{
"epoch": 21.988297624592455,
"grad_norm": 0.4070855975151062,
"learning_rate": 0.000336345952242283,
"loss": 3.2396,
"step": 75550
},
{
"epoch": 22.002619934792733,
"grad_norm": 0.3859020173549652,
"learning_rate": 0.0003361712288875946,
"loss": 3.2177,
"step": 75600
},
{
"epoch": 22.0171751280857,
"grad_norm": 0.3986937999725342,
"learning_rate": 0.0003359965055329062,
"loss": 3.1351,
"step": 75650
},
{
"epoch": 22.031730321378667,
"grad_norm": 0.39367422461509705,
"learning_rate": 0.00033582178217821785,
"loss": 3.1475,
"step": 75700
},
{
"epoch": 22.046285514671634,
"grad_norm": 0.40651074051856995,
"learning_rate": 0.0003356470588235294,
"loss": 3.149,
"step": 75750
},
{
"epoch": 22.0608407079646,
"grad_norm": 0.38006383180618286,
"learning_rate": 0.000335472335468841,
"loss": 3.1562,
"step": 75800
},
{
"epoch": 22.07539590125757,
"grad_norm": 0.40139079093933105,
"learning_rate": 0.0003352976121141526,
"loss": 3.1548,
"step": 75850
},
{
"epoch": 22.089951094550536,
"grad_norm": 0.3921520411968231,
"learning_rate": 0.0003351228887594641,
"loss": 3.1596,
"step": 75900
},
{
"epoch": 22.104506287843503,
"grad_norm": 0.4345844089984894,
"learning_rate": 0.0003349481654047757,
"loss": 3.1602,
"step": 75950
},
{
"epoch": 22.11906148113647,
"grad_norm": 0.38527101278305054,
"learning_rate": 0.00033477344205008736,
"loss": 3.1566,
"step": 76000
},
{
"epoch": 22.11906148113647,
"eval_accuracy": 0.373341939466151,
"eval_loss": 3.554121732711792,
"eval_runtime": 182.5103,
"eval_samples_per_second": 91.233,
"eval_steps_per_second": 5.704,
"step": 76000
},
{
"epoch": 22.133616674429437,
"grad_norm": 0.4101022183895111,
"learning_rate": 0.00033459871869539895,
"loss": 3.167,
"step": 76050
},
{
"epoch": 22.148171867722404,
"grad_norm": 0.4076803922653198,
"learning_rate": 0.0003344239953407105,
"loss": 3.1731,
"step": 76100
},
{
"epoch": 22.16272706101537,
"grad_norm": 0.4150278866291046,
"learning_rate": 0.0003342492719860221,
"loss": 3.168,
"step": 76150
},
{
"epoch": 22.177282254308338,
"grad_norm": 0.39549461007118225,
"learning_rate": 0.0003340745486313337,
"loss": 3.1759,
"step": 76200
},
{
"epoch": 22.191837447601305,
"grad_norm": 0.43638280034065247,
"learning_rate": 0.0003338998252766453,
"loss": 3.1879,
"step": 76250
},
{
"epoch": 22.206392640894272,
"grad_norm": 0.4284520745277405,
"learning_rate": 0.00033372510192195687,
"loss": 3.1696,
"step": 76300
},
{
"epoch": 22.22094783418724,
"grad_norm": 0.39070242643356323,
"learning_rate": 0.00033355037856726847,
"loss": 3.1726,
"step": 76350
},
{
"epoch": 22.235503027480206,
"grad_norm": 0.4118767976760864,
"learning_rate": 0.00033337565521258006,
"loss": 3.1793,
"step": 76400
},
{
"epoch": 22.250058220773173,
"grad_norm": 0.41845229268074036,
"learning_rate": 0.00033320093185789165,
"loss": 3.1821,
"step": 76450
},
{
"epoch": 22.26461341406614,
"grad_norm": 0.39468759298324585,
"learning_rate": 0.0003330262085032032,
"loss": 3.1824,
"step": 76500
},
{
"epoch": 22.279168607359107,
"grad_norm": 0.45352470874786377,
"learning_rate": 0.00033285148514851484,
"loss": 3.1823,
"step": 76550
},
{
"epoch": 22.293723800652074,
"grad_norm": 0.39214208722114563,
"learning_rate": 0.00033267676179382644,
"loss": 3.1936,
"step": 76600
},
{
"epoch": 22.308278993945038,
"grad_norm": 0.39688727259635925,
"learning_rate": 0.00033250203843913803,
"loss": 3.1897,
"step": 76650
},
{
"epoch": 22.322834187238005,
"grad_norm": 0.3960036337375641,
"learning_rate": 0.00033232731508444957,
"loss": 3.1958,
"step": 76700
},
{
"epoch": 22.337389380530972,
"grad_norm": 0.42960265278816223,
"learning_rate": 0.00033215259172976117,
"loss": 3.1911,
"step": 76750
},
{
"epoch": 22.35194457382394,
"grad_norm": 0.39636436104774475,
"learning_rate": 0.00033197786837507276,
"loss": 3.1924,
"step": 76800
},
{
"epoch": 22.366499767116906,
"grad_norm": 0.3948790729045868,
"learning_rate": 0.0003318031450203844,
"loss": 3.1953,
"step": 76850
},
{
"epoch": 22.381054960409873,
"grad_norm": 0.3913504183292389,
"learning_rate": 0.00033162842166569595,
"loss": 3.1863,
"step": 76900
},
{
"epoch": 22.39561015370284,
"grad_norm": 0.3883304297924042,
"learning_rate": 0.00033145369831100754,
"loss": 3.2014,
"step": 76950
},
{
"epoch": 22.410165346995807,
"grad_norm": 0.39272409677505493,
"learning_rate": 0.00033127897495631914,
"loss": 3.2004,
"step": 77000
},
{
"epoch": 22.410165346995807,
"eval_accuracy": 0.3737291919374858,
"eval_loss": 3.545900583267212,
"eval_runtime": 181.6134,
"eval_samples_per_second": 91.684,
"eval_steps_per_second": 5.732,
"step": 77000
},
{
"epoch": 22.424720540288774,
"grad_norm": 0.4007713794708252,
"learning_rate": 0.0003311042516016307,
"loss": 3.2017,
"step": 77050
},
{
"epoch": 22.43927573358174,
"grad_norm": 0.4593736529350281,
"learning_rate": 0.0003309295282469423,
"loss": 3.1982,
"step": 77100
},
{
"epoch": 22.45383092687471,
"grad_norm": 0.3807480037212372,
"learning_rate": 0.0003307548048922539,
"loss": 3.2012,
"step": 77150
},
{
"epoch": 22.468386120167676,
"grad_norm": 0.433830201625824,
"learning_rate": 0.0003305800815375655,
"loss": 3.2042,
"step": 77200
},
{
"epoch": 22.482941313460643,
"grad_norm": 0.3865396976470947,
"learning_rate": 0.00033040535818287705,
"loss": 3.2004,
"step": 77250
},
{
"epoch": 22.49749650675361,
"grad_norm": 0.42019572854042053,
"learning_rate": 0.00033023063482818865,
"loss": 3.2034,
"step": 77300
},
{
"epoch": 22.512051700046577,
"grad_norm": 0.43494096398353577,
"learning_rate": 0.00033005591147350024,
"loss": 3.2009,
"step": 77350
},
{
"epoch": 22.526606893339544,
"grad_norm": 0.4004029929637909,
"learning_rate": 0.0003298811881188119,
"loss": 3.2162,
"step": 77400
},
{
"epoch": 22.54116208663251,
"grad_norm": 0.41305696964263916,
"learning_rate": 0.00032970646476412343,
"loss": 3.1953,
"step": 77450
},
{
"epoch": 22.555717279925478,
"grad_norm": 0.4445006549358368,
"learning_rate": 0.000329531741409435,
"loss": 3.2142,
"step": 77500
},
{
"epoch": 22.570272473218445,
"grad_norm": 0.40516790747642517,
"learning_rate": 0.0003293570180547466,
"loss": 3.2041,
"step": 77550
},
{
"epoch": 22.584827666511412,
"grad_norm": 0.43369901180267334,
"learning_rate": 0.0003291822947000582,
"loss": 3.2064,
"step": 77600
},
{
"epoch": 22.59938285980438,
"grad_norm": 0.4414738714694977,
"learning_rate": 0.00032900757134536975,
"loss": 3.2142,
"step": 77650
},
{
"epoch": 22.613938053097346,
"grad_norm": 0.41799449920654297,
"learning_rate": 0.0003288328479906814,
"loss": 3.2172,
"step": 77700
},
{
"epoch": 22.628493246390313,
"grad_norm": 0.3902125656604767,
"learning_rate": 0.000328658124635993,
"loss": 3.2141,
"step": 77750
},
{
"epoch": 22.64304843968328,
"grad_norm": 0.4232015013694763,
"learning_rate": 0.0003284834012813046,
"loss": 3.2053,
"step": 77800
},
{
"epoch": 22.657603632976247,
"grad_norm": 0.3856189250946045,
"learning_rate": 0.00032830867792661613,
"loss": 3.2154,
"step": 77850
},
{
"epoch": 22.672158826269214,
"grad_norm": 0.4207291007041931,
"learning_rate": 0.0003281339545719277,
"loss": 3.2233,
"step": 77900
},
{
"epoch": 22.68671401956218,
"grad_norm": 0.4035766124725342,
"learning_rate": 0.0003279592312172394,
"loss": 3.2174,
"step": 77950
},
{
"epoch": 22.701269212855145,
"grad_norm": 0.3936694860458374,
"learning_rate": 0.00032778450786255097,
"loss": 3.2214,
"step": 78000
},
{
"epoch": 22.701269212855145,
"eval_accuracy": 0.37412055786451004,
"eval_loss": 3.540379047393799,
"eval_runtime": 183.076,
"eval_samples_per_second": 90.951,
"eval_steps_per_second": 5.686,
"step": 78000
},
{
"epoch": 22.715824406148112,
"grad_norm": 0.39343857765197754,
"learning_rate": 0.0003276097845078625,
"loss": 3.2047,
"step": 78050
},
{
"epoch": 22.73037959944108,
"grad_norm": 0.43328389525413513,
"learning_rate": 0.0003274350611531741,
"loss": 3.2277,
"step": 78100
},
{
"epoch": 22.744934792734046,
"grad_norm": 0.39509138464927673,
"learning_rate": 0.0003272603377984857,
"loss": 3.2203,
"step": 78150
},
{
"epoch": 22.759489986027013,
"grad_norm": 0.4134383499622345,
"learning_rate": 0.00032708561444379724,
"loss": 3.2174,
"step": 78200
},
{
"epoch": 22.77404517931998,
"grad_norm": 0.4262104332447052,
"learning_rate": 0.0003269108910891089,
"loss": 3.2249,
"step": 78250
},
{
"epoch": 22.788600372612947,
"grad_norm": 0.3714088201522827,
"learning_rate": 0.0003267361677344205,
"loss": 3.2223,
"step": 78300
},
{
"epoch": 22.803155565905914,
"grad_norm": 0.38759660720825195,
"learning_rate": 0.0003265614443797321,
"loss": 3.2291,
"step": 78350
},
{
"epoch": 22.81771075919888,
"grad_norm": 0.3645568788051605,
"learning_rate": 0.0003263867210250436,
"loss": 3.2222,
"step": 78400
},
{
"epoch": 22.83226595249185,
"grad_norm": 0.3883237838745117,
"learning_rate": 0.0003262119976703552,
"loss": 3.2259,
"step": 78450
},
{
"epoch": 22.846821145784816,
"grad_norm": 0.4034235179424286,
"learning_rate": 0.00032603727431566686,
"loss": 3.2299,
"step": 78500
},
{
"epoch": 22.861376339077783,
"grad_norm": 0.42304471135139465,
"learning_rate": 0.00032586255096097845,
"loss": 3.2408,
"step": 78550
},
{
"epoch": 22.87593153237075,
"grad_norm": 0.3993287980556488,
"learning_rate": 0.00032568782760629005,
"loss": 3.2307,
"step": 78600
},
{
"epoch": 22.890486725663717,
"grad_norm": 0.40124648809432983,
"learning_rate": 0.0003255131042516016,
"loss": 3.2391,
"step": 78650
},
{
"epoch": 22.905041918956684,
"grad_norm": 0.39223048090934753,
"learning_rate": 0.0003253383808969132,
"loss": 3.2249,
"step": 78700
},
{
"epoch": 22.91959711224965,
"grad_norm": 0.38387811183929443,
"learning_rate": 0.0003251636575422248,
"loss": 3.2355,
"step": 78750
},
{
"epoch": 22.934152305542618,
"grad_norm": 0.4031982421875,
"learning_rate": 0.0003249889341875364,
"loss": 3.2232,
"step": 78800
},
{
"epoch": 22.948707498835585,
"grad_norm": 0.396810382604599,
"learning_rate": 0.00032481421083284796,
"loss": 3.2344,
"step": 78850
},
{
"epoch": 22.963262692128552,
"grad_norm": 0.4282434582710266,
"learning_rate": 0.00032463948747815956,
"loss": 3.2466,
"step": 78900
},
{
"epoch": 22.97781788542152,
"grad_norm": 0.4010005593299866,
"learning_rate": 0.00032446476412347115,
"loss": 3.2302,
"step": 78950
},
{
"epoch": 22.992373078714486,
"grad_norm": 0.39370250701904297,
"learning_rate": 0.0003242900407687827,
"loss": 3.2437,
"step": 79000
},
{
"epoch": 22.992373078714486,
"eval_accuracy": 0.3744414074082867,
"eval_loss": 3.5298874378204346,
"eval_runtime": 181.1381,
"eval_samples_per_second": 91.924,
"eval_steps_per_second": 5.747,
"step": 79000
},
{
"epoch": 23.006695388914764,
"grad_norm": 0.4299910068511963,
"learning_rate": 0.00032411531741409434,
"loss": 3.188,
"step": 79050
},
{
"epoch": 23.02125058220773,
"grad_norm": 0.3970222771167755,
"learning_rate": 0.00032394059405940593,
"loss": 3.1256,
"step": 79100
},
{
"epoch": 23.035805775500698,
"grad_norm": 0.43236055970191956,
"learning_rate": 0.00032376587070471753,
"loss": 3.1269,
"step": 79150
},
{
"epoch": 23.050360968793665,
"grad_norm": 0.4045545756816864,
"learning_rate": 0.00032359114735002907,
"loss": 3.1355,
"step": 79200
},
{
"epoch": 23.064916162086632,
"grad_norm": 0.4147092401981354,
"learning_rate": 0.00032341642399534066,
"loss": 3.1496,
"step": 79250
},
{
"epoch": 23.0794713553796,
"grad_norm": 0.4010353088378906,
"learning_rate": 0.00032324170064065226,
"loss": 3.1275,
"step": 79300
},
{
"epoch": 23.094026548672566,
"grad_norm": 0.4142921566963196,
"learning_rate": 0.0003230669772859639,
"loss": 3.1604,
"step": 79350
},
{
"epoch": 23.108581741965533,
"grad_norm": 0.3941037058830261,
"learning_rate": 0.00032289225393127545,
"loss": 3.1492,
"step": 79400
},
{
"epoch": 23.1231369352585,
"grad_norm": 0.39037835597991943,
"learning_rate": 0.00032271753057658704,
"loss": 3.148,
"step": 79450
},
{
"epoch": 23.137692128551468,
"grad_norm": 0.4245337247848511,
"learning_rate": 0.00032254280722189863,
"loss": 3.1634,
"step": 79500
},
{
"epoch": 23.152247321844435,
"grad_norm": 0.4353491961956024,
"learning_rate": 0.00032236808386721023,
"loss": 3.1494,
"step": 79550
},
{
"epoch": 23.1668025151374,
"grad_norm": 0.41144099831581116,
"learning_rate": 0.00032219336051252177,
"loss": 3.1536,
"step": 79600
},
{
"epoch": 23.18135770843037,
"grad_norm": 0.4307765066623688,
"learning_rate": 0.0003220186371578334,
"loss": 3.1573,
"step": 79650
},
{
"epoch": 23.195912901723336,
"grad_norm": 0.42108792066574097,
"learning_rate": 0.000321843913803145,
"loss": 3.1741,
"step": 79700
},
{
"epoch": 23.210468095016303,
"grad_norm": 0.4174800515174866,
"learning_rate": 0.0003216691904484566,
"loss": 3.1674,
"step": 79750
},
{
"epoch": 23.22502328830927,
"grad_norm": 0.41323181986808777,
"learning_rate": 0.00032149446709376815,
"loss": 3.1739,
"step": 79800
},
{
"epoch": 23.239578481602237,
"grad_norm": 0.40551289916038513,
"learning_rate": 0.00032131974373907974,
"loss": 3.1751,
"step": 79850
},
{
"epoch": 23.254133674895204,
"grad_norm": 0.42636001110076904,
"learning_rate": 0.0003211450203843914,
"loss": 3.1804,
"step": 79900
},
{
"epoch": 23.26868886818817,
"grad_norm": 0.3733903467655182,
"learning_rate": 0.000320970297029703,
"loss": 3.1768,
"step": 79950
},
{
"epoch": 23.283244061481135,
"grad_norm": 0.4172973334789276,
"learning_rate": 0.0003207955736750145,
"loss": 3.1845,
"step": 80000
},
{
"epoch": 23.283244061481135,
"eval_accuracy": 0.37387151750434056,
"eval_loss": 3.5492820739746094,
"eval_runtime": 182.8566,
"eval_samples_per_second": 91.06,
"eval_steps_per_second": 5.693,
"step": 80000
},
{
"epoch": 23.2977992547741,
"grad_norm": 0.41692233085632324,
"learning_rate": 0.0003206208503203261,
"loss": 3.1795,
"step": 80050
},
{
"epoch": 23.31235444806707,
"grad_norm": 0.4022136330604553,
"learning_rate": 0.0003204461269656377,
"loss": 3.1751,
"step": 80100
},
{
"epoch": 23.326909641360036,
"grad_norm": 0.4068368971347809,
"learning_rate": 0.00032027140361094925,
"loss": 3.1923,
"step": 80150
},
{
"epoch": 23.341464834653003,
"grad_norm": 0.40505388379096985,
"learning_rate": 0.0003200966802562609,
"loss": 3.1891,
"step": 80200
},
{
"epoch": 23.35602002794597,
"grad_norm": 0.40903159976005554,
"learning_rate": 0.0003199219569015725,
"loss": 3.1887,
"step": 80250
},
{
"epoch": 23.370575221238937,
"grad_norm": 0.4178510904312134,
"learning_rate": 0.0003197472335468841,
"loss": 3.1795,
"step": 80300
},
{
"epoch": 23.385130414531904,
"grad_norm": 0.401119589805603,
"learning_rate": 0.00031957251019219563,
"loss": 3.1897,
"step": 80350
},
{
"epoch": 23.39968560782487,
"grad_norm": 0.3909188508987427,
"learning_rate": 0.0003193977868375072,
"loss": 3.1857,
"step": 80400
},
{
"epoch": 23.41424080111784,
"grad_norm": 0.4655745327472687,
"learning_rate": 0.00031922306348281887,
"loss": 3.1892,
"step": 80450
},
{
"epoch": 23.428795994410805,
"grad_norm": 0.4173809587955475,
"learning_rate": 0.00031904834012813047,
"loss": 3.1838,
"step": 80500
},
{
"epoch": 23.443351187703772,
"grad_norm": 0.4490557312965393,
"learning_rate": 0.000318873616773442,
"loss": 3.2014,
"step": 80550
},
{
"epoch": 23.45790638099674,
"grad_norm": 0.4582597315311432,
"learning_rate": 0.0003186988934187536,
"loss": 3.1982,
"step": 80600
},
{
"epoch": 23.472461574289706,
"grad_norm": 0.3948287069797516,
"learning_rate": 0.0003185241700640652,
"loss": 3.1967,
"step": 80650
},
{
"epoch": 23.487016767582674,
"grad_norm": 0.41583526134490967,
"learning_rate": 0.0003183494467093768,
"loss": 3.2026,
"step": 80700
},
{
"epoch": 23.50157196087564,
"grad_norm": 0.41763588786125183,
"learning_rate": 0.0003181747233546884,
"loss": 3.2021,
"step": 80750
},
{
"epoch": 23.516127154168608,
"grad_norm": 0.4259377121925354,
"learning_rate": 0.000318,
"loss": 3.2006,
"step": 80800
},
{
"epoch": 23.530682347461575,
"grad_norm": 0.409658282995224,
"learning_rate": 0.00031782527664531157,
"loss": 3.198,
"step": 80850
},
{
"epoch": 23.545237540754542,
"grad_norm": 0.43902572989463806,
"learning_rate": 0.00031765055329062317,
"loss": 3.2013,
"step": 80900
},
{
"epoch": 23.55979273404751,
"grad_norm": 0.40293756127357483,
"learning_rate": 0.0003174758299359347,
"loss": 3.2025,
"step": 80950
},
{
"epoch": 23.574347927340476,
"grad_norm": 0.4030076861381531,
"learning_rate": 0.0003173011065812463,
"loss": 3.1994,
"step": 81000
},
{
"epoch": 23.574347927340476,
"eval_accuracy": 0.37436407444132513,
"eval_loss": 3.5441434383392334,
"eval_runtime": 181.3124,
"eval_samples_per_second": 91.836,
"eval_steps_per_second": 5.741,
"step": 81000
},
{
"epoch": 23.588903120633443,
"grad_norm": 0.4462394118309021,
"learning_rate": 0.00031712638322655795,
"loss": 3.1878,
"step": 81050
},
{
"epoch": 23.60345831392641,
"grad_norm": 0.42426666617393494,
"learning_rate": 0.00031695165987186954,
"loss": 3.1984,
"step": 81100
},
{
"epoch": 23.618013507219377,
"grad_norm": 0.384657084941864,
"learning_rate": 0.0003167769365171811,
"loss": 3.2021,
"step": 81150
},
{
"epoch": 23.632568700512344,
"grad_norm": 0.3903048634529114,
"learning_rate": 0.0003166022131624927,
"loss": 3.205,
"step": 81200
},
{
"epoch": 23.64712389380531,
"grad_norm": 0.4122599959373474,
"learning_rate": 0.00031642748980780427,
"loss": 3.2009,
"step": 81250
},
{
"epoch": 23.66167908709828,
"grad_norm": 0.43961748480796814,
"learning_rate": 0.0003162527664531159,
"loss": 3.204,
"step": 81300
},
{
"epoch": 23.676234280391242,
"grad_norm": 0.4120798110961914,
"learning_rate": 0.00031607804309842746,
"loss": 3.1975,
"step": 81350
},
{
"epoch": 23.69078947368421,
"grad_norm": 0.40398430824279785,
"learning_rate": 0.00031590331974373905,
"loss": 3.2111,
"step": 81400
},
{
"epoch": 23.705344666977176,
"grad_norm": 0.42327627539634705,
"learning_rate": 0.00031572859638905065,
"loss": 3.2023,
"step": 81450
},
{
"epoch": 23.719899860270143,
"grad_norm": 0.4177756905555725,
"learning_rate": 0.0003155538730343622,
"loss": 3.2132,
"step": 81500
},
{
"epoch": 23.73445505356311,
"grad_norm": 0.4342160224914551,
"learning_rate": 0.0003153791496796738,
"loss": 3.224,
"step": 81550
},
{
"epoch": 23.749010246856077,
"grad_norm": 0.4283663034439087,
"learning_rate": 0.00031520442632498543,
"loss": 3.2156,
"step": 81600
},
{
"epoch": 23.763565440149044,
"grad_norm": 0.39514467120170593,
"learning_rate": 0.000315029702970297,
"loss": 3.2086,
"step": 81650
},
{
"epoch": 23.77812063344201,
"grad_norm": 0.4161839187145233,
"learning_rate": 0.0003148549796156086,
"loss": 3.2062,
"step": 81700
},
{
"epoch": 23.79267582673498,
"grad_norm": 0.44791948795318604,
"learning_rate": 0.00031468025626092016,
"loss": 3.212,
"step": 81750
},
{
"epoch": 23.807231020027945,
"grad_norm": 0.41878649592399597,
"learning_rate": 0.00031450553290623175,
"loss": 3.2154,
"step": 81800
},
{
"epoch": 23.821786213320912,
"grad_norm": 0.39344221353530884,
"learning_rate": 0.0003143308095515434,
"loss": 3.2188,
"step": 81850
},
{
"epoch": 23.83634140661388,
"grad_norm": 0.4187600910663605,
"learning_rate": 0.000314156086196855,
"loss": 3.2193,
"step": 81900
},
{
"epoch": 23.850896599906847,
"grad_norm": 0.41973283886909485,
"learning_rate": 0.00031398136284216654,
"loss": 3.2182,
"step": 81950
},
{
"epoch": 23.865451793199814,
"grad_norm": 0.4089304208755493,
"learning_rate": 0.00031380663948747813,
"loss": 3.2281,
"step": 82000
},
{
"epoch": 23.865451793199814,
"eval_accuracy": 0.37461628803874075,
"eval_loss": 3.532421588897705,
"eval_runtime": 181.294,
"eval_samples_per_second": 91.845,
"eval_steps_per_second": 5.742,
"step": 82000
},
{
"epoch": 23.88000698649278,
"grad_norm": 0.43405380845069885,
"learning_rate": 0.0003136319161327897,
"loss": 3.2204,
"step": 82050
},
{
"epoch": 23.894562179785748,
"grad_norm": 0.38422340154647827,
"learning_rate": 0.00031345719277810127,
"loss": 3.2313,
"step": 82100
},
{
"epoch": 23.909117373078715,
"grad_norm": 0.4141247272491455,
"learning_rate": 0.0003132824694234129,
"loss": 3.2268,
"step": 82150
},
{
"epoch": 23.923672566371682,
"grad_norm": 0.40371742844581604,
"learning_rate": 0.0003131077460687245,
"loss": 3.2232,
"step": 82200
},
{
"epoch": 23.93822775966465,
"grad_norm": 0.46508926153182983,
"learning_rate": 0.0003129330227140361,
"loss": 3.2214,
"step": 82250
},
{
"epoch": 23.952782952957616,
"grad_norm": 0.4180763065814972,
"learning_rate": 0.00031275829935934764,
"loss": 3.2215,
"step": 82300
},
{
"epoch": 23.967338146250583,
"grad_norm": 0.43056684732437134,
"learning_rate": 0.00031258357600465924,
"loss": 3.2203,
"step": 82350
},
{
"epoch": 23.98189333954355,
"grad_norm": 0.4081912636756897,
"learning_rate": 0.00031240885264997083,
"loss": 3.2369,
"step": 82400
},
{
"epoch": 23.996448532836517,
"grad_norm": 0.3891538083553314,
"learning_rate": 0.0003122341292952825,
"loss": 3.2197,
"step": 82450
},
{
"epoch": 24.010770843036795,
"grad_norm": 0.42958930134773254,
"learning_rate": 0.000312059405940594,
"loss": 3.1498,
"step": 82500
},
{
"epoch": 24.025326036329762,
"grad_norm": 0.4145827293395996,
"learning_rate": 0.0003118846825859056,
"loss": 3.1274,
"step": 82550
},
{
"epoch": 24.03988122962273,
"grad_norm": 0.4117175042629242,
"learning_rate": 0.0003117099592312172,
"loss": 3.1281,
"step": 82600
},
{
"epoch": 24.054436422915696,
"grad_norm": 0.4058437645435333,
"learning_rate": 0.0003115352358765288,
"loss": 3.1278,
"step": 82650
},
{
"epoch": 24.068991616208663,
"grad_norm": 0.4439738094806671,
"learning_rate": 0.0003113605125218404,
"loss": 3.1437,
"step": 82700
},
{
"epoch": 24.08354680950163,
"grad_norm": 0.4439554214477539,
"learning_rate": 0.000311185789167152,
"loss": 3.1419,
"step": 82750
},
{
"epoch": 24.098102002794597,
"grad_norm": 0.42919254302978516,
"learning_rate": 0.0003110110658124636,
"loss": 3.146,
"step": 82800
},
{
"epoch": 24.112657196087564,
"grad_norm": 0.4345749020576477,
"learning_rate": 0.0003108363424577752,
"loss": 3.1503,
"step": 82850
},
{
"epoch": 24.12721238938053,
"grad_norm": 0.419748455286026,
"learning_rate": 0.0003106616191030867,
"loss": 3.1431,
"step": 82900
},
{
"epoch": 24.1417675826735,
"grad_norm": 0.4344918429851532,
"learning_rate": 0.0003104868957483983,
"loss": 3.1408,
"step": 82950
},
{
"epoch": 24.156322775966466,
"grad_norm": 0.4116227328777313,
"learning_rate": 0.00031031217239370996,
"loss": 3.1537,
"step": 83000
},
{
"epoch": 24.156322775966466,
"eval_accuracy": 0.37389807867536384,
"eval_loss": 3.5518624782562256,
"eval_runtime": 181.3785,
"eval_samples_per_second": 91.803,
"eval_steps_per_second": 5.739,
"step": 83000
},
{
"epoch": 24.170877969259433,
"grad_norm": 0.43057700991630554,
"learning_rate": 0.00031013744903902156,
"loss": 3.1558,
"step": 83050
},
{
"epoch": 24.1854331625524,
"grad_norm": 0.4120100438594818,
"learning_rate": 0.0003099627256843331,
"loss": 3.1631,
"step": 83100
},
{
"epoch": 24.199988355845367,
"grad_norm": 0.43784815073013306,
"learning_rate": 0.0003097880023296447,
"loss": 3.1501,
"step": 83150
},
{
"epoch": 24.214543549138334,
"grad_norm": 0.41029947996139526,
"learning_rate": 0.0003096132789749563,
"loss": 3.1523,
"step": 83200
},
{
"epoch": 24.2290987424313,
"grad_norm": 0.40311041474342346,
"learning_rate": 0.00030943855562026794,
"loss": 3.1533,
"step": 83250
},
{
"epoch": 24.243653935724268,
"grad_norm": 0.42732328176498413,
"learning_rate": 0.0003092638322655795,
"loss": 3.1647,
"step": 83300
},
{
"epoch": 24.258209129017235,
"grad_norm": 0.42801928520202637,
"learning_rate": 0.00030908910891089107,
"loss": 3.1621,
"step": 83350
},
{
"epoch": 24.2727643223102,
"grad_norm": 0.4290964603424072,
"learning_rate": 0.00030891438555620266,
"loss": 3.1618,
"step": 83400
},
{
"epoch": 24.287319515603166,
"grad_norm": 0.4055120348930359,
"learning_rate": 0.0003087396622015142,
"loss": 3.1756,
"step": 83450
},
{
"epoch": 24.301874708896133,
"grad_norm": 0.42242538928985596,
"learning_rate": 0.0003085649388468258,
"loss": 3.1729,
"step": 83500
},
{
"epoch": 24.3164299021891,
"grad_norm": 0.4095151126384735,
"learning_rate": 0.00030839021549213745,
"loss": 3.1704,
"step": 83550
},
{
"epoch": 24.330985095482067,
"grad_norm": 0.40566420555114746,
"learning_rate": 0.00030821549213744904,
"loss": 3.166,
"step": 83600
},
{
"epoch": 24.345540288775034,
"grad_norm": 0.3966980278491974,
"learning_rate": 0.0003080407687827606,
"loss": 3.1797,
"step": 83650
},
{
"epoch": 24.360095482068,
"grad_norm": 0.4019142687320709,
"learning_rate": 0.0003078660454280722,
"loss": 3.1798,
"step": 83700
},
{
"epoch": 24.374650675360968,
"grad_norm": 0.4082522988319397,
"learning_rate": 0.00030769132207338377,
"loss": 3.1807,
"step": 83750
},
{
"epoch": 24.389205868653935,
"grad_norm": 0.4320712983608246,
"learning_rate": 0.00030751659871869536,
"loss": 3.1908,
"step": 83800
},
{
"epoch": 24.403761061946902,
"grad_norm": 0.41094931960105896,
"learning_rate": 0.00030734187536400696,
"loss": 3.1735,
"step": 83850
},
{
"epoch": 24.41831625523987,
"grad_norm": 0.42182329297065735,
"learning_rate": 0.00030716715200931855,
"loss": 3.1729,
"step": 83900
},
{
"epoch": 24.432871448532836,
"grad_norm": 0.4036419689655304,
"learning_rate": 0.00030699242865463015,
"loss": 3.1887,
"step": 83950
},
{
"epoch": 24.447426641825803,
"grad_norm": 0.41440510749816895,
"learning_rate": 0.00030681770529994174,
"loss": 3.1843,
"step": 84000
},
{
"epoch": 24.447426641825803,
"eval_accuracy": 0.37409999058606286,
"eval_loss": 3.544252395629883,
"eval_runtime": 181.2073,
"eval_samples_per_second": 91.889,
"eval_steps_per_second": 5.745,
"step": 84000
},
{
"epoch": 24.46198183511877,
"grad_norm": 0.4058787524700165,
"learning_rate": 0.0003066429819452533,
"loss": 3.1824,
"step": 84050
},
{
"epoch": 24.476537028411737,
"grad_norm": 0.43140995502471924,
"learning_rate": 0.00030646825859056493,
"loss": 3.1743,
"step": 84100
},
{
"epoch": 24.491092221704704,
"grad_norm": 0.4342149794101715,
"learning_rate": 0.0003062935352358765,
"loss": 3.1891,
"step": 84150
},
{
"epoch": 24.50564741499767,
"grad_norm": 0.43598994612693787,
"learning_rate": 0.0003061188118811881,
"loss": 3.1846,
"step": 84200
},
{
"epoch": 24.52020260829064,
"grad_norm": 0.41045066714286804,
"learning_rate": 0.00030594408852649966,
"loss": 3.1906,
"step": 84250
},
{
"epoch": 24.534757801583606,
"grad_norm": 0.42254623770713806,
"learning_rate": 0.00030576936517181125,
"loss": 3.1885,
"step": 84300
},
{
"epoch": 24.549312994876573,
"grad_norm": 0.4160739481449127,
"learning_rate": 0.00030559464181712285,
"loss": 3.1943,
"step": 84350
},
{
"epoch": 24.56386818816954,
"grad_norm": 0.4158286452293396,
"learning_rate": 0.0003054199184624345,
"loss": 3.1982,
"step": 84400
},
{
"epoch": 24.578423381462507,
"grad_norm": 0.4078032672405243,
"learning_rate": 0.00030524519510774604,
"loss": 3.1821,
"step": 84450
},
{
"epoch": 24.592978574755474,
"grad_norm": 0.4104495644569397,
"learning_rate": 0.00030507047175305763,
"loss": 3.1941,
"step": 84500
},
{
"epoch": 24.60753376804844,
"grad_norm": 0.42513424158096313,
"learning_rate": 0.0003048957483983692,
"loss": 3.1918,
"step": 84550
},
{
"epoch": 24.622088961341408,
"grad_norm": 0.4459233582019806,
"learning_rate": 0.00030472102504368076,
"loss": 3.2031,
"step": 84600
},
{
"epoch": 24.636644154634375,
"grad_norm": 0.39689961075782776,
"learning_rate": 0.0003045463016889924,
"loss": 3.1937,
"step": 84650
},
{
"epoch": 24.651199347927342,
"grad_norm": 0.43621814250946045,
"learning_rate": 0.000304371578334304,
"loss": 3.201,
"step": 84700
},
{
"epoch": 24.66575454122031,
"grad_norm": 0.3944084942340851,
"learning_rate": 0.0003041968549796156,
"loss": 3.1955,
"step": 84750
},
{
"epoch": 24.680309734513273,
"grad_norm": 0.398531436920166,
"learning_rate": 0.00030402213162492714,
"loss": 3.1965,
"step": 84800
},
{
"epoch": 24.69486492780624,
"grad_norm": 0.4374167323112488,
"learning_rate": 0.00030384740827023874,
"loss": 3.2008,
"step": 84850
},
{
"epoch": 24.709420121099207,
"grad_norm": 0.41527554392814636,
"learning_rate": 0.00030367268491555033,
"loss": 3.185,
"step": 84900
},
{
"epoch": 24.723975314392174,
"grad_norm": 0.4076023995876312,
"learning_rate": 0.000303497961560862,
"loss": 3.1992,
"step": 84950
},
{
"epoch": 24.73853050768514,
"grad_norm": 0.4005376696586609,
"learning_rate": 0.00030332323820617357,
"loss": 3.2124,
"step": 85000
},
{
"epoch": 24.73853050768514,
"eval_accuracy": 0.374507575281234,
"eval_loss": 3.53737735748291,
"eval_runtime": 181.2714,
"eval_samples_per_second": 91.857,
"eval_steps_per_second": 5.743,
"step": 85000
},
{
"epoch": 24.753085700978108,
"grad_norm": 0.4087483286857605,
"learning_rate": 0.0003031485148514851,
"loss": 3.2045,
"step": 85050
},
{
"epoch": 24.767640894271075,
"grad_norm": 0.4213216006755829,
"learning_rate": 0.0003029737914967967,
"loss": 3.2155,
"step": 85100
},
{
"epoch": 24.782196087564042,
"grad_norm": 0.4251973330974579,
"learning_rate": 0.0003027990681421083,
"loss": 3.2027,
"step": 85150
},
{
"epoch": 24.79675128085701,
"grad_norm": 0.4469214975833893,
"learning_rate": 0.00030262434478741984,
"loss": 3.2082,
"step": 85200
},
{
"epoch": 24.811306474149976,
"grad_norm": 0.40554749965667725,
"learning_rate": 0.0003024496214327315,
"loss": 3.2054,
"step": 85250
},
{
"epoch": 24.825861667442943,
"grad_norm": 0.4176938533782959,
"learning_rate": 0.0003022748980780431,
"loss": 3.1834,
"step": 85300
},
{
"epoch": 24.84041686073591,
"grad_norm": 0.41417908668518066,
"learning_rate": 0.0003021001747233547,
"loss": 3.2208,
"step": 85350
},
{
"epoch": 24.854972054028877,
"grad_norm": 0.42176830768585205,
"learning_rate": 0.0003019254513686662,
"loss": 3.2061,
"step": 85400
},
{
"epoch": 24.869527247321844,
"grad_norm": 0.42138761281967163,
"learning_rate": 0.0003017507280139778,
"loss": 3.218,
"step": 85450
},
{
"epoch": 24.88408244061481,
"grad_norm": 0.4482320249080658,
"learning_rate": 0.00030157600465928946,
"loss": 3.2105,
"step": 85500
},
{
"epoch": 24.89863763390778,
"grad_norm": 0.42444801330566406,
"learning_rate": 0.00030140128130460106,
"loss": 3.218,
"step": 85550
},
{
"epoch": 24.913192827200746,
"grad_norm": 0.39515721797943115,
"learning_rate": 0.0003012265579499126,
"loss": 3.2095,
"step": 85600
},
{
"epoch": 24.927748020493713,
"grad_norm": 0.44244998693466187,
"learning_rate": 0.0003010518345952242,
"loss": 3.213,
"step": 85650
},
{
"epoch": 24.94230321378668,
"grad_norm": 0.4148036539554596,
"learning_rate": 0.0003008771112405358,
"loss": 3.2165,
"step": 85700
},
{
"epoch": 24.956858407079647,
"grad_norm": 0.4045441746711731,
"learning_rate": 0.0003007023878858473,
"loss": 3.2167,
"step": 85750
},
{
"epoch": 24.971413600372614,
"grad_norm": 0.42188113927841187,
"learning_rate": 0.000300527664531159,
"loss": 3.2062,
"step": 85800
},
{
"epoch": 24.98596879366558,
"grad_norm": 0.4339165985584259,
"learning_rate": 0.00030035294117647057,
"loss": 3.2123,
"step": 85850
},
{
"epoch": 25.00029110386586,
"grad_norm": 0.42109426856040955,
"learning_rate": 0.00030017821782178216,
"loss": 3.2185,
"step": 85900
},
{
"epoch": 25.014846297158826,
"grad_norm": 0.39719924330711365,
"learning_rate": 0.00030000349446709376,
"loss": 3.1139,
"step": 85950
},
{
"epoch": 25.029401490451793,
"grad_norm": 0.39850518107414246,
"learning_rate": 0.00029982877111240535,
"loss": 3.1189,
"step": 86000
},
{
"epoch": 25.029401490451793,
"eval_accuracy": 0.37425959266681325,
"eval_loss": 3.546060800552368,
"eval_runtime": 181.2175,
"eval_samples_per_second": 91.884,
"eval_steps_per_second": 5.744,
"step": 86000
},
{
"epoch": 25.04395668374476,
"grad_norm": 0.43182504177093506,
"learning_rate": 0.00029965404775771694,
"loss": 3.1191,
"step": 86050
},
{
"epoch": 25.058511877037727,
"grad_norm": 0.4257315695285797,
"learning_rate": 0.0002994793244030285,
"loss": 3.1306,
"step": 86100
},
{
"epoch": 25.073067070330694,
"grad_norm": 0.426359623670578,
"learning_rate": 0.00029930460104834013,
"loss": 3.1286,
"step": 86150
},
{
"epoch": 25.08762226362366,
"grad_norm": 0.4430175721645355,
"learning_rate": 0.0002991298776936517,
"loss": 3.1293,
"step": 86200
},
{
"epoch": 25.10217745691663,
"grad_norm": 0.4329589605331421,
"learning_rate": 0.00029895515433896327,
"loss": 3.1417,
"step": 86250
},
{
"epoch": 25.116732650209595,
"grad_norm": 0.4352037012577057,
"learning_rate": 0.00029878043098427486,
"loss": 3.1352,
"step": 86300
},
{
"epoch": 25.131287843502562,
"grad_norm": 0.45002666115760803,
"learning_rate": 0.00029860570762958646,
"loss": 3.1452,
"step": 86350
},
{
"epoch": 25.14584303679553,
"grad_norm": 0.4303296208381653,
"learning_rate": 0.00029843098427489805,
"loss": 3.1434,
"step": 86400
},
{
"epoch": 25.160398230088497,
"grad_norm": 0.42453959584236145,
"learning_rate": 0.00029825626092020964,
"loss": 3.1406,
"step": 86450
},
{
"epoch": 25.174953423381464,
"grad_norm": 0.4066697955131531,
"learning_rate": 0.00029808153756552124,
"loss": 3.1375,
"step": 86500
},
{
"epoch": 25.18950861667443,
"grad_norm": 0.41828471422195435,
"learning_rate": 0.00029790681421083283,
"loss": 3.1537,
"step": 86550
},
{
"epoch": 25.204063809967398,
"grad_norm": 0.4148005545139313,
"learning_rate": 0.00029773209085614443,
"loss": 3.1439,
"step": 86600
},
{
"epoch": 25.218619003260365,
"grad_norm": 0.4200406074523926,
"learning_rate": 0.00029755736750145597,
"loss": 3.1465,
"step": 86650
},
{
"epoch": 25.233174196553332,
"grad_norm": 0.41274210810661316,
"learning_rate": 0.0002973826441467676,
"loss": 3.1425,
"step": 86700
},
{
"epoch": 25.2477293898463,
"grad_norm": 0.3914394974708557,
"learning_rate": 0.00029720792079207916,
"loss": 3.1553,
"step": 86750
},
{
"epoch": 25.262284583139262,
"grad_norm": 0.41851523518562317,
"learning_rate": 0.00029703319743739075,
"loss": 3.1555,
"step": 86800
},
{
"epoch": 25.27683977643223,
"grad_norm": 0.44641366600990295,
"learning_rate": 0.00029685847408270234,
"loss": 3.1664,
"step": 86850
},
{
"epoch": 25.291394969725197,
"grad_norm": 0.4330465495586395,
"learning_rate": 0.00029668375072801394,
"loss": 3.1587,
"step": 86900
},
{
"epoch": 25.305950163018164,
"grad_norm": 0.4315741956233978,
"learning_rate": 0.00029650902737332553,
"loss": 3.1571,
"step": 86950
},
{
"epoch": 25.32050535631113,
"grad_norm": 0.41501671075820923,
"learning_rate": 0.00029633430401863713,
"loss": 3.1644,
"step": 87000
},
{
"epoch": 25.32050535631113,
"eval_accuracy": 0.3738778639788329,
"eval_loss": 3.550408124923706,
"eval_runtime": 181.0862,
"eval_samples_per_second": 91.951,
"eval_steps_per_second": 5.749,
"step": 87000
},
{
"epoch": 25.335060549604098,
"grad_norm": 0.430515319108963,
"learning_rate": 0.0002961595806639487,
"loss": 3.1494,
"step": 87050
},
{
"epoch": 25.349615742897065,
"grad_norm": 0.4376870095729828,
"learning_rate": 0.0002959848573092603,
"loss": 3.1678,
"step": 87100
},
{
"epoch": 25.364170936190032,
"grad_norm": 0.39572909474372864,
"learning_rate": 0.0002958101339545719,
"loss": 3.1846,
"step": 87150
},
{
"epoch": 25.378726129483,
"grad_norm": 0.4197179079055786,
"learning_rate": 0.0002956354105998835,
"loss": 3.171,
"step": 87200
},
{
"epoch": 25.393281322775966,
"grad_norm": 0.4425291419029236,
"learning_rate": 0.0002954606872451951,
"loss": 3.1659,
"step": 87250
},
{
"epoch": 25.407836516068933,
"grad_norm": 0.4734668433666229,
"learning_rate": 0.0002952859638905067,
"loss": 3.1681,
"step": 87300
},
{
"epoch": 25.4223917093619,
"grad_norm": 0.4288901090621948,
"learning_rate": 0.00029511124053581823,
"loss": 3.1686,
"step": 87350
},
{
"epoch": 25.436946902654867,
"grad_norm": 0.41625797748565674,
"learning_rate": 0.0002949365171811299,
"loss": 3.1714,
"step": 87400
},
{
"epoch": 25.451502095947834,
"grad_norm": 0.47367292642593384,
"learning_rate": 0.0002947617938264414,
"loss": 3.1676,
"step": 87450
},
{
"epoch": 25.4660572892408,
"grad_norm": 0.42922917008399963,
"learning_rate": 0.000294587070471753,
"loss": 3.1726,
"step": 87500
},
{
"epoch": 25.48061248253377,
"grad_norm": 0.42955517768859863,
"learning_rate": 0.0002944123471170646,
"loss": 3.186,
"step": 87550
},
{
"epoch": 25.495167675826735,
"grad_norm": 0.4782566726207733,
"learning_rate": 0.0002942376237623762,
"loss": 3.197,
"step": 87600
},
{
"epoch": 25.509722869119702,
"grad_norm": 0.43603309988975525,
"learning_rate": 0.0002940629004076878,
"loss": 3.1809,
"step": 87650
},
{
"epoch": 25.52427806241267,
"grad_norm": 0.44062429666519165,
"learning_rate": 0.0002938881770529994,
"loss": 3.1796,
"step": 87700
},
{
"epoch": 25.538833255705637,
"grad_norm": 0.39834269881248474,
"learning_rate": 0.000293713453698311,
"loss": 3.1853,
"step": 87750
},
{
"epoch": 25.553388448998604,
"grad_norm": 0.4281576871871948,
"learning_rate": 0.0002935387303436226,
"loss": 3.1779,
"step": 87800
},
{
"epoch": 25.56794364229157,
"grad_norm": 0.4472132623195648,
"learning_rate": 0.0002933640069889342,
"loss": 3.1837,
"step": 87850
},
{
"epoch": 25.582498835584538,
"grad_norm": 0.42856264114379883,
"learning_rate": 0.0002931892836342457,
"loss": 3.1878,
"step": 87900
},
{
"epoch": 25.597054028877505,
"grad_norm": 0.4264664649963379,
"learning_rate": 0.00029301456027955736,
"loss": 3.1873,
"step": 87950
},
{
"epoch": 25.611609222170472,
"grad_norm": 0.40437066555023193,
"learning_rate": 0.0002928398369248689,
"loss": 3.1798,
"step": 88000
},
{
"epoch": 25.611609222170472,
"eval_accuracy": 0.374592429995742,
"eval_loss": 3.5399508476257324,
"eval_runtime": 181.2957,
"eval_samples_per_second": 91.844,
"eval_steps_per_second": 5.742,
"step": 88000
},
{
"epoch": 25.62616441546344,
"grad_norm": 0.466431587934494,
"learning_rate": 0.0002926651135701805,
"loss": 3.1867,
"step": 88050
},
{
"epoch": 25.640719608756406,
"grad_norm": 0.43069714307785034,
"learning_rate": 0.00029249039021549215,
"loss": 3.1875,
"step": 88100
},
{
"epoch": 25.65527480204937,
"grad_norm": 0.4674331843852997,
"learning_rate": 0.0002923156668608037,
"loss": 3.1889,
"step": 88150
},
{
"epoch": 25.669829995342337,
"grad_norm": 0.42163434624671936,
"learning_rate": 0.0002921409435061153,
"loss": 3.1871,
"step": 88200
},
{
"epoch": 25.684385188635304,
"grad_norm": 0.4694327116012573,
"learning_rate": 0.0002919662201514269,
"loss": 3.1816,
"step": 88250
},
{
"epoch": 25.69894038192827,
"grad_norm": 0.43342265486717224,
"learning_rate": 0.00029179149679673847,
"loss": 3.183,
"step": 88300
},
{
"epoch": 25.713495575221238,
"grad_norm": 0.436389684677124,
"learning_rate": 0.00029161677344205007,
"loss": 3.1894,
"step": 88350
},
{
"epoch": 25.728050768514205,
"grad_norm": 0.4253075420856476,
"learning_rate": 0.00029144205008736166,
"loss": 3.1994,
"step": 88400
},
{
"epoch": 25.742605961807172,
"grad_norm": 0.4468969404697418,
"learning_rate": 0.00029126732673267325,
"loss": 3.2013,
"step": 88450
},
{
"epoch": 25.75716115510014,
"grad_norm": 0.3974677324295044,
"learning_rate": 0.00029109260337798485,
"loss": 3.1971,
"step": 88500
},
{
"epoch": 25.771716348393106,
"grad_norm": 0.4659837782382965,
"learning_rate": 0.00029091788002329644,
"loss": 3.1899,
"step": 88550
},
{
"epoch": 25.786271541686073,
"grad_norm": 0.423094779253006,
"learning_rate": 0.000290743156668608,
"loss": 3.1907,
"step": 88600
},
{
"epoch": 25.80082673497904,
"grad_norm": 0.4330526292324066,
"learning_rate": 0.00029056843331391963,
"loss": 3.2041,
"step": 88650
},
{
"epoch": 25.815381928272007,
"grad_norm": 0.4233783185482025,
"learning_rate": 0.00029039370995923117,
"loss": 3.1941,
"step": 88700
},
{
"epoch": 25.829937121564974,
"grad_norm": 0.4054449498653412,
"learning_rate": 0.00029021898660454277,
"loss": 3.2012,
"step": 88750
},
{
"epoch": 25.84449231485794,
"grad_norm": 0.4207738935947418,
"learning_rate": 0.00029004426324985436,
"loss": 3.1995,
"step": 88800
},
{
"epoch": 25.85904750815091,
"grad_norm": 0.3913928270339966,
"learning_rate": 0.00028986953989516595,
"loss": 3.1966,
"step": 88850
},
{
"epoch": 25.873602701443875,
"grad_norm": 0.41812190413475037,
"learning_rate": 0.00028969481654047755,
"loss": 3.2021,
"step": 88900
},
{
"epoch": 25.888157894736842,
"grad_norm": 0.42711809277534485,
"learning_rate": 0.00028952009318578914,
"loss": 3.2104,
"step": 88950
},
{
"epoch": 25.90271308802981,
"grad_norm": 0.41786685585975647,
"learning_rate": 0.00028934536983110074,
"loss": 3.2071,
"step": 89000
},
{
"epoch": 25.90271308802981,
"eval_accuracy": 0.3751760705944214,
"eval_loss": 3.532879114151001,
"eval_runtime": 181.0754,
"eval_samples_per_second": 91.956,
"eval_steps_per_second": 5.749,
"step": 89000
},
{
"epoch": 25.917268281322777,
"grad_norm": 0.41919800639152527,
"learning_rate": 0.00028917064647641233,
"loss": 3.2159,
"step": 89050
},
{
"epoch": 25.931823474615744,
"grad_norm": 0.4346853792667389,
"learning_rate": 0.0002889959231217239,
"loss": 3.2086,
"step": 89100
},
{
"epoch": 25.94637866790871,
"grad_norm": 0.42549893260002136,
"learning_rate": 0.0002888211997670355,
"loss": 3.1917,
"step": 89150
},
{
"epoch": 25.960933861201678,
"grad_norm": 0.42451906204223633,
"learning_rate": 0.0002886464764123471,
"loss": 3.2051,
"step": 89200
},
{
"epoch": 25.975489054494645,
"grad_norm": 0.4404706358909607,
"learning_rate": 0.0002884717530576587,
"loss": 3.203,
"step": 89250
},
{
"epoch": 25.990044247787612,
"grad_norm": 0.41028639674186707,
"learning_rate": 0.00028829702970297025,
"loss": 3.1926,
"step": 89300
},
{
"epoch": 26.00436655798789,
"grad_norm": 0.48599815368652344,
"learning_rate": 0.0002881223063482819,
"loss": 3.1672,
"step": 89350
},
{
"epoch": 26.018921751280857,
"grad_norm": 0.42923834919929504,
"learning_rate": 0.00028794758299359344,
"loss": 3.102,
"step": 89400
},
{
"epoch": 26.033476944573824,
"grad_norm": 0.42139095067977905,
"learning_rate": 0.00028777285963890503,
"loss": 3.1074,
"step": 89450
},
{
"epoch": 26.04803213786679,
"grad_norm": 0.42988842725753784,
"learning_rate": 0.0002875981362842166,
"loss": 3.1249,
"step": 89500
},
{
"epoch": 26.062587331159758,
"grad_norm": 0.4336186945438385,
"learning_rate": 0.0002874234129295282,
"loss": 3.1217,
"step": 89550
},
{
"epoch": 26.077142524452725,
"grad_norm": 0.40431252121925354,
"learning_rate": 0.0002872486895748398,
"loss": 3.13,
"step": 89600
},
{
"epoch": 26.091697717745692,
"grad_norm": 0.4139500856399536,
"learning_rate": 0.0002870739662201514,
"loss": 3.13,
"step": 89650
},
{
"epoch": 26.10625291103866,
"grad_norm": 0.4227358400821686,
"learning_rate": 0.000286899242865463,
"loss": 3.119,
"step": 89700
},
{
"epoch": 26.120808104331626,
"grad_norm": 0.41390228271484375,
"learning_rate": 0.0002867245195107746,
"loss": 3.1207,
"step": 89750
},
{
"epoch": 26.135363297624593,
"grad_norm": 0.4205567538738251,
"learning_rate": 0.0002865497961560862,
"loss": 3.1274,
"step": 89800
},
{
"epoch": 26.14991849091756,
"grad_norm": 0.43315389752388,
"learning_rate": 0.00028637507280139773,
"loss": 3.1347,
"step": 89850
},
{
"epoch": 26.164473684210527,
"grad_norm": 0.4300045967102051,
"learning_rate": 0.0002862003494467094,
"loss": 3.1347,
"step": 89900
},
{
"epoch": 26.179028877503494,
"grad_norm": 0.419502854347229,
"learning_rate": 0.0002860256260920209,
"loss": 3.14,
"step": 89950
},
{
"epoch": 26.19358407079646,
"grad_norm": 0.4483463764190674,
"learning_rate": 0.0002858509027373325,
"loss": 3.1338,
"step": 90000
},
{
"epoch": 26.19358407079646,
"eval_accuracy": 0.373934864721958,
"eval_loss": 3.5498576164245605,
"eval_runtime": 180.9451,
"eval_samples_per_second": 92.022,
"eval_steps_per_second": 5.753,
"step": 90000
},
{
"epoch": 26.207848160223566,
"grad_norm": 0.4221210777759552,
"learning_rate": 0.0002856761793826441,
"loss": 3.1261,
"step": 90050
},
{
"epoch": 26.222403353516533,
"grad_norm": 0.42945337295532227,
"learning_rate": 0.0002855014560279557,
"loss": 3.1235,
"step": 90100
},
{
"epoch": 26.2369585468095,
"grad_norm": 0.421125590801239,
"learning_rate": 0.0002853267326732673,
"loss": 3.1238,
"step": 90150
},
{
"epoch": 26.251513740102467,
"grad_norm": 0.43803295493125916,
"learning_rate": 0.0002851520093185789,
"loss": 3.1205,
"step": 90200
},
{
"epoch": 26.266068933395434,
"grad_norm": 0.43245306611061096,
"learning_rate": 0.0002849772859638905,
"loss": 3.1183,
"step": 90250
},
{
"epoch": 26.2806241266884,
"grad_norm": 0.43539243936538696,
"learning_rate": 0.0002848025626092021,
"loss": 3.1116,
"step": 90300
},
{
"epoch": 26.29517931998137,
"grad_norm": 0.45199355483055115,
"learning_rate": 0.0002846278392545137,
"loss": 3.1217,
"step": 90350
},
{
"epoch": 26.309734513274336,
"grad_norm": 0.44650569558143616,
"learning_rate": 0.00028445311589982527,
"loss": 3.1276,
"step": 90400
},
{
"epoch": 26.324289706567303,
"grad_norm": 0.4259169101715088,
"learning_rate": 0.00028427839254513686,
"loss": 3.1345,
"step": 90450
},
{
"epoch": 26.33884489986027,
"grad_norm": 0.4632343053817749,
"learning_rate": 0.00028410366919044846,
"loss": 3.132,
"step": 90500
},
{
"epoch": 26.353400093153237,
"grad_norm": 0.43775540590286255,
"learning_rate": 0.00028392894583576,
"loss": 3.1356,
"step": 90550
},
{
"epoch": 26.367955286446204,
"grad_norm": 0.431986927986145,
"learning_rate": 0.00028375422248107165,
"loss": 3.1475,
"step": 90600
},
{
"epoch": 26.38251047973917,
"grad_norm": 0.4202793538570404,
"learning_rate": 0.0002835794991263832,
"loss": 3.1447,
"step": 90650
},
{
"epoch": 26.397065673032138,
"grad_norm": 0.4278116524219513,
"learning_rate": 0.0002834047757716948,
"loss": 3.1459,
"step": 90700
},
{
"epoch": 26.411620866325105,
"grad_norm": 0.4313576817512512,
"learning_rate": 0.0002832300524170064,
"loss": 3.1412,
"step": 90750
},
{
"epoch": 26.426176059618072,
"grad_norm": 0.45625731348991394,
"learning_rate": 0.00028305532906231797,
"loss": 3.1424,
"step": 90800
},
{
"epoch": 26.44073125291104,
"grad_norm": 0.4196455776691437,
"learning_rate": 0.00028288060570762956,
"loss": 3.156,
"step": 90850
},
{
"epoch": 26.455286446204006,
"grad_norm": 0.40867993235588074,
"learning_rate": 0.00028270588235294116,
"loss": 3.1573,
"step": 90900
},
{
"epoch": 26.469841639496973,
"grad_norm": 0.4437214136123657,
"learning_rate": 0.00028253115899825275,
"loss": 3.1435,
"step": 90950
},
{
"epoch": 26.48439683278994,
"grad_norm": 0.4219045341014862,
"learning_rate": 0.0002823564356435643,
"loss": 3.1636,
"step": 91000
},
{
"epoch": 26.48439683278994,
"eval_accuracy": 0.3740536848277302,
"eval_loss": 3.549358606338501,
"eval_runtime": 182.1405,
"eval_samples_per_second": 91.418,
"eval_steps_per_second": 5.715,
"step": 91000
},
{
"epoch": 26.498952026082907,
"grad_norm": 0.4574682116508484,
"learning_rate": 0.00028218171228887594,
"loss": 3.1436,
"step": 91050
},
{
"epoch": 26.513507219375875,
"grad_norm": 0.41283056139945984,
"learning_rate": 0.0002820069889341875,
"loss": 3.1572,
"step": 91100
},
{
"epoch": 26.52806241266884,
"grad_norm": 0.4420188069343567,
"learning_rate": 0.00028183226557949913,
"loss": 3.1582,
"step": 91150
},
{
"epoch": 26.54261760596181,
"grad_norm": 0.42842113971710205,
"learning_rate": 0.00028165754222481067,
"loss": 3.1653,
"step": 91200
},
{
"epoch": 26.557172799254776,
"grad_norm": 0.4411379396915436,
"learning_rate": 0.00028148281887012226,
"loss": 3.1639,
"step": 91250
},
{
"epoch": 26.571727992547743,
"grad_norm": 0.4473015069961548,
"learning_rate": 0.0002813080955154339,
"loss": 3.1546,
"step": 91300
},
{
"epoch": 26.586283185840706,
"grad_norm": 0.4217207729816437,
"learning_rate": 0.00028113337216074545,
"loss": 3.1581,
"step": 91350
},
{
"epoch": 26.600838379133673,
"grad_norm": 0.43184933066368103,
"learning_rate": 0.00028095864880605705,
"loss": 3.153,
"step": 91400
},
{
"epoch": 26.61539357242664,
"grad_norm": 0.4376067817211151,
"learning_rate": 0.00028078392545136864,
"loss": 3.1572,
"step": 91450
},
{
"epoch": 26.629948765719607,
"grad_norm": 0.46065959334373474,
"learning_rate": 0.00028060920209668023,
"loss": 3.1616,
"step": 91500
},
{
"epoch": 26.644503959012575,
"grad_norm": 0.42812418937683105,
"learning_rate": 0.00028043447874199183,
"loss": 3.1661,
"step": 91550
},
{
"epoch": 26.65905915230554,
"grad_norm": 0.44456830620765686,
"learning_rate": 0.0002802597553873034,
"loss": 3.1581,
"step": 91600
},
{
"epoch": 26.67361434559851,
"grad_norm": 0.4181954860687256,
"learning_rate": 0.000280085032032615,
"loss": 3.1771,
"step": 91650
},
{
"epoch": 26.688169538891476,
"grad_norm": 0.4161057770252228,
"learning_rate": 0.00027991030867792656,
"loss": 3.1825,
"step": 91700
},
{
"epoch": 26.702724732184443,
"grad_norm": 0.41539129614830017,
"learning_rate": 0.0002797355853232382,
"loss": 3.1715,
"step": 91750
},
{
"epoch": 26.71727992547741,
"grad_norm": 0.43947649002075195,
"learning_rate": 0.00027956086196854975,
"loss": 3.1684,
"step": 91800
},
{
"epoch": 26.731835118770377,
"grad_norm": 0.4386477768421173,
"learning_rate": 0.0002793861386138614,
"loss": 3.1752,
"step": 91850
},
{
"epoch": 26.746390312063344,
"grad_norm": 0.46494022011756897,
"learning_rate": 0.00027921141525917293,
"loss": 3.1694,
"step": 91900
},
{
"epoch": 26.76094550535631,
"grad_norm": 0.43608811497688293,
"learning_rate": 0.00027903669190448453,
"loss": 3.1695,
"step": 91950
},
{
"epoch": 26.775500698649278,
"grad_norm": 0.43629440665245056,
"learning_rate": 0.0002788619685497961,
"loss": 3.1776,
"step": 92000
},
{
"epoch": 26.775500698649278,
"eval_accuracy": 0.37460982403694304,
"eval_loss": 3.5446009635925293,
"eval_runtime": 181.4344,
"eval_samples_per_second": 91.774,
"eval_steps_per_second": 5.738,
"step": 92000
},
{
"epoch": 26.790055891942245,
"grad_norm": 0.4232460558414459,
"learning_rate": 0.0002786872451951077,
"loss": 3.1696,
"step": 92050
},
{
"epoch": 26.804611085235212,
"grad_norm": 0.42919155955314636,
"learning_rate": 0.0002785125218404193,
"loss": 3.175,
"step": 92100
},
{
"epoch": 26.81916627852818,
"grad_norm": 0.42091572284698486,
"learning_rate": 0.0002783377984857309,
"loss": 3.1751,
"step": 92150
},
{
"epoch": 26.833721471821146,
"grad_norm": 0.4050874412059784,
"learning_rate": 0.0002781630751310425,
"loss": 3.1813,
"step": 92200
},
{
"epoch": 26.848276665114113,
"grad_norm": 0.4387812614440918,
"learning_rate": 0.0002779883517763541,
"loss": 3.1837,
"step": 92250
},
{
"epoch": 26.86283185840708,
"grad_norm": 0.4179288446903229,
"learning_rate": 0.0002778136284216657,
"loss": 3.181,
"step": 92300
},
{
"epoch": 26.877387051700047,
"grad_norm": 0.42394018173217773,
"learning_rate": 0.0002776389050669773,
"loss": 3.1847,
"step": 92350
},
{
"epoch": 26.891942244993015,
"grad_norm": 0.44571471214294434,
"learning_rate": 0.0002774641817122888,
"loss": 3.2013,
"step": 92400
},
{
"epoch": 26.90649743828598,
"grad_norm": 0.4174729287624359,
"learning_rate": 0.00027728945835760047,
"loss": 3.187,
"step": 92450
},
{
"epoch": 26.92105263157895,
"grad_norm": 0.45103052258491516,
"learning_rate": 0.000277114735002912,
"loss": 3.175,
"step": 92500
},
{
"epoch": 26.935607824871916,
"grad_norm": 0.42905762791633606,
"learning_rate": 0.00027694001164822366,
"loss": 3.1818,
"step": 92550
},
{
"epoch": 26.950163018164883,
"grad_norm": 0.45374882221221924,
"learning_rate": 0.0002767652882935352,
"loss": 3.1782,
"step": 92600
},
{
"epoch": 26.96471821145785,
"grad_norm": 0.44403746724128723,
"learning_rate": 0.0002765905649388468,
"loss": 3.1673,
"step": 92650
},
{
"epoch": 26.979273404750813,
"grad_norm": 0.41546887159347534,
"learning_rate": 0.0002764158415841584,
"loss": 3.1826,
"step": 92700
},
{
"epoch": 26.99382859804378,
"grad_norm": 0.41803351044654846,
"learning_rate": 0.00027624111822947,
"loss": 3.1781,
"step": 92750
},
{
"epoch": 27.00844201210992,
"grad_norm": 0.4286218285560608,
"learning_rate": 0.0002760663948747816,
"loss": 3.1922,
"step": 92800
},
{
"epoch": 27.022997205402888,
"grad_norm": 0.4495951235294342,
"learning_rate": 0.00027589167152009317,
"loss": 3.1048,
"step": 92850
},
{
"epoch": 27.037552398695855,
"grad_norm": 0.4294961392879486,
"learning_rate": 0.00027571694816540477,
"loss": 3.1089,
"step": 92900
},
{
"epoch": 27.052107591988822,
"grad_norm": 0.44172975420951843,
"learning_rate": 0.0002755422248107163,
"loss": 3.1145,
"step": 92950
},
{
"epoch": 27.06666278528179,
"grad_norm": 0.40919336676597595,
"learning_rate": 0.00027536750145602795,
"loss": 3.1153,
"step": 93000
},
{
"epoch": 27.06666278528179,
"eval_accuracy": 0.3738698721220648,
"eval_loss": 3.5517330169677734,
"eval_runtime": 181.3112,
"eval_samples_per_second": 91.837,
"eval_steps_per_second": 5.742,
"step": 93000
},
{
"epoch": 27.081217978574756,
"grad_norm": 0.4204716682434082,
"learning_rate": 0.0002751927781013395,
"loss": 3.1158,
"step": 93050
},
{
"epoch": 27.095773171867723,
"grad_norm": 0.4470804035663605,
"learning_rate": 0.0002750180547466511,
"loss": 3.1204,
"step": 93100
},
{
"epoch": 27.11032836516069,
"grad_norm": 0.4344484508037567,
"learning_rate": 0.0002748433313919627,
"loss": 3.1196,
"step": 93150
},
{
"epoch": 27.124883558453657,
"grad_norm": 0.4729747474193573,
"learning_rate": 0.0002746686080372743,
"loss": 3.123,
"step": 93200
},
{
"epoch": 27.139438751746624,
"grad_norm": 0.40821605920791626,
"learning_rate": 0.00027449388468258587,
"loss": 3.1364,
"step": 93250
},
{
"epoch": 27.15399394503959,
"grad_norm": 0.4483500123023987,
"learning_rate": 0.00027431916132789747,
"loss": 3.1251,
"step": 93300
},
{
"epoch": 27.16854913833256,
"grad_norm": 0.42222732305526733,
"learning_rate": 0.00027414443797320906,
"loss": 3.1326,
"step": 93350
},
{
"epoch": 27.183104331625525,
"grad_norm": 0.41381341218948364,
"learning_rate": 0.00027396971461852065,
"loss": 3.1297,
"step": 93400
},
{
"epoch": 27.197659524918492,
"grad_norm": 0.4309457838535309,
"learning_rate": 0.00027379499126383225,
"loss": 3.1373,
"step": 93450
},
{
"epoch": 27.21221471821146,
"grad_norm": 0.42898011207580566,
"learning_rate": 0.00027362026790914384,
"loss": 3.1305,
"step": 93500
},
{
"epoch": 27.226769911504423,
"grad_norm": 0.449894517660141,
"learning_rate": 0.00027344554455445544,
"loss": 3.1401,
"step": 93550
},
{
"epoch": 27.24132510479739,
"grad_norm": 0.4240707457065582,
"learning_rate": 0.00027327082119976703,
"loss": 3.138,
"step": 93600
},
{
"epoch": 27.255880298090357,
"grad_norm": 0.4510543942451477,
"learning_rate": 0.00027309609784507857,
"loss": 3.1454,
"step": 93650
},
{
"epoch": 27.270435491383324,
"grad_norm": 0.41815170645713806,
"learning_rate": 0.0002729213744903902,
"loss": 3.1415,
"step": 93700
},
{
"epoch": 27.28499068467629,
"grad_norm": 0.4698253273963928,
"learning_rate": 0.00027274665113570176,
"loss": 3.1491,
"step": 93750
},
{
"epoch": 27.29954587796926,
"grad_norm": 0.44704052805900574,
"learning_rate": 0.00027257192778101335,
"loss": 3.1542,
"step": 93800
},
{
"epoch": 27.314101071262225,
"grad_norm": 0.4419684112071991,
"learning_rate": 0.00027239720442632495,
"loss": 3.1361,
"step": 93850
},
{
"epoch": 27.328656264555192,
"grad_norm": 0.4628223776817322,
"learning_rate": 0.00027222248107163654,
"loss": 3.1476,
"step": 93900
},
{
"epoch": 27.34321145784816,
"grad_norm": 0.43916580080986023,
"learning_rate": 0.00027204775771694814,
"loss": 3.1373,
"step": 93950
},
{
"epoch": 27.357766651141127,
"grad_norm": 0.40940552949905396,
"learning_rate": 0.00027187303436225973,
"loss": 3.1558,
"step": 94000
},
{
"epoch": 27.357766651141127,
"eval_accuracy": 0.37423256138656835,
"eval_loss": 3.548722743988037,
"eval_runtime": 181.1383,
"eval_samples_per_second": 91.924,
"eval_steps_per_second": 5.747,
"step": 94000
},
{
"epoch": 27.372321844434094,
"grad_norm": 0.4344089925289154,
"learning_rate": 0.0002716983110075713,
"loss": 3.1457,
"step": 94050
},
{
"epoch": 27.38687703772706,
"grad_norm": 0.4405762553215027,
"learning_rate": 0.0002715235876528829,
"loss": 3.1544,
"step": 94100
},
{
"epoch": 27.401432231020028,
"grad_norm": 0.4617936611175537,
"learning_rate": 0.0002713488642981945,
"loss": 3.1517,
"step": 94150
},
{
"epoch": 27.415987424312995,
"grad_norm": 0.4110572934150696,
"learning_rate": 0.00027117414094350606,
"loss": 3.162,
"step": 94200
},
{
"epoch": 27.430542617605962,
"grad_norm": 0.4259830713272095,
"learning_rate": 0.0002709994175888177,
"loss": 3.1644,
"step": 94250
},
{
"epoch": 27.44509781089893,
"grad_norm": 0.46111369132995605,
"learning_rate": 0.00027082469423412924,
"loss": 3.1597,
"step": 94300
},
{
"epoch": 27.459653004191896,
"grad_norm": 0.4082739055156708,
"learning_rate": 0.00027064997087944084,
"loss": 3.1719,
"step": 94350
},
{
"epoch": 27.474208197484863,
"grad_norm": 0.40091511607170105,
"learning_rate": 0.00027047524752475243,
"loss": 3.1593,
"step": 94400
},
{
"epoch": 27.48876339077783,
"grad_norm": 0.4257362484931946,
"learning_rate": 0.000270300524170064,
"loss": 3.1685,
"step": 94450
},
{
"epoch": 27.503318584070797,
"grad_norm": 0.44700610637664795,
"learning_rate": 0.0002701258008153757,
"loss": 3.1676,
"step": 94500
},
{
"epoch": 27.517873777363764,
"grad_norm": 0.45396730303764343,
"learning_rate": 0.0002699510774606872,
"loss": 3.1596,
"step": 94550
},
{
"epoch": 27.53242897065673,
"grad_norm": 0.41291770339012146,
"learning_rate": 0.0002697763541059988,
"loss": 3.1769,
"step": 94600
},
{
"epoch": 27.5469841639497,
"grad_norm": 0.44958990812301636,
"learning_rate": 0.0002696016307513104,
"loss": 3.1754,
"step": 94650
},
{
"epoch": 27.561539357242665,
"grad_norm": 0.4432903528213501,
"learning_rate": 0.000269426907396622,
"loss": 3.165,
"step": 94700
},
{
"epoch": 27.576094550535633,
"grad_norm": 0.43412747979164124,
"learning_rate": 0.0002692521840419336,
"loss": 3.1682,
"step": 94750
},
{
"epoch": 27.5906497438286,
"grad_norm": 0.4156794846057892,
"learning_rate": 0.0002690774606872452,
"loss": 3.1573,
"step": 94800
},
{
"epoch": 27.605204937121567,
"grad_norm": 0.4606965184211731,
"learning_rate": 0.0002689027373325568,
"loss": 3.1712,
"step": 94850
},
{
"epoch": 27.619760130414534,
"grad_norm": 0.4299561083316803,
"learning_rate": 0.0002687280139778683,
"loss": 3.1645,
"step": 94900
},
{
"epoch": 27.634315323707497,
"grad_norm": 0.4365476667881012,
"learning_rate": 0.00026855329062317997,
"loss": 3.1718,
"step": 94950
},
{
"epoch": 27.648870517000464,
"grad_norm": 0.4401813745498657,
"learning_rate": 0.0002683785672684915,
"loss": 3.1684,
"step": 95000
},
{
"epoch": 27.648870517000464,
"eval_accuracy": 0.3745749184272355,
"eval_loss": 3.541621446609497,
"eval_runtime": 178.674,
"eval_samples_per_second": 93.192,
"eval_steps_per_second": 5.826,
"step": 95000
},
{
"epoch": 27.66342571029343,
"grad_norm": 0.44295379519462585,
"learning_rate": 0.0002682038439138031,
"loss": 3.1729,
"step": 95050
},
{
"epoch": 27.6779809035864,
"grad_norm": 0.4356094300746918,
"learning_rate": 0.0002680291205591147,
"loss": 3.1698,
"step": 95100
},
{
"epoch": 27.692536096879365,
"grad_norm": 0.42831119894981384,
"learning_rate": 0.0002678543972044263,
"loss": 3.1754,
"step": 95150
},
{
"epoch": 27.707091290172333,
"grad_norm": 0.4276999831199646,
"learning_rate": 0.0002676796738497379,
"loss": 3.1777,
"step": 95200
},
{
"epoch": 27.7216464834653,
"grad_norm": 0.42521238327026367,
"learning_rate": 0.0002675049504950495,
"loss": 3.1809,
"step": 95250
},
{
"epoch": 27.736201676758267,
"grad_norm": 0.4228277802467346,
"learning_rate": 0.0002673302271403611,
"loss": 3.1755,
"step": 95300
},
{
"epoch": 27.750756870051234,
"grad_norm": 0.4252335727214813,
"learning_rate": 0.00026715550378567267,
"loss": 3.1874,
"step": 95350
},
{
"epoch": 27.7653120633442,
"grad_norm": 0.5082770586013794,
"learning_rate": 0.00026698078043098426,
"loss": 3.1646,
"step": 95400
},
{
"epoch": 27.779867256637168,
"grad_norm": 0.44211241602897644,
"learning_rate": 0.00026680605707629586,
"loss": 3.1714,
"step": 95450
},
{
"epoch": 27.794422449930135,
"grad_norm": 0.4420224726200104,
"learning_rate": 0.00026663133372160745,
"loss": 3.1849,
"step": 95500
},
{
"epoch": 27.808977643223102,
"grad_norm": 0.46785563230514526,
"learning_rate": 0.00026645661036691905,
"loss": 3.1791,
"step": 95550
},
{
"epoch": 27.82353283651607,
"grad_norm": 0.42737874388694763,
"learning_rate": 0.0002662818870122306,
"loss": 3.1751,
"step": 95600
},
{
"epoch": 27.838088029809036,
"grad_norm": 0.4250100255012512,
"learning_rate": 0.00026610716365754224,
"loss": 3.1702,
"step": 95650
},
{
"epoch": 27.852643223102003,
"grad_norm": 0.44536617398262024,
"learning_rate": 0.0002659324403028538,
"loss": 3.1824,
"step": 95700
},
{
"epoch": 27.86719841639497,
"grad_norm": 0.433122843503952,
"learning_rate": 0.00026575771694816537,
"loss": 3.1877,
"step": 95750
},
{
"epoch": 27.881753609687937,
"grad_norm": 0.4616224765777588,
"learning_rate": 0.00026558299359347696,
"loss": 3.1831,
"step": 95800
},
{
"epoch": 27.896308802980904,
"grad_norm": 0.41990095376968384,
"learning_rate": 0.00026540827023878856,
"loss": 3.1879,
"step": 95850
},
{
"epoch": 27.91086399627387,
"grad_norm": 0.4435485303401947,
"learning_rate": 0.00026523354688410015,
"loss": 3.1814,
"step": 95900
},
{
"epoch": 27.92541918956684,
"grad_norm": 0.43030399084091187,
"learning_rate": 0.00026505882352941175,
"loss": 3.2003,
"step": 95950
},
{
"epoch": 27.939974382859806,
"grad_norm": 0.42241382598876953,
"learning_rate": 0.00026488410017472334,
"loss": 3.1868,
"step": 96000
},
{
"epoch": 27.939974382859806,
"eval_accuracy": 0.375420879971596,
"eval_loss": 3.5337677001953125,
"eval_runtime": 178.7437,
"eval_samples_per_second": 93.156,
"eval_steps_per_second": 5.824,
"step": 96000
},
{
"epoch": 27.954529576152773,
"grad_norm": 0.46110999584198,
"learning_rate": 0.00026470937682003494,
"loss": 3.1818,
"step": 96050
},
{
"epoch": 27.96908476944574,
"grad_norm": 0.4265202283859253,
"learning_rate": 0.00026453465346534653,
"loss": 3.1885,
"step": 96100
},
{
"epoch": 27.983639962738707,
"grad_norm": 0.45619866251945496,
"learning_rate": 0.00026435993011065807,
"loss": 3.1869,
"step": 96150
},
{
"epoch": 27.998195156031674,
"grad_norm": 0.43252936005592346,
"learning_rate": 0.0002641852067559697,
"loss": 3.1825,
"step": 96200
},
{
"epoch": 28.01251746623195,
"grad_norm": 0.45363131165504456,
"learning_rate": 0.00026401048340128126,
"loss": 3.0828,
"step": 96250
},
{
"epoch": 28.02707265952492,
"grad_norm": 0.45988988876342773,
"learning_rate": 0.00026383576004659285,
"loss": 3.0949,
"step": 96300
},
{
"epoch": 28.041627852817886,
"grad_norm": 0.4816953241825104,
"learning_rate": 0.00026366103669190445,
"loss": 3.0957,
"step": 96350
},
{
"epoch": 28.056183046110853,
"grad_norm": 0.4343043267726898,
"learning_rate": 0.00026348631333721604,
"loss": 3.1143,
"step": 96400
},
{
"epoch": 28.07073823940382,
"grad_norm": 0.4606913626194,
"learning_rate": 0.00026331158998252764,
"loss": 3.1016,
"step": 96450
},
{
"epoch": 28.085293432696787,
"grad_norm": 0.44795188307762146,
"learning_rate": 0.00026313686662783923,
"loss": 3.098,
"step": 96500
},
{
"epoch": 28.099848625989754,
"grad_norm": 0.4226738214492798,
"learning_rate": 0.0002629621432731508,
"loss": 3.1013,
"step": 96550
},
{
"epoch": 28.11440381928272,
"grad_norm": 0.44529786705970764,
"learning_rate": 0.0002627874199184624,
"loss": 3.1166,
"step": 96600
},
{
"epoch": 28.128959012575688,
"grad_norm": 0.43973055481910706,
"learning_rate": 0.000262612696563774,
"loss": 3.103,
"step": 96650
},
{
"epoch": 28.143514205868655,
"grad_norm": 0.4689570963382721,
"learning_rate": 0.0002624379732090856,
"loss": 3.1211,
"step": 96700
},
{
"epoch": 28.158069399161622,
"grad_norm": 0.42313089966773987,
"learning_rate": 0.0002622632498543972,
"loss": 3.112,
"step": 96750
},
{
"epoch": 28.17262459245459,
"grad_norm": 0.5010049939155579,
"learning_rate": 0.0002620885264997088,
"loss": 3.1209,
"step": 96800
},
{
"epoch": 28.187179785747556,
"grad_norm": 0.4199390709400177,
"learning_rate": 0.00026191380314502034,
"loss": 3.1294,
"step": 96850
},
{
"epoch": 28.201734979040523,
"grad_norm": 0.427876353263855,
"learning_rate": 0.000261739079790332,
"loss": 3.1027,
"step": 96900
},
{
"epoch": 28.216290172333487,
"grad_norm": 0.4426726996898651,
"learning_rate": 0.0002615643564356435,
"loss": 3.1277,
"step": 96950
},
{
"epoch": 28.230845365626454,
"grad_norm": 0.43064725399017334,
"learning_rate": 0.0002613896330809551,
"loss": 3.1243,
"step": 97000
},
{
"epoch": 28.230845365626454,
"eval_accuracy": 0.37407354694234496,
"eval_loss": 3.555612325668335,
"eval_runtime": 178.5317,
"eval_samples_per_second": 93.266,
"eval_steps_per_second": 5.831,
"step": 97000
},
{
"epoch": 28.24540055891942,
"grad_norm": 0.4461931884288788,
"learning_rate": 0.0002612149097262667,
"loss": 3.1323,
"step": 97050
},
{
"epoch": 28.259955752212388,
"grad_norm": 0.43096446990966797,
"learning_rate": 0.0002610401863715783,
"loss": 3.1194,
"step": 97100
},
{
"epoch": 28.274510945505355,
"grad_norm": 0.47497278451919556,
"learning_rate": 0.0002608654630168899,
"loss": 3.1272,
"step": 97150
},
{
"epoch": 28.289066138798322,
"grad_norm": 0.4501521587371826,
"learning_rate": 0.0002606907396622015,
"loss": 3.1414,
"step": 97200
},
{
"epoch": 28.30362133209129,
"grad_norm": 0.43721261620521545,
"learning_rate": 0.0002605160163075131,
"loss": 3.125,
"step": 97250
},
{
"epoch": 28.318176525384256,
"grad_norm": 0.4478330612182617,
"learning_rate": 0.0002603412929528247,
"loss": 3.1445,
"step": 97300
},
{
"epoch": 28.332731718677223,
"grad_norm": 0.5095174312591553,
"learning_rate": 0.0002601665695981363,
"loss": 3.1409,
"step": 97350
},
{
"epoch": 28.34728691197019,
"grad_norm": 0.42724117636680603,
"learning_rate": 0.0002599918462434478,
"loss": 3.1428,
"step": 97400
},
{
"epoch": 28.361842105263158,
"grad_norm": 0.4426712989807129,
"learning_rate": 0.00025981712288875947,
"loss": 3.1387,
"step": 97450
},
{
"epoch": 28.376397298556125,
"grad_norm": 0.4426730275154114,
"learning_rate": 0.000259642399534071,
"loss": 3.1511,
"step": 97500
},
{
"epoch": 28.39095249184909,
"grad_norm": 0.42980238795280457,
"learning_rate": 0.0002594676761793826,
"loss": 3.1457,
"step": 97550
},
{
"epoch": 28.40550768514206,
"grad_norm": 0.4393332302570343,
"learning_rate": 0.0002592929528246942,
"loss": 3.1492,
"step": 97600
},
{
"epoch": 28.420062878435026,
"grad_norm": 0.43661534786224365,
"learning_rate": 0.0002591182294700058,
"loss": 3.1473,
"step": 97650
},
{
"epoch": 28.434618071727993,
"grad_norm": 0.43180370330810547,
"learning_rate": 0.0002589435061153174,
"loss": 3.1527,
"step": 97700
},
{
"epoch": 28.44917326502096,
"grad_norm": 0.45117655396461487,
"learning_rate": 0.000258768782760629,
"loss": 3.1472,
"step": 97750
},
{
"epoch": 28.463728458313927,
"grad_norm": 0.4737044870853424,
"learning_rate": 0.0002585940594059406,
"loss": 3.1518,
"step": 97800
},
{
"epoch": 28.478283651606894,
"grad_norm": 0.4503737688064575,
"learning_rate": 0.00025841933605125217,
"loss": 3.1522,
"step": 97850
},
{
"epoch": 28.49283884489986,
"grad_norm": 0.4357546865940094,
"learning_rate": 0.00025824461269656376,
"loss": 3.163,
"step": 97900
},
{
"epoch": 28.507394038192828,
"grad_norm": 0.4459472596645355,
"learning_rate": 0.00025806988934187536,
"loss": 3.148,
"step": 97950
},
{
"epoch": 28.521949231485795,
"grad_norm": 0.4553315043449402,
"learning_rate": 0.00025789516598718695,
"loss": 3.149,
"step": 98000
},
{
"epoch": 28.521949231485795,
"eval_accuracy": 0.3750310418995421,
"eval_loss": 3.5434606075286865,
"eval_runtime": 178.8143,
"eval_samples_per_second": 93.119,
"eval_steps_per_second": 5.822,
"step": 98000
},
{
"epoch": 28.536504424778762,
"grad_norm": 0.45329025387763977,
"learning_rate": 0.00025772044263249854,
"loss": 3.1607,
"step": 98050
},
{
"epoch": 28.55105961807173,
"grad_norm": 0.4378615617752075,
"learning_rate": 0.0002575457192778101,
"loss": 3.1634,
"step": 98100
},
{
"epoch": 28.565614811364696,
"grad_norm": 0.4968438446521759,
"learning_rate": 0.00025737099592312173,
"loss": 3.1435,
"step": 98150
},
{
"epoch": 28.580170004657663,
"grad_norm": 0.4444005787372589,
"learning_rate": 0.0002571962725684333,
"loss": 3.1574,
"step": 98200
},
{
"epoch": 28.59472519795063,
"grad_norm": 0.4303135275840759,
"learning_rate": 0.00025702154921374487,
"loss": 3.1583,
"step": 98250
},
{
"epoch": 28.609280391243594,
"grad_norm": 0.4470735192298889,
"learning_rate": 0.00025684682585905646,
"loss": 3.1603,
"step": 98300
},
{
"epoch": 28.62383558453656,
"grad_norm": 0.44142308831214905,
"learning_rate": 0.00025667210250436806,
"loss": 3.1726,
"step": 98350
},
{
"epoch": 28.638390777829528,
"grad_norm": 0.46621376276016235,
"learning_rate": 0.00025649737914967965,
"loss": 3.1674,
"step": 98400
},
{
"epoch": 28.652945971122495,
"grad_norm": 0.4749205410480499,
"learning_rate": 0.00025632265579499124,
"loss": 3.1734,
"step": 98450
},
{
"epoch": 28.667501164415462,
"grad_norm": 0.4264492988586426,
"learning_rate": 0.00025614793244030284,
"loss": 3.1614,
"step": 98500
},
{
"epoch": 28.68205635770843,
"grad_norm": 0.8595496416091919,
"learning_rate": 0.0002559732090856144,
"loss": 3.1595,
"step": 98550
},
{
"epoch": 28.696611551001396,
"grad_norm": 0.45727524161338806,
"learning_rate": 0.00025579848573092603,
"loss": 3.157,
"step": 98600
},
{
"epoch": 28.711166744294363,
"grad_norm": 0.4491088390350342,
"learning_rate": 0.0002556237623762376,
"loss": 3.1799,
"step": 98650
},
{
"epoch": 28.72572193758733,
"grad_norm": 0.4161786735057831,
"learning_rate": 0.0002554490390215492,
"loss": 3.1753,
"step": 98700
},
{
"epoch": 28.740277130880298,
"grad_norm": 0.4785554111003876,
"learning_rate": 0.0002552743156668608,
"loss": 3.1759,
"step": 98750
},
{
"epoch": 28.754832324173265,
"grad_norm": 0.4457080066204071,
"learning_rate": 0.00025509959231217235,
"loss": 3.1696,
"step": 98800
},
{
"epoch": 28.76938751746623,
"grad_norm": 0.4306055009365082,
"learning_rate": 0.000254924868957484,
"loss": 3.1665,
"step": 98850
},
{
"epoch": 28.7839427107592,
"grad_norm": 0.4257147014141083,
"learning_rate": 0.00025475014560279554,
"loss": 3.1832,
"step": 98900
},
{
"epoch": 28.798497904052166,
"grad_norm": 0.4394259452819824,
"learning_rate": 0.00025457542224810713,
"loss": 3.1635,
"step": 98950
},
{
"epoch": 28.813053097345133,
"grad_norm": 0.43619710206985474,
"learning_rate": 0.00025440069889341873,
"loss": 3.1667,
"step": 99000
},
{
"epoch": 28.813053097345133,
"eval_accuracy": 0.3752215536616161,
"eval_loss": 3.5366761684417725,
"eval_runtime": 178.9311,
"eval_samples_per_second": 93.058,
"eval_steps_per_second": 5.818,
"step": 99000
},
{
"epoch": 28.8276082906381,
"grad_norm": 0.46083733439445496,
"learning_rate": 0.0002542259755387303,
"loss": 3.1623,
"step": 99050
},
{
"epoch": 28.842163483931067,
"grad_norm": 0.48397934436798096,
"learning_rate": 0.0002540512521840419,
"loss": 3.1784,
"step": 99100
},
{
"epoch": 28.856718677224034,
"grad_norm": 0.456592321395874,
"learning_rate": 0.0002538765288293535,
"loss": 3.1693,
"step": 99150
},
{
"epoch": 28.871273870517,
"grad_norm": 0.4616560637950897,
"learning_rate": 0.0002537018054746651,
"loss": 3.1686,
"step": 99200
},
{
"epoch": 28.885829063809968,
"grad_norm": 0.44148775935173035,
"learning_rate": 0.00025352708211997664,
"loss": 3.1726,
"step": 99250
},
{
"epoch": 28.900384257102935,
"grad_norm": 0.4602034389972687,
"learning_rate": 0.0002533523587652883,
"loss": 3.1717,
"step": 99300
},
{
"epoch": 28.914939450395902,
"grad_norm": 0.4577277600765228,
"learning_rate": 0.00025317763541059983,
"loss": 3.1874,
"step": 99350
},
{
"epoch": 28.92949464368887,
"grad_norm": 0.46108654141426086,
"learning_rate": 0.0002530029120559115,
"loss": 3.1752,
"step": 99400
},
{
"epoch": 28.944049836981836,
"grad_norm": 0.4398329555988312,
"learning_rate": 0.000252828188701223,
"loss": 3.1825,
"step": 99450
},
{
"epoch": 28.958605030274803,
"grad_norm": 0.4647148847579956,
"learning_rate": 0.0002526534653465346,
"loss": 3.1825,
"step": 99500
},
{
"epoch": 28.97316022356777,
"grad_norm": 0.48289281129837036,
"learning_rate": 0.0002524787419918462,
"loss": 3.1766,
"step": 99550
},
{
"epoch": 28.987715416860738,
"grad_norm": 0.4246104955673218,
"learning_rate": 0.0002523040186371578,
"loss": 3.179,
"step": 99600
},
{
"epoch": 29.002037727061015,
"grad_norm": 0.4434986412525177,
"learning_rate": 0.0002521292952824694,
"loss": 3.1789,
"step": 99650
},
{
"epoch": 29.016592920353983,
"grad_norm": 0.481111615896225,
"learning_rate": 0.000251954571927781,
"loss": 3.0767,
"step": 99700
},
{
"epoch": 29.03114811364695,
"grad_norm": 0.4342561960220337,
"learning_rate": 0.0002517798485730926,
"loss": 3.0848,
"step": 99750
},
{
"epoch": 29.045703306939917,
"grad_norm": 0.4534182548522949,
"learning_rate": 0.0002516051252184042,
"loss": 3.08,
"step": 99800
},
{
"epoch": 29.060258500232884,
"grad_norm": 0.4300639033317566,
"learning_rate": 0.0002514304018637158,
"loss": 3.0883,
"step": 99850
},
{
"epoch": 29.07481369352585,
"grad_norm": 0.43499693274497986,
"learning_rate": 0.00025125567850902737,
"loss": 3.0863,
"step": 99900
},
{
"epoch": 29.089368886818818,
"grad_norm": 0.4732967019081116,
"learning_rate": 0.0002510809551543389,
"loss": 3.0884,
"step": 99950
},
{
"epoch": 29.103924080111785,
"grad_norm": 0.441400945186615,
"learning_rate": 0.00025090623179965056,
"loss": 3.1054,
"step": 100000
},
{
"epoch": 29.103924080111785,
"eval_accuracy": 0.37428838685663934,
"eval_loss": 3.5510168075561523,
"eval_runtime": 186.9219,
"eval_samples_per_second": 89.08,
"eval_steps_per_second": 5.569,
"step": 100000
},
{
"epoch": 29.118479273404752,
"grad_norm": 0.44024235010147095,
"learning_rate": 0.0002507315084449621,
"loss": 3.111,
"step": 100050
},
{
"epoch": 29.13303446669772,
"grad_norm": 0.4516540765762329,
"learning_rate": 0.00025055678509027375,
"loss": 3.092,
"step": 100100
},
{
"epoch": 29.147589659990686,
"grad_norm": 0.43763741850852966,
"learning_rate": 0.0002503820617355853,
"loss": 3.1217,
"step": 100150
},
{
"epoch": 29.162144853283653,
"grad_norm": 0.4587101638317108,
"learning_rate": 0.0002502073383808969,
"loss": 3.1219,
"step": 100200
},
{
"epoch": 29.17670004657662,
"grad_norm": 0.4364701807498932,
"learning_rate": 0.0002500326150262085,
"loss": 3.1208,
"step": 100250
},
{
"epoch": 29.191255239869584,
"grad_norm": 0.4533688426017761,
"learning_rate": 0.00024985789167152007,
"loss": 3.117,
"step": 100300
},
{
"epoch": 29.20581043316255,
"grad_norm": 0.454554945230484,
"learning_rate": 0.00024968316831683167,
"loss": 3.1305,
"step": 100350
},
{
"epoch": 29.220365626455518,
"grad_norm": 0.43322986364364624,
"learning_rate": 0.00024950844496214326,
"loss": 3.1151,
"step": 100400
},
{
"epoch": 29.234920819748485,
"grad_norm": 0.4622367322444916,
"learning_rate": 0.00024933372160745485,
"loss": 3.1159,
"step": 100450
},
{
"epoch": 29.249476013041452,
"grad_norm": 0.4399924874305725,
"learning_rate": 0.0002491589982527664,
"loss": 3.125,
"step": 100500
},
{
"epoch": 29.26403120633442,
"grad_norm": 0.4242802560329437,
"learning_rate": 0.00024898427489807804,
"loss": 3.111,
"step": 100550
},
{
"epoch": 29.278586399627386,
"grad_norm": 0.46443307399749756,
"learning_rate": 0.0002488095515433896,
"loss": 3.1034,
"step": 100600
},
{
"epoch": 29.293141592920353,
"grad_norm": 0.4604977071285248,
"learning_rate": 0.0002486348281887012,
"loss": 3.1327,
"step": 100650
},
{
"epoch": 29.30769678621332,
"grad_norm": 0.45278415083885193,
"learning_rate": 0.00024846010483401277,
"loss": 3.1281,
"step": 100700
},
{
"epoch": 29.322251979506287,
"grad_norm": 0.4434463381767273,
"learning_rate": 0.00024828538147932437,
"loss": 3.1237,
"step": 100750
},
{
"epoch": 29.336807172799254,
"grad_norm": 0.435261607170105,
"learning_rate": 0.00024811065812463596,
"loss": 3.1243,
"step": 100800
},
{
"epoch": 29.35136236609222,
"grad_norm": 0.4782884418964386,
"learning_rate": 0.00024793593476994755,
"loss": 3.1301,
"step": 100850
},
{
"epoch": 29.36591755938519,
"grad_norm": 0.4731723368167877,
"learning_rate": 0.00024776121141525915,
"loss": 3.1214,
"step": 100900
},
{
"epoch": 29.380472752678156,
"grad_norm": 0.4642447233200073,
"learning_rate": 0.00024758648806057074,
"loss": 3.1421,
"step": 100950
},
{
"epoch": 29.395027945971123,
"grad_norm": 0.4479381740093231,
"learning_rate": 0.00024741176470588234,
"loss": 3.1242,
"step": 101000
},
{
"epoch": 29.395027945971123,
"eval_accuracy": 0.3747103098830709,
"eval_loss": 3.5458545684814453,
"eval_runtime": 179.2165,
"eval_samples_per_second": 92.91,
"eval_steps_per_second": 5.809,
"step": 101000
},
{
"epoch": 29.40958313926409,
"grad_norm": 0.4384738802909851,
"learning_rate": 0.00024723704135119393,
"loss": 3.1327,
"step": 101050
},
{
"epoch": 29.424138332557057,
"grad_norm": 0.436471551656723,
"learning_rate": 0.0002470623179965055,
"loss": 3.1351,
"step": 101100
},
{
"epoch": 29.438693525850024,
"grad_norm": 0.4630633294582367,
"learning_rate": 0.0002468875946418171,
"loss": 3.1315,
"step": 101150
},
{
"epoch": 29.45324871914299,
"grad_norm": 0.43443524837493896,
"learning_rate": 0.00024671287128712866,
"loss": 3.153,
"step": 101200
},
{
"epoch": 29.467803912435958,
"grad_norm": 0.5008822083473206,
"learning_rate": 0.0002465381479324403,
"loss": 3.142,
"step": 101250
},
{
"epoch": 29.482359105728925,
"grad_norm": 0.4582183063030243,
"learning_rate": 0.00024636342457775185,
"loss": 3.155,
"step": 101300
},
{
"epoch": 29.496914299021892,
"grad_norm": 0.42436885833740234,
"learning_rate": 0.00024618870122306344,
"loss": 3.1489,
"step": 101350
},
{
"epoch": 29.51146949231486,
"grad_norm": 0.475890189409256,
"learning_rate": 0.00024601397786837504,
"loss": 3.1403,
"step": 101400
},
{
"epoch": 29.526024685607826,
"grad_norm": 0.43587788939476013,
"learning_rate": 0.00024583925451368663,
"loss": 3.134,
"step": 101450
},
{
"epoch": 29.540579878900793,
"grad_norm": 0.45687776803970337,
"learning_rate": 0.0002456645311589982,
"loss": 3.1401,
"step": 101500
},
{
"epoch": 29.55513507219376,
"grad_norm": 0.47364842891693115,
"learning_rate": 0.0002454898078043098,
"loss": 3.1547,
"step": 101550
},
{
"epoch": 29.569690265486727,
"grad_norm": 0.46195733547210693,
"learning_rate": 0.0002453150844496214,
"loss": 3.1403,
"step": 101600
},
{
"epoch": 29.58424545877969,
"grad_norm": 0.4433932304382324,
"learning_rate": 0.000245140361094933,
"loss": 3.1476,
"step": 101650
},
{
"epoch": 29.598800652072658,
"grad_norm": 0.4796285927295685,
"learning_rate": 0.0002449656377402446,
"loss": 3.1543,
"step": 101700
},
{
"epoch": 29.613355845365625,
"grad_norm": 0.4393634498119354,
"learning_rate": 0.00024479091438555614,
"loss": 3.1608,
"step": 101750
},
{
"epoch": 29.627911038658592,
"grad_norm": 0.4540051519870758,
"learning_rate": 0.0002446161910308678,
"loss": 3.1679,
"step": 101800
},
{
"epoch": 29.64246623195156,
"grad_norm": 0.45071864128112793,
"learning_rate": 0.0002444414676761794,
"loss": 3.1577,
"step": 101850
},
{
"epoch": 29.657021425244526,
"grad_norm": 0.44411563873291016,
"learning_rate": 0.0002442667443214909,
"loss": 3.1719,
"step": 101900
},
{
"epoch": 29.671576618537493,
"grad_norm": 0.452627956867218,
"learning_rate": 0.00024409202096680255,
"loss": 3.1593,
"step": 101950
},
{
"epoch": 29.68613181183046,
"grad_norm": 0.4586232602596283,
"learning_rate": 0.00024391729761211411,
"loss": 3.168,
"step": 102000
},
{
"epoch": 29.68613181183046,
"eval_accuracy": 0.37510555421117375,
"eval_loss": 3.5422708988189697,
"eval_runtime": 179.3978,
"eval_samples_per_second": 92.816,
"eval_steps_per_second": 5.803,
"step": 102000
},
{
"epoch": 29.700687005123427,
"grad_norm": 0.44371819496154785,
"learning_rate": 0.00024374257425742574,
"loss": 3.159,
"step": 102050
},
{
"epoch": 29.715242198416394,
"grad_norm": 0.4582088589668274,
"learning_rate": 0.0002435678509027373,
"loss": 3.1497,
"step": 102100
},
{
"epoch": 29.72979739170936,
"grad_norm": 0.467153936624527,
"learning_rate": 0.0002433931275480489,
"loss": 3.1633,
"step": 102150
},
{
"epoch": 29.74435258500233,
"grad_norm": 0.4523339867591858,
"learning_rate": 0.00024321840419336052,
"loss": 3.1521,
"step": 102200
},
{
"epoch": 29.758907778295296,
"grad_norm": 0.4426189959049225,
"learning_rate": 0.00024304368083867209,
"loss": 3.1679,
"step": 102250
},
{
"epoch": 29.773462971588263,
"grad_norm": 0.43963423371315,
"learning_rate": 0.00024286895748398365,
"loss": 3.1501,
"step": 102300
},
{
"epoch": 29.78801816488123,
"grad_norm": 0.4491553008556366,
"learning_rate": 0.00024269423412929527,
"loss": 3.1566,
"step": 102350
},
{
"epoch": 29.802573358174197,
"grad_norm": 0.46082642674446106,
"learning_rate": 0.00024251951077460684,
"loss": 3.1595,
"step": 102400
},
{
"epoch": 29.817128551467164,
"grad_norm": 0.4386240541934967,
"learning_rate": 0.00024234478741991844,
"loss": 3.172,
"step": 102450
},
{
"epoch": 29.83168374476013,
"grad_norm": 0.4534355700016022,
"learning_rate": 0.00024217006406523003,
"loss": 3.1628,
"step": 102500
},
{
"epoch": 29.846238938053098,
"grad_norm": 0.44121938943862915,
"learning_rate": 0.00024199534071054162,
"loss": 3.1749,
"step": 102550
},
{
"epoch": 29.860794131346065,
"grad_norm": 0.48554858565330505,
"learning_rate": 0.0002418206173558532,
"loss": 3.1665,
"step": 102600
},
{
"epoch": 29.875349324639032,
"grad_norm": 0.45584553480148315,
"learning_rate": 0.0002416458940011648,
"loss": 3.1708,
"step": 102650
},
{
"epoch": 29.889904517932,
"grad_norm": 0.44270968437194824,
"learning_rate": 0.00024147117064647638,
"loss": 3.1641,
"step": 102700
},
{
"epoch": 29.904459711224966,
"grad_norm": 0.47318506240844727,
"learning_rate": 0.000241296447291788,
"loss": 3.1675,
"step": 102750
},
{
"epoch": 29.919014904517933,
"grad_norm": 0.4316175580024719,
"learning_rate": 0.00024112172393709957,
"loss": 3.1721,
"step": 102800
},
{
"epoch": 29.9335700978109,
"grad_norm": 0.44327935576438904,
"learning_rate": 0.00024094700058241116,
"loss": 3.1656,
"step": 102850
},
{
"epoch": 29.948125291103867,
"grad_norm": 0.44737857580184937,
"learning_rate": 0.00024077227722772276,
"loss": 3.162,
"step": 102900
},
{
"epoch": 29.962680484396834,
"grad_norm": 0.4546486735343933,
"learning_rate": 0.00024059755387303435,
"loss": 3.1818,
"step": 102950
},
{
"epoch": 29.977235677689798,
"grad_norm": 0.43689993023872375,
"learning_rate": 0.00024042283051834592,
"loss": 3.1654,
"step": 103000
},
{
"epoch": 29.977235677689798,
"eval_accuracy": 0.37546554034765284,
"eval_loss": 3.537097454071045,
"eval_runtime": 179.3374,
"eval_samples_per_second": 92.847,
"eval_steps_per_second": 5.805,
"step": 103000
},
{
"epoch": 29.991790870982765,
"grad_norm": 0.45586585998535156,
"learning_rate": 0.00024024810716365754,
"loss": 3.1683,
"step": 103050
},
{
"epoch": 30.006113181183046,
"grad_norm": 0.4485780596733093,
"learning_rate": 0.0002400733838089691,
"loss": 3.128,
"step": 103100
},
{
"epoch": 30.020668374476013,
"grad_norm": 0.4525091350078583,
"learning_rate": 0.0002398986604542807,
"loss": 3.0769,
"step": 103150
},
{
"epoch": 30.03522356776898,
"grad_norm": 0.45657244324684143,
"learning_rate": 0.0002397239370995923,
"loss": 3.0872,
"step": 103200
},
{
"epoch": 30.049778761061948,
"grad_norm": 0.46582069993019104,
"learning_rate": 0.0002395492137449039,
"loss": 3.0911,
"step": 103250
},
{
"epoch": 30.064333954354915,
"grad_norm": 0.47504469752311707,
"learning_rate": 0.00023937449039021546,
"loss": 3.0842,
"step": 103300
},
{
"epoch": 30.07888914764788,
"grad_norm": 0.4897063672542572,
"learning_rate": 0.00023919976703552708,
"loss": 3.0813,
"step": 103350
},
{
"epoch": 30.09344434094085,
"grad_norm": 0.4772857427597046,
"learning_rate": 0.00023902504368083865,
"loss": 3.0985,
"step": 103400
},
{
"epoch": 30.107999534233816,
"grad_norm": 0.43550288677215576,
"learning_rate": 0.00023885032032615027,
"loss": 3.1014,
"step": 103450
},
{
"epoch": 30.122554727526783,
"grad_norm": 0.4723055958747864,
"learning_rate": 0.00023867559697146183,
"loss": 3.1032,
"step": 103500
},
{
"epoch": 30.13710992081975,
"grad_norm": 0.4526902735233307,
"learning_rate": 0.0002385008736167734,
"loss": 3.0951,
"step": 103550
},
{
"epoch": 30.151665114112717,
"grad_norm": 0.4515558183193207,
"learning_rate": 0.00023832615026208502,
"loss": 3.1004,
"step": 103600
},
{
"epoch": 30.166220307405684,
"grad_norm": 0.4724433720111847,
"learning_rate": 0.0002381514269073966,
"loss": 3.0963,
"step": 103650
},
{
"epoch": 30.180775500698648,
"grad_norm": 0.45959505438804626,
"learning_rate": 0.00023797670355270818,
"loss": 3.1053,
"step": 103700
},
{
"epoch": 30.195330693991615,
"grad_norm": 0.47177109122276306,
"learning_rate": 0.0002378019801980198,
"loss": 3.1237,
"step": 103750
},
{
"epoch": 30.20988588728458,
"grad_norm": 0.4628247618675232,
"learning_rate": 0.00023762725684333137,
"loss": 3.1103,
"step": 103800
},
{
"epoch": 30.22444108057755,
"grad_norm": 0.4638172388076782,
"learning_rate": 0.00023745253348864294,
"loss": 3.1001,
"step": 103850
},
{
"epoch": 30.238996273870516,
"grad_norm": 0.4565567672252655,
"learning_rate": 0.00023727781013395456,
"loss": 3.114,
"step": 103900
},
{
"epoch": 30.253551467163483,
"grad_norm": 0.44833269715309143,
"learning_rate": 0.00023710308677926613,
"loss": 3.1142,
"step": 103950
},
{
"epoch": 30.26810666045645,
"grad_norm": 0.4612325429916382,
"learning_rate": 0.00023692836342457772,
"loss": 3.1044,
"step": 104000
},
{
"epoch": 30.26810666045645,
"eval_accuracy": 0.37487837392981105,
"eval_loss": 3.5515880584716797,
"eval_runtime": 179.1596,
"eval_samples_per_second": 92.939,
"eval_steps_per_second": 5.81,
"step": 104000
},
{
"epoch": 30.282661853749417,
"grad_norm": 0.46742621064186096,
"learning_rate": 0.00023675364006988932,
"loss": 3.1101,
"step": 104050
},
{
"epoch": 30.297217047042384,
"grad_norm": 0.4602166712284088,
"learning_rate": 0.0002365789167152009,
"loss": 3.1211,
"step": 104100
},
{
"epoch": 30.31177224033535,
"grad_norm": 0.45198196172714233,
"learning_rate": 0.0002364041933605125,
"loss": 3.1238,
"step": 104150
},
{
"epoch": 30.326327433628318,
"grad_norm": 0.44531548023223877,
"learning_rate": 0.0002362294700058241,
"loss": 3.1364,
"step": 104200
},
{
"epoch": 30.340882626921285,
"grad_norm": 0.44484877586364746,
"learning_rate": 0.00023605474665113567,
"loss": 3.1233,
"step": 104250
},
{
"epoch": 30.355437820214252,
"grad_norm": 0.4585815668106079,
"learning_rate": 0.0002358800232964473,
"loss": 3.117,
"step": 104300
},
{
"epoch": 30.36999301350722,
"grad_norm": 0.4720662832260132,
"learning_rate": 0.00023570529994175886,
"loss": 3.1192,
"step": 104350
},
{
"epoch": 30.384548206800186,
"grad_norm": 0.4488585293292999,
"learning_rate": 0.00023553057658707045,
"loss": 3.1271,
"step": 104400
},
{
"epoch": 30.399103400093153,
"grad_norm": 0.46229711174964905,
"learning_rate": 0.00023535585323238204,
"loss": 3.1308,
"step": 104450
},
{
"epoch": 30.41365859338612,
"grad_norm": 0.4812774360179901,
"learning_rate": 0.00023518112987769364,
"loss": 3.1154,
"step": 104500
},
{
"epoch": 30.428213786679088,
"grad_norm": 0.45849233865737915,
"learning_rate": 0.0002350064065230052,
"loss": 3.1383,
"step": 104550
},
{
"epoch": 30.442768979972055,
"grad_norm": 0.46144720911979675,
"learning_rate": 0.00023483168316831683,
"loss": 3.1309,
"step": 104600
},
{
"epoch": 30.45732417326502,
"grad_norm": 0.4718158543109894,
"learning_rate": 0.0002346569598136284,
"loss": 3.1369,
"step": 104650
},
{
"epoch": 30.47187936655799,
"grad_norm": 0.4972895383834839,
"learning_rate": 0.00023448223645894,
"loss": 3.1443,
"step": 104700
},
{
"epoch": 30.486434559850956,
"grad_norm": 0.44092628359794617,
"learning_rate": 0.00023430751310425158,
"loss": 3.1188,
"step": 104750
},
{
"epoch": 30.500989753143923,
"grad_norm": 0.44488775730133057,
"learning_rate": 0.00023413278974956318,
"loss": 3.1493,
"step": 104800
},
{
"epoch": 30.51554494643689,
"grad_norm": 0.44548696279525757,
"learning_rate": 0.00023395806639487477,
"loss": 3.1299,
"step": 104850
},
{
"epoch": 30.530100139729857,
"grad_norm": 0.4458260238170624,
"learning_rate": 0.00023378334304018637,
"loss": 3.1381,
"step": 104900
},
{
"epoch": 30.544655333022824,
"grad_norm": 0.4345245957374573,
"learning_rate": 0.00023360861968549793,
"loss": 3.1361,
"step": 104950
},
{
"epoch": 30.55921052631579,
"grad_norm": 0.4373602271080017,
"learning_rate": 0.00023343389633080955,
"loss": 3.135,
"step": 105000
},
{
"epoch": 30.55921052631579,
"eval_accuracy": 0.3747435701105027,
"eval_loss": 3.5478925704956055,
"eval_runtime": 179.1076,
"eval_samples_per_second": 92.966,
"eval_steps_per_second": 5.812,
"step": 105000
},
{
"epoch": 30.573765719608758,
"grad_norm": 0.47670161724090576,
"learning_rate": 0.00023325917297612112,
"loss": 3.1391,
"step": 105050
},
{
"epoch": 30.58832091290172,
"grad_norm": 0.43521973490715027,
"learning_rate": 0.0002330844496214327,
"loss": 3.1376,
"step": 105100
},
{
"epoch": 30.60287610619469,
"grad_norm": 0.4696406424045563,
"learning_rate": 0.0002329097262667443,
"loss": 3.1465,
"step": 105150
},
{
"epoch": 30.617431299487656,
"grad_norm": 0.43479442596435547,
"learning_rate": 0.00023273500291205588,
"loss": 3.1442,
"step": 105200
},
{
"epoch": 30.631986492780623,
"grad_norm": 0.4464789927005768,
"learning_rate": 0.00023256027955736747,
"loss": 3.1509,
"step": 105250
},
{
"epoch": 30.64654168607359,
"grad_norm": 0.49761533737182617,
"learning_rate": 0.00023238555620267907,
"loss": 3.1401,
"step": 105300
},
{
"epoch": 30.661096879366557,
"grad_norm": 0.4205106496810913,
"learning_rate": 0.00023221083284799066,
"loss": 3.1504,
"step": 105350
},
{
"epoch": 30.675652072659524,
"grad_norm": 0.45064088702201843,
"learning_rate": 0.00023203610949330223,
"loss": 3.1418,
"step": 105400
},
{
"epoch": 30.69020726595249,
"grad_norm": 0.48725900053977966,
"learning_rate": 0.00023186138613861385,
"loss": 3.1581,
"step": 105450
},
{
"epoch": 30.704762459245458,
"grad_norm": 0.43696922063827515,
"learning_rate": 0.00023168666278392542,
"loss": 3.1458,
"step": 105500
},
{
"epoch": 30.719317652538425,
"grad_norm": 0.45358753204345703,
"learning_rate": 0.00023151193942923704,
"loss": 3.1522,
"step": 105550
},
{
"epoch": 30.733872845831392,
"grad_norm": 0.4805457592010498,
"learning_rate": 0.0002313372160745486,
"loss": 3.1392,
"step": 105600
},
{
"epoch": 30.74842803912436,
"grad_norm": 0.44270044565200806,
"learning_rate": 0.0002311624927198602,
"loss": 3.1485,
"step": 105650
},
{
"epoch": 30.762983232417326,
"grad_norm": 0.4392821192741394,
"learning_rate": 0.0002309877693651718,
"loss": 3.1522,
"step": 105700
},
{
"epoch": 30.777538425710294,
"grad_norm": 0.47344040870666504,
"learning_rate": 0.0002308130460104834,
"loss": 3.1454,
"step": 105750
},
{
"epoch": 30.79209361900326,
"grad_norm": 0.4679954946041107,
"learning_rate": 0.00023063832265579496,
"loss": 3.1484,
"step": 105800
},
{
"epoch": 30.806648812296228,
"grad_norm": 0.4455646872520447,
"learning_rate": 0.00023046359930110658,
"loss": 3.1572,
"step": 105850
},
{
"epoch": 30.821204005589195,
"grad_norm": 0.4564194679260254,
"learning_rate": 0.00023028887594641814,
"loss": 3.1502,
"step": 105900
},
{
"epoch": 30.83575919888216,
"grad_norm": 0.4792768657207489,
"learning_rate": 0.00023011415259172974,
"loss": 3.1549,
"step": 105950
},
{
"epoch": 30.85031439217513,
"grad_norm": 0.4297233819961548,
"learning_rate": 0.00022993942923704133,
"loss": 3.1584,
"step": 106000
},
{
"epoch": 30.85031439217513,
"eval_accuracy": 0.3756258475922357,
"eval_loss": 3.535869598388672,
"eval_runtime": 179.0451,
"eval_samples_per_second": 92.999,
"eval_steps_per_second": 5.814,
"step": 106000
},
{
"epoch": 30.864869585468096,
"grad_norm": 0.48069772124290466,
"learning_rate": 0.00022976470588235293,
"loss": 3.1533,
"step": 106050
},
{
"epoch": 30.879424778761063,
"grad_norm": 0.4900089502334595,
"learning_rate": 0.0002295899825276645,
"loss": 3.1557,
"step": 106100
},
{
"epoch": 30.89397997205403,
"grad_norm": 0.4526594281196594,
"learning_rate": 0.00022941525917297612,
"loss": 3.162,
"step": 106150
},
{
"epoch": 30.908535165346997,
"grad_norm": 0.4509666860103607,
"learning_rate": 0.00022924053581828768,
"loss": 3.1507,
"step": 106200
},
{
"epoch": 30.923090358639964,
"grad_norm": 0.44628453254699707,
"learning_rate": 0.0002290658124635993,
"loss": 3.1555,
"step": 106250
},
{
"epoch": 30.93764555193293,
"grad_norm": 0.4314361810684204,
"learning_rate": 0.00022889108910891087,
"loss": 3.1643,
"step": 106300
},
{
"epoch": 30.9522007452259,
"grad_norm": 0.44850438833236694,
"learning_rate": 0.00022871636575422247,
"loss": 3.1515,
"step": 106350
},
{
"epoch": 30.966755938518865,
"grad_norm": 0.46688973903656006,
"learning_rate": 0.00022854164239953406,
"loss": 3.1586,
"step": 106400
},
{
"epoch": 30.98131113181183,
"grad_norm": 0.46198490262031555,
"learning_rate": 0.00022836691904484565,
"loss": 3.1721,
"step": 106450
},
{
"epoch": 30.995866325104796,
"grad_norm": 0.45322471857070923,
"learning_rate": 0.00022819219569015722,
"loss": 3.1583,
"step": 106500
},
{
"epoch": 31.010188635305077,
"grad_norm": 0.46810445189476013,
"learning_rate": 0.00022801747233546884,
"loss": 3.1108,
"step": 106550
},
{
"epoch": 31.024743828598044,
"grad_norm": 0.4760522246360779,
"learning_rate": 0.0002278427489807804,
"loss": 3.063,
"step": 106600
},
{
"epoch": 31.03929902189101,
"grad_norm": 0.4931826889514923,
"learning_rate": 0.00022766802562609198,
"loss": 3.0837,
"step": 106650
},
{
"epoch": 31.05385421518398,
"grad_norm": 0.456632137298584,
"learning_rate": 0.0002274933022714036,
"loss": 3.0864,
"step": 106700
},
{
"epoch": 31.068409408476946,
"grad_norm": 0.45030152797698975,
"learning_rate": 0.00022731857891671517,
"loss": 3.0609,
"step": 106750
},
{
"epoch": 31.082964601769913,
"grad_norm": 0.4781184792518616,
"learning_rate": 0.00022714385556202676,
"loss": 3.0877,
"step": 106800
},
{
"epoch": 31.09751979506288,
"grad_norm": 0.4634675681591034,
"learning_rate": 0.00022696913220733835,
"loss": 3.0881,
"step": 106850
},
{
"epoch": 31.112074988355847,
"grad_norm": 0.46249765157699585,
"learning_rate": 0.00022679440885264995,
"loss": 3.0788,
"step": 106900
},
{
"epoch": 31.126630181648814,
"grad_norm": 0.46504703164100647,
"learning_rate": 0.00022661968549796157,
"loss": 3.0933,
"step": 106950
},
{
"epoch": 31.14118537494178,
"grad_norm": 0.44973164796829224,
"learning_rate": 0.00022644496214327314,
"loss": 3.0948,
"step": 107000
},
{
"epoch": 31.14118537494178,
"eval_accuracy": 0.37459031450424457,
"eval_loss": 3.554295539855957,
"eval_runtime": 181.1589,
"eval_samples_per_second": 91.914,
"eval_steps_per_second": 5.746,
"step": 107000
},
{
"epoch": 31.155740568234748,
"grad_norm": 0.47105175256729126,
"learning_rate": 0.0002262702387885847,
"loss": 3.0937,
"step": 107050
},
{
"epoch": 31.17029576152771,
"grad_norm": 0.4425552189350128,
"learning_rate": 0.00022609551543389633,
"loss": 3.0917,
"step": 107100
},
{
"epoch": 31.18485095482068,
"grad_norm": 0.4712986350059509,
"learning_rate": 0.0002259207920792079,
"loss": 3.0927,
"step": 107150
},
{
"epoch": 31.199406148113646,
"grad_norm": 0.4686832129955292,
"learning_rate": 0.0002257460687245195,
"loss": 3.0996,
"step": 107200
},
{
"epoch": 31.213961341406613,
"grad_norm": 0.4482789933681488,
"learning_rate": 0.00022557134536983108,
"loss": 3.0915,
"step": 107250
},
{
"epoch": 31.22851653469958,
"grad_norm": NaN,
"learning_rate": 0.00022539662201514268,
"loss": 3.1036,
"step": 107300
},
{
"epoch": 31.243071727992547,
"grad_norm": 0.4736534059047699,
"learning_rate": 0.00022522189866045424,
"loss": 3.1067,
"step": 107350
},
{
"epoch": 31.257626921285514,
"grad_norm": 0.44008541107177734,
"learning_rate": 0.00022504717530576586,
"loss": 3.0907,
"step": 107400
},
{
"epoch": 31.27218211457848,
"grad_norm": 0.4680997133255005,
"learning_rate": 0.00022487245195107743,
"loss": 3.1064,
"step": 107450
},
{
"epoch": 31.286737307871448,
"grad_norm": 0.4645274579524994,
"learning_rate": 0.00022469772859638903,
"loss": 3.111,
"step": 107500
},
{
"epoch": 31.301292501164415,
"grad_norm": 0.43999406695365906,
"learning_rate": 0.00022452300524170062,
"loss": 3.1186,
"step": 107550
},
{
"epoch": 31.315847694457382,
"grad_norm": 0.4434453547000885,
"learning_rate": 0.00022434828188701221,
"loss": 3.105,
"step": 107600
},
{
"epoch": 31.33040288775035,
"grad_norm": 0.4525635242462158,
"learning_rate": 0.0002241735585323238,
"loss": 3.1177,
"step": 107650
},
{
"epoch": 31.344958081043316,
"grad_norm": 0.4374854564666748,
"learning_rate": 0.0002239988351776354,
"loss": 3.1208,
"step": 107700
},
{
"epoch": 31.359513274336283,
"grad_norm": 0.4741731584072113,
"learning_rate": 0.00022382411182294697,
"loss": 3.1195,
"step": 107750
},
{
"epoch": 31.37406846762925,
"grad_norm": 0.4294305443763733,
"learning_rate": 0.0002236493884682586,
"loss": 3.115,
"step": 107800
},
{
"epoch": 31.388623660922217,
"grad_norm": 0.44635891914367676,
"learning_rate": 0.00022347466511357016,
"loss": 3.1123,
"step": 107850
},
{
"epoch": 31.403178854215184,
"grad_norm": 0.4601406455039978,
"learning_rate": 0.00022329994175888175,
"loss": 3.1098,
"step": 107900
},
{
"epoch": 31.41773404750815,
"grad_norm": 0.4337484538555145,
"learning_rate": 0.00022312521840419335,
"loss": 3.1193,
"step": 107950
},
{
"epoch": 31.43228924080112,
"grad_norm": 0.4300398826599121,
"learning_rate": 0.00022295049504950494,
"loss": 3.1236,
"step": 108000
},
{
"epoch": 31.43228924080112,
"eval_accuracy": 0.3750808734770371,
"eval_loss": 3.5484988689422607,
"eval_runtime": 181.0232,
"eval_samples_per_second": 91.983,
"eval_steps_per_second": 5.751,
"step": 108000
},
{
"epoch": 31.446844434094086,
"grad_norm": 0.5017353296279907,
"learning_rate": 0.0002227757716948165,
"loss": 3.126,
"step": 108050
},
{
"epoch": 31.461399627387053,
"grad_norm": 0.45702579617500305,
"learning_rate": 0.00022260104834012813,
"loss": 3.1345,
"step": 108100
},
{
"epoch": 31.47595482068002,
"grad_norm": 0.48082423210144043,
"learning_rate": 0.0002224263249854397,
"loss": 3.1295,
"step": 108150
},
{
"epoch": 31.490510013972987,
"grad_norm": 0.4717453718185425,
"learning_rate": 0.00022225160163075126,
"loss": 3.1349,
"step": 108200
},
{
"epoch": 31.505065207265954,
"grad_norm": 0.45219430327415466,
"learning_rate": 0.00022207687827606289,
"loss": 3.1237,
"step": 108250
},
{
"epoch": 31.51962040055892,
"grad_norm": 0.46613046526908875,
"learning_rate": 0.00022190215492137445,
"loss": 3.1341,
"step": 108300
},
{
"epoch": 31.534175593851888,
"grad_norm": 0.474324107170105,
"learning_rate": 0.00022172743156668607,
"loss": 3.125,
"step": 108350
},
{
"epoch": 31.548730787144855,
"grad_norm": 0.47235554456710815,
"learning_rate": 0.00022155270821199764,
"loss": 3.1349,
"step": 108400
},
{
"epoch": 31.56328598043782,
"grad_norm": 0.44595572352409363,
"learning_rate": 0.00022137798485730924,
"loss": 3.1368,
"step": 108450
},
{
"epoch": 31.577841173730786,
"grad_norm": 0.4679282307624817,
"learning_rate": 0.00022120326150262083,
"loss": 3.1317,
"step": 108500
},
{
"epoch": 31.592396367023753,
"grad_norm": 0.4556312561035156,
"learning_rate": 0.00022102853814793242,
"loss": 3.1197,
"step": 108550
},
{
"epoch": 31.60695156031672,
"grad_norm": 0.47826388478279114,
"learning_rate": 0.000220853814793244,
"loss": 3.1234,
"step": 108600
},
{
"epoch": 31.621506753609687,
"grad_norm": 0.48391714692115784,
"learning_rate": 0.0002206790914385556,
"loss": 3.1286,
"step": 108650
},
{
"epoch": 31.636061946902654,
"grad_norm": 0.47049498558044434,
"learning_rate": 0.00022050436808386718,
"loss": 3.1443,
"step": 108700
},
{
"epoch": 31.65061714019562,
"grad_norm": 0.45760464668273926,
"learning_rate": 0.00022032964472917877,
"loss": 3.1455,
"step": 108750
},
{
"epoch": 31.665172333488588,
"grad_norm": 0.4569242298603058,
"learning_rate": 0.00022015492137449037,
"loss": 3.1406,
"step": 108800
},
{
"epoch": 31.679727526781555,
"grad_norm": 0.4866321384906769,
"learning_rate": 0.00021998019801980196,
"loss": 3.1426,
"step": 108850
},
{
"epoch": 31.694282720074522,
"grad_norm": 0.4808866083621979,
"learning_rate": 0.00021980547466511353,
"loss": 3.1362,
"step": 108900
},
{
"epoch": 31.70883791336749,
"grad_norm": 0.44989365339279175,
"learning_rate": 0.00021963075131042515,
"loss": 3.1458,
"step": 108950
},
{
"epoch": 31.723393106660456,
"grad_norm": 0.456107497215271,
"learning_rate": 0.00021945602795573672,
"loss": 3.1344,
"step": 109000
},
{
"epoch": 31.723393106660456,
"eval_accuracy": 0.3755069099591581,
"eval_loss": 3.5397472381591797,
"eval_runtime": 180.9711,
"eval_samples_per_second": 92.009,
"eval_steps_per_second": 5.752,
"step": 109000
},
{
"epoch": 31.737948299953423,
"grad_norm": 0.44981271028518677,
"learning_rate": 0.00021928130460104834,
"loss": 3.1351,
"step": 109050
},
{
"epoch": 31.75250349324639,
"grad_norm": 0.48247814178466797,
"learning_rate": 0.0002191065812463599,
"loss": 3.1416,
"step": 109100
},
{
"epoch": 31.767058686539357,
"grad_norm": 0.4782215356826782,
"learning_rate": 0.0002189318578916715,
"loss": 3.1548,
"step": 109150
},
{
"epoch": 31.781613879832324,
"grad_norm": 0.4576863944530487,
"learning_rate": 0.0002187571345369831,
"loss": 3.153,
"step": 109200
},
{
"epoch": 31.79616907312529,
"grad_norm": 0.46203556656837463,
"learning_rate": 0.0002185824111822947,
"loss": 3.1496,
"step": 109250
},
{
"epoch": 31.81072426641826,
"grad_norm": 0.45210832357406616,
"learning_rate": 0.00021840768782760626,
"loss": 3.1359,
"step": 109300
},
{
"epoch": 31.825279459711226,
"grad_norm": 0.4440477788448334,
"learning_rate": 0.00021823296447291788,
"loss": 3.139,
"step": 109350
},
{
"epoch": 31.839834653004193,
"grad_norm": 0.47411999106407166,
"learning_rate": 0.00021805824111822945,
"loss": 3.1456,
"step": 109400
},
{
"epoch": 31.85438984629716,
"grad_norm": 0.46934619545936584,
"learning_rate": 0.00021788351776354104,
"loss": 3.1523,
"step": 109450
},
{
"epoch": 31.868945039590127,
"grad_norm": 0.4774060547351837,
"learning_rate": 0.00021770879440885263,
"loss": 3.1483,
"step": 109500
},
{
"epoch": 31.883500232883094,
"grad_norm": 0.4577077329158783,
"learning_rate": 0.00021753407105416423,
"loss": 3.1504,
"step": 109550
},
{
"epoch": 31.89805542617606,
"grad_norm": 0.4703179597854614,
"learning_rate": 0.0002173593476994758,
"loss": 3.1481,
"step": 109600
},
{
"epoch": 31.912610619469028,
"grad_norm": 0.4769929349422455,
"learning_rate": 0.00021718462434478742,
"loss": 3.1508,
"step": 109650
},
{
"epoch": 31.927165812761995,
"grad_norm": 0.49723508954048157,
"learning_rate": 0.00021700990099009898,
"loss": 3.156,
"step": 109700
},
{
"epoch": 31.941721006054962,
"grad_norm": 0.4571239650249481,
"learning_rate": 0.0002168351776354106,
"loss": 3.1584,
"step": 109750
},
{
"epoch": 31.956276199347926,
"grad_norm": 0.4813500642776489,
"learning_rate": 0.00021666045428072217,
"loss": 3.138,
"step": 109800
},
{
"epoch": 31.970831392640893,
"grad_norm": 0.4950752258300781,
"learning_rate": 0.00021648573092603374,
"loss": 3.1532,
"step": 109850
},
{
"epoch": 31.98538658593386,
"grad_norm": 0.4535057842731476,
"learning_rate": 0.00021631100757134536,
"loss": 3.1451,
"step": 109900
},
{
"epoch": 31.999941779226827,
"grad_norm": 0.46446654200553894,
"learning_rate": 0.00021613628421665693,
"loss": 3.1475,
"step": 109950
},
{
"epoch": 32.01426408942711,
"grad_norm": 0.4385596513748169,
"learning_rate": 0.00021596156086196852,
"loss": 3.0595,
"step": 110000
},
{
"epoch": 32.01426408942711,
"eval_accuracy": 0.37491116404802116,
"eval_loss": 3.550321340560913,
"eval_runtime": 179.2352,
"eval_samples_per_second": 92.9,
"eval_steps_per_second": 5.808,
"step": 110000
},
{
"epoch": 32.01426408942711,
"step": 110000,
"total_flos": 2.298801310138368e+18,
"train_loss": 0.5713334234064276,
"train_runtime": 39943.0578,
"train_samples_per_second": 343.998,
"train_steps_per_second": 4.301
}
],
"logging_steps": 50,
"max_steps": 171800,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 10000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 20,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 20
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.298801310138368e+18,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}