UD-VLA_CALVIN_ABCD_D / trainer_state.json
chenpyyy's picture
Upload folder using huggingface_hub
7e3b726 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 66.85236768802228,
"eval_steps": 500,
"global_step": 24000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.055710306406685235,
"grad_norm": 14.402867878604889,
"learning_rate": 3.2000000000000005e-05,
"loss": 10.1886,
"step": 20
},
{
"epoch": 0.11142061281337047,
"grad_norm": 13.635298426580595,
"learning_rate": 6.400000000000001e-05,
"loss": 8.826,
"step": 40
},
{
"epoch": 0.1671309192200557,
"grad_norm": 4.640229834066508,
"learning_rate": 7.999996773810157e-05,
"loss": 7.8841,
"step": 60
},
{
"epoch": 0.22284122562674094,
"grad_norm": 4.771787721259069,
"learning_rate": 7.999970964324714e-05,
"loss": 5.9641,
"step": 80
},
{
"epoch": 0.2785515320334262,
"grad_norm": 2.673493851023645,
"learning_rate": 7.999919345531461e-05,
"loss": 4.8468,
"step": 100
},
{
"epoch": 0.3342618384401114,
"grad_norm": 4.303381236511131,
"learning_rate": 7.999841917785668e-05,
"loss": 4.5736,
"step": 120
},
{
"epoch": 0.38997214484679665,
"grad_norm": 3.0474327568183464,
"learning_rate": 7.999738681620232e-05,
"loss": 4.492,
"step": 140
},
{
"epoch": 0.4456824512534819,
"grad_norm": 4.693843207680094,
"learning_rate": 7.999609637745683e-05,
"loss": 4.2847,
"step": 160
},
{
"epoch": 0.5013927576601671,
"grad_norm": 2.9247230468930265,
"learning_rate": 7.999454787050167e-05,
"loss": 3.8923,
"step": 180
},
{
"epoch": 0.5571030640668524,
"grad_norm": 1.489196861741293,
"learning_rate": 7.999274130599451e-05,
"loss": 3.7348,
"step": 200
},
{
"epoch": 0.6128133704735376,
"grad_norm": 1.61920958013195,
"learning_rate": 7.999067669636909e-05,
"loss": 3.6525,
"step": 220
},
{
"epoch": 0.6685236768802229,
"grad_norm": 1.7425863171418448,
"learning_rate": 7.998835405583514e-05,
"loss": 3.606,
"step": 240
},
{
"epoch": 0.724233983286908,
"grad_norm": 1.5257744205454054,
"learning_rate": 7.998577340037835e-05,
"loss": 3.5769,
"step": 260
},
{
"epoch": 0.7799442896935933,
"grad_norm": 2.667757094800373,
"learning_rate": 7.998293474776016e-05,
"loss": 3.5703,
"step": 280
},
{
"epoch": 0.8356545961002786,
"grad_norm": 1.7404959255578403,
"learning_rate": 7.997983811751768e-05,
"loss": 3.5621,
"step": 300
},
{
"epoch": 0.8913649025069638,
"grad_norm": 1.634375074224456,
"learning_rate": 7.99764835309636e-05,
"loss": 3.5179,
"step": 320
},
{
"epoch": 0.947075208913649,
"grad_norm": 1.7342700062168488,
"learning_rate": 7.997287101118597e-05,
"loss": 3.4854,
"step": 340
},
{
"epoch": 1.0027855153203342,
"grad_norm": 1.3788139455421626,
"learning_rate": 7.996900058304807e-05,
"loss": 3.4837,
"step": 360
},
{
"epoch": 1.0584958217270195,
"grad_norm": 2.0400921187647367,
"learning_rate": 7.996487227318829e-05,
"loss": 3.4779,
"step": 380
},
{
"epoch": 1.1142061281337048,
"grad_norm": 1.3557958841205344,
"learning_rate": 7.996048611001985e-05,
"loss": 3.4484,
"step": 400
},
{
"epoch": 1.16991643454039,
"grad_norm": 1.2968269160609383,
"learning_rate": 7.995584212373067e-05,
"loss": 3.4364,
"step": 420
},
{
"epoch": 1.2256267409470751,
"grad_norm": 1.552015046378834,
"learning_rate": 7.995094034628315e-05,
"loss": 3.428,
"step": 440
},
{
"epoch": 1.2813370473537604,
"grad_norm": 1.5830966652724554,
"learning_rate": 7.994578081141396e-05,
"loss": 3.4002,
"step": 460
},
{
"epoch": 1.3370473537604457,
"grad_norm": 1.1969791506799554,
"learning_rate": 7.994036355463378e-05,
"loss": 3.3879,
"step": 480
},
{
"epoch": 1.392757660167131,
"grad_norm": 1.615715341656064,
"learning_rate": 7.993468861322705e-05,
"loss": 3.3804,
"step": 500
},
{
"epoch": 1.448467966573816,
"grad_norm": 1.7044068053200199,
"learning_rate": 7.992875602625179e-05,
"loss": 3.3872,
"step": 520
},
{
"epoch": 1.5041782729805013,
"grad_norm": 1.376009869207643,
"learning_rate": 7.99225658345392e-05,
"loss": 3.3729,
"step": 540
},
{
"epoch": 1.5598885793871866,
"grad_norm": 1.3796323292210666,
"learning_rate": 7.991611808069354e-05,
"loss": 3.3832,
"step": 560
},
{
"epoch": 1.615598885793872,
"grad_norm": 1.5448889484720325,
"learning_rate": 7.990941280909165e-05,
"loss": 3.372,
"step": 580
},
{
"epoch": 1.6713091922005572,
"grad_norm": 1.492544519723889,
"learning_rate": 7.990245006588282e-05,
"loss": 3.3374,
"step": 600
},
{
"epoch": 1.7270194986072425,
"grad_norm": 1.9983170136669428,
"learning_rate": 7.98952298989884e-05,
"loss": 3.3147,
"step": 620
},
{
"epoch": 1.7827298050139275,
"grad_norm": 1.3253294659757162,
"learning_rate": 7.988775235810143e-05,
"loss": 3.3236,
"step": 640
},
{
"epoch": 1.8384401114206128,
"grad_norm": 1.235983274303143,
"learning_rate": 7.988001749468634e-05,
"loss": 3.293,
"step": 660
},
{
"epoch": 1.894150417827298,
"grad_norm": 1.4225853711687855,
"learning_rate": 7.987202536197861e-05,
"loss": 3.3039,
"step": 680
},
{
"epoch": 1.9498607242339832,
"grad_norm": 1.3045957220402902,
"learning_rate": 7.986377601498437e-05,
"loss": 3.2981,
"step": 700
},
{
"epoch": 2.0055710306406684,
"grad_norm": 1.3450960062726653,
"learning_rate": 7.985526951048004e-05,
"loss": 3.2797,
"step": 720
},
{
"epoch": 2.0612813370473537,
"grad_norm": 1.3839789031500191,
"learning_rate": 7.984650590701197e-05,
"loss": 3.2485,
"step": 740
},
{
"epoch": 2.116991643454039,
"grad_norm": 1.2061805725063066,
"learning_rate": 7.983748526489592e-05,
"loss": 3.2598,
"step": 760
},
{
"epoch": 2.1727019498607243,
"grad_norm": 1.272502438657944,
"learning_rate": 7.98282076462168e-05,
"loss": 3.2587,
"step": 780
},
{
"epoch": 2.2284122562674096,
"grad_norm": 1.2869044872960342,
"learning_rate": 7.981867311482816e-05,
"loss": 3.2227,
"step": 800
},
{
"epoch": 2.284122562674095,
"grad_norm": 1.3303768462152665,
"learning_rate": 7.980888173635174e-05,
"loss": 3.2648,
"step": 820
},
{
"epoch": 2.33983286908078,
"grad_norm": 1.3112716268503581,
"learning_rate": 7.979883357817706e-05,
"loss": 3.2745,
"step": 840
},
{
"epoch": 2.3955431754874654,
"grad_norm": 1.286190262319452,
"learning_rate": 7.978852870946091e-05,
"loss": 3.2425,
"step": 860
},
{
"epoch": 2.4512534818941503,
"grad_norm": 1.220565008870163,
"learning_rate": 7.977796720112692e-05,
"loss": 3.2243,
"step": 880
},
{
"epoch": 2.5069637883008355,
"grad_norm": 1.2416999049169055,
"learning_rate": 7.976714912586503e-05,
"loss": 3.2217,
"step": 900
},
{
"epoch": 2.562674094707521,
"grad_norm": 1.649909126615191,
"learning_rate": 7.975607455813105e-05,
"loss": 3.2232,
"step": 920
},
{
"epoch": 2.618384401114206,
"grad_norm": 1.2360938114511875,
"learning_rate": 7.974474357414606e-05,
"loss": 3.1888,
"step": 940
},
{
"epoch": 2.6740947075208914,
"grad_norm": 1.3519864827851877,
"learning_rate": 7.973315625189597e-05,
"loss": 3.1782,
"step": 960
},
{
"epoch": 2.7298050139275767,
"grad_norm": 1.2890437174386116,
"learning_rate": 7.972131267113096e-05,
"loss": 3.192,
"step": 980
},
{
"epoch": 2.785515320334262,
"grad_norm": 1.2063188725654215,
"learning_rate": 7.970921291336485e-05,
"loss": 3.1869,
"step": 1000
},
{
"epoch": 2.841225626740947,
"grad_norm": 1.2749039180973243,
"learning_rate": 7.969685706187467e-05,
"loss": 3.1663,
"step": 1020
},
{
"epoch": 2.896935933147632,
"grad_norm": 1.1624439689926256,
"learning_rate": 7.968424520170001e-05,
"loss": 3.1558,
"step": 1040
},
{
"epoch": 2.9526462395543174,
"grad_norm": 1.3645084603771966,
"learning_rate": 7.967137741964243e-05,
"loss": 3.2151,
"step": 1060
},
{
"epoch": 3.0083565459610027,
"grad_norm": 1.3095011308845006,
"learning_rate": 7.965825380426492e-05,
"loss": 3.1241,
"step": 1080
},
{
"epoch": 3.064066852367688,
"grad_norm": 1.3067653478209693,
"learning_rate": 7.96448744458912e-05,
"loss": 3.1433,
"step": 1100
},
{
"epoch": 3.1197771587743732,
"grad_norm": 1.299162093757844,
"learning_rate": 7.963123943660518e-05,
"loss": 3.1515,
"step": 1120
},
{
"epoch": 3.1754874651810585,
"grad_norm": 1.1814155508957564,
"learning_rate": 7.961734887025032e-05,
"loss": 3.1658,
"step": 1140
},
{
"epoch": 3.231197771587744,
"grad_norm": 1.4814086313964214,
"learning_rate": 7.96032028424289e-05,
"loss": 3.1477,
"step": 1160
},
{
"epoch": 3.286908077994429,
"grad_norm": 1.2408741761500102,
"learning_rate": 7.958880145050149e-05,
"loss": 3.1562,
"step": 1180
},
{
"epoch": 3.3426183844011144,
"grad_norm": 1.240867953717542,
"learning_rate": 7.957414479358615e-05,
"loss": 3.128,
"step": 1200
},
{
"epoch": 3.3983286908077996,
"grad_norm": 1.3259135820081311,
"learning_rate": 7.955923297255786e-05,
"loss": 3.1341,
"step": 1220
},
{
"epoch": 3.4540389972144845,
"grad_norm": 1.24731715301278,
"learning_rate": 7.954406609004775e-05,
"loss": 3.1352,
"step": 1240
},
{
"epoch": 3.5097493036211698,
"grad_norm": 1.4328700677390847,
"learning_rate": 7.952864425044241e-05,
"loss": 3.1776,
"step": 1260
},
{
"epoch": 3.565459610027855,
"grad_norm": 1.2128896131543132,
"learning_rate": 7.951296755988323e-05,
"loss": 3.155,
"step": 1280
},
{
"epoch": 3.6211699164345403,
"grad_norm": 1.2116171750344513,
"learning_rate": 7.949703612626555e-05,
"loss": 3.1577,
"step": 1300
},
{
"epoch": 3.6768802228412256,
"grad_norm": 1.1238332423339203,
"learning_rate": 7.948085005923804e-05,
"loss": 3.1176,
"step": 1320
},
{
"epoch": 3.732590529247911,
"grad_norm": 1.2861584858089754,
"learning_rate": 7.94644094702019e-05,
"loss": 3.1444,
"step": 1340
},
{
"epoch": 3.788300835654596,
"grad_norm": 1.392865877163545,
"learning_rate": 7.944771447231002e-05,
"loss": 3.1275,
"step": 1360
},
{
"epoch": 3.8440111420612815,
"grad_norm": 1.319680796984498,
"learning_rate": 7.943076518046636e-05,
"loss": 3.1178,
"step": 1380
},
{
"epoch": 3.8997214484679663,
"grad_norm": 1.6517648664445108,
"learning_rate": 7.9413561711325e-05,
"loss": 3.113,
"step": 1400
},
{
"epoch": 3.9554317548746516,
"grad_norm": 1.2610682032071576,
"learning_rate": 7.939610418328943e-05,
"loss": 3.1197,
"step": 1420
},
{
"epoch": 4.011142061281337,
"grad_norm": 1.451143416466195,
"learning_rate": 7.937839271651169e-05,
"loss": 3.081,
"step": 1440
},
{
"epoch": 4.066852367688022,
"grad_norm": 1.377524653184373,
"learning_rate": 7.936042743289158e-05,
"loss": 3.0716,
"step": 1460
},
{
"epoch": 4.1225626740947074,
"grad_norm": 1.2134154671846935,
"learning_rate": 7.934220845607582e-05,
"loss": 3.0934,
"step": 1480
},
{
"epoch": 4.178272980501393,
"grad_norm": 1.1951444826901634,
"learning_rate": 7.932373591145714e-05,
"loss": 3.0666,
"step": 1500
},
{
"epoch": 4.233983286908078,
"grad_norm": 1.3540057345367065,
"learning_rate": 7.93050099261735e-05,
"loss": 3.1106,
"step": 1520
},
{
"epoch": 4.289693593314763,
"grad_norm": 1.0915162692508482,
"learning_rate": 7.928603062910715e-05,
"loss": 3.0979,
"step": 1540
},
{
"epoch": 4.345403899721449,
"grad_norm": 1.3408430692556432,
"learning_rate": 7.926679815088376e-05,
"loss": 3.0822,
"step": 1560
},
{
"epoch": 4.401114206128134,
"grad_norm": 1.244967969301378,
"learning_rate": 7.924731262387156e-05,
"loss": 3.0636,
"step": 1580
},
{
"epoch": 4.456824512534819,
"grad_norm": 1.2480585827601505,
"learning_rate": 7.922757418218038e-05,
"loss": 3.0699,
"step": 1600
},
{
"epoch": 4.512534818941504,
"grad_norm": 1.5801893483515768,
"learning_rate": 7.920758296166072e-05,
"loss": 3.0814,
"step": 1620
},
{
"epoch": 4.56824512534819,
"grad_norm": 1.2273710344040911,
"learning_rate": 7.918733909990287e-05,
"loss": 3.0844,
"step": 1640
},
{
"epoch": 4.623955431754875,
"grad_norm": 1.2276563480543434,
"learning_rate": 7.916684273623593e-05,
"loss": 3.042,
"step": 1660
},
{
"epoch": 4.67966573816156,
"grad_norm": 1.2696026342294924,
"learning_rate": 7.914609401172687e-05,
"loss": 3.0693,
"step": 1680
},
{
"epoch": 4.735376044568245,
"grad_norm": 1.2199175574002186,
"learning_rate": 7.912509306917949e-05,
"loss": 3.0728,
"step": 1700
},
{
"epoch": 4.791086350974931,
"grad_norm": 1.2583579762159638,
"learning_rate": 7.910384005313353e-05,
"loss": 3.0661,
"step": 1720
},
{
"epoch": 4.846796657381615,
"grad_norm": 1.1722768148065734,
"learning_rate": 7.908233510986363e-05,
"loss": 3.0687,
"step": 1740
},
{
"epoch": 4.9025069637883005,
"grad_norm": 1.3035051706656053,
"learning_rate": 7.906057838737831e-05,
"loss": 3.032,
"step": 1760
},
{
"epoch": 4.958217270194986,
"grad_norm": 1.1974756008535596,
"learning_rate": 7.903857003541898e-05,
"loss": 3.0866,
"step": 1780
},
{
"epoch": 5.013927576601671,
"grad_norm": 1.1642430602684481,
"learning_rate": 7.901631020545893e-05,
"loss": 3.0565,
"step": 1800
},
{
"epoch": 5.069637883008356,
"grad_norm": 1.4004497345779519,
"learning_rate": 7.899379905070219e-05,
"loss": 3.0445,
"step": 1820
},
{
"epoch": 5.125348189415042,
"grad_norm": 1.17997512711719,
"learning_rate": 7.89710367260826e-05,
"loss": 3.035,
"step": 1840
},
{
"epoch": 5.181058495821727,
"grad_norm": 1.3826862632813843,
"learning_rate": 7.894802338826267e-05,
"loss": 3.0447,
"step": 1860
},
{
"epoch": 5.236768802228412,
"grad_norm": 1.1816414577958267,
"learning_rate": 7.89247591956325e-05,
"loss": 3.0637,
"step": 1880
},
{
"epoch": 5.2924791086350975,
"grad_norm": 1.1654353865755456,
"learning_rate": 7.890124430830871e-05,
"loss": 3.0468,
"step": 1900
},
{
"epoch": 5.348189415041783,
"grad_norm": 1.165385013993255,
"learning_rate": 7.887747888813336e-05,
"loss": 3.0313,
"step": 1920
},
{
"epoch": 5.403899721448468,
"grad_norm": 1.2162837114192384,
"learning_rate": 7.88534630986728e-05,
"loss": 3.0466,
"step": 1940
},
{
"epoch": 5.459610027855153,
"grad_norm": 1.20720099919841,
"learning_rate": 7.882919710521653e-05,
"loss": 3.0551,
"step": 1960
},
{
"epoch": 5.515320334261839,
"grad_norm": 1.177139436715742,
"learning_rate": 7.880468107477611e-05,
"loss": 3.0376,
"step": 1980
},
{
"epoch": 5.571030640668524,
"grad_norm": 1.1524202569240398,
"learning_rate": 7.8779915176084e-05,
"loss": 3.0291,
"step": 2000
},
{
"epoch": 5.626740947075209,
"grad_norm": 1.2146587030127356,
"learning_rate": 7.875489957959237e-05,
"loss": 3.0191,
"step": 2020
},
{
"epoch": 5.6824512534818945,
"grad_norm": 1.128843488220497,
"learning_rate": 7.872963445747195e-05,
"loss": 3.0227,
"step": 2040
},
{
"epoch": 5.73816155988858,
"grad_norm": 1.0994966342019519,
"learning_rate": 7.870411998361084e-05,
"loss": 3.02,
"step": 2060
},
{
"epoch": 5.793871866295264,
"grad_norm": 1.2808892137589254,
"learning_rate": 7.867835633361329e-05,
"loss": 3.0469,
"step": 2080
},
{
"epoch": 5.84958217270195,
"grad_norm": 1.5413808282647536,
"learning_rate": 7.865234368479853e-05,
"loss": 3.0436,
"step": 2100
},
{
"epoch": 5.905292479108635,
"grad_norm": 1.2503860090167789,
"learning_rate": 7.862608221619959e-05,
"loss": 3.0106,
"step": 2120
},
{
"epoch": 5.96100278551532,
"grad_norm": 1.125474961018013,
"learning_rate": 7.859957210856188e-05,
"loss": 3.0519,
"step": 2140
},
{
"epoch": 6.016713091922005,
"grad_norm": 1.1689492826416197,
"learning_rate": 7.857281354434221e-05,
"loss": 2.9989,
"step": 2160
},
{
"epoch": 6.072423398328691,
"grad_norm": 1.154286021730802,
"learning_rate": 7.854580670770731e-05,
"loss": 3.0334,
"step": 2180
},
{
"epoch": 6.128133704735376,
"grad_norm": 1.4350398534810123,
"learning_rate": 7.851855178453272e-05,
"loss": 2.988,
"step": 2200
},
{
"epoch": 6.183844011142061,
"grad_norm": 1.113383813885033,
"learning_rate": 7.84910489624014e-05,
"loss": 2.9763,
"step": 2220
},
{
"epoch": 6.2395543175487465,
"grad_norm": 1.1728629584155144,
"learning_rate": 7.846329843060248e-05,
"loss": 3.0121,
"step": 2240
},
{
"epoch": 6.295264623955432,
"grad_norm": 1.3311805526252927,
"learning_rate": 7.843530038012998e-05,
"loss": 3.0093,
"step": 2260
},
{
"epoch": 6.350974930362117,
"grad_norm": 1.6172845191063454,
"learning_rate": 7.840705500368151e-05,
"loss": 3.006,
"step": 2280
},
{
"epoch": 6.406685236768802,
"grad_norm": 1.11224616315367,
"learning_rate": 7.837856249565682e-05,
"loss": 3.0092,
"step": 2300
},
{
"epoch": 6.462395543175488,
"grad_norm": 1.2756438414289364,
"learning_rate": 7.834982305215663e-05,
"loss": 2.992,
"step": 2320
},
{
"epoch": 6.518105849582173,
"grad_norm": 1.2405302540136935,
"learning_rate": 7.832083687098119e-05,
"loss": 3.0005,
"step": 2340
},
{
"epoch": 6.573816155988858,
"grad_norm": 1.16507749139941,
"learning_rate": 7.829160415162888e-05,
"loss": 2.9687,
"step": 2360
},
{
"epoch": 6.629526462395543,
"grad_norm": 1.1740957839848314,
"learning_rate": 7.826212509529497e-05,
"loss": 2.99,
"step": 2380
},
{
"epoch": 6.685236768802229,
"grad_norm": 1.1161100776511597,
"learning_rate": 7.823239990487008e-05,
"loss": 2.9827,
"step": 2400
},
{
"epoch": 6.740947075208914,
"grad_norm": 1.157820224450206,
"learning_rate": 7.820242878493888e-05,
"loss": 2.9993,
"step": 2420
},
{
"epoch": 6.796657381615599,
"grad_norm": 1.1204269029527796,
"learning_rate": 7.817221194177869e-05,
"loss": 2.9845,
"step": 2440
},
{
"epoch": 6.852367688022284,
"grad_norm": 1.1227241115622848,
"learning_rate": 7.814174958335797e-05,
"loss": 3.0135,
"step": 2460
},
{
"epoch": 6.908077994428969,
"grad_norm": 1.314572529939298,
"learning_rate": 7.8111041919335e-05,
"loss": 3.0121,
"step": 2480
},
{
"epoch": 6.963788300835654,
"grad_norm": 1.3816466804127303,
"learning_rate": 7.808008916105636e-05,
"loss": 3.0031,
"step": 2500
},
{
"epoch": 7.0194986072423395,
"grad_norm": 1.1289207328451878,
"learning_rate": 7.804889152155548e-05,
"loss": 2.9677,
"step": 2520
},
{
"epoch": 7.075208913649025,
"grad_norm": 1.1378808677658065,
"learning_rate": 7.801744921555127e-05,
"loss": 2.9911,
"step": 2540
},
{
"epoch": 7.13091922005571,
"grad_norm": 1.2254491174881055,
"learning_rate": 7.798576245944647e-05,
"loss": 2.9853,
"step": 2560
},
{
"epoch": 7.186629526462395,
"grad_norm": 1.1570802675245002,
"learning_rate": 7.795383147132631e-05,
"loss": 2.9589,
"step": 2580
},
{
"epoch": 7.242339832869081,
"grad_norm": 1.2894488302976834,
"learning_rate": 7.792165647095696e-05,
"loss": 2.9776,
"step": 2600
},
{
"epoch": 7.298050139275766,
"grad_norm": 1.0528540016974788,
"learning_rate": 7.788923767978396e-05,
"loss": 2.96,
"step": 2620
},
{
"epoch": 7.353760445682451,
"grad_norm": 1.2125786891416614,
"learning_rate": 7.785657532093085e-05,
"loss": 3.0041,
"step": 2640
},
{
"epoch": 7.4094707520891365,
"grad_norm": 1.1920890603213412,
"learning_rate": 7.78236696191974e-05,
"loss": 2.9508,
"step": 2660
},
{
"epoch": 7.465181058495822,
"grad_norm": 1.2174124036610061,
"learning_rate": 7.779052080105831e-05,
"loss": 2.9744,
"step": 2680
},
{
"epoch": 7.520891364902507,
"grad_norm": 1.1450363781873376,
"learning_rate": 7.77571290946615e-05,
"loss": 2.9648,
"step": 2700
},
{
"epoch": 7.576601671309192,
"grad_norm": 1.0906384870617993,
"learning_rate": 7.772349472982652e-05,
"loss": 2.9472,
"step": 2720
},
{
"epoch": 7.632311977715878,
"grad_norm": 1.321934725618673,
"learning_rate": 7.768961793804312e-05,
"loss": 2.9812,
"step": 2740
},
{
"epoch": 7.688022284122563,
"grad_norm": 1.3145051192938724,
"learning_rate": 7.765549895246952e-05,
"loss": 2.9936,
"step": 2760
},
{
"epoch": 7.743732590529248,
"grad_norm": 1.1688841213500007,
"learning_rate": 7.762113800793083e-05,
"loss": 2.9673,
"step": 2780
},
{
"epoch": 7.7994428969359335,
"grad_norm": 1.3151433447911725,
"learning_rate": 7.758653534091746e-05,
"loss": 2.9899,
"step": 2800
},
{
"epoch": 7.855153203342619,
"grad_norm": 1.1890842453445192,
"learning_rate": 7.75516911895835e-05,
"loss": 2.9372,
"step": 2820
},
{
"epoch": 7.910863509749303,
"grad_norm": 1.1869067775771354,
"learning_rate": 7.751660579374505e-05,
"loss": 2.9741,
"step": 2840
},
{
"epoch": 7.9665738161559885,
"grad_norm": 1.2713091007536645,
"learning_rate": 7.74812793948786e-05,
"loss": 2.9583,
"step": 2860
},
{
"epoch": 8.022284122562674,
"grad_norm": 1.2300818760281924,
"learning_rate": 7.74457122361193e-05,
"loss": 2.9214,
"step": 2880
},
{
"epoch": 8.07799442896936,
"grad_norm": 1.1631255228333053,
"learning_rate": 7.740990456225944e-05,
"loss": 2.9644,
"step": 2900
},
{
"epoch": 8.133704735376044,
"grad_norm": 1.2750643723536295,
"learning_rate": 7.737385661974655e-05,
"loss": 2.9401,
"step": 2920
},
{
"epoch": 8.18941504178273,
"grad_norm": 1.1034800758000585,
"learning_rate": 7.733756865668189e-05,
"loss": 2.9726,
"step": 2940
},
{
"epoch": 8.245125348189415,
"grad_norm": 1.1418415045379222,
"learning_rate": 7.730104092281867e-05,
"loss": 2.9504,
"step": 2960
},
{
"epoch": 8.300835654596101,
"grad_norm": 1.1744854672216198,
"learning_rate": 7.726427366956026e-05,
"loss": 2.9361,
"step": 2980
},
{
"epoch": 8.356545961002785,
"grad_norm": 1.3072658416879444,
"learning_rate": 7.722726714995862e-05,
"loss": 2.9589,
"step": 3000
},
{
"epoch": 8.412256267409472,
"grad_norm": 1.441058710439587,
"learning_rate": 7.719002161871242e-05,
"loss": 2.9417,
"step": 3020
},
{
"epoch": 8.467966573816156,
"grad_norm": 1.067582173805221,
"learning_rate": 7.715253733216534e-05,
"loss": 2.9067,
"step": 3040
},
{
"epoch": 8.52367688022284,
"grad_norm": 1.1284427897815015,
"learning_rate": 7.711481454830433e-05,
"loss": 2.899,
"step": 3060
},
{
"epoch": 8.579387186629527,
"grad_norm": 1.081964247959463,
"learning_rate": 7.707685352675777e-05,
"loss": 2.9379,
"step": 3080
},
{
"epoch": 8.635097493036211,
"grad_norm": 1.1987325305883298,
"learning_rate": 7.703865452879372e-05,
"loss": 2.9327,
"step": 3100
},
{
"epoch": 8.690807799442897,
"grad_norm": 1.285067490039356,
"learning_rate": 7.700021781731815e-05,
"loss": 2.9105,
"step": 3120
},
{
"epoch": 8.746518105849582,
"grad_norm": 1.2495135447033165,
"learning_rate": 7.696154365687308e-05,
"loss": 2.9324,
"step": 3140
},
{
"epoch": 8.802228412256268,
"grad_norm": 1.2973981114841804,
"learning_rate": 7.69226323136348e-05,
"loss": 2.9255,
"step": 3160
},
{
"epoch": 8.857938718662952,
"grad_norm": 1.4170513439543635,
"learning_rate": 7.6883484055412e-05,
"loss": 2.9497,
"step": 3180
},
{
"epoch": 8.913649025069638,
"grad_norm": 1.1690142712727585,
"learning_rate": 7.684409915164392e-05,
"loss": 2.923,
"step": 3200
},
{
"epoch": 8.969359331476323,
"grad_norm": 1.1510038194930527,
"learning_rate": 7.680447787339861e-05,
"loss": 2.926,
"step": 3220
},
{
"epoch": 9.025069637883009,
"grad_norm": 1.2532790582579474,
"learning_rate": 7.676462049337088e-05,
"loss": 2.9202,
"step": 3240
},
{
"epoch": 9.080779944289693,
"grad_norm": 1.1281216086435024,
"learning_rate": 7.672452728588057e-05,
"loss": 2.962,
"step": 3260
},
{
"epoch": 9.13649025069638,
"grad_norm": 1.1715186252733858,
"learning_rate": 7.668419852687062e-05,
"loss": 2.9135,
"step": 3280
},
{
"epoch": 9.192200557103064,
"grad_norm": 1.1013980953917584,
"learning_rate": 7.664363449390508e-05,
"loss": 2.9017,
"step": 3300
},
{
"epoch": 9.24791086350975,
"grad_norm": 1.2577466487795455,
"learning_rate": 7.660283546616741e-05,
"loss": 2.9397,
"step": 3320
},
{
"epoch": 9.303621169916434,
"grad_norm": 1.2666590722136728,
"learning_rate": 7.656180172445832e-05,
"loss": 2.9291,
"step": 3340
},
{
"epoch": 9.35933147632312,
"grad_norm": 1.0677411210264725,
"learning_rate": 7.6520533551194e-05,
"loss": 2.8936,
"step": 3360
},
{
"epoch": 9.415041782729805,
"grad_norm": 1.2801627452281688,
"learning_rate": 7.647903123040411e-05,
"loss": 2.9053,
"step": 3380
},
{
"epoch": 9.47075208913649,
"grad_norm": 1.4058461947277687,
"learning_rate": 7.643729504772985e-05,
"loss": 2.9267,
"step": 3400
},
{
"epoch": 9.526462395543176,
"grad_norm": 1.0476841151648881,
"learning_rate": 7.639532529042196e-05,
"loss": 2.9067,
"step": 3420
},
{
"epoch": 9.58217270194986,
"grad_norm": 1.1192055741834313,
"learning_rate": 7.635312224733879e-05,
"loss": 2.9217,
"step": 3440
},
{
"epoch": 9.637883008356546,
"grad_norm": 1.0978122554766025,
"learning_rate": 7.631068620894427e-05,
"loss": 2.9008,
"step": 3460
},
{
"epoch": 9.69359331476323,
"grad_norm": 1.1633036806799766,
"learning_rate": 7.626801746730594e-05,
"loss": 2.9058,
"step": 3480
},
{
"epoch": 9.749303621169917,
"grad_norm": 1.09083755501122,
"learning_rate": 7.622511631609293e-05,
"loss": 2.9128,
"step": 3500
},
{
"epoch": 9.805013927576601,
"grad_norm": 1.0388545560459703,
"learning_rate": 7.618198305057391e-05,
"loss": 2.9161,
"step": 3520
},
{
"epoch": 9.860724233983287,
"grad_norm": 1.1015281389024363,
"learning_rate": 7.613861796761513e-05,
"loss": 2.901,
"step": 3540
},
{
"epoch": 9.916434540389972,
"grad_norm": 1.0877360587312859,
"learning_rate": 7.609502136567829e-05,
"loss": 2.9284,
"step": 3560
},
{
"epoch": 9.972144846796658,
"grad_norm": 1.0365425361156384,
"learning_rate": 7.605119354481855e-05,
"loss": 2.902,
"step": 3580
},
{
"epoch": 10.027855153203342,
"grad_norm": 1.2163943871082232,
"learning_rate": 7.600713480668244e-05,
"loss": 2.8877,
"step": 3600
},
{
"epoch": 10.083565459610028,
"grad_norm": 1.3467097823122347,
"learning_rate": 7.596284545450579e-05,
"loss": 2.902,
"step": 3620
},
{
"epoch": 10.139275766016713,
"grad_norm": 1.1614707108221107,
"learning_rate": 7.591832579311162e-05,
"loss": 2.8924,
"step": 3640
},
{
"epoch": 10.194986072423399,
"grad_norm": 1.154263677555927,
"learning_rate": 7.587357612890807e-05,
"loss": 2.8906,
"step": 3660
},
{
"epoch": 10.250696378830083,
"grad_norm": 1.2048421202419115,
"learning_rate": 7.582859676988631e-05,
"loss": 2.91,
"step": 3680
},
{
"epoch": 10.30640668523677,
"grad_norm": 1.0867049785413572,
"learning_rate": 7.578338802561835e-05,
"loss": 2.9205,
"step": 3700
},
{
"epoch": 10.362116991643454,
"grad_norm": 1.2226191180056192,
"learning_rate": 7.573795020725498e-05,
"loss": 2.891,
"step": 3720
},
{
"epoch": 10.41782729805014,
"grad_norm": 1.0288993145273457,
"learning_rate": 7.569228362752359e-05,
"loss": 2.8813,
"step": 3740
},
{
"epoch": 10.473537604456824,
"grad_norm": 1.322014772637416,
"learning_rate": 7.564638860072602e-05,
"loss": 2.8942,
"step": 3760
},
{
"epoch": 10.52924791086351,
"grad_norm": 1.7224079895109572,
"learning_rate": 7.560026544273644e-05,
"loss": 2.89,
"step": 3780
},
{
"epoch": 10.584958217270195,
"grad_norm": 1.06553603535958,
"learning_rate": 7.555391447099909e-05,
"loss": 2.8933,
"step": 3800
},
{
"epoch": 10.64066852367688,
"grad_norm": 1.6005573677632197,
"learning_rate": 7.550733600452618e-05,
"loss": 2.8778,
"step": 3820
},
{
"epoch": 10.696378830083566,
"grad_norm": 1.183378791005643,
"learning_rate": 7.546053036389568e-05,
"loss": 2.8785,
"step": 3840
},
{
"epoch": 10.75208913649025,
"grad_norm": 1.0673503444473083,
"learning_rate": 7.541349787124903e-05,
"loss": 2.8656,
"step": 3860
},
{
"epoch": 10.807799442896936,
"grad_norm": 1.1161036292131443,
"learning_rate": 7.536623885028903e-05,
"loss": 2.8949,
"step": 3880
},
{
"epoch": 10.86350974930362,
"grad_norm": 1.1044441414641109,
"learning_rate": 7.53187536262776e-05,
"loss": 2.8852,
"step": 3900
},
{
"epoch": 10.919220055710307,
"grad_norm": 1.1221763618181442,
"learning_rate": 7.527104252603341e-05,
"loss": 2.8687,
"step": 3920
},
{
"epoch": 10.974930362116991,
"grad_norm": 1.0239803282921194,
"learning_rate": 7.522310587792984e-05,
"loss": 2.8738,
"step": 3940
},
{
"epoch": 11.030640668523677,
"grad_norm": 1.0403254568733065,
"learning_rate": 7.517494401189256e-05,
"loss": 2.8654,
"step": 3960
},
{
"epoch": 11.086350974930362,
"grad_norm": 1.3774438500736799,
"learning_rate": 7.512655725939733e-05,
"loss": 2.8514,
"step": 3980
},
{
"epoch": 11.142061281337048,
"grad_norm": 1.0311405284757509,
"learning_rate": 7.507794595346767e-05,
"loss": 2.8698,
"step": 4000
},
{
"epoch": 11.197771587743732,
"grad_norm": 1.0136275961779155,
"learning_rate": 7.502911042867261e-05,
"loss": 2.8141,
"step": 4020
},
{
"epoch": 11.253481894150418,
"grad_norm": 1.2412045833199774,
"learning_rate": 7.498005102112435e-05,
"loss": 2.894,
"step": 4040
},
{
"epoch": 11.309192200557103,
"grad_norm": 1.070331458091606,
"learning_rate": 7.493076806847605e-05,
"loss": 2.8753,
"step": 4060
},
{
"epoch": 11.364902506963789,
"grad_norm": 1.1760961554541343,
"learning_rate": 7.488126190991936e-05,
"loss": 2.8722,
"step": 4080
},
{
"epoch": 11.420612813370473,
"grad_norm": 1.1616540292064435,
"learning_rate": 7.483153288618215e-05,
"loss": 2.8909,
"step": 4100
},
{
"epoch": 11.47632311977716,
"grad_norm": 1.0331404489020481,
"learning_rate": 7.478158133952619e-05,
"loss": 2.853,
"step": 4120
},
{
"epoch": 11.532033426183844,
"grad_norm": 1.254677923406745,
"learning_rate": 7.473140761374479e-05,
"loss": 2.8674,
"step": 4140
},
{
"epoch": 11.587743732590528,
"grad_norm": 1.4723808775722143,
"learning_rate": 7.468101205416035e-05,
"loss": 2.8738,
"step": 4160
},
{
"epoch": 11.643454038997215,
"grad_norm": 1.2019875338287322,
"learning_rate": 7.463039500762213e-05,
"loss": 2.8878,
"step": 4180
},
{
"epoch": 11.699164345403899,
"grad_norm": 1.2787778339880642,
"learning_rate": 7.457955682250372e-05,
"loss": 2.8797,
"step": 4200
},
{
"epoch": 11.754874651810585,
"grad_norm": 1.054866858779706,
"learning_rate": 7.452849784870072e-05,
"loss": 2.8617,
"step": 4220
},
{
"epoch": 11.81058495821727,
"grad_norm": 1.1047418002149563,
"learning_rate": 7.447721843762836e-05,
"loss": 2.8519,
"step": 4240
},
{
"epoch": 11.866295264623956,
"grad_norm": 1.3479580629482741,
"learning_rate": 7.442571894221898e-05,
"loss": 2.8764,
"step": 4260
},
{
"epoch": 11.92200557103064,
"grad_norm": 1.0996191502382984,
"learning_rate": 7.437399971691968e-05,
"loss": 2.8742,
"step": 4280
},
{
"epoch": 11.977715877437326,
"grad_norm": 1.1829758402369137,
"learning_rate": 7.432206111768985e-05,
"loss": 2.8795,
"step": 4300
},
{
"epoch": 12.03342618384401,
"grad_norm": 1.2260323749324813,
"learning_rate": 7.426990350199874e-05,
"loss": 2.8393,
"step": 4320
},
{
"epoch": 12.089136490250697,
"grad_norm": 1.0838502363024154,
"learning_rate": 7.421752722882299e-05,
"loss": 2.8434,
"step": 4340
},
{
"epoch": 12.144846796657381,
"grad_norm": 1.3481847345508688,
"learning_rate": 7.416493265864415e-05,
"loss": 2.8609,
"step": 4360
},
{
"epoch": 12.200557103064067,
"grad_norm": 1.0824899172058005,
"learning_rate": 7.411212015344622e-05,
"loss": 2.8521,
"step": 4380
},
{
"epoch": 12.256267409470752,
"grad_norm": 1.2218416915168235,
"learning_rate": 7.40590900767131e-05,
"loss": 2.8913,
"step": 4400
},
{
"epoch": 12.311977715877438,
"grad_norm": 1.1304024863305357,
"learning_rate": 7.400584279342621e-05,
"loss": 2.8493,
"step": 4420
},
{
"epoch": 12.367688022284122,
"grad_norm": 1.0421994761055569,
"learning_rate": 7.395237867006185e-05,
"loss": 2.8292,
"step": 4440
},
{
"epoch": 12.423398328690809,
"grad_norm": 1.265322995175785,
"learning_rate": 7.389869807458872e-05,
"loss": 2.8576,
"step": 4460
},
{
"epoch": 12.479108635097493,
"grad_norm": 1.2148541733313956,
"learning_rate": 7.384480137646545e-05,
"loss": 2.8684,
"step": 4480
},
{
"epoch": 12.534818941504179,
"grad_norm": 1.0816243901094347,
"learning_rate": 7.379068894663795e-05,
"loss": 2.8608,
"step": 4500
},
{
"epoch": 12.590529247910863,
"grad_norm": 1.0319313826952614,
"learning_rate": 7.373636115753691e-05,
"loss": 2.8381,
"step": 4520
},
{
"epoch": 12.64623955431755,
"grad_norm": 1.362502756526822,
"learning_rate": 7.368181838307531e-05,
"loss": 2.8361,
"step": 4540
},
{
"epoch": 12.701949860724234,
"grad_norm": 1.1691483178804676,
"learning_rate": 7.36270609986457e-05,
"loss": 2.8476,
"step": 4560
},
{
"epoch": 12.757660167130918,
"grad_norm": 1.099737609334872,
"learning_rate": 7.357208938111772e-05,
"loss": 2.8317,
"step": 4580
},
{
"epoch": 12.813370473537605,
"grad_norm": 1.2465031640095632,
"learning_rate": 7.351690390883547e-05,
"loss": 2.8607,
"step": 4600
},
{
"epoch": 12.869080779944289,
"grad_norm": 1.0626741966239892,
"learning_rate": 7.346150496161489e-05,
"loss": 2.8482,
"step": 4620
},
{
"epoch": 12.924791086350975,
"grad_norm": 1.4215858090007158,
"learning_rate": 7.340589292074123e-05,
"loss": 2.828,
"step": 4640
},
{
"epoch": 12.98050139275766,
"grad_norm": 1.189746289044543,
"learning_rate": 7.33500681689663e-05,
"loss": 2.8392,
"step": 4660
},
{
"epoch": 13.036211699164346,
"grad_norm": 1.1816710302425453,
"learning_rate": 7.329403109050598e-05,
"loss": 2.8439,
"step": 4680
},
{
"epoch": 13.09192200557103,
"grad_norm": 1.1422328461301028,
"learning_rate": 7.323778207103738e-05,
"loss": 2.8458,
"step": 4700
},
{
"epoch": 13.147632311977716,
"grad_norm": 1.286190919467895,
"learning_rate": 7.318132149769639e-05,
"loss": 2.8373,
"step": 4720
},
{
"epoch": 13.2033426183844,
"grad_norm": 1.1535174701098847,
"learning_rate": 7.312464975907494e-05,
"loss": 2.8287,
"step": 4740
},
{
"epoch": 13.259052924791087,
"grad_norm": 1.098410005946985,
"learning_rate": 7.306776724521822e-05,
"loss": 2.8347,
"step": 4760
},
{
"epoch": 13.314763231197771,
"grad_norm": 1.0730663678590038,
"learning_rate": 7.301067434762217e-05,
"loss": 2.8022,
"step": 4780
},
{
"epoch": 13.370473537604457,
"grad_norm": 1.2860001127878453,
"learning_rate": 7.295337145923068e-05,
"loss": 2.8209,
"step": 4800
},
{
"epoch": 13.426183844011142,
"grad_norm": 1.1788256509864643,
"learning_rate": 7.28958589744329e-05,
"loss": 2.8202,
"step": 4820
},
{
"epoch": 13.481894150417828,
"grad_norm": 1.2476156265733942,
"learning_rate": 7.283813728906054e-05,
"loss": 2.8301,
"step": 4840
},
{
"epoch": 13.537604456824512,
"grad_norm": 1.1806963623362805,
"learning_rate": 7.278020680038514e-05,
"loss": 2.8325,
"step": 4860
},
{
"epoch": 13.593314763231199,
"grad_norm": 1.312577644195395,
"learning_rate": 7.272206790711534e-05,
"loss": 2.8268,
"step": 4880
},
{
"epoch": 13.649025069637883,
"grad_norm": 1.2945260257216111,
"learning_rate": 7.266372100939415e-05,
"loss": 2.8474,
"step": 4900
},
{
"epoch": 13.704735376044567,
"grad_norm": 1.1825196002989207,
"learning_rate": 7.26051665087961e-05,
"loss": 2.8245,
"step": 4920
},
{
"epoch": 13.760445682451254,
"grad_norm": 1.1409651239961929,
"learning_rate": 7.254640480832468e-05,
"loss": 2.8342,
"step": 4940
},
{
"epoch": 13.816155988857938,
"grad_norm": 1.1047252543056303,
"learning_rate": 7.248743631240934e-05,
"loss": 2.8504,
"step": 4960
},
{
"epoch": 13.871866295264624,
"grad_norm": 1.1114472045482278,
"learning_rate": 7.242826142690284e-05,
"loss": 2.8238,
"step": 4980
},
{
"epoch": 13.927576601671309,
"grad_norm": 1.0521613836121042,
"learning_rate": 7.236888055907841e-05,
"loss": 2.8524,
"step": 5000
},
{
"epoch": 13.983286908077995,
"grad_norm": 1.1464930432732499,
"learning_rate": 7.230929411762698e-05,
"loss": 2.8309,
"step": 5020
},
{
"epoch": 14.038997214484679,
"grad_norm": 1.2249548851439938,
"learning_rate": 7.224950251265438e-05,
"loss": 2.8166,
"step": 5040
},
{
"epoch": 14.094707520891365,
"grad_norm": 1.1496013157017706,
"learning_rate": 7.218950615567839e-05,
"loss": 2.8176,
"step": 5060
},
{
"epoch": 14.15041782729805,
"grad_norm": 1.2406955049253514,
"learning_rate": 7.212930545962609e-05,
"loss": 2.8452,
"step": 5080
},
{
"epoch": 14.206128133704736,
"grad_norm": 1.0793023104931123,
"learning_rate": 7.206890083883089e-05,
"loss": 2.7934,
"step": 5100
},
{
"epoch": 14.26183844011142,
"grad_norm": 1.0555332439739342,
"learning_rate": 7.200829270902974e-05,
"loss": 2.7967,
"step": 5120
},
{
"epoch": 14.317548746518106,
"grad_norm": 1.088008092694001,
"learning_rate": 7.194748148736022e-05,
"loss": 2.8118,
"step": 5140
},
{
"epoch": 14.37325905292479,
"grad_norm": 1.1664773713662093,
"learning_rate": 7.18864675923577e-05,
"loss": 2.8322,
"step": 5160
},
{
"epoch": 14.428969359331477,
"grad_norm": 1.180830286270773,
"learning_rate": 7.182525144395254e-05,
"loss": 2.7889,
"step": 5180
},
{
"epoch": 14.484679665738161,
"grad_norm": 1.289182787809941,
"learning_rate": 7.176383346346697e-05,
"loss": 2.8145,
"step": 5200
},
{
"epoch": 14.540389972144848,
"grad_norm": 1.3156558914225063,
"learning_rate": 7.170221407361246e-05,
"loss": 2.8057,
"step": 5220
},
{
"epoch": 14.596100278551532,
"grad_norm": 1.311742340109156,
"learning_rate": 7.164039369848662e-05,
"loss": 2.7996,
"step": 5240
},
{
"epoch": 14.651810584958218,
"grad_norm": 1.1167094205760877,
"learning_rate": 7.157837276357038e-05,
"loss": 2.8106,
"step": 5260
},
{
"epoch": 14.707520891364902,
"grad_norm": 1.0697751811552614,
"learning_rate": 7.151615169572499e-05,
"loss": 2.8089,
"step": 5280
},
{
"epoch": 14.763231197771589,
"grad_norm": 1.3217920715681262,
"learning_rate": 7.145373092318921e-05,
"loss": 2.8295,
"step": 5300
},
{
"epoch": 14.818941504178273,
"grad_norm": 1.2030049127163769,
"learning_rate": 7.139111087557614e-05,
"loss": 2.8208,
"step": 5320
},
{
"epoch": 14.874651810584957,
"grad_norm": 1.1560181548327002,
"learning_rate": 7.132829198387052e-05,
"loss": 2.7894,
"step": 5340
},
{
"epoch": 14.930362116991644,
"grad_norm": 1.3677024675114284,
"learning_rate": 7.12652746804256e-05,
"loss": 2.8208,
"step": 5360
},
{
"epoch": 14.986072423398328,
"grad_norm": 1.1758085824901958,
"learning_rate": 7.120205939896016e-05,
"loss": 2.7816,
"step": 5380
},
{
"epoch": 15.041782729805014,
"grad_norm": 1.236149394894879,
"learning_rate": 7.113864657455565e-05,
"loss": 2.8242,
"step": 5400
},
{
"epoch": 15.097493036211699,
"grad_norm": 1.2013719116832535,
"learning_rate": 7.107503664365306e-05,
"loss": 2.8048,
"step": 5420
},
{
"epoch": 15.153203342618385,
"grad_norm": 1.1890270902537186,
"learning_rate": 7.101123004404999e-05,
"loss": 2.7988,
"step": 5440
},
{
"epoch": 15.20891364902507,
"grad_norm": 0.9947321024811,
"learning_rate": 7.094722721489762e-05,
"loss": 2.8023,
"step": 5460
},
{
"epoch": 15.264623955431755,
"grad_norm": 1.097217550580232,
"learning_rate": 7.088302859669767e-05,
"loss": 2.7876,
"step": 5480
},
{
"epoch": 15.32033426183844,
"grad_norm": 1.1297335196918232,
"learning_rate": 7.081863463129943e-05,
"loss": 2.81,
"step": 5500
},
{
"epoch": 15.376044568245126,
"grad_norm": 1.1252485351464636,
"learning_rate": 7.075404576189664e-05,
"loss": 2.8104,
"step": 5520
},
{
"epoch": 15.43175487465181,
"grad_norm": 1.0952308860470021,
"learning_rate": 7.068926243302446e-05,
"loss": 2.8134,
"step": 5540
},
{
"epoch": 15.487465181058496,
"grad_norm": 1.1033772218566738,
"learning_rate": 7.062428509055645e-05,
"loss": 2.7919,
"step": 5560
},
{
"epoch": 15.54317548746518,
"grad_norm": 1.1682663344591195,
"learning_rate": 7.055911418170146e-05,
"loss": 2.8255,
"step": 5580
},
{
"epoch": 15.598885793871867,
"grad_norm": 1.131084807002329,
"learning_rate": 7.049375015500061e-05,
"loss": 2.7911,
"step": 5600
},
{
"epoch": 15.654596100278551,
"grad_norm": 1.200817519578834,
"learning_rate": 7.042819346032408e-05,
"loss": 2.8178,
"step": 5620
},
{
"epoch": 15.710306406685238,
"grad_norm": 1.14175820047776,
"learning_rate": 7.036244454886818e-05,
"loss": 2.7656,
"step": 5640
},
{
"epoch": 15.766016713091922,
"grad_norm": 1.1227562173996573,
"learning_rate": 7.029650387315208e-05,
"loss": 2.8176,
"step": 5660
},
{
"epoch": 15.821727019498606,
"grad_norm": 1.1575039434693588,
"learning_rate": 7.023037188701485e-05,
"loss": 2.7942,
"step": 5680
},
{
"epoch": 15.877437325905293,
"grad_norm": 1.229577946005525,
"learning_rate": 7.01640490456122e-05,
"loss": 2.786,
"step": 5700
},
{
"epoch": 15.933147632311977,
"grad_norm": 1.2021995784653015,
"learning_rate": 7.009753580541344e-05,
"loss": 2.7857,
"step": 5720
},
{
"epoch": 15.988857938718663,
"grad_norm": 1.1325819395096217,
"learning_rate": 7.003083262419829e-05,
"loss": 2.7999,
"step": 5740
},
{
"epoch": 16.044568245125348,
"grad_norm": 1.1155399032222173,
"learning_rate": 6.996393996105378e-05,
"loss": 2.7835,
"step": 5760
},
{
"epoch": 16.100278551532032,
"grad_norm": 1.152416883414678,
"learning_rate": 6.989685827637099e-05,
"loss": 2.7879,
"step": 5780
},
{
"epoch": 16.15598885793872,
"grad_norm": 1.0579264010465572,
"learning_rate": 6.982958803184201e-05,
"loss": 2.7968,
"step": 5800
},
{
"epoch": 16.211699164345404,
"grad_norm": 1.2482769970776515,
"learning_rate": 6.976212969045668e-05,
"loss": 2.7628,
"step": 5820
},
{
"epoch": 16.26740947075209,
"grad_norm": 1.1876572666161167,
"learning_rate": 6.969448371649945e-05,
"loss": 2.7645,
"step": 5840
},
{
"epoch": 16.323119777158773,
"grad_norm": 1.0502937038876212,
"learning_rate": 6.962665057554606e-05,
"loss": 2.7836,
"step": 5860
},
{
"epoch": 16.37883008356546,
"grad_norm": 1.1579505600465934,
"learning_rate": 6.955863073446054e-05,
"loss": 2.8117,
"step": 5880
},
{
"epoch": 16.434540389972145,
"grad_norm": 1.1026773625827397,
"learning_rate": 6.949042466139187e-05,
"loss": 2.7684,
"step": 5900
},
{
"epoch": 16.49025069637883,
"grad_norm": 1.080576186890865,
"learning_rate": 6.942203282577072e-05,
"loss": 2.8201,
"step": 5920
},
{
"epoch": 16.545961002785514,
"grad_norm": 1.1201896108002356,
"learning_rate": 6.935345569830636e-05,
"loss": 2.7998,
"step": 5940
},
{
"epoch": 16.601671309192202,
"grad_norm": 1.0674451923986512,
"learning_rate": 6.928469375098327e-05,
"loss": 2.7513,
"step": 5960
},
{
"epoch": 16.657381615598887,
"grad_norm": 1.393151947824109,
"learning_rate": 6.921574745705798e-05,
"loss": 2.7765,
"step": 5980
},
{
"epoch": 16.71309192200557,
"grad_norm": 1.0998714589532412,
"learning_rate": 6.91466172910558e-05,
"loss": 2.7645,
"step": 6000
},
{
"epoch": 16.768802228412255,
"grad_norm": 1.161893735809738,
"learning_rate": 6.907730372876756e-05,
"loss": 2.7775,
"step": 6020
},
{
"epoch": 16.824512534818943,
"grad_norm": 1.1249294683555988,
"learning_rate": 6.90078072472463e-05,
"loss": 2.7751,
"step": 6040
},
{
"epoch": 16.880222841225628,
"grad_norm": 1.0278563705072414,
"learning_rate": 6.8938128324804e-05,
"loss": 2.7886,
"step": 6060
},
{
"epoch": 16.935933147632312,
"grad_norm": 1.061083533262939,
"learning_rate": 6.886826744100831e-05,
"loss": 2.7706,
"step": 6080
},
{
"epoch": 16.991643454038996,
"grad_norm": 1.1962487610461527,
"learning_rate": 6.879822507667925e-05,
"loss": 2.778,
"step": 6100
},
{
"epoch": 17.04735376044568,
"grad_norm": 1.084120872740396,
"learning_rate": 6.872800171388584e-05,
"loss": 2.758,
"step": 6120
},
{
"epoch": 17.10306406685237,
"grad_norm": 1.2302739964645966,
"learning_rate": 6.865759783594288e-05,
"loss": 2.7437,
"step": 6140
},
{
"epoch": 17.158774373259053,
"grad_norm": 1.2503733070591003,
"learning_rate": 6.858701392740755e-05,
"loss": 2.7828,
"step": 6160
},
{
"epoch": 17.214484679665738,
"grad_norm": 1.0453068724351287,
"learning_rate": 6.85162504740761e-05,
"loss": 2.7804,
"step": 6180
},
{
"epoch": 17.270194986072422,
"grad_norm": 1.102788495625949,
"learning_rate": 6.844530796298049e-05,
"loss": 2.7794,
"step": 6200
},
{
"epoch": 17.32590529247911,
"grad_norm": 1.1692774985464567,
"learning_rate": 6.837418688238506e-05,
"loss": 2.7432,
"step": 6220
},
{
"epoch": 17.381615598885794,
"grad_norm": 1.1331591899737494,
"learning_rate": 6.830288772178319e-05,
"loss": 2.7716,
"step": 6240
},
{
"epoch": 17.43732590529248,
"grad_norm": 1.2148411223909634,
"learning_rate": 6.823141097189384e-05,
"loss": 2.7696,
"step": 6260
},
{
"epoch": 17.493036211699163,
"grad_norm": 1.393055281753607,
"learning_rate": 6.815975712465829e-05,
"loss": 2.7415,
"step": 6280
},
{
"epoch": 17.54874651810585,
"grad_norm": 1.3474136405073431,
"learning_rate": 6.808792667323665e-05,
"loss": 2.781,
"step": 6300
},
{
"epoch": 17.604456824512535,
"grad_norm": 1.036698414343895,
"learning_rate": 6.80159201120046e-05,
"loss": 2.7695,
"step": 6320
},
{
"epoch": 17.66016713091922,
"grad_norm": 1.1451609472767672,
"learning_rate": 6.79437379365498e-05,
"loss": 2.7744,
"step": 6340
},
{
"epoch": 17.715877437325904,
"grad_norm": 1.0591481814927388,
"learning_rate": 6.787138064366862e-05,
"loss": 2.7892,
"step": 6360
},
{
"epoch": 17.771587743732592,
"grad_norm": 1.1623299583698332,
"learning_rate": 6.779884873136271e-05,
"loss": 2.7675,
"step": 6380
},
{
"epoch": 17.827298050139277,
"grad_norm": 1.0147233449592903,
"learning_rate": 6.772614269883552e-05,
"loss": 2.7427,
"step": 6400
},
{
"epoch": 17.88300835654596,
"grad_norm": 1.1368649796660046,
"learning_rate": 6.765326304648889e-05,
"loss": 2.7683,
"step": 6420
},
{
"epoch": 17.938718662952645,
"grad_norm": 1.1490699285660757,
"learning_rate": 6.758021027591959e-05,
"loss": 2.7886,
"step": 6440
},
{
"epoch": 17.99442896935933,
"grad_norm": 0.9785051726498183,
"learning_rate": 6.75069848899159e-05,
"loss": 2.7515,
"step": 6460
},
{
"epoch": 18.050139275766018,
"grad_norm": 1.2031239642894627,
"learning_rate": 6.743358739245416e-05,
"loss": 2.7646,
"step": 6480
},
{
"epoch": 18.105849582172702,
"grad_norm": 1.0544436866470264,
"learning_rate": 6.736001828869522e-05,
"loss": 2.7755,
"step": 6500
},
{
"epoch": 18.161559888579387,
"grad_norm": 1.2123837569916929,
"learning_rate": 6.728627808498102e-05,
"loss": 2.726,
"step": 6520
},
{
"epoch": 18.21727019498607,
"grad_norm": 1.0826550987274146,
"learning_rate": 6.721236728883116e-05,
"loss": 2.7447,
"step": 6540
},
{
"epoch": 18.27298050139276,
"grad_norm": 1.0759411138187283,
"learning_rate": 6.71382864089393e-05,
"loss": 2.7457,
"step": 6560
},
{
"epoch": 18.328690807799443,
"grad_norm": 1.097324968536256,
"learning_rate": 6.706403595516969e-05,
"loss": 2.7833,
"step": 6580
},
{
"epoch": 18.384401114206128,
"grad_norm": 1.057266074357713,
"learning_rate": 6.69896164385537e-05,
"loss": 2.7441,
"step": 6600
},
{
"epoch": 18.440111420612812,
"grad_norm": 1.1747200892285241,
"learning_rate": 6.691502837128632e-05,
"loss": 2.7255,
"step": 6620
},
{
"epoch": 18.4958217270195,
"grad_norm": 0.9921075391724958,
"learning_rate": 6.684027226672256e-05,
"loss": 2.749,
"step": 6640
},
{
"epoch": 18.551532033426184,
"grad_norm": 1.1919910460153382,
"learning_rate": 6.676534863937394e-05,
"loss": 2.7244,
"step": 6660
},
{
"epoch": 18.60724233983287,
"grad_norm": 0.9700416046708717,
"learning_rate": 6.669025800490496e-05,
"loss": 2.7578,
"step": 6680
},
{
"epoch": 18.662952646239553,
"grad_norm": 1.7798390717297705,
"learning_rate": 6.66150008801296e-05,
"loss": 2.7497,
"step": 6700
},
{
"epoch": 18.71866295264624,
"grad_norm": 1.3075654856552488,
"learning_rate": 6.653957778300764e-05,
"loss": 2.7627,
"step": 6720
},
{
"epoch": 18.774373259052926,
"grad_norm": 1.109643118621012,
"learning_rate": 6.646398923264127e-05,
"loss": 2.7451,
"step": 6740
},
{
"epoch": 18.83008356545961,
"grad_norm": 1.0116510422849132,
"learning_rate": 6.638823574927133e-05,
"loss": 2.7904,
"step": 6760
},
{
"epoch": 18.885793871866294,
"grad_norm": 1.154541906977704,
"learning_rate": 6.631231785427385e-05,
"loss": 2.7375,
"step": 6780
},
{
"epoch": 18.94150417827298,
"grad_norm": 0.9834780422588326,
"learning_rate": 6.623623607015642e-05,
"loss": 2.7324,
"step": 6800
},
{
"epoch": 18.997214484679667,
"grad_norm": 1.0967675089569602,
"learning_rate": 6.615999092055462e-05,
"loss": 2.7377,
"step": 6820
},
{
"epoch": 19.05292479108635,
"grad_norm": 1.033720234986334,
"learning_rate": 6.608358293022839e-05,
"loss": 2.7455,
"step": 6840
},
{
"epoch": 19.108635097493035,
"grad_norm": 1.4074135325652137,
"learning_rate": 6.600701262505844e-05,
"loss": 2.7175,
"step": 6860
},
{
"epoch": 19.16434540389972,
"grad_norm": 1.2217671501431764,
"learning_rate": 6.593028053204258e-05,
"loss": 2.7459,
"step": 6880
},
{
"epoch": 19.220055710306408,
"grad_norm": 1.157829388585607,
"learning_rate": 6.585338717929218e-05,
"loss": 2.7437,
"step": 6900
},
{
"epoch": 19.275766016713092,
"grad_norm": 1.0614455080094343,
"learning_rate": 6.577633309602842e-05,
"loss": 2.7703,
"step": 6920
},
{
"epoch": 19.331476323119777,
"grad_norm": 1.1146275976097708,
"learning_rate": 6.569911881257878e-05,
"loss": 2.7435,
"step": 6940
},
{
"epoch": 19.38718662952646,
"grad_norm": 1.0865432136684192,
"learning_rate": 6.56217448603733e-05,
"loss": 2.732,
"step": 6960
},
{
"epoch": 19.44289693593315,
"grad_norm": 1.1353040144610025,
"learning_rate": 6.554421177194095e-05,
"loss": 2.7285,
"step": 6980
},
{
"epoch": 19.498607242339833,
"grad_norm": 1.0801239077744584,
"learning_rate": 6.546652008090591e-05,
"loss": 2.7449,
"step": 7000
},
{
"epoch": 19.554317548746518,
"grad_norm": 1.3301081392287104,
"learning_rate": 6.538867032198405e-05,
"loss": 2.758,
"step": 7020
},
{
"epoch": 19.610027855153202,
"grad_norm": 1.378308356595679,
"learning_rate": 6.531066303097907e-05,
"loss": 2.7296,
"step": 7040
},
{
"epoch": 19.66573816155989,
"grad_norm": 1.0272775678623267,
"learning_rate": 6.523249874477889e-05,
"loss": 2.7366,
"step": 7060
},
{
"epoch": 19.721448467966574,
"grad_norm": 1.3280539231741249,
"learning_rate": 6.515417800135199e-05,
"loss": 2.7206,
"step": 7080
},
{
"epoch": 19.77715877437326,
"grad_norm": 1.2190042029662624,
"learning_rate": 6.507570133974366e-05,
"loss": 2.7413,
"step": 7100
},
{
"epoch": 19.832869080779943,
"grad_norm": 1.0253976846781938,
"learning_rate": 6.499706930007227e-05,
"loss": 2.7194,
"step": 7120
},
{
"epoch": 19.88857938718663,
"grad_norm": 1.0998801088450254,
"learning_rate": 6.491828242352565e-05,
"loss": 2.7299,
"step": 7140
},
{
"epoch": 19.944289693593316,
"grad_norm": 1.2547720080479265,
"learning_rate": 6.483934125235726e-05,
"loss": 2.6907,
"step": 7160
},
{
"epoch": 20.0,
"grad_norm": 1.1249680628198624,
"learning_rate": 6.47602463298825e-05,
"loss": 2.7427,
"step": 7180
},
{
"epoch": 20.055710306406684,
"grad_norm": 1.438020280162261,
"learning_rate": 6.468099820047495e-05,
"loss": 2.7324,
"step": 7200
},
{
"epoch": 20.11142061281337,
"grad_norm": 1.2513883791091014,
"learning_rate": 6.46015974095627e-05,
"loss": 2.7433,
"step": 7220
},
{
"epoch": 20.167130919220057,
"grad_norm": 1.1732322456488424,
"learning_rate": 6.452204450362446e-05,
"loss": 2.7287,
"step": 7240
},
{
"epoch": 20.22284122562674,
"grad_norm": 1.099413306167727,
"learning_rate": 6.444234003018595e-05,
"loss": 2.7166,
"step": 7260
},
{
"epoch": 20.278551532033426,
"grad_norm": 1.1933888070643845,
"learning_rate": 6.436248453781604e-05,
"loss": 2.7084,
"step": 7280
},
{
"epoch": 20.33426183844011,
"grad_norm": 1.1295980078738417,
"learning_rate": 6.428247857612295e-05,
"loss": 2.7101,
"step": 7300
},
{
"epoch": 20.389972144846798,
"grad_norm": 1.1784232492865596,
"learning_rate": 6.420232269575055e-05,
"loss": 2.7238,
"step": 7320
},
{
"epoch": 20.445682451253482,
"grad_norm": 1.600869680659427,
"learning_rate": 6.412201744837451e-05,
"loss": 2.7048,
"step": 7340
},
{
"epoch": 20.501392757660167,
"grad_norm": 1.15499859140262,
"learning_rate": 6.404156338669859e-05,
"loss": 2.6977,
"step": 7360
},
{
"epoch": 20.55710306406685,
"grad_norm": 1.1378208760575172,
"learning_rate": 6.396096106445064e-05,
"loss": 2.7181,
"step": 7380
},
{
"epoch": 20.61281337047354,
"grad_norm": 1.123871481777876,
"learning_rate": 6.388021103637904e-05,
"loss": 2.7155,
"step": 7400
},
{
"epoch": 20.668523676880223,
"grad_norm": 1.0841648486082098,
"learning_rate": 6.37993138582487e-05,
"loss": 2.7354,
"step": 7420
},
{
"epoch": 20.724233983286908,
"grad_norm": 1.1664188982324037,
"learning_rate": 6.371827008683732e-05,
"loss": 2.7238,
"step": 7440
},
{
"epoch": 20.779944289693592,
"grad_norm": 1.104589633589391,
"learning_rate": 6.363708027993152e-05,
"loss": 2.6975,
"step": 7460
},
{
"epoch": 20.83565459610028,
"grad_norm": 1.421916066233274,
"learning_rate": 6.355574499632301e-05,
"loss": 2.7423,
"step": 7480
},
{
"epoch": 20.891364902506965,
"grad_norm": 1.081254160033794,
"learning_rate": 6.347426479580477e-05,
"loss": 2.725,
"step": 7500
},
{
"epoch": 20.94707520891365,
"grad_norm": 1.1098202109359585,
"learning_rate": 6.339264023916715e-05,
"loss": 2.7272,
"step": 7520
},
{
"epoch": 21.002785515320333,
"grad_norm": 1.0514058782353737,
"learning_rate": 6.331087188819405e-05,
"loss": 2.739,
"step": 7540
},
{
"epoch": 21.058495821727018,
"grad_norm": 1.1184046430910828,
"learning_rate": 6.322896030565905e-05,
"loss": 2.703,
"step": 7560
},
{
"epoch": 21.114206128133706,
"grad_norm": 1.1842229516973584,
"learning_rate": 6.31469060553215e-05,
"loss": 2.7151,
"step": 7580
},
{
"epoch": 21.16991643454039,
"grad_norm": 1.0858640050274855,
"learning_rate": 6.30647097019227e-05,
"loss": 2.7033,
"step": 7600
},
{
"epoch": 21.225626740947074,
"grad_norm": 1.2333467648992407,
"learning_rate": 6.298237181118193e-05,
"loss": 2.6952,
"step": 7620
},
{
"epoch": 21.28133704735376,
"grad_norm": 1.3879484824315327,
"learning_rate": 6.289989294979264e-05,
"loss": 2.7024,
"step": 7640
},
{
"epoch": 21.337047353760447,
"grad_norm": 1.2106236210149603,
"learning_rate": 6.281727368541853e-05,
"loss": 2.7047,
"step": 7660
},
{
"epoch": 21.39275766016713,
"grad_norm": 1.0115849386843587,
"learning_rate": 6.273451458668961e-05,
"loss": 2.7075,
"step": 7680
},
{
"epoch": 21.448467966573816,
"grad_norm": 1.051437728204155,
"learning_rate": 6.265161622319829e-05,
"loss": 2.7247,
"step": 7700
},
{
"epoch": 21.5041782729805,
"grad_norm": 1.1019904640067848,
"learning_rate": 6.256857916549548e-05,
"loss": 2.691,
"step": 7720
},
{
"epoch": 21.559888579387188,
"grad_norm": 1.1014027785637281,
"learning_rate": 6.248540398508673e-05,
"loss": 2.6992,
"step": 7740
},
{
"epoch": 21.615598885793872,
"grad_norm": 1.0988814252962915,
"learning_rate": 6.240209125442806e-05,
"loss": 2.714,
"step": 7760
},
{
"epoch": 21.671309192200557,
"grad_norm": 1.2025654774415926,
"learning_rate": 6.231864154692237e-05,
"loss": 2.7042,
"step": 7780
},
{
"epoch": 21.72701949860724,
"grad_norm": 1.1583280781305814,
"learning_rate": 6.223505543691518e-05,
"loss": 2.7081,
"step": 7800
},
{
"epoch": 21.78272980501393,
"grad_norm": 1.1105375147778487,
"learning_rate": 6.215133349969086e-05,
"loss": 2.6869,
"step": 7820
},
{
"epoch": 21.838440111420613,
"grad_norm": 1.1146300728660752,
"learning_rate": 6.206747631146862e-05,
"loss": 2.6988,
"step": 7840
},
{
"epoch": 21.894150417827298,
"grad_norm": 1.0660501800448277,
"learning_rate": 6.198348444939849e-05,
"loss": 2.6491,
"step": 7860
},
{
"epoch": 21.949860724233982,
"grad_norm": 1.1013215569007113,
"learning_rate": 6.189935849155747e-05,
"loss": 2.7103,
"step": 7880
},
{
"epoch": 22.00557103064067,
"grad_norm": 1.1802406207769038,
"learning_rate": 6.18150990169454e-05,
"loss": 2.7193,
"step": 7900
},
{
"epoch": 22.061281337047355,
"grad_norm": 1.1108973776747364,
"learning_rate": 6.173070660548112e-05,
"loss": 2.6831,
"step": 7920
},
{
"epoch": 22.11699164345404,
"grad_norm": 1.092182682917437,
"learning_rate": 6.16461818379984e-05,
"loss": 2.6557,
"step": 7940
},
{
"epoch": 22.172701949860723,
"grad_norm": 1.145560140100733,
"learning_rate": 6.156152529624193e-05,
"loss": 2.6672,
"step": 7960
},
{
"epoch": 22.228412256267408,
"grad_norm": 1.2287470982936533,
"learning_rate": 6.147673756286334e-05,
"loss": 2.7312,
"step": 7980
},
{
"epoch": 22.284122562674096,
"grad_norm": 1.0846368953094574,
"learning_rate": 6.139181922141721e-05,
"loss": 2.7017,
"step": 8000
},
{
"epoch": 22.33983286908078,
"grad_norm": 1.1767612465335586,
"learning_rate": 6.130677085635704e-05,
"loss": 2.7118,
"step": 8020
},
{
"epoch": 22.395543175487465,
"grad_norm": 1.1460490982941247,
"learning_rate": 6.12215930530312e-05,
"loss": 2.6819,
"step": 8040
},
{
"epoch": 22.45125348189415,
"grad_norm": 1.1479085095640083,
"learning_rate": 6.113628639767893e-05,
"loss": 2.6877,
"step": 8060
},
{
"epoch": 22.506963788300837,
"grad_norm": 1.1375246029213462,
"learning_rate": 6.105085147742632e-05,
"loss": 2.6925,
"step": 8080
},
{
"epoch": 22.56267409470752,
"grad_norm": 1.1123231981092023,
"learning_rate": 6.0965288880282214e-05,
"loss": 2.6822,
"step": 8100
},
{
"epoch": 22.618384401114206,
"grad_norm": 1.2752545626840799,
"learning_rate": 6.087959919513422e-05,
"loss": 2.7205,
"step": 8120
},
{
"epoch": 22.67409470752089,
"grad_norm": 1.2262605320485462,
"learning_rate": 6.079378301174464e-05,
"loss": 2.6924,
"step": 8140
},
{
"epoch": 22.729805013927578,
"grad_norm": 1.0730006469584497,
"learning_rate": 6.0707840920746374e-05,
"loss": 2.7124,
"step": 8160
},
{
"epoch": 22.785515320334262,
"grad_norm": 1.3257454551313221,
"learning_rate": 6.0621773513638905e-05,
"loss": 2.6762,
"step": 8180
},
{
"epoch": 22.841225626740947,
"grad_norm": 1.2930384450871677,
"learning_rate": 6.0535581382784216e-05,
"loss": 2.6623,
"step": 8200
},
{
"epoch": 22.89693593314763,
"grad_norm": 1.139885910912536,
"learning_rate": 6.0449265121402686e-05,
"loss": 2.6867,
"step": 8220
},
{
"epoch": 22.95264623955432,
"grad_norm": 1.125710423580146,
"learning_rate": 6.036282532356904e-05,
"loss": 2.6742,
"step": 8240
},
{
"epoch": 23.008356545961004,
"grad_norm": 1.131377041116149,
"learning_rate": 6.027626258420825e-05,
"loss": 2.7031,
"step": 8260
},
{
"epoch": 23.064066852367688,
"grad_norm": 1.0869417103268262,
"learning_rate": 6.0189577499091424e-05,
"loss": 2.6683,
"step": 8280
},
{
"epoch": 23.119777158774372,
"grad_norm": 1.1391157676417962,
"learning_rate": 6.010277066483174e-05,
"loss": 2.707,
"step": 8300
},
{
"epoch": 23.175487465181057,
"grad_norm": 1.2945583722046563,
"learning_rate": 6.001584267888028e-05,
"loss": 2.6522,
"step": 8320
},
{
"epoch": 23.231197771587745,
"grad_norm": 1.1565608753713996,
"learning_rate": 5.9928794139522025e-05,
"loss": 2.6717,
"step": 8340
},
{
"epoch": 23.28690807799443,
"grad_norm": 1.2394122917368424,
"learning_rate": 5.9841625645871575e-05,
"loss": 2.7024,
"step": 8360
},
{
"epoch": 23.342618384401113,
"grad_norm": 1.0903861377505122,
"learning_rate": 5.975433779786921e-05,
"loss": 2.6455,
"step": 8380
},
{
"epoch": 23.398328690807798,
"grad_norm": 1.0661921555569798,
"learning_rate": 5.966693119627662e-05,
"loss": 2.6706,
"step": 8400
},
{
"epoch": 23.454038997214486,
"grad_norm": 1.0655312130218653,
"learning_rate": 5.957940644267282e-05,
"loss": 2.6906,
"step": 8420
},
{
"epoch": 23.50974930362117,
"grad_norm": 1.1134166788328776,
"learning_rate": 5.949176413945003e-05,
"loss": 2.6432,
"step": 8440
},
{
"epoch": 23.565459610027855,
"grad_norm": 1.650527244066039,
"learning_rate": 5.94040048898095e-05,
"loss": 2.6937,
"step": 8460
},
{
"epoch": 23.62116991643454,
"grad_norm": 1.1235056862121802,
"learning_rate": 5.931612929775738e-05,
"loss": 2.6705,
"step": 8480
},
{
"epoch": 23.676880222841227,
"grad_norm": 1.0368298826041793,
"learning_rate": 5.922813796810054e-05,
"loss": 2.6724,
"step": 8500
},
{
"epoch": 23.73259052924791,
"grad_norm": 1.215524406135999,
"learning_rate": 5.914003150644242e-05,
"loss": 2.6768,
"step": 8520
},
{
"epoch": 23.788300835654596,
"grad_norm": 1.2276511679169064,
"learning_rate": 5.905181051917883e-05,
"loss": 2.7046,
"step": 8540
},
{
"epoch": 23.84401114206128,
"grad_norm": 1.0144214891191534,
"learning_rate": 5.896347561349387e-05,
"loss": 2.652,
"step": 8560
},
{
"epoch": 23.899721448467968,
"grad_norm": 1.206243520777862,
"learning_rate": 5.887502739735565e-05,
"loss": 2.6965,
"step": 8580
},
{
"epoch": 23.955431754874652,
"grad_norm": 1.0056037720791713,
"learning_rate": 5.878646647951213e-05,
"loss": 2.6475,
"step": 8600
},
{
"epoch": 24.011142061281337,
"grad_norm": 1.0641520059001517,
"learning_rate": 5.8697793469486964e-05,
"loss": 2.6991,
"step": 8620
},
{
"epoch": 24.06685236768802,
"grad_norm": 1.2174985172315418,
"learning_rate": 5.860900897757528e-05,
"loss": 2.6711,
"step": 8640
},
{
"epoch": 24.12256267409471,
"grad_norm": 1.0515823218930573,
"learning_rate": 5.852011361483949e-05,
"loss": 2.6625,
"step": 8660
},
{
"epoch": 24.178272980501394,
"grad_norm": 1.3132128877951654,
"learning_rate": 5.8431107993105076e-05,
"loss": 2.6604,
"step": 8680
},
{
"epoch": 24.233983286908078,
"grad_norm": 1.0603177321270034,
"learning_rate": 5.834199272495636e-05,
"loss": 2.6663,
"step": 8700
},
{
"epoch": 24.289693593314762,
"grad_norm": 1.262687868955379,
"learning_rate": 5.8252768423732364e-05,
"loss": 2.6708,
"step": 8720
},
{
"epoch": 24.345403899721447,
"grad_norm": 1.0873862910591228,
"learning_rate": 5.816343570352244e-05,
"loss": 2.6367,
"step": 8740
},
{
"epoch": 24.401114206128135,
"grad_norm": 1.2746436419501377,
"learning_rate": 5.8073995179162254e-05,
"loss": 2.7081,
"step": 8760
},
{
"epoch": 24.45682451253482,
"grad_norm": 1.092203478224612,
"learning_rate": 5.798444746622934e-05,
"loss": 2.6693,
"step": 8780
},
{
"epoch": 24.512534818941504,
"grad_norm": 1.1045845154960057,
"learning_rate": 5.7894793181039e-05,
"loss": 2.6981,
"step": 8800
},
{
"epoch": 24.568245125348188,
"grad_norm": 1.139876543282688,
"learning_rate": 5.780503294064005e-05,
"loss": 2.6539,
"step": 8820
},
{
"epoch": 24.623955431754876,
"grad_norm": 1.1892780737352568,
"learning_rate": 5.771516736281051e-05,
"loss": 2.6676,
"step": 8840
},
{
"epoch": 24.67966573816156,
"grad_norm": 1.124479629411898,
"learning_rate": 5.7625197066053374e-05,
"loss": 2.6712,
"step": 8860
},
{
"epoch": 24.735376044568245,
"grad_norm": 1.078433196751875,
"learning_rate": 5.753512266959242e-05,
"loss": 2.6658,
"step": 8880
},
{
"epoch": 24.79108635097493,
"grad_norm": 1.0663315697490754,
"learning_rate": 5.744494479336786e-05,
"loss": 2.6488,
"step": 8900
},
{
"epoch": 24.846796657381617,
"grad_norm": 1.1044363572012328,
"learning_rate": 5.735466405803211e-05,
"loss": 2.6905,
"step": 8920
},
{
"epoch": 24.9025069637883,
"grad_norm": 1.0926049616035345,
"learning_rate": 5.7264281084945534e-05,
"loss": 2.6744,
"step": 8940
},
{
"epoch": 24.958217270194986,
"grad_norm": 1.0597627637210976,
"learning_rate": 5.717379649617212e-05,
"loss": 2.6501,
"step": 8960
},
{
"epoch": 25.01392757660167,
"grad_norm": 0.9918782369429666,
"learning_rate": 5.70832109144753e-05,
"loss": 2.6394,
"step": 8980
},
{
"epoch": 25.069637883008358,
"grad_norm": 1.1550215185121195,
"learning_rate": 5.6992524963313494e-05,
"loss": 2.6491,
"step": 9000
},
{
"epoch": 25.125348189415043,
"grad_norm": 1.2681147669552022,
"learning_rate": 5.6901739266835976e-05,
"loss": 2.6637,
"step": 9020
},
{
"epoch": 25.181058495821727,
"grad_norm": 1.1669892724232989,
"learning_rate": 5.681085444987855e-05,
"loss": 2.6595,
"step": 9040
},
{
"epoch": 25.23676880222841,
"grad_norm": 1.118544311154742,
"learning_rate": 5.6719871137959136e-05,
"loss": 2.6602,
"step": 9060
},
{
"epoch": 25.2924791086351,
"grad_norm": 1.1886485765626151,
"learning_rate": 5.6628789957273634e-05,
"loss": 2.6209,
"step": 9080
},
{
"epoch": 25.348189415041784,
"grad_norm": 1.170833233617083,
"learning_rate": 5.653761153469147e-05,
"loss": 2.6986,
"step": 9100
},
{
"epoch": 25.403899721448468,
"grad_norm": 1.248618232385229,
"learning_rate": 5.644633649775136e-05,
"loss": 2.6686,
"step": 9120
},
{
"epoch": 25.459610027855152,
"grad_norm": 1.1165667219670397,
"learning_rate": 5.6354965474657e-05,
"loss": 2.6708,
"step": 9140
},
{
"epoch": 25.515320334261837,
"grad_norm": 1.1574567530077358,
"learning_rate": 5.626349909427265e-05,
"loss": 2.6521,
"step": 9160
},
{
"epoch": 25.571030640668525,
"grad_norm": 1.0835835883220495,
"learning_rate": 5.617193798611895e-05,
"loss": 2.6581,
"step": 9180
},
{
"epoch": 25.62674094707521,
"grad_norm": 1.144587311838764,
"learning_rate": 5.6080282780368435e-05,
"loss": 2.6602,
"step": 9200
},
{
"epoch": 25.682451253481894,
"grad_norm": 1.112741624680344,
"learning_rate": 5.598853410784133e-05,
"loss": 2.6598,
"step": 9220
},
{
"epoch": 25.738161559888578,
"grad_norm": 1.168044637764499,
"learning_rate": 5.589669260000109e-05,
"loss": 2.6645,
"step": 9240
},
{
"epoch": 25.793871866295266,
"grad_norm": 1.120774568517253,
"learning_rate": 5.580475888895015e-05,
"loss": 2.6602,
"step": 9260
},
{
"epoch": 25.84958217270195,
"grad_norm": 1.2114817500368666,
"learning_rate": 5.571273360742552e-05,
"loss": 2.6328,
"step": 9280
},
{
"epoch": 25.905292479108635,
"grad_norm": 1.2356256005701014,
"learning_rate": 5.5620617388794466e-05,
"loss": 2.6384,
"step": 9300
},
{
"epoch": 25.96100278551532,
"grad_norm": 1.2235414625020526,
"learning_rate": 5.552841086705014e-05,
"loss": 2.6681,
"step": 9320
},
{
"epoch": 26.016713091922007,
"grad_norm": 1.2819636542569275,
"learning_rate": 5.5436114676807156e-05,
"loss": 2.6561,
"step": 9340
},
{
"epoch": 26.07242339832869,
"grad_norm": 1.2032746744995522,
"learning_rate": 5.534372945329733e-05,
"loss": 2.6384,
"step": 9360
},
{
"epoch": 26.128133704735376,
"grad_norm": 1.1226061787109827,
"learning_rate": 5.525125583236522e-05,
"loss": 2.6294,
"step": 9380
},
{
"epoch": 26.18384401114206,
"grad_norm": 1.021042107378643,
"learning_rate": 5.515869445046379e-05,
"loss": 2.6588,
"step": 9400
},
{
"epoch": 26.23955431754875,
"grad_norm": 1.291042490575208,
"learning_rate": 5.506604594465004e-05,
"loss": 2.6264,
"step": 9420
},
{
"epoch": 26.295264623955433,
"grad_norm": 1.1032208560398462,
"learning_rate": 5.4973310952580576e-05,
"loss": 2.6169,
"step": 9440
},
{
"epoch": 26.350974930362117,
"grad_norm": 1.2761063536817212,
"learning_rate": 5.488049011250727e-05,
"loss": 2.6506,
"step": 9460
},
{
"epoch": 26.4066852367688,
"grad_norm": 1.1511514672695646,
"learning_rate": 5.478758406327282e-05,
"loss": 2.6698,
"step": 9480
},
{
"epoch": 26.462395543175486,
"grad_norm": 1.0992455713094436,
"learning_rate": 5.469459344430642e-05,
"loss": 2.6097,
"step": 9500
},
{
"epoch": 26.518105849582174,
"grad_norm": 1.172205701527116,
"learning_rate": 5.4601518895619284e-05,
"loss": 2.6293,
"step": 9520
},
{
"epoch": 26.573816155988858,
"grad_norm": 1.2031374723785744,
"learning_rate": 5.4508361057800276e-05,
"loss": 2.6199,
"step": 9540
},
{
"epoch": 26.629526462395543,
"grad_norm": 1.0768039312990048,
"learning_rate": 5.441512057201152e-05,
"loss": 2.6497,
"step": 9560
},
{
"epoch": 26.685236768802227,
"grad_norm": 1.182782939690952,
"learning_rate": 5.432179807998395e-05,
"loss": 2.6439,
"step": 9580
},
{
"epoch": 26.740947075208915,
"grad_norm": 1.1202931202000697,
"learning_rate": 5.422839422401295e-05,
"loss": 2.622,
"step": 9600
},
{
"epoch": 26.7966573816156,
"grad_norm": 1.291606489378618,
"learning_rate": 5.413490964695381e-05,
"loss": 2.6146,
"step": 9620
},
{
"epoch": 26.852367688022284,
"grad_norm": 1.1538604314310363,
"learning_rate": 5.404134499221748e-05,
"loss": 2.6338,
"step": 9640
},
{
"epoch": 26.908077994428968,
"grad_norm": 1.6695600582971142,
"learning_rate": 5.3947700903765986e-05,
"loss": 2.6499,
"step": 9660
},
{
"epoch": 26.963788300835656,
"grad_norm": 1.245827651961817,
"learning_rate": 5.3853978026108086e-05,
"loss": 2.6421,
"step": 9680
},
{
"epoch": 27.01949860724234,
"grad_norm": 1.173940924924453,
"learning_rate": 5.37601770042948e-05,
"loss": 2.6403,
"step": 9700
},
{
"epoch": 27.075208913649025,
"grad_norm": 1.0519719376965715,
"learning_rate": 5.3666298483914984e-05,
"loss": 2.6203,
"step": 9720
},
{
"epoch": 27.13091922005571,
"grad_norm": 1.133593745206024,
"learning_rate": 5.357234311109086e-05,
"loss": 2.6574,
"step": 9740
},
{
"epoch": 27.186629526462397,
"grad_norm": 1.2653772970355646,
"learning_rate": 5.347831153247361e-05,
"loss": 2.6414,
"step": 9760
},
{
"epoch": 27.24233983286908,
"grad_norm": 1.109574259928109,
"learning_rate": 5.338420439523891e-05,
"loss": 2.6147,
"step": 9780
},
{
"epoch": 27.298050139275766,
"grad_norm": 1.2411069069646816,
"learning_rate": 5.329002234708245e-05,
"loss": 2.608,
"step": 9800
},
{
"epoch": 27.35376044568245,
"grad_norm": 1.1182716681713758,
"learning_rate": 5.319576603621553e-05,
"loss": 2.6413,
"step": 9820
},
{
"epoch": 27.409470752089135,
"grad_norm": 1.2003086684148825,
"learning_rate": 5.3101436111360504e-05,
"loss": 2.6275,
"step": 9840
},
{
"epoch": 27.465181058495823,
"grad_norm": 1.0717097302386294,
"learning_rate": 5.300703322174646e-05,
"loss": 2.6328,
"step": 9860
},
{
"epoch": 27.520891364902507,
"grad_norm": 1.4444979446572614,
"learning_rate": 5.29125580171046e-05,
"loss": 2.6201,
"step": 9880
},
{
"epoch": 27.57660167130919,
"grad_norm": 1.0746363462650246,
"learning_rate": 5.281801114766385e-05,
"loss": 2.6123,
"step": 9900
},
{
"epoch": 27.632311977715876,
"grad_norm": 1.0661192079701574,
"learning_rate": 5.272339326414642e-05,
"loss": 2.5964,
"step": 9920
},
{
"epoch": 27.688022284122564,
"grad_norm": 1.1015082831508238,
"learning_rate": 5.262870501776321e-05,
"loss": 2.5953,
"step": 9940
},
{
"epoch": 27.74373259052925,
"grad_norm": 1.1382427443811807,
"learning_rate": 5.253394706020944e-05,
"loss": 2.6181,
"step": 9960
},
{
"epoch": 27.799442896935933,
"grad_norm": 1.0943370244357078,
"learning_rate": 5.243912004366008e-05,
"loss": 2.6116,
"step": 9980
},
{
"epoch": 27.855153203342617,
"grad_norm": 1.0772056082741257,
"learning_rate": 5.234422462076547e-05,
"loss": 2.5998,
"step": 10000
},
{
"epoch": 27.910863509749305,
"grad_norm": 1.0546759553565561,
"learning_rate": 5.2249261444646674e-05,
"loss": 2.5937,
"step": 10020
},
{
"epoch": 27.96657381615599,
"grad_norm": 1.176478667272748,
"learning_rate": 5.2154231168891134e-05,
"loss": 2.6093,
"step": 10040
},
{
"epoch": 28.022284122562674,
"grad_norm": 1.1321140906627838,
"learning_rate": 5.2059134447548076e-05,
"loss": 2.6229,
"step": 10060
},
{
"epoch": 28.077994428969358,
"grad_norm": 1.065907796206171,
"learning_rate": 5.196397193512405e-05,
"loss": 2.6205,
"step": 10080
},
{
"epoch": 28.133704735376046,
"grad_norm": 1.233651032400036,
"learning_rate": 5.1868744286578406e-05,
"loss": 2.5931,
"step": 10100
},
{
"epoch": 28.18941504178273,
"grad_norm": 1.0599465164229271,
"learning_rate": 5.177345215731881e-05,
"loss": 2.6147,
"step": 10120
},
{
"epoch": 28.245125348189415,
"grad_norm": 1.1538820509055618,
"learning_rate": 5.167809620319672e-05,
"loss": 2.6167,
"step": 10140
},
{
"epoch": 28.3008356545961,
"grad_norm": 1.154317454601146,
"learning_rate": 5.158267708050286e-05,
"loss": 2.5937,
"step": 10160
},
{
"epoch": 28.356545961002787,
"grad_norm": 1.0665328766519204,
"learning_rate": 5.1487195445962715e-05,
"loss": 2.604,
"step": 10180
},
{
"epoch": 28.41225626740947,
"grad_norm": 1.1694960107465548,
"learning_rate": 5.139165195673201e-05,
"loss": 2.5995,
"step": 10200
},
{
"epoch": 28.467966573816156,
"grad_norm": 1.0526752329813267,
"learning_rate": 5.1296047270392175e-05,
"loss": 2.6209,
"step": 10220
},
{
"epoch": 28.52367688022284,
"grad_norm": 1.1414513270535445,
"learning_rate": 5.120038204494588e-05,
"loss": 2.5929,
"step": 10240
},
{
"epoch": 28.579387186629525,
"grad_norm": 1.0966114959097728,
"learning_rate": 5.1104656938812394e-05,
"loss": 2.5924,
"step": 10260
},
{
"epoch": 28.635097493036213,
"grad_norm": 1.1174765080610298,
"learning_rate": 5.1008872610823155e-05,
"loss": 2.6202,
"step": 10280
},
{
"epoch": 28.690807799442897,
"grad_norm": 1.1145726415269017,
"learning_rate": 5.091302972021719e-05,
"loss": 2.5968,
"step": 10300
},
{
"epoch": 28.74651810584958,
"grad_norm": 1.0586956135779206,
"learning_rate": 5.08171289266366e-05,
"loss": 2.6123,
"step": 10320
},
{
"epoch": 28.802228412256266,
"grad_norm": 1.3940508320205856,
"learning_rate": 5.072117089012195e-05,
"loss": 2.597,
"step": 10340
},
{
"epoch": 28.857938718662954,
"grad_norm": 1.0350493924845274,
"learning_rate": 5.062515627110785e-05,
"loss": 2.6207,
"step": 10360
},
{
"epoch": 28.91364902506964,
"grad_norm": 1.2315404210100842,
"learning_rate": 5.0529085730418306e-05,
"loss": 2.6179,
"step": 10380
},
{
"epoch": 28.969359331476323,
"grad_norm": 1.1045820694597503,
"learning_rate": 5.0432959929262205e-05,
"loss": 2.6008,
"step": 10400
},
{
"epoch": 29.025069637883007,
"grad_norm": 1.146579867692895,
"learning_rate": 5.03367795292288e-05,
"loss": 2.6202,
"step": 10420
},
{
"epoch": 29.080779944289695,
"grad_norm": 1.2733330944071208,
"learning_rate": 5.0240545192283056e-05,
"loss": 2.6123,
"step": 10440
},
{
"epoch": 29.13649025069638,
"grad_norm": 1.0494794859758667,
"learning_rate": 5.0144257580761224e-05,
"loss": 2.5829,
"step": 10460
},
{
"epoch": 29.192200557103064,
"grad_norm": 1.1190765883905507,
"learning_rate": 5.0047917357366194e-05,
"loss": 2.6223,
"step": 10480
},
{
"epoch": 29.24791086350975,
"grad_norm": 1.5745640876405715,
"learning_rate": 4.995152518516296e-05,
"loss": 2.6133,
"step": 10500
},
{
"epoch": 29.303621169916436,
"grad_norm": 1.1765754248618923,
"learning_rate": 4.9855081727574066e-05,
"loss": 2.6047,
"step": 10520
},
{
"epoch": 29.35933147632312,
"grad_norm": 1.498763487641365,
"learning_rate": 4.975858764837501e-05,
"loss": 2.5656,
"step": 10540
},
{
"epoch": 29.415041782729805,
"grad_norm": 1.1132252850299105,
"learning_rate": 4.966204361168971e-05,
"loss": 2.5914,
"step": 10560
},
{
"epoch": 29.47075208913649,
"grad_norm": 1.1686019155914007,
"learning_rate": 4.956545028198591e-05,
"loss": 2.5874,
"step": 10580
},
{
"epoch": 29.526462395543177,
"grad_norm": 1.1003078614700978,
"learning_rate": 4.946880832407062e-05,
"loss": 2.6143,
"step": 10600
},
{
"epoch": 29.58217270194986,
"grad_norm": 1.0784823374299444,
"learning_rate": 4.937211840308553e-05,
"loss": 2.6153,
"step": 10620
},
{
"epoch": 29.637883008356546,
"grad_norm": 1.2454601562477818,
"learning_rate": 4.927538118450244e-05,
"loss": 2.5872,
"step": 10640
},
{
"epoch": 29.69359331476323,
"grad_norm": 1.2988040165269858,
"learning_rate": 4.917859733411869e-05,
"loss": 2.603,
"step": 10660
},
{
"epoch": 29.749303621169915,
"grad_norm": 1.1745460752963417,
"learning_rate": 4.908176751805253e-05,
"loss": 2.5681,
"step": 10680
},
{
"epoch": 29.805013927576603,
"grad_norm": 1.3297312991458439,
"learning_rate": 4.898489240273864e-05,
"loss": 2.6095,
"step": 10700
},
{
"epoch": 29.860724233983287,
"grad_norm": 1.1339679906193685,
"learning_rate": 4.888797265492338e-05,
"loss": 2.6067,
"step": 10720
},
{
"epoch": 29.91643454038997,
"grad_norm": 1.1806251948614392,
"learning_rate": 4.879100894166038e-05,
"loss": 2.5967,
"step": 10740
},
{
"epoch": 29.972144846796656,
"grad_norm": 1.165860935100357,
"learning_rate": 4.8694001930305794e-05,
"loss": 2.5785,
"step": 10760
},
{
"epoch": 30.027855153203344,
"grad_norm": 1.3776136936276104,
"learning_rate": 4.859695228851381e-05,
"loss": 2.5897,
"step": 10780
},
{
"epoch": 30.08356545961003,
"grad_norm": 1.2587071827316874,
"learning_rate": 4.8499860684232066e-05,
"loss": 2.5797,
"step": 10800
},
{
"epoch": 30.139275766016713,
"grad_norm": 1.1970210105742216,
"learning_rate": 4.84027277856969e-05,
"loss": 2.5672,
"step": 10820
},
{
"epoch": 30.194986072423397,
"grad_norm": 1.3454554228099718,
"learning_rate": 4.830555426142899e-05,
"loss": 2.5934,
"step": 10840
},
{
"epoch": 30.250696378830085,
"grad_norm": 1.264747247911362,
"learning_rate": 4.8208340780228475e-05,
"loss": 2.5894,
"step": 10860
},
{
"epoch": 30.30640668523677,
"grad_norm": 1.05872829319226,
"learning_rate": 4.811108801117065e-05,
"loss": 2.5867,
"step": 10880
},
{
"epoch": 30.362116991643454,
"grad_norm": 1.084586036460413,
"learning_rate": 4.80137966236011e-05,
"loss": 2.5901,
"step": 10900
},
{
"epoch": 30.41782729805014,
"grad_norm": 1.229834581800877,
"learning_rate": 4.7916467287131244e-05,
"loss": 2.5604,
"step": 10920
},
{
"epoch": 30.473537604456826,
"grad_norm": 1.4451033607435961,
"learning_rate": 4.7819100671633706e-05,
"loss": 2.597,
"step": 10940
},
{
"epoch": 30.52924791086351,
"grad_norm": 1.2548193540172925,
"learning_rate": 4.772169744723762e-05,
"loss": 2.5529,
"step": 10960
},
{
"epoch": 30.584958217270195,
"grad_norm": 1.5052254853750195,
"learning_rate": 4.762425828432416e-05,
"loss": 2.6054,
"step": 10980
},
{
"epoch": 30.64066852367688,
"grad_norm": 1.1519592174538793,
"learning_rate": 4.7526783853521796e-05,
"loss": 2.5836,
"step": 11000
},
{
"epoch": 30.696378830083564,
"grad_norm": 1.1375488161597391,
"learning_rate": 4.742927482570176e-05,
"loss": 2.5621,
"step": 11020
},
{
"epoch": 30.75208913649025,
"grad_norm": 1.0552495588001027,
"learning_rate": 4.733173187197335e-05,
"loss": 2.5886,
"step": 11040
},
{
"epoch": 30.807799442896936,
"grad_norm": 1.2026052803495149,
"learning_rate": 4.723415566367945e-05,
"loss": 2.576,
"step": 11060
},
{
"epoch": 30.86350974930362,
"grad_norm": 1.2031132145653618,
"learning_rate": 4.713654687239171e-05,
"loss": 2.5871,
"step": 11080
},
{
"epoch": 30.919220055710305,
"grad_norm": 1.152263728928741,
"learning_rate": 4.703890616990612e-05,
"loss": 2.586,
"step": 11100
},
{
"epoch": 30.974930362116993,
"grad_norm": 1.3435496121402817,
"learning_rate": 4.6941234228238256e-05,
"loss": 2.5813,
"step": 11120
},
{
"epoch": 31.030640668523677,
"grad_norm": 1.1328862858818538,
"learning_rate": 4.684353171961873e-05,
"loss": 2.5917,
"step": 11140
},
{
"epoch": 31.08635097493036,
"grad_norm": 1.2484233364341746,
"learning_rate": 4.674579931648851e-05,
"loss": 2.5619,
"step": 11160
},
{
"epoch": 31.142061281337046,
"grad_norm": 1.3584774397600772,
"learning_rate": 4.664803769149427e-05,
"loss": 2.5569,
"step": 11180
},
{
"epoch": 31.197771587743734,
"grad_norm": 1.1992013763670537,
"learning_rate": 4.6550247517483926e-05,
"loss": 2.5468,
"step": 11200
},
{
"epoch": 31.25348189415042,
"grad_norm": 1.1878315761868057,
"learning_rate": 4.645242946750176e-05,
"loss": 2.5693,
"step": 11220
},
{
"epoch": 31.309192200557103,
"grad_norm": 1.2856028595876672,
"learning_rate": 4.635458421478398e-05,
"loss": 2.5959,
"step": 11240
},
{
"epoch": 31.364902506963787,
"grad_norm": 1.0789628687819908,
"learning_rate": 4.6256712432754e-05,
"loss": 2.5813,
"step": 11260
},
{
"epoch": 31.420612813370475,
"grad_norm": 1.3393871601470777,
"learning_rate": 4.615881479501779e-05,
"loss": 2.5487,
"step": 11280
},
{
"epoch": 31.47632311977716,
"grad_norm": 1.4170061638611984,
"learning_rate": 4.606089197535936e-05,
"loss": 2.5672,
"step": 11300
},
{
"epoch": 31.532033426183844,
"grad_norm": 1.200423025841862,
"learning_rate": 4.5962944647735934e-05,
"loss": 2.5586,
"step": 11320
},
{
"epoch": 31.58774373259053,
"grad_norm": 1.1437564881838662,
"learning_rate": 4.586497348627349e-05,
"loss": 2.5968,
"step": 11340
},
{
"epoch": 31.643454038997213,
"grad_norm": 1.08141997515126,
"learning_rate": 4.576697916526199e-05,
"loss": 2.5688,
"step": 11360
},
{
"epoch": 31.6991643454039,
"grad_norm": 1.2066477041237875,
"learning_rate": 4.5668962359150815e-05,
"loss": 2.593,
"step": 11380
},
{
"epoch": 31.754874651810585,
"grad_norm": 1.2032961351385616,
"learning_rate": 4.557092374254412e-05,
"loss": 2.5883,
"step": 11400
},
{
"epoch": 31.81058495821727,
"grad_norm": 1.1746121842250725,
"learning_rate": 4.547286399019614e-05,
"loss": 2.5669,
"step": 11420
},
{
"epoch": 31.866295264623954,
"grad_norm": 1.1916678733066106,
"learning_rate": 4.53747837770066e-05,
"loss": 2.5613,
"step": 11440
},
{
"epoch": 31.922005571030642,
"grad_norm": 1.151296833307102,
"learning_rate": 4.5276683778015984e-05,
"loss": 2.5574,
"step": 11460
},
{
"epoch": 31.977715877437326,
"grad_norm": 1.1387505510412506,
"learning_rate": 4.517856466840108e-05,
"loss": 2.5778,
"step": 11480
},
{
"epoch": 32.033426183844014,
"grad_norm": 1.1307628256541518,
"learning_rate": 4.50804271234701e-05,
"loss": 2.58,
"step": 11500
},
{
"epoch": 32.089136490250695,
"grad_norm": 1.143654969969148,
"learning_rate": 4.498227181865816e-05,
"loss": 2.5342,
"step": 11520
},
{
"epoch": 32.14484679665738,
"grad_norm": 1.0872870621719841,
"learning_rate": 4.488409942952261e-05,
"loss": 2.5615,
"step": 11540
},
{
"epoch": 32.200557103064064,
"grad_norm": 1.258161916655144,
"learning_rate": 4.478591063173842e-05,
"loss": 2.5566,
"step": 11560
},
{
"epoch": 32.25626740947075,
"grad_norm": 1.2208070919269458,
"learning_rate": 4.468770610109344e-05,
"loss": 2.5549,
"step": 11580
},
{
"epoch": 32.31197771587744,
"grad_norm": 1.169221911082448,
"learning_rate": 4.458948651348383e-05,
"loss": 2.5896,
"step": 11600
},
{
"epoch": 32.36768802228412,
"grad_norm": 1.095014238667817,
"learning_rate": 4.4491252544909394e-05,
"loss": 2.5633,
"step": 11620
},
{
"epoch": 32.42339832869081,
"grad_norm": 1.2970244190994054,
"learning_rate": 4.439300487146887e-05,
"loss": 2.5643,
"step": 11640
},
{
"epoch": 32.4791086350975,
"grad_norm": 1.2886541818686938,
"learning_rate": 4.429474416935536e-05,
"loss": 2.6024,
"step": 11660
},
{
"epoch": 32.53481894150418,
"grad_norm": 1.1502247567681423,
"learning_rate": 4.419647111485162e-05,
"loss": 2.5393,
"step": 11680
},
{
"epoch": 32.590529247910865,
"grad_norm": 1.2887795563474687,
"learning_rate": 4.4098186384325424e-05,
"loss": 2.5511,
"step": 11700
},
{
"epoch": 32.646239554317546,
"grad_norm": 1.1497087680966827,
"learning_rate": 4.399989065422491e-05,
"loss": 2.5538,
"step": 11720
},
{
"epoch": 32.701949860724234,
"grad_norm": 1.2553351424149752,
"learning_rate": 4.39015846010739e-05,
"loss": 2.5896,
"step": 11740
},
{
"epoch": 32.75766016713092,
"grad_norm": 1.1172142499914282,
"learning_rate": 4.380326890146732e-05,
"loss": 2.5503,
"step": 11760
},
{
"epoch": 32.8133704735376,
"grad_norm": 1.0957163400827985,
"learning_rate": 4.370494423206639e-05,
"loss": 2.5527,
"step": 11780
},
{
"epoch": 32.86908077994429,
"grad_norm": 1.1727453881618946,
"learning_rate": 4.360661126959418e-05,
"loss": 2.5808,
"step": 11800
},
{
"epoch": 32.92479108635097,
"grad_norm": 1.5043031377227738,
"learning_rate": 4.3508270690830764e-05,
"loss": 2.5809,
"step": 11820
},
{
"epoch": 32.98050139275766,
"grad_norm": 3.061321585302142,
"learning_rate": 4.340992317260865e-05,
"loss": 2.5672,
"step": 11840
},
{
"epoch": 33.03621169916435,
"grad_norm": 1.2959039372508918,
"learning_rate": 4.3311569391808116e-05,
"loss": 2.5542,
"step": 11860
},
{
"epoch": 33.09192200557103,
"grad_norm": 1.1013587892712957,
"learning_rate": 4.321321002535253e-05,
"loss": 2.5175,
"step": 11880
},
{
"epoch": 33.147632311977716,
"grad_norm": 1.0924446035892936,
"learning_rate": 4.311484575020373e-05,
"loss": 2.538,
"step": 11900
},
{
"epoch": 33.203342618384404,
"grad_norm": 1.4345799681219358,
"learning_rate": 4.3016477243357297e-05,
"loss": 2.5775,
"step": 11920
},
{
"epoch": 33.259052924791085,
"grad_norm": 1.2806133296858657,
"learning_rate": 4.291810518183797e-05,
"loss": 2.5358,
"step": 11940
},
{
"epoch": 33.31476323119777,
"grad_norm": 1.1660089750322047,
"learning_rate": 4.2819730242694924e-05,
"loss": 2.5516,
"step": 11960
},
{
"epoch": 33.370473537604454,
"grad_norm": 1.5552478865676143,
"learning_rate": 4.272135310299719e-05,
"loss": 2.5551,
"step": 11980
},
{
"epoch": 33.42618384401114,
"grad_norm": 1.3019947250320747,
"learning_rate": 4.262297443982888e-05,
"loss": 2.5147,
"step": 12000
},
{
"epoch": 33.48189415041783,
"grad_norm": 1.1712469646586066,
"learning_rate": 4.252459493028466e-05,
"loss": 2.5448,
"step": 12020
},
{
"epoch": 33.53760445682451,
"grad_norm": 1.190656040631796,
"learning_rate": 4.2426215251464944e-05,
"loss": 2.5421,
"step": 12040
},
{
"epoch": 33.5933147632312,
"grad_norm": 1.166057800751934,
"learning_rate": 4.232783608047138e-05,
"loss": 2.5225,
"step": 12060
},
{
"epoch": 33.64902506963789,
"grad_norm": 1.1827485566010056,
"learning_rate": 4.222945809440208e-05,
"loss": 2.5264,
"step": 12080
},
{
"epoch": 33.70473537604457,
"grad_norm": 1.0982238340826698,
"learning_rate": 4.213108197034701e-05,
"loss": 2.5311,
"step": 12100
},
{
"epoch": 33.760445682451255,
"grad_norm": 1.3570517309524062,
"learning_rate": 4.2032708385383325e-05,
"loss": 2.5381,
"step": 12120
},
{
"epoch": 33.816155988857936,
"grad_norm": 1.221101343240307,
"learning_rate": 4.193433801657072e-05,
"loss": 2.5085,
"step": 12140
},
{
"epoch": 33.871866295264624,
"grad_norm": 1.1074955804736837,
"learning_rate": 4.183597154094672e-05,
"loss": 2.554,
"step": 12160
},
{
"epoch": 33.92757660167131,
"grad_norm": 1.2061033239097223,
"learning_rate": 4.173760963552209e-05,
"loss": 2.5144,
"step": 12180
},
{
"epoch": 33.98328690807799,
"grad_norm": 1.5819083614189133,
"learning_rate": 4.1639252977276076e-05,
"loss": 2.5495,
"step": 12200
},
{
"epoch": 34.03899721448468,
"grad_norm": 1.2608802822567775,
"learning_rate": 4.1540902243151906e-05,
"loss": 2.5386,
"step": 12220
},
{
"epoch": 34.09470752089136,
"grad_norm": 1.1987980606427822,
"learning_rate": 4.144255811005199e-05,
"loss": 2.5521,
"step": 12240
},
{
"epoch": 34.15041782729805,
"grad_norm": 1.5620621378858097,
"learning_rate": 4.134422125483328e-05,
"loss": 2.547,
"step": 12260
},
{
"epoch": 34.20612813370474,
"grad_norm": 1.1486672391725685,
"learning_rate": 4.124589235430266e-05,
"loss": 2.5527,
"step": 12280
},
{
"epoch": 34.26183844011142,
"grad_norm": 1.0658317601206686,
"learning_rate": 4.114757208521229e-05,
"loss": 2.5188,
"step": 12300
},
{
"epoch": 34.317548746518106,
"grad_norm": 1.1632885977667755,
"learning_rate": 4.104926112425487e-05,
"loss": 2.5066,
"step": 12320
},
{
"epoch": 34.373259052924794,
"grad_norm": 1.1978332103431018,
"learning_rate": 4.095096014805907e-05,
"loss": 2.5242,
"step": 12340
},
{
"epoch": 34.428969359331475,
"grad_norm": 1.2329715563026837,
"learning_rate": 4.0852669833184864e-05,
"loss": 2.5121,
"step": 12360
},
{
"epoch": 34.48467966573816,
"grad_norm": 1.4254336559578342,
"learning_rate": 4.075439085611879e-05,
"loss": 2.5327,
"step": 12380
},
{
"epoch": 34.540389972144844,
"grad_norm": 1.3455389003917915,
"learning_rate": 4.065612389326941e-05,
"loss": 2.5282,
"step": 12400
},
{
"epoch": 34.59610027855153,
"grad_norm": 1.1098645710519481,
"learning_rate": 4.055786962096253e-05,
"loss": 2.5414,
"step": 12420
},
{
"epoch": 34.65181058495822,
"grad_norm": 1.0712441022651848,
"learning_rate": 4.04596287154367e-05,
"loss": 2.5243,
"step": 12440
},
{
"epoch": 34.7075208913649,
"grad_norm": 1.2229530919327418,
"learning_rate": 4.0361401852838415e-05,
"loss": 2.5391,
"step": 12460
},
{
"epoch": 34.76323119777159,
"grad_norm": 1.1116414230831893,
"learning_rate": 4.026318970921751e-05,
"loss": 2.5549,
"step": 12480
},
{
"epoch": 34.81894150417827,
"grad_norm": 1.3725266835406396,
"learning_rate": 4.016499296052257e-05,
"loss": 2.5375,
"step": 12500
},
{
"epoch": 34.87465181058496,
"grad_norm": 1.1846019558051342,
"learning_rate": 4.0066812282596165e-05,
"loss": 2.5508,
"step": 12520
},
{
"epoch": 34.930362116991645,
"grad_norm": 1.155996133148295,
"learning_rate": 3.9968648351170285e-05,
"loss": 2.5284,
"step": 12540
},
{
"epoch": 34.986072423398326,
"grad_norm": 1.3251336278575234,
"learning_rate": 3.987050184186168e-05,
"loss": 2.5112,
"step": 12560
},
{
"epoch": 35.041782729805014,
"grad_norm": 1.2962788212433607,
"learning_rate": 3.9772373430167165e-05,
"loss": 2.5334,
"step": 12580
},
{
"epoch": 35.0974930362117,
"grad_norm": 1.1720387548872857,
"learning_rate": 3.967426379145899e-05,
"loss": 2.5233,
"step": 12600
},
{
"epoch": 35.15320334261838,
"grad_norm": 1.0933571811040883,
"learning_rate": 3.957617360098023e-05,
"loss": 2.5134,
"step": 12620
},
{
"epoch": 35.20891364902507,
"grad_norm": 1.2440055643446601,
"learning_rate": 3.9478103533840095e-05,
"loss": 2.5155,
"step": 12640
},
{
"epoch": 35.26462395543175,
"grad_norm": 1.1329596221636413,
"learning_rate": 3.938005426500927e-05,
"loss": 2.5335,
"step": 12660
},
{
"epoch": 35.32033426183844,
"grad_norm": 1.185464659161853,
"learning_rate": 3.928202646931534e-05,
"loss": 2.5438,
"step": 12680
},
{
"epoch": 35.37604456824513,
"grad_norm": 1.3785291897615783,
"learning_rate": 3.918402082143804e-05,
"loss": 2.5442,
"step": 12700
},
{
"epoch": 35.43175487465181,
"grad_norm": 2.546941401301199,
"learning_rate": 3.908603799590476e-05,
"loss": 2.5113,
"step": 12720
},
{
"epoch": 35.4874651810585,
"grad_norm": 1.1505394021830595,
"learning_rate": 3.898807866708572e-05,
"loss": 2.5527,
"step": 12740
},
{
"epoch": 35.543175487465184,
"grad_norm": 1.1406223829986795,
"learning_rate": 3.889014350918947e-05,
"loss": 2.5169,
"step": 12760
},
{
"epoch": 35.598885793871865,
"grad_norm": 1.5268262564835722,
"learning_rate": 3.8792233196258226e-05,
"loss": 2.5385,
"step": 12780
},
{
"epoch": 35.65459610027855,
"grad_norm": 1.4366052923997545,
"learning_rate": 3.869434840216315e-05,
"loss": 2.5138,
"step": 12800
},
{
"epoch": 35.710306406685234,
"grad_norm": 1.3493022957735832,
"learning_rate": 3.8596489800599826e-05,
"loss": 2.5012,
"step": 12820
},
{
"epoch": 35.76601671309192,
"grad_norm": 1.2612297939888255,
"learning_rate": 3.849865806508352e-05,
"loss": 2.5167,
"step": 12840
},
{
"epoch": 35.82172701949861,
"grad_norm": 1.1655785400988308,
"learning_rate": 3.8400853868944604e-05,
"loss": 2.5054,
"step": 12860
},
{
"epoch": 35.87743732590529,
"grad_norm": 1.195789035259279,
"learning_rate": 3.8303077885323945e-05,
"loss": 2.5038,
"step": 12880
},
{
"epoch": 35.93314763231198,
"grad_norm": 1.1195435490995402,
"learning_rate": 3.820533078716821e-05,
"loss": 2.5628,
"step": 12900
},
{
"epoch": 35.98885793871866,
"grad_norm": 1.7921872957808758,
"learning_rate": 3.810761324722523e-05,
"loss": 2.5052,
"step": 12920
},
{
"epoch": 36.04456824512535,
"grad_norm": 1.2261058827590274,
"learning_rate": 3.800992593803946e-05,
"loss": 2.5112,
"step": 12940
},
{
"epoch": 36.100278551532035,
"grad_norm": 1.135621182926646,
"learning_rate": 3.791226953194725e-05,
"loss": 2.5028,
"step": 12960
},
{
"epoch": 36.155988857938716,
"grad_norm": 1.6277687709086734,
"learning_rate": 3.7814644701072246e-05,
"loss": 2.5162,
"step": 12980
},
{
"epoch": 36.211699164345404,
"grad_norm": 1.1697686185114848,
"learning_rate": 3.771705211732085e-05,
"loss": 2.4937,
"step": 13000
},
{
"epoch": 36.26740947075209,
"grad_norm": 1.1470445311609152,
"learning_rate": 3.761949245237742e-05,
"loss": 2.4959,
"step": 13020
},
{
"epoch": 36.32311977715877,
"grad_norm": 1.1830067451201864,
"learning_rate": 3.752196637769983e-05,
"loss": 2.5184,
"step": 13040
},
{
"epoch": 36.37883008356546,
"grad_norm": 1.467366443353488,
"learning_rate": 3.742447456451474e-05,
"loss": 2.5167,
"step": 13060
},
{
"epoch": 36.43454038997214,
"grad_norm": 1.4104527622849412,
"learning_rate": 3.732701768381299e-05,
"loss": 2.5044,
"step": 13080
},
{
"epoch": 36.49025069637883,
"grad_norm": 1.2155633839198112,
"learning_rate": 3.722959640634501e-05,
"loss": 2.5472,
"step": 13100
},
{
"epoch": 36.54596100278552,
"grad_norm": 1.083177335128614,
"learning_rate": 3.713221140261619e-05,
"loss": 2.5002,
"step": 13120
},
{
"epoch": 36.6016713091922,
"grad_norm": 1.2014071998350737,
"learning_rate": 3.703486334288228e-05,
"loss": 2.5114,
"step": 13140
},
{
"epoch": 36.65738161559889,
"grad_norm": 1.1428988294911389,
"learning_rate": 3.693755289714471e-05,
"loss": 2.4979,
"step": 13160
},
{
"epoch": 36.713091922005574,
"grad_norm": 1.1304274747808816,
"learning_rate": 3.68402807351461e-05,
"loss": 2.4936,
"step": 13180
},
{
"epoch": 36.768802228412255,
"grad_norm": 1.3650882764211232,
"learning_rate": 3.674304752636551e-05,
"loss": 2.5157,
"step": 13200
},
{
"epoch": 36.82451253481894,
"grad_norm": 1.2182169011081385,
"learning_rate": 3.664585394001398e-05,
"loss": 2.5035,
"step": 13220
},
{
"epoch": 36.880222841225624,
"grad_norm": 1.272624756548235,
"learning_rate": 3.654870064502978e-05,
"loss": 2.4992,
"step": 13240
},
{
"epoch": 36.93593314763231,
"grad_norm": 1.0715589202820812,
"learning_rate": 3.6451588310073895e-05,
"loss": 2.5021,
"step": 13260
},
{
"epoch": 36.991643454039,
"grad_norm": 1.2942781983724116,
"learning_rate": 3.6354517603525434e-05,
"loss": 2.4859,
"step": 13280
},
{
"epoch": 37.04735376044568,
"grad_norm": 1.2606778534888776,
"learning_rate": 3.625748919347694e-05,
"loss": 2.506,
"step": 13300
},
{
"epoch": 37.10306406685237,
"grad_norm": 1.2777601010216262,
"learning_rate": 3.616050374772989e-05,
"loss": 2.4778,
"step": 13320
},
{
"epoch": 37.15877437325905,
"grad_norm": 1.176481978013248,
"learning_rate": 3.606356193379004e-05,
"loss": 2.5033,
"step": 13340
},
{
"epoch": 37.21448467966574,
"grad_norm": 1.2169531494104022,
"learning_rate": 3.596666441886285e-05,
"loss": 2.4996,
"step": 13360
},
{
"epoch": 37.270194986072426,
"grad_norm": 1.3217947072787108,
"learning_rate": 3.586981186984891e-05,
"loss": 2.4884,
"step": 13380
},
{
"epoch": 37.325905292479106,
"grad_norm": 1.2091550344263509,
"learning_rate": 3.577300495333929e-05,
"loss": 2.4643,
"step": 13400
},
{
"epoch": 37.381615598885794,
"grad_norm": 1.3020086095843864,
"learning_rate": 3.5676244335611045e-05,
"loss": 2.5115,
"step": 13420
},
{
"epoch": 37.43732590529248,
"grad_norm": 1.224631401229228,
"learning_rate": 3.5579530682622527e-05,
"loss": 2.5052,
"step": 13440
},
{
"epoch": 37.49303621169916,
"grad_norm": 1.2937153649240984,
"learning_rate": 3.548286466000888e-05,
"loss": 2.4887,
"step": 13460
},
{
"epoch": 37.54874651810585,
"grad_norm": 1.1629156767485442,
"learning_rate": 3.5386246933077437e-05,
"loss": 2.4835,
"step": 13480
},
{
"epoch": 37.60445682451253,
"grad_norm": 1.409151745433056,
"learning_rate": 3.52896781668031e-05,
"loss": 2.4844,
"step": 13500
},
{
"epoch": 37.66016713091922,
"grad_norm": 1.219993695453392,
"learning_rate": 3.519315902582384e-05,
"loss": 2.4891,
"step": 13520
},
{
"epoch": 37.71587743732591,
"grad_norm": 1.286568356632355,
"learning_rate": 3.509669017443603e-05,
"loss": 2.5028,
"step": 13540
},
{
"epoch": 37.77158774373259,
"grad_norm": 1.2144839115218449,
"learning_rate": 3.500027227658998e-05,
"loss": 2.4808,
"step": 13560
},
{
"epoch": 37.82729805013928,
"grad_norm": 1.4206777606666374,
"learning_rate": 3.490390599588527e-05,
"loss": 2.4884,
"step": 13580
},
{
"epoch": 37.88300835654596,
"grad_norm": 1.119769219553682,
"learning_rate": 3.480759199556625e-05,
"loss": 2.532,
"step": 13600
},
{
"epoch": 37.938718662952645,
"grad_norm": 1.216344088823688,
"learning_rate": 3.4711330938517415e-05,
"loss": 2.4825,
"step": 13620
},
{
"epoch": 37.99442896935933,
"grad_norm": 1.2356798590645826,
"learning_rate": 3.4615123487258904e-05,
"loss": 2.477,
"step": 13640
},
{
"epoch": 38.050139275766014,
"grad_norm": 1.1889430421938458,
"learning_rate": 3.45189703039419e-05,
"loss": 2.4657,
"step": 13660
},
{
"epoch": 38.1058495821727,
"grad_norm": 1.2167871317287668,
"learning_rate": 3.442287205034409e-05,
"loss": 2.4873,
"step": 13680
},
{
"epoch": 38.16155988857939,
"grad_norm": 1.3417090804209535,
"learning_rate": 3.4326829387865105e-05,
"loss": 2.4978,
"step": 13700
},
{
"epoch": 38.21727019498607,
"grad_norm": 1.215214848291835,
"learning_rate": 3.423084297752197e-05,
"loss": 2.4873,
"step": 13720
},
{
"epoch": 38.27298050139276,
"grad_norm": 1.3942080399553185,
"learning_rate": 3.413491347994455e-05,
"loss": 2.4869,
"step": 13740
},
{
"epoch": 38.32869080779944,
"grad_norm": 1.1596082595198587,
"learning_rate": 3.4039041555370985e-05,
"loss": 2.4742,
"step": 13760
},
{
"epoch": 38.38440111420613,
"grad_norm": 1.300823510553577,
"learning_rate": 3.394322786364321e-05,
"loss": 2.4824,
"step": 13780
},
{
"epoch": 38.440111420612816,
"grad_norm": 1.4023204575319392,
"learning_rate": 3.384747306420234e-05,
"loss": 2.5132,
"step": 13800
},
{
"epoch": 38.4958217270195,
"grad_norm": 1.169311221589696,
"learning_rate": 3.375177781608417e-05,
"loss": 2.4931,
"step": 13820
},
{
"epoch": 38.551532033426184,
"grad_norm": 1.3513790522732192,
"learning_rate": 3.365614277791463e-05,
"loss": 2.5037,
"step": 13840
},
{
"epoch": 38.60724233983287,
"grad_norm": 1.127031391091575,
"learning_rate": 3.3560568607905244e-05,
"loss": 2.5187,
"step": 13860
},
{
"epoch": 38.66295264623955,
"grad_norm": 1.2275938957516415,
"learning_rate": 3.346505596384864e-05,
"loss": 2.4657,
"step": 13880
},
{
"epoch": 38.71866295264624,
"grad_norm": 1.3320190791797168,
"learning_rate": 3.336960550311395e-05,
"loss": 2.4951,
"step": 13900
},
{
"epoch": 38.77437325905292,
"grad_norm": 1.5869673766120833,
"learning_rate": 3.3274217882642355e-05,
"loss": 2.5087,
"step": 13920
},
{
"epoch": 38.83008356545961,
"grad_norm": 1.2695582913500114,
"learning_rate": 3.317889375894252e-05,
"loss": 2.4826,
"step": 13940
},
{
"epoch": 38.8857938718663,
"grad_norm": 1.430015517022631,
"learning_rate": 3.3083633788086115e-05,
"loss": 2.4652,
"step": 13960
},
{
"epoch": 38.94150417827298,
"grad_norm": 1.2238102984760655,
"learning_rate": 3.2988438625703226e-05,
"loss": 2.5151,
"step": 13980
},
{
"epoch": 38.99721448467967,
"grad_norm": 1.1505694948469867,
"learning_rate": 3.2893308926977964e-05,
"loss": 2.4639,
"step": 14000
},
{
"epoch": 39.05292479108635,
"grad_norm": 1.3358804025647248,
"learning_rate": 3.2798245346643826e-05,
"loss": 2.4831,
"step": 14020
},
{
"epoch": 39.108635097493035,
"grad_norm": 1.1965450681906031,
"learning_rate": 3.270324853897926e-05,
"loss": 2.4934,
"step": 14040
},
{
"epoch": 39.16434540389972,
"grad_norm": 1.2724701888326422,
"learning_rate": 3.260831915780317e-05,
"loss": 2.515,
"step": 14060
},
{
"epoch": 39.220055710306404,
"grad_norm": 1.226431107683309,
"learning_rate": 3.251345785647037e-05,
"loss": 2.4912,
"step": 14080
},
{
"epoch": 39.27576601671309,
"grad_norm": 1.4978021366863286,
"learning_rate": 3.241866528786712e-05,
"loss": 2.4666,
"step": 14100
},
{
"epoch": 39.33147632311978,
"grad_norm": 1.1852102410219578,
"learning_rate": 3.232394210440664e-05,
"loss": 2.453,
"step": 14120
},
{
"epoch": 39.38718662952646,
"grad_norm": 1.1514347696655127,
"learning_rate": 3.222928895802457e-05,
"loss": 2.492,
"step": 14140
},
{
"epoch": 39.44289693593315,
"grad_norm": 1.1727022967870606,
"learning_rate": 3.213470650017457e-05,
"loss": 2.4671,
"step": 14160
},
{
"epoch": 39.49860724233983,
"grad_norm": 1.2255693825516534,
"learning_rate": 3.204019538182371e-05,
"loss": 2.47,
"step": 14180
},
{
"epoch": 39.55431754874652,
"grad_norm": 1.3308024547484585,
"learning_rate": 3.194575625344813e-05,
"loss": 2.4705,
"step": 14200
},
{
"epoch": 39.610027855153206,
"grad_norm": 1.29922812166292,
"learning_rate": 3.185138976502847e-05,
"loss": 2.4756,
"step": 14220
},
{
"epoch": 39.66573816155989,
"grad_norm": 1.172050649565444,
"learning_rate": 3.175709656604543e-05,
"loss": 2.4795,
"step": 14240
},
{
"epoch": 39.721448467966574,
"grad_norm": 1.1860108514337921,
"learning_rate": 3.166287730547528e-05,
"loss": 2.4682,
"step": 14260
},
{
"epoch": 39.77715877437326,
"grad_norm": 1.1464631864975399,
"learning_rate": 3.1568732631785405e-05,
"loss": 2.4649,
"step": 14280
},
{
"epoch": 39.83286908077994,
"grad_norm": 1.2912682284287014,
"learning_rate": 3.147466319292988e-05,
"loss": 2.458,
"step": 14300
},
{
"epoch": 39.88857938718663,
"grad_norm": 1.24473741562901,
"learning_rate": 3.138066963634491e-05,
"loss": 2.4418,
"step": 14320
},
{
"epoch": 39.94428969359331,
"grad_norm": 1.2188645511392617,
"learning_rate": 3.1286752608944504e-05,
"loss": 2.4666,
"step": 14340
},
{
"epoch": 40.0,
"grad_norm": 1.515575618356932,
"learning_rate": 3.11929127571159e-05,
"loss": 2.4695,
"step": 14360
},
{
"epoch": 40.05571030640669,
"grad_norm": 1.267322258199449,
"learning_rate": 3.10991507267152e-05,
"loss": 2.4631,
"step": 14380
},
{
"epoch": 40.11142061281337,
"grad_norm": 1.2899851180690782,
"learning_rate": 3.100546716306292e-05,
"loss": 2.4461,
"step": 14400
},
{
"epoch": 40.16713091922006,
"grad_norm": 1.14649768727926,
"learning_rate": 3.091186271093947e-05,
"loss": 2.4534,
"step": 14420
},
{
"epoch": 40.22284122562674,
"grad_norm": 1.2682086289441403,
"learning_rate": 3.081833801458084e-05,
"loss": 2.4369,
"step": 14440
},
{
"epoch": 40.278551532033426,
"grad_norm": 1.2091006451986415,
"learning_rate": 3.0724893717674023e-05,
"loss": 2.4586,
"step": 14460
},
{
"epoch": 40.33426183844011,
"grad_norm": 1.3592584263818426,
"learning_rate": 3.063153046335271e-05,
"loss": 2.4591,
"step": 14480
},
{
"epoch": 40.389972144846794,
"grad_norm": 1.3946508542140756,
"learning_rate": 3.0538248894192804e-05,
"loss": 2.4411,
"step": 14500
},
{
"epoch": 40.44568245125348,
"grad_norm": 1.1777413578534581,
"learning_rate": 3.0445049652207995e-05,
"loss": 2.4261,
"step": 14520
},
{
"epoch": 40.50139275766017,
"grad_norm": 1.2195721573601,
"learning_rate": 3.035193337884538e-05,
"loss": 2.4421,
"step": 14540
},
{
"epoch": 40.55710306406685,
"grad_norm": 1.3520200371008158,
"learning_rate": 3.0258900714981e-05,
"loss": 2.4602,
"step": 14560
},
{
"epoch": 40.61281337047354,
"grad_norm": 1.3624148046187146,
"learning_rate": 3.016595230091545e-05,
"loss": 2.4655,
"step": 14580
},
{
"epoch": 40.66852367688022,
"grad_norm": 1.2699195161380987,
"learning_rate": 3.0073088776369473e-05,
"loss": 2.4279,
"step": 14600
},
{
"epoch": 40.72423398328691,
"grad_norm": 1.1758469344658353,
"learning_rate": 2.998031078047958e-05,
"loss": 2.473,
"step": 14620
},
{
"epoch": 40.779944289693596,
"grad_norm": 1.215388378764313,
"learning_rate": 2.9887618951793587e-05,
"loss": 2.4955,
"step": 14640
},
{
"epoch": 40.83565459610028,
"grad_norm": 1.2477068837194507,
"learning_rate": 2.97950139282663e-05,
"loss": 2.4784,
"step": 14660
},
{
"epoch": 40.891364902506965,
"grad_norm": 1.3420186707298443,
"learning_rate": 2.9702496347255056e-05,
"loss": 2.4768,
"step": 14680
},
{
"epoch": 40.94707520891365,
"grad_norm": 1.39249618130456,
"learning_rate": 2.9610066845515383e-05,
"loss": 2.4385,
"step": 14700
},
{
"epoch": 41.00278551532033,
"grad_norm": 1.2754818915657757,
"learning_rate": 2.9517726059196613e-05,
"loss": 2.4569,
"step": 14720
},
{
"epoch": 41.05849582172702,
"grad_norm": 1.4277410738801621,
"learning_rate": 2.942547462383744e-05,
"loss": 2.4587,
"step": 14740
},
{
"epoch": 41.1142061281337,
"grad_norm": 1.1986405858066078,
"learning_rate": 2.9333313174361673e-05,
"loss": 2.4533,
"step": 14760
},
{
"epoch": 41.16991643454039,
"grad_norm": 1.4106771315477926,
"learning_rate": 2.924124234507371e-05,
"loss": 2.4564,
"step": 14780
},
{
"epoch": 41.22562674094708,
"grad_norm": 1.1572284880554229,
"learning_rate": 2.9149262769654307e-05,
"loss": 2.4403,
"step": 14800
},
{
"epoch": 41.28133704735376,
"grad_norm": 1.3398664349194382,
"learning_rate": 2.9057375081156153e-05,
"loss": 2.4632,
"step": 14820
},
{
"epoch": 41.33704735376045,
"grad_norm": 1.4232783195653564,
"learning_rate": 2.89655799119995e-05,
"loss": 2.4457,
"step": 14840
},
{
"epoch": 41.39275766016713,
"grad_norm": 1.2303189537876713,
"learning_rate": 2.887387789396784e-05,
"loss": 2.4454,
"step": 14860
},
{
"epoch": 41.448467966573816,
"grad_norm": 1.4753554867267846,
"learning_rate": 2.8782269658203593e-05,
"loss": 2.4708,
"step": 14880
},
{
"epoch": 41.5041782729805,
"grad_norm": 2.0554338333623225,
"learning_rate": 2.8690755835203644e-05,
"loss": 2.4174,
"step": 14900
},
{
"epoch": 41.559888579387184,
"grad_norm": 1.2638050115107629,
"learning_rate": 2.8599337054815128e-05,
"loss": 2.4576,
"step": 14920
},
{
"epoch": 41.61559888579387,
"grad_norm": 1.6592795850932565,
"learning_rate": 2.8508013946231054e-05,
"loss": 2.4439,
"step": 14940
},
{
"epoch": 41.67130919220056,
"grad_norm": 1.2018369861968858,
"learning_rate": 2.8416787137985912e-05,
"loss": 2.4677,
"step": 14960
},
{
"epoch": 41.72701949860724,
"grad_norm": 1.1787125181340552,
"learning_rate": 2.832565725795147e-05,
"loss": 2.4423,
"step": 14980
},
{
"epoch": 41.78272980501393,
"grad_norm": 1.3144876376584371,
"learning_rate": 2.8234624933332324e-05,
"loss": 2.4166,
"step": 15000
},
{
"epoch": 41.83844011142061,
"grad_norm": 1.2101663058378904,
"learning_rate": 2.8143690790661687e-05,
"loss": 2.431,
"step": 15020
},
{
"epoch": 41.8941504178273,
"grad_norm": 1.3306396247714227,
"learning_rate": 2.8052855455797008e-05,
"loss": 2.423,
"step": 15040
},
{
"epoch": 41.949860724233986,
"grad_norm": 1.1740048371108092,
"learning_rate": 2.7962119553915685e-05,
"loss": 2.4543,
"step": 15060
},
{
"epoch": 42.00557103064067,
"grad_norm": 1.2505959391308659,
"learning_rate": 2.7871483709510788e-05,
"loss": 2.4612,
"step": 15080
},
{
"epoch": 42.061281337047355,
"grad_norm": 1.1705839887196592,
"learning_rate": 2.7780948546386702e-05,
"loss": 2.4248,
"step": 15100
},
{
"epoch": 42.116991643454035,
"grad_norm": 1.5770301620040164,
"learning_rate": 2.76905146876549e-05,
"loss": 2.4475,
"step": 15120
},
{
"epoch": 42.17270194986072,
"grad_norm": 1.3540734118986908,
"learning_rate": 2.760018275572962e-05,
"loss": 2.4186,
"step": 15140
},
{
"epoch": 42.22841225626741,
"grad_norm": 1.146718032535289,
"learning_rate": 2.750995337232356e-05,
"loss": 2.4091,
"step": 15160
},
{
"epoch": 42.28412256267409,
"grad_norm": 1.2196868218322996,
"learning_rate": 2.7419827158443667e-05,
"loss": 2.4309,
"step": 15180
},
{
"epoch": 42.33983286908078,
"grad_norm": 1.5485243840943164,
"learning_rate": 2.7329804734386765e-05,
"loss": 2.4602,
"step": 15200
},
{
"epoch": 42.39554317548747,
"grad_norm": 1.2206227305933974,
"learning_rate": 2.723988671973541e-05,
"loss": 2.4701,
"step": 15220
},
{
"epoch": 42.45125348189415,
"grad_norm": 1.26332460678578,
"learning_rate": 2.7150073733353484e-05,
"loss": 2.4528,
"step": 15240
},
{
"epoch": 42.50696378830084,
"grad_norm": 1.311901210503493,
"learning_rate": 2.706036639338207e-05,
"loss": 2.4283,
"step": 15260
},
{
"epoch": 42.56267409470752,
"grad_norm": 1.2690533418017822,
"learning_rate": 2.6970765317235096e-05,
"loss": 2.4345,
"step": 15280
},
{
"epoch": 42.618384401114206,
"grad_norm": 1.2676520230160475,
"learning_rate": 2.6881271121595137e-05,
"loss": 2.4048,
"step": 15300
},
{
"epoch": 42.674094707520894,
"grad_norm": 1.4516895593566883,
"learning_rate": 2.6791884422409157e-05,
"loss": 2.4279,
"step": 15320
},
{
"epoch": 42.729805013927574,
"grad_norm": 1.3872513872471008,
"learning_rate": 2.6702605834884283e-05,
"loss": 2.4026,
"step": 15340
},
{
"epoch": 42.78551532033426,
"grad_norm": 1.3767239373202538,
"learning_rate": 2.6613435973483546e-05,
"loss": 2.4219,
"step": 15360
},
{
"epoch": 42.84122562674095,
"grad_norm": 1.206741931800155,
"learning_rate": 2.6524375451921694e-05,
"loss": 2.426,
"step": 15380
},
{
"epoch": 42.89693593314763,
"grad_norm": 1.314107492262272,
"learning_rate": 2.643542488316087e-05,
"loss": 2.4027,
"step": 15400
},
{
"epoch": 42.95264623955432,
"grad_norm": 1.2591566602374167,
"learning_rate": 2.6346584879406546e-05,
"loss": 2.4105,
"step": 15420
},
{
"epoch": 43.008356545961,
"grad_norm": 1.2543202609095945,
"learning_rate": 2.6257856052103176e-05,
"loss": 2.4174,
"step": 15440
},
{
"epoch": 43.06406685236769,
"grad_norm": 1.3161836673091634,
"learning_rate": 2.616923901193006e-05,
"loss": 2.4146,
"step": 15460
},
{
"epoch": 43.119777158774376,
"grad_norm": 1.171075292652416,
"learning_rate": 2.6080734368797124e-05,
"loss": 2.4159,
"step": 15480
},
{
"epoch": 43.17548746518106,
"grad_norm": 1.207718728407823,
"learning_rate": 2.599234273184067e-05,
"loss": 2.404,
"step": 15500
},
{
"epoch": 43.231197771587745,
"grad_norm": 1.2836648932544974,
"learning_rate": 2.5904064709419275e-05,
"loss": 2.4147,
"step": 15520
},
{
"epoch": 43.286908077994426,
"grad_norm": 1.2867738641320774,
"learning_rate": 2.5815900909109578e-05,
"loss": 2.4405,
"step": 15540
},
{
"epoch": 43.34261838440111,
"grad_norm": 1.314400827907675,
"learning_rate": 2.572785193770205e-05,
"loss": 2.384,
"step": 15560
},
{
"epoch": 43.3983286908078,
"grad_norm": 1.4291537299918844,
"learning_rate": 2.5639918401196828e-05,
"loss": 2.4408,
"step": 15580
},
{
"epoch": 43.45403899721448,
"grad_norm": 1.5382813225617216,
"learning_rate": 2.555210090479959e-05,
"loss": 2.4224,
"step": 15600
},
{
"epoch": 43.50974930362117,
"grad_norm": 1.2172238724441946,
"learning_rate": 2.5464400052917377e-05,
"loss": 2.4273,
"step": 15620
},
{
"epoch": 43.56545961002786,
"grad_norm": 1.3185716102890666,
"learning_rate": 2.537681644915439e-05,
"loss": 2.4399,
"step": 15640
},
{
"epoch": 43.62116991643454,
"grad_norm": 1.7970207701573762,
"learning_rate": 2.528935069630791e-05,
"loss": 2.438,
"step": 15660
},
{
"epoch": 43.67688022284123,
"grad_norm": 1.336384852624976,
"learning_rate": 2.5202003396364028e-05,
"loss": 2.4104,
"step": 15680
},
{
"epoch": 43.73259052924791,
"grad_norm": 1.2492741812810837,
"learning_rate": 2.5114775150493652e-05,
"loss": 2.4372,
"step": 15700
},
{
"epoch": 43.788300835654596,
"grad_norm": 1.1839597940386342,
"learning_rate": 2.5027666559048265e-05,
"loss": 2.4374,
"step": 15720
},
{
"epoch": 43.844011142061284,
"grad_norm": 1.5042190873869037,
"learning_rate": 2.4940678221555836e-05,
"loss": 2.4131,
"step": 15740
},
{
"epoch": 43.899721448467965,
"grad_norm": 1.263884884008274,
"learning_rate": 2.485381073671668e-05,
"loss": 2.4481,
"step": 15760
},
{
"epoch": 43.95543175487465,
"grad_norm": 1.3105766012420574,
"learning_rate": 2.4767064702399307e-05,
"loss": 2.4316,
"step": 15780
},
{
"epoch": 44.01114206128134,
"grad_norm": 1.3070696897883654,
"learning_rate": 2.4680440715636386e-05,
"loss": 2.4113,
"step": 15800
},
{
"epoch": 44.06685236768802,
"grad_norm": 1.3679934235167148,
"learning_rate": 2.459393937262057e-05,
"loss": 2.462,
"step": 15820
},
{
"epoch": 44.12256267409471,
"grad_norm": 1.2961531328086042,
"learning_rate": 2.45075612687004e-05,
"loss": 2.3913,
"step": 15840
},
{
"epoch": 44.17827298050139,
"grad_norm": 1.5918138107382298,
"learning_rate": 2.4421306998376247e-05,
"loss": 2.4062,
"step": 15860
},
{
"epoch": 44.23398328690808,
"grad_norm": 1.5407959855411433,
"learning_rate": 2.4335177155296173e-05,
"loss": 2.4135,
"step": 15880
},
{
"epoch": 44.289693593314766,
"grad_norm": 1.3864718482505074,
"learning_rate": 2.4249172332251867e-05,
"loss": 2.435,
"step": 15900
},
{
"epoch": 44.34540389972145,
"grad_norm": 1.5629168020574962,
"learning_rate": 2.4163293121174586e-05,
"loss": 2.42,
"step": 15920
},
{
"epoch": 44.401114206128135,
"grad_norm": 1.3404420567150592,
"learning_rate": 2.4077540113131e-05,
"loss": 2.3939,
"step": 15940
},
{
"epoch": 44.456824512534816,
"grad_norm": 1.2610215128317497,
"learning_rate": 2.3991913898319236e-05,
"loss": 2.3981,
"step": 15960
},
{
"epoch": 44.5125348189415,
"grad_norm": 1.1948628905515135,
"learning_rate": 2.390641506606475e-05,
"loss": 2.4259,
"step": 15980
},
{
"epoch": 44.56824512534819,
"grad_norm": 1.5442691993168876,
"learning_rate": 2.3821044204816285e-05,
"loss": 2.4106,
"step": 16000
},
{
"epoch": 44.62395543175487,
"grad_norm": 1.3385181560005985,
"learning_rate": 2.3735801902141812e-05,
"loss": 2.4231,
"step": 16020
},
{
"epoch": 44.67966573816156,
"grad_norm": 1.5429534444435276,
"learning_rate": 2.3650688744724484e-05,
"loss": 2.4094,
"step": 16040
},
{
"epoch": 44.73537604456825,
"grad_norm": 1.3480847228783814,
"learning_rate": 2.356570531835862e-05,
"loss": 2.3925,
"step": 16060
},
{
"epoch": 44.79108635097493,
"grad_norm": 1.3495527852819211,
"learning_rate": 2.348085220794566e-05,
"loss": 2.4055,
"step": 16080
},
{
"epoch": 44.84679665738162,
"grad_norm": 1.3160942477102502,
"learning_rate": 2.3396129997490143e-05,
"loss": 2.4,
"step": 16100
},
{
"epoch": 44.9025069637883,
"grad_norm": 1.1577155684233915,
"learning_rate": 2.3311539270095685e-05,
"loss": 2.4214,
"step": 16120
},
{
"epoch": 44.958217270194986,
"grad_norm": 1.1408523761505251,
"learning_rate": 2.3227080607960936e-05,
"loss": 2.3958,
"step": 16140
},
{
"epoch": 45.013927576601674,
"grad_norm": 1.2730620543264026,
"learning_rate": 2.314275459237564e-05,
"loss": 2.3903,
"step": 16160
},
{
"epoch": 45.069637883008355,
"grad_norm": 1.1827924678817745,
"learning_rate": 2.3058561803716587e-05,
"loss": 2.4268,
"step": 16180
},
{
"epoch": 45.12534818941504,
"grad_norm": 1.3034192039686017,
"learning_rate": 2.2974502821443615e-05,
"loss": 2.3954,
"step": 16200
},
{
"epoch": 45.18105849582173,
"grad_norm": 1.4633461110410906,
"learning_rate": 2.289057822409564e-05,
"loss": 2.3956,
"step": 16220
},
{
"epoch": 45.23676880222841,
"grad_norm": 1.4777159861247156,
"learning_rate": 2.2806788589286683e-05,
"loss": 2.3643,
"step": 16240
},
{
"epoch": 45.2924791086351,
"grad_norm": 1.2317517708690167,
"learning_rate": 2.2723134493701863e-05,
"loss": 2.3884,
"step": 16260
},
{
"epoch": 45.34818941504178,
"grad_norm": 1.2821403751825975,
"learning_rate": 2.2639616513093453e-05,
"loss": 2.4146,
"step": 16280
},
{
"epoch": 45.40389972144847,
"grad_norm": 1.445641063656784,
"learning_rate": 2.2556235222276924e-05,
"loss": 2.4316,
"step": 16300
},
{
"epoch": 45.459610027855156,
"grad_norm": 1.4577958162948974,
"learning_rate": 2.2472991195126933e-05,
"loss": 2.417,
"step": 16320
},
{
"epoch": 45.51532033426184,
"grad_norm": 1.3311809796824847,
"learning_rate": 2.2389885004573452e-05,
"loss": 2.4165,
"step": 16340
},
{
"epoch": 45.571030640668525,
"grad_norm": 1.404962587268908,
"learning_rate": 2.2306917222597776e-05,
"loss": 2.4204,
"step": 16360
},
{
"epoch": 45.626740947075206,
"grad_norm": 1.2925568482512864,
"learning_rate": 2.2224088420228597e-05,
"loss": 2.3624,
"step": 16380
},
{
"epoch": 45.682451253481894,
"grad_norm": 1.2480563390495507,
"learning_rate": 2.21413991675381e-05,
"loss": 2.4226,
"step": 16400
},
{
"epoch": 45.73816155988858,
"grad_norm": 1.395538579066591,
"learning_rate": 2.2058850033637958e-05,
"loss": 2.4103,
"step": 16420
},
{
"epoch": 45.79387186629526,
"grad_norm": 1.2886437393173196,
"learning_rate": 2.197644158667552e-05,
"loss": 2.4027,
"step": 16440
},
{
"epoch": 45.84958217270195,
"grad_norm": 1.2805552396725532,
"learning_rate": 2.1894174393829843e-05,
"loss": 2.3974,
"step": 16460
},
{
"epoch": 45.90529247910864,
"grad_norm": 1.3616296005412893,
"learning_rate": 2.1812049021307776e-05,
"loss": 2.389,
"step": 16480
},
{
"epoch": 45.96100278551532,
"grad_norm": 1.2441651106621028,
"learning_rate": 2.1730066034340133e-05,
"loss": 2.397,
"step": 16500
},
{
"epoch": 46.01671309192201,
"grad_norm": 1.294168032433046,
"learning_rate": 2.1648225997177664e-05,
"loss": 2.4032,
"step": 16520
},
{
"epoch": 46.07242339832869,
"grad_norm": 1.3259623680432362,
"learning_rate": 2.1566529473087366e-05,
"loss": 2.409,
"step": 16540
},
{
"epoch": 46.128133704735376,
"grad_norm": 1.4193156574119963,
"learning_rate": 2.1484977024348456e-05,
"loss": 2.3973,
"step": 16560
},
{
"epoch": 46.183844011142064,
"grad_norm": 1.423133534891623,
"learning_rate": 2.1403569212248545e-05,
"loss": 2.4221,
"step": 16580
},
{
"epoch": 46.239554317548745,
"grad_norm": 1.289312718105723,
"learning_rate": 2.1322306597079752e-05,
"loss": 2.4058,
"step": 16600
},
{
"epoch": 46.29526462395543,
"grad_norm": 1.3742336820519754,
"learning_rate": 2.1241189738134925e-05,
"loss": 2.3803,
"step": 16620
},
{
"epoch": 46.35097493036211,
"grad_norm": 1.3293445511084065,
"learning_rate": 2.116021919370371e-05,
"loss": 2.3779,
"step": 16640
},
{
"epoch": 46.4066852367688,
"grad_norm": 1.438873266798883,
"learning_rate": 2.1079395521068736e-05,
"loss": 2.4125,
"step": 16660
},
{
"epoch": 46.46239554317549,
"grad_norm": 1.5684362505574445,
"learning_rate": 2.099871927650181e-05,
"loss": 2.4172,
"step": 16680
},
{
"epoch": 46.51810584958217,
"grad_norm": 1.219039146798224,
"learning_rate": 2.091819101526001e-05,
"loss": 2.3556,
"step": 16700
},
{
"epoch": 46.57381615598886,
"grad_norm": 1.290744476185817,
"learning_rate": 2.083781129158196e-05,
"loss": 2.3915,
"step": 16720
},
{
"epoch": 46.629526462395546,
"grad_norm": 1.2783924627018073,
"learning_rate": 2.075758065868394e-05,
"loss": 2.4021,
"step": 16740
},
{
"epoch": 46.68523676880223,
"grad_norm": 1.4723296505015109,
"learning_rate": 2.0677499668756148e-05,
"loss": 2.4076,
"step": 16760
},
{
"epoch": 46.740947075208915,
"grad_norm": 1.4361000627178464,
"learning_rate": 2.0597568872958793e-05,
"loss": 2.3704,
"step": 16780
},
{
"epoch": 46.796657381615596,
"grad_norm": 1.3406172199532498,
"learning_rate": 2.051778882141842e-05,
"loss": 2.4095,
"step": 16800
},
{
"epoch": 46.852367688022284,
"grad_norm": 1.2728857237580478,
"learning_rate": 2.0438160063224055e-05,
"loss": 2.3951,
"step": 16820
},
{
"epoch": 46.90807799442897,
"grad_norm": 1.307601696226092,
"learning_rate": 2.035868314642344e-05,
"loss": 2.3762,
"step": 16840
},
{
"epoch": 46.96378830083565,
"grad_norm": 1.341375077755919,
"learning_rate": 2.0279358618019277e-05,
"loss": 2.3747,
"step": 16860
},
{
"epoch": 47.01949860724234,
"grad_norm": 1.3661705880679602,
"learning_rate": 2.0200187023965426e-05,
"loss": 2.3762,
"step": 16880
},
{
"epoch": 47.07520891364903,
"grad_norm": 1.498474803717406,
"learning_rate": 2.0121168909163192e-05,
"loss": 2.3757,
"step": 16900
},
{
"epoch": 47.13091922005571,
"grad_norm": 1.30345057097787,
"learning_rate": 2.0042304817457542e-05,
"loss": 2.4065,
"step": 16920
},
{
"epoch": 47.1866295264624,
"grad_norm": 1.8505180253101237,
"learning_rate": 1.9963595291633392e-05,
"loss": 2.3871,
"step": 16940
},
{
"epoch": 47.24233983286908,
"grad_norm": 1.450593355694572,
"learning_rate": 1.9885040873411806e-05,
"loss": 2.3871,
"step": 16960
},
{
"epoch": 47.298050139275766,
"grad_norm": 1.3343700388758895,
"learning_rate": 1.980664210344637e-05,
"loss": 2.3649,
"step": 16980
},
{
"epoch": 47.353760445682454,
"grad_norm": 1.355886861721978,
"learning_rate": 1.9728399521319373e-05,
"loss": 2.4009,
"step": 17000
},
{
"epoch": 47.409470752089135,
"grad_norm": 1.271470013080887,
"learning_rate": 1.9650313665538177e-05,
"loss": 2.3921,
"step": 17020
},
{
"epoch": 47.46518105849582,
"grad_norm": 1.3641748668938396,
"learning_rate": 1.957238507353144e-05,
"loss": 2.3785,
"step": 17040
},
{
"epoch": 47.5208913649025,
"grad_norm": 1.3453345779522274,
"learning_rate": 1.9494614281645438e-05,
"loss": 2.3535,
"step": 17060
},
{
"epoch": 47.57660167130919,
"grad_norm": 1.4507718501211375,
"learning_rate": 1.9417001825140412e-05,
"loss": 2.3866,
"step": 17080
},
{
"epoch": 47.63231197771588,
"grad_norm": 1.246940923314926,
"learning_rate": 1.9339548238186828e-05,
"loss": 2.3664,
"step": 17100
},
{
"epoch": 47.68802228412256,
"grad_norm": 1.369776524169899,
"learning_rate": 1.9262254053861745e-05,
"loss": 2.4068,
"step": 17120
},
{
"epoch": 47.74373259052925,
"grad_norm": 1.4449578327650376,
"learning_rate": 1.9185119804145137e-05,
"loss": 2.386,
"step": 17140
},
{
"epoch": 47.799442896935936,
"grad_norm": 1.3430555560772082,
"learning_rate": 1.9108146019916174e-05,
"loss": 2.4116,
"step": 17160
},
{
"epoch": 47.85515320334262,
"grad_norm": 1.3639059020256794,
"learning_rate": 1.9031333230949668e-05,
"loss": 2.3732,
"step": 17180
},
{
"epoch": 47.910863509749305,
"grad_norm": 1.2408281799864953,
"learning_rate": 1.8954681965912332e-05,
"loss": 2.3787,
"step": 17200
},
{
"epoch": 47.966573816155986,
"grad_norm": 1.3061826212099938,
"learning_rate": 1.8878192752359258e-05,
"loss": 2.3728,
"step": 17220
},
{
"epoch": 48.022284122562674,
"grad_norm": 1.538238890457239,
"learning_rate": 1.8801866116730123e-05,
"loss": 2.3755,
"step": 17240
},
{
"epoch": 48.07799442896936,
"grad_norm": 1.3762558112205403,
"learning_rate": 1.872570258434571e-05,
"loss": 2.3727,
"step": 17260
},
{
"epoch": 48.13370473537604,
"grad_norm": 1.2476738434905705,
"learning_rate": 1.8649702679404223e-05,
"loss": 2.3652,
"step": 17280
},
{
"epoch": 48.18941504178273,
"grad_norm": 1.278376392904509,
"learning_rate": 1.8573866924977697e-05,
"loss": 2.3867,
"step": 17300
},
{
"epoch": 48.24512534818942,
"grad_norm": 1.5831828264230967,
"learning_rate": 1.84981958430084e-05,
"loss": 2.382,
"step": 17320
},
{
"epoch": 48.3008356545961,
"grad_norm": 1.2626351145463612,
"learning_rate": 1.842268995430522e-05,
"loss": 2.36,
"step": 17340
},
{
"epoch": 48.35654596100279,
"grad_norm": 1.3775126997716187,
"learning_rate": 1.834734977854011e-05,
"loss": 2.3553,
"step": 17360
},
{
"epoch": 48.41225626740947,
"grad_norm": 1.2704692987761135,
"learning_rate": 1.8272175834244497e-05,
"loss": 2.3722,
"step": 17380
},
{
"epoch": 48.467966573816156,
"grad_norm": 1.2840542491455302,
"learning_rate": 1.8197168638805704e-05,
"loss": 2.3766,
"step": 17400
},
{
"epoch": 48.523676880222844,
"grad_norm": 1.2405421296938253,
"learning_rate": 1.812232870846343e-05,
"loss": 2.3796,
"step": 17420
},
{
"epoch": 48.579387186629525,
"grad_norm": 1.5352436430288825,
"learning_rate": 1.8047656558306114e-05,
"loss": 2.3297,
"step": 17440
},
{
"epoch": 48.63509749303621,
"grad_norm": 1.3152930932933073,
"learning_rate": 1.797315270226748e-05,
"loss": 2.3763,
"step": 17460
},
{
"epoch": 48.690807799442894,
"grad_norm": 1.162288641400069,
"learning_rate": 1.789881765312296e-05,
"loss": 2.378,
"step": 17480
},
{
"epoch": 48.74651810584958,
"grad_norm": 1.5801279533275956,
"learning_rate": 1.7824651922486156e-05,
"loss": 2.3697,
"step": 17500
},
{
"epoch": 48.80222841225627,
"grad_norm": 1.471173585562619,
"learning_rate": 1.7750656020805324e-05,
"loss": 2.3874,
"step": 17520
},
{
"epoch": 48.85793871866295,
"grad_norm": 1.4109592484040796,
"learning_rate": 1.767683045735989e-05,
"loss": 2.3444,
"step": 17540
},
{
"epoch": 48.91364902506964,
"grad_norm": 1.3726614971919417,
"learning_rate": 1.7603175740256895e-05,
"loss": 2.3635,
"step": 17560
},
{
"epoch": 48.969359331476326,
"grad_norm": 1.4758519191463757,
"learning_rate": 1.752969237642755e-05,
"loss": 2.3672,
"step": 17580
},
{
"epoch": 49.02506963788301,
"grad_norm": 1.4221040255565391,
"learning_rate": 1.745638087162368e-05,
"loss": 2.3578,
"step": 17600
},
{
"epoch": 49.080779944289695,
"grad_norm": 1.3119388898351543,
"learning_rate": 1.7383241730414324e-05,
"loss": 2.3895,
"step": 17620
},
{
"epoch": 49.136490250696376,
"grad_norm": 1.3268973264123183,
"learning_rate": 1.7310275456182212e-05,
"loss": 2.3383,
"step": 17640
},
{
"epoch": 49.192200557103064,
"grad_norm": 1.3097397298281048,
"learning_rate": 1.72374825511203e-05,
"loss": 2.3623,
"step": 17660
},
{
"epoch": 49.24791086350975,
"grad_norm": 1.1921177397839031,
"learning_rate": 1.716486351622835e-05,
"loss": 2.351,
"step": 17680
},
{
"epoch": 49.30362116991643,
"grad_norm": 1.3660254674411707,
"learning_rate": 1.709241885130941e-05,
"loss": 2.3643,
"step": 17700
},
{
"epoch": 49.35933147632312,
"grad_norm": 1.6307520440880778,
"learning_rate": 1.7020149054966462e-05,
"loss": 2.3624,
"step": 17720
},
{
"epoch": 49.41504178272981,
"grad_norm": 1.4329708911636774,
"learning_rate": 1.694805462459894e-05,
"loss": 2.3278,
"step": 17740
},
{
"epoch": 49.47075208913649,
"grad_norm": 1.516855939329352,
"learning_rate": 1.6876136056399307e-05,
"loss": 2.3734,
"step": 17760
},
{
"epoch": 49.52646239554318,
"grad_norm": 1.2457026011328314,
"learning_rate": 1.6804393845349665e-05,
"loss": 2.3626,
"step": 17780
},
{
"epoch": 49.58217270194986,
"grad_norm": 1.7297834390596316,
"learning_rate": 1.6732828485218297e-05,
"loss": 2.3713,
"step": 17800
},
{
"epoch": 49.637883008356546,
"grad_norm": 1.429662136545568,
"learning_rate": 1.6661440468556335e-05,
"loss": 2.3455,
"step": 17820
},
{
"epoch": 49.693593314763234,
"grad_norm": 1.26740617213478,
"learning_rate": 1.6590230286694328e-05,
"loss": 2.3659,
"step": 17840
},
{
"epoch": 49.749303621169915,
"grad_norm": 1.681389697921109,
"learning_rate": 1.651919842973888e-05,
"loss": 2.3445,
"step": 17860
},
{
"epoch": 49.8050139275766,
"grad_norm": 1.441881294284973,
"learning_rate": 1.6448345386569248e-05,
"loss": 2.3834,
"step": 17880
},
{
"epoch": 49.860724233983284,
"grad_norm": 1.341854199720449,
"learning_rate": 1.637767164483401e-05,
"loss": 2.3699,
"step": 17900
},
{
"epoch": 49.91643454038997,
"grad_norm": 1.252618865244937,
"learning_rate": 1.6307177690947698e-05,
"loss": 2.3635,
"step": 17920
},
{
"epoch": 49.97214484679666,
"grad_norm": 1.226307845622635,
"learning_rate": 1.6236864010087446e-05,
"loss": 2.3485,
"step": 17940
},
{
"epoch": 50.02785515320334,
"grad_norm": 1.3356553843028374,
"learning_rate": 1.616673108618965e-05,
"loss": 2.3578,
"step": 17960
},
{
"epoch": 50.08356545961003,
"grad_norm": 1.3848996607905597,
"learning_rate": 1.6096779401946624e-05,
"loss": 2.3504,
"step": 17980
},
{
"epoch": 50.139275766016716,
"grad_norm": 1.498723321509667,
"learning_rate": 1.6027009438803323e-05,
"loss": 2.3496,
"step": 18000
},
{
"epoch": 50.1949860724234,
"grad_norm": 1.5191100216493636,
"learning_rate": 1.595742167695398e-05,
"loss": 2.3461,
"step": 18020
},
{
"epoch": 50.250696378830085,
"grad_norm": 1.2559077943231471,
"learning_rate": 1.5888016595338836e-05,
"loss": 2.371,
"step": 18040
},
{
"epoch": 50.306406685236766,
"grad_norm": 1.3899128057224512,
"learning_rate": 1.5818794671640822e-05,
"loss": 2.349,
"step": 18060
},
{
"epoch": 50.362116991643454,
"grad_norm": 1.354712886418284,
"learning_rate": 1.574975638228226e-05,
"loss": 2.3709,
"step": 18080
},
{
"epoch": 50.41782729805014,
"grad_norm": 1.3042322626317018,
"learning_rate": 1.5680902202421623e-05,
"loss": 2.3456,
"step": 18100
},
{
"epoch": 50.47353760445682,
"grad_norm": 1.3508893425795137,
"learning_rate": 1.5612232605950247e-05,
"loss": 2.3353,
"step": 18120
},
{
"epoch": 50.52924791086351,
"grad_norm": 1.4691874741171684,
"learning_rate": 1.554374806548906e-05,
"loss": 2.3336,
"step": 18140
},
{
"epoch": 50.5849582172702,
"grad_norm": 3.0455576787450673,
"learning_rate": 1.5475449052385337e-05,
"loss": 2.3218,
"step": 18160
},
{
"epoch": 50.64066852367688,
"grad_norm": 1.4402674578269268,
"learning_rate": 1.540733603670942e-05,
"loss": 2.3372,
"step": 18180
},
{
"epoch": 50.69637883008357,
"grad_norm": 1.4795688476423643,
"learning_rate": 1.5339409487251585e-05,
"loss": 2.341,
"step": 18200
},
{
"epoch": 50.75208913649025,
"grad_norm": 1.4070357885010039,
"learning_rate": 1.5271669871518705e-05,
"loss": 2.3241,
"step": 18220
},
{
"epoch": 50.807799442896936,
"grad_norm": 1.3031362800514608,
"learning_rate": 1.5204117655731085e-05,
"loss": 2.3621,
"step": 18240
},
{
"epoch": 50.863509749303624,
"grad_norm": 1.434766758940882,
"learning_rate": 1.5136753304819218e-05,
"loss": 2.3302,
"step": 18260
},
{
"epoch": 50.919220055710305,
"grad_norm": 1.274818587770635,
"learning_rate": 1.5069577282420647e-05,
"loss": 2.3465,
"step": 18280
},
{
"epoch": 50.97493036211699,
"grad_norm": 1.2730001227993584,
"learning_rate": 1.500259005087672e-05,
"loss": 2.3294,
"step": 18300
},
{
"epoch": 51.030640668523674,
"grad_norm": 1.3369943012249967,
"learning_rate": 1.493579207122943e-05,
"loss": 2.3577,
"step": 18320
},
{
"epoch": 51.08635097493036,
"grad_norm": 1.255851793002455,
"learning_rate": 1.4869183803218242e-05,
"loss": 2.3442,
"step": 18340
},
{
"epoch": 51.14206128133705,
"grad_norm": 1.465379611467944,
"learning_rate": 1.4802765705276894e-05,
"loss": 2.3361,
"step": 18360
},
{
"epoch": 51.19777158774373,
"grad_norm": 1.598753189094513,
"learning_rate": 1.4736538234530309e-05,
"loss": 2.3488,
"step": 18380
},
{
"epoch": 51.25348189415042,
"grad_norm": 1.3758579974135776,
"learning_rate": 1.4670501846791401e-05,
"loss": 2.341,
"step": 18400
},
{
"epoch": 51.309192200557106,
"grad_norm": 1.3023167947311844,
"learning_rate": 1.4604656996557936e-05,
"loss": 2.3496,
"step": 18420
},
{
"epoch": 51.36490250696379,
"grad_norm": 1.2931564385198864,
"learning_rate": 1.4539004137009436e-05,
"loss": 2.3394,
"step": 18440
},
{
"epoch": 51.420612813370475,
"grad_norm": 1.6620919275534023,
"learning_rate": 1.4473543720004015e-05,
"loss": 2.3285,
"step": 18460
},
{
"epoch": 51.476323119777156,
"grad_norm": 1.3917496152973825,
"learning_rate": 1.4408276196075313e-05,
"loss": 2.3486,
"step": 18480
},
{
"epoch": 51.532033426183844,
"grad_norm": 1.3562772472225384,
"learning_rate": 1.4343202014429376e-05,
"loss": 2.3323,
"step": 18500
},
{
"epoch": 51.58774373259053,
"grad_norm": 1.2319819791798092,
"learning_rate": 1.4278321622941556e-05,
"loss": 2.3439,
"step": 18520
},
{
"epoch": 51.64345403899721,
"grad_norm": 1.5358845964171328,
"learning_rate": 1.4213635468153446e-05,
"loss": 2.3307,
"step": 18540
},
{
"epoch": 51.6991643454039,
"grad_norm": 1.292648527837981,
"learning_rate": 1.4149143995269799e-05,
"loss": 2.3303,
"step": 18560
},
{
"epoch": 51.75487465181058,
"grad_norm": 1.9631233257274625,
"learning_rate": 1.4084847648155449e-05,
"loss": 2.3382,
"step": 18580
},
{
"epoch": 51.81058495821727,
"grad_norm": 1.6354418847695984,
"learning_rate": 1.4020746869332296e-05,
"loss": 2.3761,
"step": 18600
},
{
"epoch": 51.86629526462396,
"grad_norm": 1.4408996452028136,
"learning_rate": 1.3956842099976191e-05,
"loss": 2.3899,
"step": 18620
},
{
"epoch": 51.92200557103064,
"grad_norm": 1.3154420223017438,
"learning_rate": 1.3893133779913992e-05,
"loss": 2.3267,
"step": 18640
},
{
"epoch": 51.977715877437326,
"grad_norm": 1.2664206876617758,
"learning_rate": 1.382962234762045e-05,
"loss": 2.3145,
"step": 18660
},
{
"epoch": 52.033426183844014,
"grad_norm": 1.461002841812497,
"learning_rate": 1.3766308240215257e-05,
"loss": 2.337,
"step": 18680
},
{
"epoch": 52.089136490250695,
"grad_norm": 1.3350308045413666,
"learning_rate": 1.3703191893460002e-05,
"loss": 2.3553,
"step": 18700
},
{
"epoch": 52.14484679665738,
"grad_norm": 1.4095843649708175,
"learning_rate": 1.364027374175515e-05,
"loss": 2.3408,
"step": 18720
},
{
"epoch": 52.200557103064064,
"grad_norm": 1.8553789055534144,
"learning_rate": 1.357755421813712e-05,
"loss": 2.3513,
"step": 18740
},
{
"epoch": 52.25626740947075,
"grad_norm": 1.517906600566457,
"learning_rate": 1.3515033754275249e-05,
"loss": 2.3512,
"step": 18760
},
{
"epoch": 52.31197771587744,
"grad_norm": 1.3004637489061956,
"learning_rate": 1.3452712780468846e-05,
"loss": 2.3344,
"step": 18780
},
{
"epoch": 52.36768802228412,
"grad_norm": 1.6081005585159005,
"learning_rate": 1.3390591725644231e-05,
"loss": 2.3714,
"step": 18800
},
{
"epoch": 52.42339832869081,
"grad_norm": 1.3820776705600462,
"learning_rate": 1.3328671017351728e-05,
"loss": 2.3472,
"step": 18820
},
{
"epoch": 52.4791086350975,
"grad_norm": 1.741399862442912,
"learning_rate": 1.3266951081762823e-05,
"loss": 2.3318,
"step": 18840
},
{
"epoch": 52.53481894150418,
"grad_norm": 1.6610982616432777,
"learning_rate": 1.320543234366714e-05,
"loss": 2.3564,
"step": 18860
},
{
"epoch": 52.590529247910865,
"grad_norm": 1.534678472008335,
"learning_rate": 1.3144115226469601e-05,
"loss": 2.3453,
"step": 18880
},
{
"epoch": 52.646239554317546,
"grad_norm": 1.6155419457685751,
"learning_rate": 1.3083000152187406e-05,
"loss": 2.3193,
"step": 18900
},
{
"epoch": 52.701949860724234,
"grad_norm": 1.3933017087549446,
"learning_rate": 1.3022087541447226e-05,
"loss": 2.3263,
"step": 18920
},
{
"epoch": 52.75766016713092,
"grad_norm": 1.4752643448356435,
"learning_rate": 1.2961377813482258e-05,
"loss": 2.3198,
"step": 18940
},
{
"epoch": 52.8133704735376,
"grad_norm": 1.4046488544858395,
"learning_rate": 1.2900871386129355e-05,
"loss": 2.3076,
"step": 18960
},
{
"epoch": 52.86908077994429,
"grad_norm": 1.3496360542086223,
"learning_rate": 1.2840568675826145e-05,
"loss": 2.3298,
"step": 18980
},
{
"epoch": 52.92479108635097,
"grad_norm": 1.441744999480469,
"learning_rate": 1.2780470097608155e-05,
"loss": 2.3579,
"step": 19000
},
{
"epoch": 52.98050139275766,
"grad_norm": 1.441039976490415,
"learning_rate": 1.272057606510598e-05,
"loss": 2.3408,
"step": 19020
},
{
"epoch": 53.03621169916435,
"grad_norm": 1.394791214620375,
"learning_rate": 1.2660886990542415e-05,
"loss": 2.3151,
"step": 19040
},
{
"epoch": 53.09192200557103,
"grad_norm": 1.315868085708281,
"learning_rate": 1.2601403284729635e-05,
"loss": 2.3304,
"step": 19060
},
{
"epoch": 53.147632311977716,
"grad_norm": 1.3975934028921921,
"learning_rate": 1.2542125357066354e-05,
"loss": 2.3314,
"step": 19080
},
{
"epoch": 53.203342618384404,
"grad_norm": 1.3231737653504019,
"learning_rate": 1.2483053615534986e-05,
"loss": 2.3419,
"step": 19100
},
{
"epoch": 53.259052924791085,
"grad_norm": 1.4696480235764173,
"learning_rate": 1.2424188466698894e-05,
"loss": 2.327,
"step": 19120
},
{
"epoch": 53.31476323119777,
"grad_norm": 1.369796074017674,
"learning_rate": 1.2365530315699543e-05,
"loss": 2.2887,
"step": 19140
},
{
"epoch": 53.370473537604454,
"grad_norm": 1.318991041965293,
"learning_rate": 1.2307079566253733e-05,
"loss": 2.3359,
"step": 19160
},
{
"epoch": 53.42618384401114,
"grad_norm": 1.3653752917448936,
"learning_rate": 1.2248836620650818e-05,
"loss": 2.3091,
"step": 19180
},
{
"epoch": 53.48189415041783,
"grad_norm": 1.3620553290863062,
"learning_rate": 1.219080187974993e-05,
"loss": 2.333,
"step": 19200
},
{
"epoch": 53.53760445682451,
"grad_norm": 1.521530969044834,
"learning_rate": 1.2132975742977222e-05,
"loss": 2.3477,
"step": 19220
},
{
"epoch": 53.5933147632312,
"grad_norm": 1.3604837849834415,
"learning_rate": 1.2075358608323133e-05,
"loss": 2.3018,
"step": 19240
},
{
"epoch": 53.64902506963789,
"grad_norm": 1.3342516336240242,
"learning_rate": 1.2017950872339636e-05,
"loss": 2.3189,
"step": 19260
},
{
"epoch": 53.70473537604457,
"grad_norm": 1.388890930917694,
"learning_rate": 1.1960752930137489e-05,
"loss": 2.3289,
"step": 19280
},
{
"epoch": 53.760445682451255,
"grad_norm": 1.3001131964873058,
"learning_rate": 1.1903765175383552e-05,
"loss": 2.2918,
"step": 19300
},
{
"epoch": 53.816155988857936,
"grad_norm": 1.3807429463362486,
"learning_rate": 1.1846988000298073e-05,
"loss": 2.2947,
"step": 19320
},
{
"epoch": 53.871866295264624,
"grad_norm": 1.3500489069634025,
"learning_rate": 1.1790421795651973e-05,
"loss": 2.3193,
"step": 19340
},
{
"epoch": 53.92757660167131,
"grad_norm": 1.440888239581446,
"learning_rate": 1.1734066950764138e-05,
"loss": 2.3342,
"step": 19360
},
{
"epoch": 53.98328690807799,
"grad_norm": 1.5457618066269658,
"learning_rate": 1.1677923853498792e-05,
"loss": 2.2843,
"step": 19380
},
{
"epoch": 54.03899721448468,
"grad_norm": 1.4026171429295824,
"learning_rate": 1.162199289026279e-05,
"loss": 2.2993,
"step": 19400
},
{
"epoch": 54.09470752089136,
"grad_norm": 1.5830085773681513,
"learning_rate": 1.156627444600296e-05,
"loss": 2.3154,
"step": 19420
},
{
"epoch": 54.15041782729805,
"grad_norm": 1.4881826390608948,
"learning_rate": 1.151076890420348e-05,
"loss": 2.3147,
"step": 19440
},
{
"epoch": 54.20612813370474,
"grad_norm": 1.3551091744705666,
"learning_rate": 1.1455476646883177e-05,
"loss": 2.3427,
"step": 19460
},
{
"epoch": 54.26183844011142,
"grad_norm": 1.4419537324222909,
"learning_rate": 1.1400398054592988e-05,
"loss": 2.3253,
"step": 19480
},
{
"epoch": 54.317548746518106,
"grad_norm": 1.3708788026201257,
"learning_rate": 1.1345533506413266e-05,
"loss": 2.2869,
"step": 19500
},
{
"epoch": 54.373259052924794,
"grad_norm": 1.474440189127514,
"learning_rate": 1.1290883379951205e-05,
"loss": 2.3095,
"step": 19520
},
{
"epoch": 54.428969359331475,
"grad_norm": 1.3454680397423404,
"learning_rate": 1.1236448051338234e-05,
"loss": 2.2888,
"step": 19540
},
{
"epoch": 54.48467966573816,
"grad_norm": 1.429931802787514,
"learning_rate": 1.1182227895227435e-05,
"loss": 2.3356,
"step": 19560
},
{
"epoch": 54.540389972144844,
"grad_norm": 1.5782912526289399,
"learning_rate": 1.112822328479094e-05,
"loss": 2.3116,
"step": 19580
},
{
"epoch": 54.59610027855153,
"grad_norm": 1.4640262618715514,
"learning_rate": 1.1074434591717396e-05,
"loss": 2.3333,
"step": 19600
},
{
"epoch": 54.65181058495822,
"grad_norm": 1.7289128171607941,
"learning_rate": 1.102086218620939e-05,
"loss": 2.2807,
"step": 19620
},
{
"epoch": 54.7075208913649,
"grad_norm": 1.4358145943314486,
"learning_rate": 1.0967506436980888e-05,
"loss": 2.3362,
"step": 19640
},
{
"epoch": 54.76323119777159,
"grad_norm": 1.2782981684370716,
"learning_rate": 1.0914367711254726e-05,
"loss": 2.3087,
"step": 19660
},
{
"epoch": 54.81894150417827,
"grad_norm": 1.4574671055158,
"learning_rate": 1.0861446374760058e-05,
"loss": 2.329,
"step": 19680
},
{
"epoch": 54.87465181058496,
"grad_norm": 1.6398070121291626,
"learning_rate": 1.0808742791729863e-05,
"loss": 2.3005,
"step": 19700
},
{
"epoch": 54.930362116991645,
"grad_norm": 1.3682990675605438,
"learning_rate": 1.075625732489842e-05,
"loss": 2.3105,
"step": 19720
},
{
"epoch": 54.986072423398326,
"grad_norm": 1.4113101451622823,
"learning_rate": 1.0703990335498795e-05,
"loss": 2.3004,
"step": 19740
},
{
"epoch": 55.041782729805014,
"grad_norm": 1.3747716130043024,
"learning_rate": 1.0651942183260405e-05,
"loss": 2.3123,
"step": 19760
},
{
"epoch": 55.0974930362117,
"grad_norm": 1.5773531144976136,
"learning_rate": 1.0600113226406483e-05,
"loss": 2.31,
"step": 19780
},
{
"epoch": 55.15320334261838,
"grad_norm": 1.636015923365525,
"learning_rate": 1.0548503821651675e-05,
"loss": 2.2963,
"step": 19800
},
{
"epoch": 55.20891364902507,
"grad_norm": 1.9126127423976698,
"learning_rate": 1.0497114324199536e-05,
"loss": 2.3125,
"step": 19820
},
{
"epoch": 55.26462395543175,
"grad_norm": 1.3810199319505396,
"learning_rate": 1.0445945087740083e-05,
"loss": 2.2836,
"step": 19840
},
{
"epoch": 55.32033426183844,
"grad_norm": 1.655152735076459,
"learning_rate": 1.0394996464447398e-05,
"loss": 2.3183,
"step": 19860
},
{
"epoch": 55.37604456824513,
"grad_norm": 1.7280728341318472,
"learning_rate": 1.0344268804977195e-05,
"loss": 2.3056,
"step": 19880
},
{
"epoch": 55.43175487465181,
"grad_norm": 1.3354142969390423,
"learning_rate": 1.029376245846439e-05,
"loss": 2.2894,
"step": 19900
},
{
"epoch": 55.4874651810585,
"grad_norm": 1.6222487674242974,
"learning_rate": 1.024347777252068e-05,
"loss": 2.3073,
"step": 19920
},
{
"epoch": 55.543175487465184,
"grad_norm": 1.454409209087223,
"learning_rate": 1.0193415093232206e-05,
"loss": 2.3023,
"step": 19940
},
{
"epoch": 55.598885793871865,
"grad_norm": 1.5315652454207556,
"learning_rate": 1.0143574765157128e-05,
"loss": 2.3427,
"step": 19960
},
{
"epoch": 55.65459610027855,
"grad_norm": 1.441290570573882,
"learning_rate": 1.0093957131323262e-05,
"loss": 2.3211,
"step": 19980
},
{
"epoch": 55.710306406685234,
"grad_norm": 1.6552345609147763,
"learning_rate": 1.004456253322574e-05,
"loss": 2.3032,
"step": 20000
},
{
"epoch": 55.76601671309192,
"grad_norm": 1.4344844434843587,
"learning_rate": 9.995391310824615e-06,
"loss": 2.32,
"step": 20020
},
{
"epoch": 55.82172701949861,
"grad_norm": 1.5149262550896996,
"learning_rate": 9.946443802542573e-06,
"loss": 2.3054,
"step": 20040
},
{
"epoch": 55.87743732590529,
"grad_norm": 1.4983617276276844,
"learning_rate": 9.89772034526257e-06,
"loss": 2.2887,
"step": 20060
},
{
"epoch": 55.93314763231198,
"grad_norm": 1.3214738473117364,
"learning_rate": 9.849221274325526e-06,
"loss": 2.3222,
"step": 20080
},
{
"epoch": 55.98885793871866,
"grad_norm": 1.477089337511352,
"learning_rate": 9.800946923528015e-06,
"loss": 2.2982,
"step": 20100
},
{
"epoch": 56.04456824512535,
"grad_norm": 1.4526733288769058,
"learning_rate": 9.752897625119957e-06,
"loss": 2.2978,
"step": 20120
},
{
"epoch": 56.100278551532035,
"grad_norm": 1.4020841003024251,
"learning_rate": 9.705073709802343e-06,
"loss": 2.2945,
"step": 20140
},
{
"epoch": 56.155988857938716,
"grad_norm": 1.5600664300784186,
"learning_rate": 9.657475506724974e-06,
"loss": 2.2782,
"step": 20160
},
{
"epoch": 56.211699164345404,
"grad_norm": 1.8810092843791293,
"learning_rate": 9.610103343484164e-06,
"loss": 2.3072,
"step": 20180
},
{
"epoch": 56.26740947075209,
"grad_norm": 1.5355656388936216,
"learning_rate": 9.562957546120497e-06,
"loss": 2.2978,
"step": 20200
},
{
"epoch": 56.32311977715877,
"grad_norm": 1.49909865084026,
"learning_rate": 9.51603843911659e-06,
"loss": 2.3092,
"step": 20220
},
{
"epoch": 56.37883008356546,
"grad_norm": 1.5161221850342854,
"learning_rate": 9.469346345394869e-06,
"loss": 2.2818,
"step": 20240
},
{
"epoch": 56.43454038997214,
"grad_norm": 1.7615731834241355,
"learning_rate": 9.422881586315314e-06,
"loss": 2.3084,
"step": 20260
},
{
"epoch": 56.49025069637883,
"grad_norm": 1.329887631910666,
"learning_rate": 9.376644481673266e-06,
"loss": 2.3056,
"step": 20280
},
{
"epoch": 56.54596100278552,
"grad_norm": 1.4720910620951293,
"learning_rate": 9.33063534969724e-06,
"loss": 2.3108,
"step": 20300
},
{
"epoch": 56.6016713091922,
"grad_norm": 1.3281512249998089,
"learning_rate": 9.284854507046706e-06,
"loss": 2.2901,
"step": 20320
},
{
"epoch": 56.65738161559889,
"grad_norm": 1.3508495829729492,
"learning_rate": 9.239302268809946e-06,
"loss": 2.3169,
"step": 20340
},
{
"epoch": 56.713091922005574,
"grad_norm": 1.4459681060448604,
"learning_rate": 9.19397894850185e-06,
"loss": 2.2935,
"step": 20360
},
{
"epoch": 56.768802228412255,
"grad_norm": 1.435466464580322,
"learning_rate": 9.148884858061761e-06,
"loss": 2.297,
"step": 20380
},
{
"epoch": 56.82451253481894,
"grad_norm": 1.4747023153570098,
"learning_rate": 9.10402030785136e-06,
"loss": 2.2758,
"step": 20400
},
{
"epoch": 56.880222841225624,
"grad_norm": 1.732004184834518,
"learning_rate": 9.059385606652494e-06,
"loss": 2.2663,
"step": 20420
},
{
"epoch": 56.93593314763231,
"grad_norm": 1.627581542112412,
"learning_rate": 9.014981061665082e-06,
"loss": 2.3057,
"step": 20440
},
{
"epoch": 56.991643454039,
"grad_norm": 1.557984274560907,
"learning_rate": 8.970806978504978e-06,
"loss": 2.3203,
"step": 20460
},
{
"epoch": 57.04735376044568,
"grad_norm": 1.3370492439725272,
"learning_rate": 8.926863661201858e-06,
"loss": 2.2901,
"step": 20480
},
{
"epoch": 57.10306406685237,
"grad_norm": 1.5834661112813444,
"learning_rate": 8.883151412197163e-06,
"loss": 2.3148,
"step": 20500
},
{
"epoch": 57.15877437325905,
"grad_norm": 1.444140494560892,
"learning_rate": 8.839670532341993e-06,
"loss": 2.2811,
"step": 20520
},
{
"epoch": 57.21448467966574,
"grad_norm": 1.2924659150251059,
"learning_rate": 8.796421320895056e-06,
"loss": 2.2812,
"step": 20540
},
{
"epoch": 57.270194986072426,
"grad_norm": 1.278167875022471,
"learning_rate": 8.753404075520562e-06,
"loss": 2.2695,
"step": 20560
},
{
"epoch": 57.325905292479106,
"grad_norm": 1.489794984401024,
"learning_rate": 8.710619092286228e-06,
"loss": 2.2812,
"step": 20580
},
{
"epoch": 57.381615598885794,
"grad_norm": 1.4707110829209729,
"learning_rate": 8.668066665661217e-06,
"loss": 2.2903,
"step": 20600
},
{
"epoch": 57.43732590529248,
"grad_norm": 1.4687558384788093,
"learning_rate": 8.625747088514107e-06,
"loss": 2.306,
"step": 20620
},
{
"epoch": 57.49303621169916,
"grad_norm": 1.4060321766361261,
"learning_rate": 8.583660652110897e-06,
"loss": 2.3054,
"step": 20640
},
{
"epoch": 57.54874651810585,
"grad_norm": 1.416314142016587,
"learning_rate": 8.541807646112959e-06,
"loss": 2.2776,
"step": 20660
},
{
"epoch": 57.60445682451253,
"grad_norm": 1.528612849317557,
"learning_rate": 8.50018835857509e-06,
"loss": 2.2615,
"step": 20680
},
{
"epoch": 57.66016713091922,
"grad_norm": 1.5253351674209896,
"learning_rate": 8.45880307594351e-06,
"loss": 2.2641,
"step": 20700
},
{
"epoch": 57.71587743732591,
"grad_norm": 1.4358983735975828,
"learning_rate": 8.417652083053896e-06,
"loss": 2.2722,
"step": 20720
},
{
"epoch": 57.77158774373259,
"grad_norm": 1.7734798200078705,
"learning_rate": 8.376735663129412e-06,
"loss": 2.3084,
"step": 20740
},
{
"epoch": 57.82729805013928,
"grad_norm": 1.5869547851700487,
"learning_rate": 8.336054097778755e-06,
"loss": 2.2899,
"step": 20760
},
{
"epoch": 57.88300835654596,
"grad_norm": 1.4546940753793316,
"learning_rate": 8.295607666994244e-06,
"loss": 2.3095,
"step": 20780
},
{
"epoch": 57.938718662952645,
"grad_norm": 2.102508648107544,
"learning_rate": 8.255396649149872e-06,
"loss": 2.2591,
"step": 20800
},
{
"epoch": 57.99442896935933,
"grad_norm": 1.7539480811989963,
"learning_rate": 8.215421320999385e-06,
"loss": 2.2713,
"step": 20820
},
{
"epoch": 58.050139275766014,
"grad_norm": 2.230779439808303,
"learning_rate": 8.175681957674403e-06,
"loss": 2.3016,
"step": 20840
},
{
"epoch": 58.1058495821727,
"grad_norm": 1.382319191320228,
"learning_rate": 8.136178832682491e-06,
"loss": 2.3041,
"step": 20860
},
{
"epoch": 58.16155988857939,
"grad_norm": 1.5265747955874778,
"learning_rate": 8.096912217905309e-06,
"loss": 2.2702,
"step": 20880
},
{
"epoch": 58.21727019498607,
"grad_norm": 1.4460542045577416,
"learning_rate": 8.057882383596717e-06,
"loss": 2.3015,
"step": 20900
},
{
"epoch": 58.27298050139276,
"grad_norm": 1.4566734252474305,
"learning_rate": 8.019089598380943e-06,
"loss": 2.2889,
"step": 20920
},
{
"epoch": 58.32869080779944,
"grad_norm": 1.2969700785128098,
"learning_rate": 7.98053412925069e-06,
"loss": 2.3123,
"step": 20940
},
{
"epoch": 58.38440111420613,
"grad_norm": 1.4381747219027274,
"learning_rate": 7.942216241565335e-06,
"loss": 2.2903,
"step": 20960
},
{
"epoch": 58.440111420612816,
"grad_norm": 1.41845772591463,
"learning_rate": 7.904136199049108e-06,
"loss": 2.2915,
"step": 20980
},
{
"epoch": 58.4958217270195,
"grad_norm": 1.7118356239586723,
"learning_rate": 7.866294263789243e-06,
"loss": 2.272,
"step": 21000
},
{
"epoch": 58.551532033426184,
"grad_norm": 1.6802721584331735,
"learning_rate": 7.828690696234207e-06,
"loss": 2.2831,
"step": 21020
},
{
"epoch": 58.60724233983287,
"grad_norm": 1.3312453916466178,
"learning_rate": 7.791325755191866e-06,
"loss": 2.3042,
"step": 21040
},
{
"epoch": 58.66295264623955,
"grad_norm": 1.4400016262848356,
"learning_rate": 7.754199697827755e-06,
"loss": 2.2708,
"step": 21060
},
{
"epoch": 58.71866295264624,
"grad_norm": 1.4653785109530788,
"learning_rate": 7.717312779663285e-06,
"loss": 2.298,
"step": 21080
},
{
"epoch": 58.77437325905292,
"grad_norm": 1.4617048241574984,
"learning_rate": 7.680665254573972e-06,
"loss": 2.295,
"step": 21100
},
{
"epoch": 58.83008356545961,
"grad_norm": 1.5216063506001387,
"learning_rate": 7.644257374787696e-06,
"loss": 2.276,
"step": 21120
},
{
"epoch": 58.8857938718663,
"grad_norm": 1.3935611767924123,
"learning_rate": 7.6080893908829835e-06,
"loss": 2.2758,
"step": 21140
},
{
"epoch": 58.94150417827298,
"grad_norm": 1.4853858842912901,
"learning_rate": 7.572161551787261e-06,
"loss": 2.2871,
"step": 21160
},
{
"epoch": 58.99721448467967,
"grad_norm": 1.5467142471984074,
"learning_rate": 7.536474104775158e-06,
"loss": 2.2848,
"step": 21180
},
{
"epoch": 59.05292479108635,
"grad_norm": 1.5612739193627336,
"learning_rate": 7.501027295466781e-06,
"loss": 2.2918,
"step": 21200
},
{
"epoch": 59.108635097493035,
"grad_norm": 1.5107647532729618,
"learning_rate": 7.4658213678260586e-06,
"loss": 2.2938,
"step": 21220
},
{
"epoch": 59.16434540389972,
"grad_norm": 1.613803688667171,
"learning_rate": 7.430856564159026e-06,
"loss": 2.2624,
"step": 21240
},
{
"epoch": 59.220055710306404,
"grad_norm": 1.4075510840449976,
"learning_rate": 7.396133125112186e-06,
"loss": 2.2882,
"step": 21260
},
{
"epoch": 59.27576601671309,
"grad_norm": 1.4680721335227742,
"learning_rate": 7.361651289670837e-06,
"loss": 2.2772,
"step": 21280
},
{
"epoch": 59.33147632311978,
"grad_norm": 1.9100448192464394,
"learning_rate": 7.327411295157427e-06,
"loss": 2.2552,
"step": 21300
},
{
"epoch": 59.38718662952646,
"grad_norm": 1.6058890472494596,
"learning_rate": 7.293413377229926e-06,
"loss": 2.2458,
"step": 21320
},
{
"epoch": 59.44289693593315,
"grad_norm": 1.760861350098991,
"learning_rate": 7.259657769880218e-06,
"loss": 2.2921,
"step": 21340
},
{
"epoch": 59.49860724233983,
"grad_norm": 1.4588818247613144,
"learning_rate": 7.226144705432453e-06,
"loss": 2.2647,
"step": 21360
},
{
"epoch": 59.55431754874652,
"grad_norm": 1.347496064491126,
"learning_rate": 7.192874414541492e-06,
"loss": 2.3212,
"step": 21380
},
{
"epoch": 59.610027855153206,
"grad_norm": 1.4180100417862518,
"learning_rate": 7.159847126191279e-06,
"loss": 2.2922,
"step": 21400
},
{
"epoch": 59.66573816155989,
"grad_norm": 1.3383217284211308,
"learning_rate": 7.127063067693305e-06,
"loss": 2.2642,
"step": 21420
},
{
"epoch": 59.721448467966574,
"grad_norm": 1.5431150296701466,
"learning_rate": 7.094522464685003e-06,
"loss": 2.2798,
"step": 21440
},
{
"epoch": 59.77715877437326,
"grad_norm": 1.453049719160441,
"learning_rate": 7.062225541128232e-06,
"loss": 2.2882,
"step": 21460
},
{
"epoch": 59.83286908077994,
"grad_norm": 1.5504386381902358,
"learning_rate": 7.030172519307708e-06,
"loss": 2.2702,
"step": 21480
},
{
"epoch": 59.88857938718663,
"grad_norm": 1.3068595652128718,
"learning_rate": 6.998363619829485e-06,
"loss": 2.2867,
"step": 21500
},
{
"epoch": 59.94428969359331,
"grad_norm": 1.5921797096923227,
"learning_rate": 6.966799061619429e-06,
"loss": 2.3073,
"step": 21520
},
{
"epoch": 60.0,
"grad_norm": 1.4377189989333592,
"learning_rate": 6.935479061921752e-06,
"loss": 2.2524,
"step": 21540
},
{
"epoch": 60.05571030640669,
"grad_norm": 1.4965660553834361,
"learning_rate": 6.904403836297449e-06,
"loss": 2.2908,
"step": 21560
},
{
"epoch": 60.11142061281337,
"grad_norm": 1.5433152079814891,
"learning_rate": 6.873573598622855e-06,
"loss": 2.3,
"step": 21580
},
{
"epoch": 60.16713091922006,
"grad_norm": 1.532824089241608,
"learning_rate": 6.842988561088175e-06,
"loss": 2.2503,
"step": 21600
},
{
"epoch": 60.22284122562674,
"grad_norm": 1.404540197652475,
"learning_rate": 6.81264893419601e-06,
"loss": 2.2671,
"step": 21620
},
{
"epoch": 60.278551532033426,
"grad_norm": 2.3304693566638277,
"learning_rate": 6.782554926759919e-06,
"loss": 2.272,
"step": 21640
},
{
"epoch": 60.33426183844011,
"grad_norm": 1.4431191588807148,
"learning_rate": 6.752706745902972e-06,
"loss": 2.2741,
"step": 21660
},
{
"epoch": 60.389972144846794,
"grad_norm": 1.7541038462058614,
"learning_rate": 6.723104597056326e-06,
"loss": 2.2679,
"step": 21680
},
{
"epoch": 60.44568245125348,
"grad_norm": 1.5107298093918222,
"learning_rate": 6.693748683957818e-06,
"loss": 2.2439,
"step": 21700
},
{
"epoch": 60.50139275766017,
"grad_norm": 1.8531551454644686,
"learning_rate": 6.664639208650558e-06,
"loss": 2.3011,
"step": 21720
},
{
"epoch": 60.55710306406685,
"grad_norm": 1.7982354405211904,
"learning_rate": 6.635776371481545e-06,
"loss": 2.2564,
"step": 21740
},
{
"epoch": 60.61281337047354,
"grad_norm": 1.4660071270340647,
"learning_rate": 6.607160371100274e-06,
"loss": 2.2991,
"step": 21760
},
{
"epoch": 60.66852367688022,
"grad_norm": 1.6108738101591895,
"learning_rate": 6.578791404457377e-06,
"loss": 2.2712,
"step": 21780
},
{
"epoch": 60.72423398328691,
"grad_norm": 1.7280681944263292,
"learning_rate": 6.550669666803269e-06,
"loss": 2.2645,
"step": 21800
},
{
"epoch": 60.779944289693596,
"grad_norm": 1.634845479160207,
"learning_rate": 6.522795351686807e-06,
"loss": 2.264,
"step": 21820
},
{
"epoch": 60.83565459610028,
"grad_norm": 1.4425486098001479,
"learning_rate": 6.495168650953954e-06,
"loss": 2.2848,
"step": 21840
},
{
"epoch": 60.891364902506965,
"grad_norm": 1.4822789289824396,
"learning_rate": 6.467789754746452e-06,
"loss": 2.2683,
"step": 21860
},
{
"epoch": 60.94707520891365,
"grad_norm": 1.732381751649639,
"learning_rate": 6.440658851500523e-06,
"loss": 2.2965,
"step": 21880
},
{
"epoch": 61.00278551532033,
"grad_norm": 1.5491944453547546,
"learning_rate": 6.413776127945568e-06,
"loss": 2.2874,
"step": 21900
},
{
"epoch": 61.05849582172702,
"grad_norm": 1.3257669016068976,
"learning_rate": 6.3871417691028895e-06,
"loss": 2.2499,
"step": 21920
},
{
"epoch": 61.1142061281337,
"grad_norm": 1.9195662310637283,
"learning_rate": 6.360755958284388e-06,
"loss": 2.2535,
"step": 21940
},
{
"epoch": 61.16991643454039,
"grad_norm": 1.5972372792438843,
"learning_rate": 6.334618877091354e-06,
"loss": 2.2632,
"step": 21960
},
{
"epoch": 61.22562674094708,
"grad_norm": 1.3893966806690632,
"learning_rate": 6.308730705413165e-06,
"loss": 2.2583,
"step": 21980
},
{
"epoch": 61.28133704735376,
"grad_norm": 1.3230497193349502,
"learning_rate": 6.283091621426083e-06,
"loss": 2.2836,
"step": 22000
},
{
"epoch": 61.33704735376045,
"grad_norm": 1.3952354521448391,
"learning_rate": 6.257701801592015e-06,
"loss": 2.257,
"step": 22020
},
{
"epoch": 61.39275766016713,
"grad_norm": 1.554543762365429,
"learning_rate": 6.232561420657287e-06,
"loss": 2.2712,
"step": 22040
},
{
"epoch": 61.448467966573816,
"grad_norm": 1.5605932348002485,
"learning_rate": 6.207670651651461e-06,
"loss": 2.2724,
"step": 22060
},
{
"epoch": 61.5041782729805,
"grad_norm": 1.3976211106358032,
"learning_rate": 6.183029665886133e-06,
"loss": 2.2473,
"step": 22080
},
{
"epoch": 61.559888579387184,
"grad_norm": 1.8546230069148926,
"learning_rate": 6.158638632953763e-06,
"loss": 2.2717,
"step": 22100
},
{
"epoch": 61.61559888579387,
"grad_norm": 1.89636102470963,
"learning_rate": 6.134497720726502e-06,
"loss": 2.2812,
"step": 22120
},
{
"epoch": 61.67130919220056,
"grad_norm": 1.5811862192223516,
"learning_rate": 6.110607095355023e-06,
"loss": 2.2526,
"step": 22140
},
{
"epoch": 61.72701949860724,
"grad_norm": 1.4824595544381087,
"learning_rate": 6.0869669212674075e-06,
"loss": 2.2745,
"step": 22160
},
{
"epoch": 61.78272980501393,
"grad_norm": 1.457324859716249,
"learning_rate": 6.063577361167978e-06,
"loss": 2.2999,
"step": 22180
},
{
"epoch": 61.83844011142061,
"grad_norm": 1.7057172055075098,
"learning_rate": 6.040438576036232e-06,
"loss": 2.2332,
"step": 22200
},
{
"epoch": 61.8941504178273,
"grad_norm": 1.4146229574598475,
"learning_rate": 6.0175507251256545e-06,
"loss": 2.2701,
"step": 22220
},
{
"epoch": 61.949860724233986,
"grad_norm": 1.583083000184335,
"learning_rate": 5.994913965962701e-06,
"loss": 2.2528,
"step": 22240
},
{
"epoch": 62.00557103064067,
"grad_norm": 1.4267458097429977,
"learning_rate": 5.972528454345661e-06,
"loss": 2.2459,
"step": 22260
},
{
"epoch": 62.061281337047355,
"grad_norm": 1.868289697809984,
"learning_rate": 5.950394344343613e-06,
"loss": 2.2553,
"step": 22280
},
{
"epoch": 62.116991643454035,
"grad_norm": 1.6100749427479117,
"learning_rate": 5.928511788295353e-06,
"loss": 2.258,
"step": 22300
},
{
"epoch": 62.17270194986072,
"grad_norm": 2.033401841218128,
"learning_rate": 5.906880936808346e-06,
"loss": 2.2656,
"step": 22320
},
{
"epoch": 62.22841225626741,
"grad_norm": 1.5336132691384432,
"learning_rate": 5.8855019387576895e-06,
"loss": 2.2713,
"step": 22340
},
{
"epoch": 62.28412256267409,
"grad_norm": 1.4006290924081595,
"learning_rate": 5.864374941285097e-06,
"loss": 2.273,
"step": 22360
},
{
"epoch": 62.33983286908078,
"grad_norm": 1.5727786325700963,
"learning_rate": 5.843500089797875e-06,
"loss": 2.2698,
"step": 22380
},
{
"epoch": 62.39554317548747,
"grad_norm": 1.5471505670773764,
"learning_rate": 5.822877527967931e-06,
"loss": 2.2366,
"step": 22400
},
{
"epoch": 62.45125348189415,
"grad_norm": 1.7387583369728248,
"learning_rate": 5.802507397730769e-06,
"loss": 2.2517,
"step": 22420
},
{
"epoch": 62.50696378830084,
"grad_norm": 1.34648038991986,
"learning_rate": 5.782389839284539e-06,
"loss": 2.2792,
"step": 22440
},
{
"epoch": 62.56267409470752,
"grad_norm": 1.4257642559426869,
"learning_rate": 5.76252499108904e-06,
"loss": 2.2639,
"step": 22460
},
{
"epoch": 62.618384401114206,
"grad_norm": 1.4992603072132409,
"learning_rate": 5.7429129898647996e-06,
"loss": 2.2469,
"step": 22480
},
{
"epoch": 62.674094707520894,
"grad_norm": 1.3812098236775807,
"learning_rate": 5.723553970592111e-06,
"loss": 2.2778,
"step": 22500
},
{
"epoch": 62.729805013927574,
"grad_norm": 1.5792548079682394,
"learning_rate": 5.704448066510095e-06,
"loss": 2.267,
"step": 22520
},
{
"epoch": 62.78551532033426,
"grad_norm": 1.3978392074739474,
"learning_rate": 5.6855954091158275e-06,
"loss": 2.2949,
"step": 22540
},
{
"epoch": 62.84122562674095,
"grad_norm": 1.323364408912749,
"learning_rate": 5.666996128163389e-06,
"loss": 2.239,
"step": 22560
},
{
"epoch": 62.89693593314763,
"grad_norm": 1.47299942786999,
"learning_rate": 5.648650351662984e-06,
"loss": 2.2428,
"step": 22580
},
{
"epoch": 62.95264623955432,
"grad_norm": 1.5746917164193233,
"learning_rate": 5.630558205880067e-06,
"loss": 2.2717,
"step": 22600
},
{
"epoch": 63.008356545961,
"grad_norm": 1.3453881918350201,
"learning_rate": 5.612719815334472e-06,
"loss": 2.2605,
"step": 22620
},
{
"epoch": 63.06406685236769,
"grad_norm": 1.5732476592031857,
"learning_rate": 5.595135302799554e-06,
"loss": 2.2981,
"step": 22640
},
{
"epoch": 63.119777158774376,
"grad_norm": 1.4933682090479892,
"learning_rate": 5.577804789301342e-06,
"loss": 2.2629,
"step": 22660
},
{
"epoch": 63.17548746518106,
"grad_norm": 2.277146814514644,
"learning_rate": 5.560728394117715e-06,
"loss": 2.2708,
"step": 22680
},
{
"epoch": 63.231197771587745,
"grad_norm": 1.272383107286924,
"learning_rate": 5.543906234777552e-06,
"loss": 2.2573,
"step": 22700
},
{
"epoch": 63.286908077994426,
"grad_norm": 1.5182549490627633,
"learning_rate": 5.527338427059974e-06,
"loss": 2.2316,
"step": 22720
},
{
"epoch": 63.34261838440111,
"grad_norm": 1.7891996636535088,
"learning_rate": 5.511025084993495e-06,
"loss": 2.2441,
"step": 22740
},
{
"epoch": 63.3983286908078,
"grad_norm": 1.5802790373457376,
"learning_rate": 5.494966320855273e-06,
"loss": 2.2617,
"step": 22760
},
{
"epoch": 63.45403899721448,
"grad_norm": 1.6073072729662374,
"learning_rate": 5.479162245170319e-06,
"loss": 2.2458,
"step": 22780
},
{
"epoch": 63.50974930362117,
"grad_norm": 1.5697619339543767,
"learning_rate": 5.4636129667107414e-06,
"loss": 2.2971,
"step": 22800
},
{
"epoch": 63.56545961002786,
"grad_norm": 1.3744571764690732,
"learning_rate": 5.448318592495002e-06,
"loss": 2.2844,
"step": 22820
},
{
"epoch": 63.62116991643454,
"grad_norm": 1.6689901020657363,
"learning_rate": 5.433279227787173e-06,
"loss": 2.2517,
"step": 22840
},
{
"epoch": 63.67688022284123,
"grad_norm": 1.382942198241601,
"learning_rate": 5.418494976096209e-06,
"loss": 2.26,
"step": 22860
},
{
"epoch": 63.73259052924791,
"grad_norm": 1.5877846358641807,
"learning_rate": 5.403965939175251e-06,
"loss": 2.2572,
"step": 22880
},
{
"epoch": 63.788300835654596,
"grad_norm": 1.816527139540087,
"learning_rate": 5.389692217020904e-06,
"loss": 2.2546,
"step": 22900
},
{
"epoch": 63.844011142061284,
"grad_norm": 1.7654697501524792,
"learning_rate": 5.375673907872574e-06,
"loss": 2.2418,
"step": 22920
},
{
"epoch": 63.899721448467965,
"grad_norm": 1.5888334127364063,
"learning_rate": 5.36191110821176e-06,
"loss": 2.2664,
"step": 22940
},
{
"epoch": 63.95543175487465,
"grad_norm": 1.4670614525113759,
"learning_rate": 5.348403912761424e-06,
"loss": 2.2343,
"step": 22960
},
{
"epoch": 64.01114206128133,
"grad_norm": 1.3543942876062933,
"learning_rate": 5.335152414485308e-06,
"loss": 2.2503,
"step": 22980
},
{
"epoch": 64.06685236768803,
"grad_norm": 1.4563546735325645,
"learning_rate": 5.32215670458733e-06,
"loss": 2.2304,
"step": 23000
},
{
"epoch": 64.12256267409471,
"grad_norm": 1.622565484000199,
"learning_rate": 5.309416872510913e-06,
"loss": 2.2452,
"step": 23020
},
{
"epoch": 64.17827298050139,
"grad_norm": 1.4460308265765018,
"learning_rate": 5.296933005938412e-06,
"loss": 2.2938,
"step": 23040
},
{
"epoch": 64.23398328690807,
"grad_norm": 1.4440564061553423,
"learning_rate": 5.284705190790466e-06,
"loss": 2.2453,
"step": 23060
},
{
"epoch": 64.28969359331477,
"grad_norm": 1.4478148572192913,
"learning_rate": 5.272733511225455e-06,
"loss": 2.2343,
"step": 23080
},
{
"epoch": 64.34540389972145,
"grad_norm": 1.4180812216938306,
"learning_rate": 5.261018049638886e-06,
"loss": 2.2665,
"step": 23100
},
{
"epoch": 64.40111420612813,
"grad_norm": 1.5831915297512447,
"learning_rate": 5.24955888666284e-06,
"loss": 2.2539,
"step": 23120
},
{
"epoch": 64.45682451253482,
"grad_norm": 1.6700085502977315,
"learning_rate": 5.238356101165407e-06,
"loss": 2.2677,
"step": 23140
},
{
"epoch": 64.5125348189415,
"grad_norm": 1.4281340861580372,
"learning_rate": 5.227409770250158e-06,
"loss": 2.2693,
"step": 23160
},
{
"epoch": 64.56824512534818,
"grad_norm": 1.474080653934136,
"learning_rate": 5.216719969255597e-06,
"loss": 2.2576,
"step": 23180
},
{
"epoch": 64.62395543175488,
"grad_norm": 1.5546948660357771,
"learning_rate": 5.206286771754661e-06,
"loss": 2.2718,
"step": 23200
},
{
"epoch": 64.67966573816156,
"grad_norm": 1.5619037183872753,
"learning_rate": 5.196110249554205e-06,
"loss": 2.2617,
"step": 23220
},
{
"epoch": 64.73537604456824,
"grad_norm": 1.4612049949245505,
"learning_rate": 5.186190472694495e-06,
"loss": 2.2531,
"step": 23240
},
{
"epoch": 64.79108635097494,
"grad_norm": 1.4463084271801094,
"learning_rate": 5.176527509448752e-06,
"loss": 2.2492,
"step": 23260
},
{
"epoch": 64.84679665738162,
"grad_norm": 1.52054949971835,
"learning_rate": 5.167121426322663e-06,
"loss": 2.265,
"step": 23280
},
{
"epoch": 64.9025069637883,
"grad_norm": 1.6137336460770288,
"learning_rate": 5.157972288053923e-06,
"loss": 2.2761,
"step": 23300
},
{
"epoch": 64.958217270195,
"grad_norm": 1.5343563001642067,
"learning_rate": 5.1490801576118046e-06,
"loss": 2.2589,
"step": 23320
},
{
"epoch": 65.01392757660167,
"grad_norm": 1.4067304900789732,
"learning_rate": 5.140445096196706e-06,
"loss": 2.2344,
"step": 23340
},
{
"epoch": 65.06963788300835,
"grad_norm": 1.6266281172023442,
"learning_rate": 5.132067163239744e-06,
"loss": 2.2327,
"step": 23360
},
{
"epoch": 65.12534818941504,
"grad_norm": 1.6114614480347964,
"learning_rate": 5.123946416402338e-06,
"loss": 2.2252,
"step": 23380
},
{
"epoch": 65.18105849582173,
"grad_norm": 1.4381934112576613,
"learning_rate": 5.116082911575816e-06,
"loss": 2.2376,
"step": 23400
},
{
"epoch": 65.23676880222841,
"grad_norm": 1.6265621183224508,
"learning_rate": 5.108476702881032e-06,
"loss": 2.2575,
"step": 23420
},
{
"epoch": 65.29247910863509,
"grad_norm": 1.6332251534091453,
"learning_rate": 5.101127842667981e-06,
"loss": 2.2482,
"step": 23440
},
{
"epoch": 65.34818941504179,
"grad_norm": 1.4703516792242604,
"learning_rate": 5.094036381515459e-06,
"loss": 2.2636,
"step": 23460
},
{
"epoch": 65.40389972144847,
"grad_norm": 1.828744349221896,
"learning_rate": 5.087202368230689e-06,
"loss": 2.2676,
"step": 23480
},
{
"epoch": 65.45961002785515,
"grad_norm": 1.5728681116117378,
"learning_rate": 5.080625849849016e-06,
"loss": 2.2408,
"step": 23500
},
{
"epoch": 65.51532033426184,
"grad_norm": 1.3646586794266737,
"learning_rate": 5.074306871633561e-06,
"loss": 2.2594,
"step": 23520
},
{
"epoch": 65.57103064066852,
"grad_norm": 1.344312658230311,
"learning_rate": 5.068245477074914e-06,
"loss": 2.2548,
"step": 23540
},
{
"epoch": 65.6267409470752,
"grad_norm": 2.3009044106769543,
"learning_rate": 5.062441707890833e-06,
"loss": 2.2515,
"step": 23560
},
{
"epoch": 65.6824512534819,
"grad_norm": 1.5477394057524316,
"learning_rate": 5.056895604025971e-06,
"loss": 2.2286,
"step": 23580
},
{
"epoch": 65.73816155988858,
"grad_norm": 1.473509209538851,
"learning_rate": 5.051607203651582e-06,
"loss": 2.2558,
"step": 23600
},
{
"epoch": 65.79387186629526,
"grad_norm": 1.6163262255244608,
"learning_rate": 5.046576543165266e-06,
"loss": 2.2587,
"step": 23620
},
{
"epoch": 65.84958217270194,
"grad_norm": 1.7522873848064437,
"learning_rate": 5.041803657190727e-06,
"loss": 2.262,
"step": 23640
},
{
"epoch": 65.90529247910864,
"grad_norm": 1.9510070766562828,
"learning_rate": 5.037288578577515e-06,
"loss": 2.2731,
"step": 23660
},
{
"epoch": 65.96100278551532,
"grad_norm": 1.3249968235869283,
"learning_rate": 5.033031338400824e-06,
"loss": 2.2357,
"step": 23680
},
{
"epoch": 66.016713091922,
"grad_norm": 1.8089068811586904,
"learning_rate": 5.0290319659612565e-06,
"loss": 2.2264,
"step": 23700
},
{
"epoch": 66.0724233983287,
"grad_norm": 1.6823052379660255,
"learning_rate": 5.0252904887846365e-06,
"loss": 2.2241,
"step": 23720
},
{
"epoch": 66.12813370473538,
"grad_norm": 1.4348819340116656,
"learning_rate": 5.02180693262181e-06,
"loss": 2.2448,
"step": 23740
},
{
"epoch": 66.18384401114206,
"grad_norm": 1.48816291038319,
"learning_rate": 5.01858132144848e-06,
"loss": 2.2445,
"step": 23760
},
{
"epoch": 66.23955431754875,
"grad_norm": 1.4921612956391412,
"learning_rate": 5.015613677465031e-06,
"loss": 2.2608,
"step": 23780
},
{
"epoch": 66.29526462395543,
"grad_norm": 1.8304620275041354,
"learning_rate": 5.0129040210963695e-06,
"loss": 2.2599,
"step": 23800
},
{
"epoch": 66.35097493036211,
"grad_norm": 1.3873830650491625,
"learning_rate": 5.010452370991807e-06,
"loss": 2.2506,
"step": 23820
},
{
"epoch": 66.40668523676881,
"grad_norm": 1.4260103007082212,
"learning_rate": 5.008258744024913e-06,
"loss": 2.2474,
"step": 23840
},
{
"epoch": 66.46239554317549,
"grad_norm": 1.5475790370983857,
"learning_rate": 5.006323155293398e-06,
"loss": 2.2718,
"step": 23860
},
{
"epoch": 66.51810584958217,
"grad_norm": 1.6513082696882795,
"learning_rate": 5.004645618119022e-06,
"loss": 2.2305,
"step": 23880
},
{
"epoch": 66.57381615598885,
"grad_norm": 1.3518749191666553,
"learning_rate": 5.0032261440475e-06,
"loss": 2.2475,
"step": 23900
},
{
"epoch": 66.62952646239555,
"grad_norm": 1.4415217716791036,
"learning_rate": 5.0020647428484e-06,
"loss": 2.2413,
"step": 23920
},
{
"epoch": 66.68523676880223,
"grad_norm": 1.463396039264776,
"learning_rate": 5.001161422515119e-06,
"loss": 2.2409,
"step": 23940
},
{
"epoch": 66.74094707520891,
"grad_norm": 1.6094576785573045,
"learning_rate": 5.000516189264787e-06,
"loss": 2.2368,
"step": 23960
},
{
"epoch": 66.7966573816156,
"grad_norm": 1.8634319644864112,
"learning_rate": 5.000129047538239e-06,
"loss": 2.2534,
"step": 23980
},
{
"epoch": 66.85236768802228,
"grad_norm": 1.5147891109103364,
"learning_rate": 5e-06,
"loss": 2.2525,
"step": 24000
}
],
"logging_steps": 20,
"max_steps": 24000,
"num_input_tokens_seen": 0,
"num_train_epochs": 67,
"save_steps": 3000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4048509763584000.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}